Upload 3 files
Browse files- app.py +73 -0
- processing_qwen_vl.py +26 -0
- requirements.txt +15 -0
app.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import cv2
|
4 |
+
from PIL import Image
|
5 |
+
from auto_gptq import AutoGPTQForCausalLM
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
from processing_qwen_vl import QWenVLProcessor
|
8 |
+
import os
|
9 |
+
|
10 |
+
model_id = "Qwen/Qwen-VL-Chat-Int4"
|
11 |
+
|
12 |
+
processor = QWenVLProcessor.from_pretrained(model_id, trust_remote_code=True)
|
13 |
+
|
14 |
+
model = AutoGPTQForCausalLM.from_quantized(
|
15 |
+
model_id,
|
16 |
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
17 |
+
trust_remote_code=True
|
18 |
+
).eval()
|
19 |
+
|
20 |
+
def capture_photo(filename="sitting.jpg"):
|
21 |
+
cap = cv2.VideoCapture(0)
|
22 |
+
ret, frame = cap.read()
|
23 |
+
cap.release()
|
24 |
+
if ret:
|
25 |
+
cv2.imwrite(filename, frame)
|
26 |
+
return filename
|
27 |
+
return None
|
28 |
+
|
29 |
+
def speak_text(text, lang="zh"):
|
30 |
+
voice = "zh-CN-XiaoxiaoNeural" if lang == "zh" else "en-US-AriaNeural"
|
31 |
+
os.system(f'edge-tts --text "{text}" --voice "{voice}" --write-media output.mp3')
|
32 |
+
os.system('start output.mp3' if os.name == 'nt' else 'afplay output.mp3')
|
33 |
+
|
34 |
+
def analyze_posture(image=None, auto_capture=False):
|
35 |
+
if auto_capture:
|
36 |
+
image_path = capture_photo()
|
37 |
+
if image_path is None:
|
38 |
+
return "β η‘ζ³εεζει ", None
|
39 |
+
image = Image.open(image_path)
|
40 |
+
elif image is None:
|
41 |
+
return "β θ«δΈε³εηζεη¨θͺεζη
§", None
|
42 |
+
|
43 |
+
question = "θ«ε€ζ·ιεδΊΊζ―ε¦εε§ΏδΈθ―οΌε¦ι§θγεεΎζζͺζοΌη¨δΈθ±ζεηγ"
|
44 |
+
inputs = processor(text=question, images=image, return_tensors="pt").to(model.device)
|
45 |
+
outputs = model.generate(**inputs, max_new_tokens=512)
|
46 |
+
answer = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
47 |
+
|
48 |
+
if "θ«" in answer:
|
49 |
+
speak_text(answer, lang="zh")
|
50 |
+
if "please" in answer.lower():
|
51 |
+
speak_text(answer, lang="en")
|
52 |
+
|
53 |
+
return answer, image
|
54 |
+
|
55 |
+
def run_auto_capture():
|
56 |
+
return analyze_posture(auto_capture=True)
|
57 |
+
|
58 |
+
with gr.Blocks(title="ι§θθε₯ε©ζ") as demo:
|
59 |
+
gr.Markdown("## πͺ Qwen-VL-Chat-Int4 ι§θθε₯ Demo")
|
60 |
+
|
61 |
+
with gr.Row():
|
62 |
+
with gr.Column():
|
63 |
+
auto_btn = gr.Button("π· θͺεζει ζη
§δΈ¦ε€ζ·")
|
64 |
+
image_input = gr.Image(type="pil", label="ζζεδΈε³εη")
|
65 |
+
submit_btn = gr.Button("π€ δΈε³δΈ¦ε€ζ·")
|
66 |
+
with gr.Column():
|
67 |
+
output_text = gr.Textbox(label="π§ 樑εε€ζ·η΅ζ", lines=6)
|
68 |
+
output_image = gr.Image(type="pil", label="εζεη")
|
69 |
+
|
70 |
+
auto_btn.click(fn=run_auto_capture, outputs=[output_text, output_image])
|
71 |
+
submit_btn.click(fn=analyze_posture, inputs=[image_input], outputs=[output_text, output_image])
|
72 |
+
|
73 |
+
demo.launch(share=True)
|
processing_qwen_vl.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import CLIPImageProcessor, AutoTokenizer
|
2 |
+
|
3 |
+
class QWenVLProcessor:
|
4 |
+
def __init__(self, tokenizer, image_processor):
|
5 |
+
self.tokenizer = tokenizer
|
6 |
+
self.image_processor = image_processor
|
7 |
+
|
8 |
+
@classmethod
|
9 |
+
def from_pretrained(cls, model_id, **kwargs):
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, **kwargs)
|
11 |
+
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
12 |
+
return cls(tokenizer=tokenizer, image_processor=image_processor)
|
13 |
+
|
14 |
+
def __call__(self, text=None, images=None, return_tensors=None):
|
15 |
+
if images is not None:
|
16 |
+
image_inputs = self.image_processor(images, return_tensors=return_tensors)
|
17 |
+
else:
|
18 |
+
image_inputs = {}
|
19 |
+
if text is not None:
|
20 |
+
text_inputs = self.tokenizer(text, return_tensors=return_tensors, padding=True)
|
21 |
+
else:
|
22 |
+
text_inputs = {}
|
23 |
+
return {**text_inputs, **image_inputs}
|
24 |
+
|
25 |
+
def batch_decode(self, *args, **kwargs):
|
26 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.12.0
|
2 |
+
torch>=2.1.0
|
3 |
+
transformers>=4.37.0
|
4 |
+
accelerate
|
5 |
+
matplotlib
|
6 |
+
tiktoken
|
7 |
+
einops
|
8 |
+
transformers_stream_generator
|
9 |
+
torchvision
|
10 |
+
|
11 |
+
opencv-python
|
12 |
+
optimum
|
13 |
+
opencv-python
|
14 |
+
matplotlib
|
15 |
+
auto-gptq
|