taybeyond commited on
Commit
c132fb4
Β·
verified Β·
1 Parent(s): 72501ac

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +73 -0
  2. processing_qwen_vl.py +26 -0
  3. requirements.txt +15 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import cv2
4
+ from PIL import Image
5
+ from auto_gptq import AutoGPTQForCausalLM
6
+ from transformers import AutoTokenizer
7
+ from processing_qwen_vl import QWenVLProcessor
8
+ import os
9
+
10
+ model_id = "Qwen/Qwen-VL-Chat-Int4"
11
+
12
+ processor = QWenVLProcessor.from_pretrained(model_id, trust_remote_code=True)
13
+
14
+ model = AutoGPTQForCausalLM.from_quantized(
15
+ model_id,
16
+ device="cuda" if torch.cuda.is_available() else "cpu",
17
+ trust_remote_code=True
18
+ ).eval()
19
+
20
+ def capture_photo(filename="sitting.jpg"):
21
+ cap = cv2.VideoCapture(0)
22
+ ret, frame = cap.read()
23
+ cap.release()
24
+ if ret:
25
+ cv2.imwrite(filename, frame)
26
+ return filename
27
+ return None
28
+
29
+ def speak_text(text, lang="zh"):
30
+ voice = "zh-CN-XiaoxiaoNeural" if lang == "zh" else "en-US-AriaNeural"
31
+ os.system(f'edge-tts --text "{text}" --voice "{voice}" --write-media output.mp3')
32
+ os.system('start output.mp3' if os.name == 'nt' else 'afplay output.mp3')
33
+
34
+ def analyze_posture(image=None, auto_capture=False):
35
+ if auto_capture:
36
+ image_path = capture_photo()
37
+ if image_path is None:
38
+ return "❌ η„‘ζ³•ε•Ÿε‹•ζ”εƒι ­", None
39
+ image = Image.open(image_path)
40
+ elif image is None:
41
+ return "❌ θ«‹δΈŠε‚³εœ–η‰‡ζˆ–ε•Ÿη”¨θ‡ͺ動拍照", None
42
+
43
+ question = "θ«‹εˆ€ζ–·ι€™ε€‹δΊΊζ˜―ε¦εε§ΏδΈθ‰―οΌŒε¦‚ι§θƒŒγ€ε‰ε‚Ύζˆ–ζ­ͺζ–œοΌŸη”¨δΈ­θ‹±ζ–‡ε›žη­”γ€‚"
44
+ inputs = processor(text=question, images=image, return_tensors="pt").to(model.device)
45
+ outputs = model.generate(**inputs, max_new_tokens=512)
46
+ answer = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
47
+
48
+ if "θ«‹" in answer:
49
+ speak_text(answer, lang="zh")
50
+ if "please" in answer.lower():
51
+ speak_text(answer, lang="en")
52
+
53
+ return answer, image
54
+
55
+ def run_auto_capture():
56
+ return analyze_posture(auto_capture=True)
57
+
58
+ with gr.Blocks(title="ι§θƒŒθ­˜εˆ₯εŠ©ζ‰‹") as demo:
59
+ gr.Markdown("## πŸͺ‘ Qwen-VL-Chat-Int4 ι§θƒŒθ­˜εˆ₯ Demo")
60
+
61
+ with gr.Row():
62
+ with gr.Column():
63
+ auto_btn = gr.Button("πŸ“· θ‡ͺε‹•ζ”εƒι ­ζ‹η…§δΈ¦εˆ€ζ–·")
64
+ image_input = gr.Image(type="pil", label="ζˆ–ζ‰‹ε‹•δΈŠε‚³εœ–η‰‡")
65
+ submit_btn = gr.Button("πŸ“€ δΈŠε‚³δΈ¦εˆ€ζ–·")
66
+ with gr.Column():
67
+ output_text = gr.Textbox(label="🧠 ζ¨‘εž‹εˆ€ζ–·η΅ζžœ", lines=6)
68
+ output_image = gr.Image(type="pil", label="εˆ†ζžεœ–η‰‡")
69
+
70
+ auto_btn.click(fn=run_auto_capture, outputs=[output_text, output_image])
71
+ submit_btn.click(fn=analyze_posture, inputs=[image_input], outputs=[output_text, output_image])
72
+
73
+ demo.launch(share=True)
processing_qwen_vl.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import CLIPImageProcessor, AutoTokenizer
2
+
3
+ class QWenVLProcessor:
4
+ def __init__(self, tokenizer, image_processor):
5
+ self.tokenizer = tokenizer
6
+ self.image_processor = image_processor
7
+
8
+ @classmethod
9
+ def from_pretrained(cls, model_id, **kwargs):
10
+ tokenizer = AutoTokenizer.from_pretrained(model_id, **kwargs)
11
+ image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
12
+ return cls(tokenizer=tokenizer, image_processor=image_processor)
13
+
14
+ def __call__(self, text=None, images=None, return_tensors=None):
15
+ if images is not None:
16
+ image_inputs = self.image_processor(images, return_tensors=return_tensors)
17
+ else:
18
+ image_inputs = {}
19
+ if text is not None:
20
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors, padding=True)
21
+ else:
22
+ text_inputs = {}
23
+ return {**text_inputs, **image_inputs}
24
+
25
+ def batch_decode(self, *args, **kwargs):
26
+ return self.tokenizer.batch_decode(*args, **kwargs)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.12.0
2
+ torch>=2.1.0
3
+ transformers>=4.37.0
4
+ accelerate
5
+ matplotlib
6
+ tiktoken
7
+ einops
8
+ transformers_stream_generator
9
+ torchvision
10
+
11
+ opencv-python
12
+ optimum
13
+ opencv-python
14
+ matplotlib
15
+ auto-gptq