Spaces:

chenjoya
/

LiveCC

Running on Zero

App Files Files Community

chenjoya commited on 8 days ago

Commit

82c2aee

1 Parent(s): c1fab3e

fix

Browse files

Files changed (2) hide show

app.py +143 -4
demo/infer.py +212 -0

app.py CHANGED Viewed

@@ -1,7 +1,146 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
 import gradio as gr
+from demo.infer import LiveCCDemoInfer
+class GradioBackend:
+    waiting_video_response = 'Waiting for video input...'
+    not_found_video_response = 'Video does not exist...'
+    mode2api = {
+        'Real-Time Commentary': 'live_cc',
+        'Conversation': 'video_qa'
+    }
+    def __init__(self, model_path: str = 'chenjoya/LiveCC-7B-Instruct'):
+        self.infer = LiveCCDemoInfer(model_path)
+        from kokoro import KPipeline
+        self.audio_pipeline = KPipeline(lang_code='a')
+    def __call__(self, query: str = None, state: dict = {}, mode: str = 'Real-Time Commentary', **kwargs):
+        return getattr(self.infer, self.mode2api[mode])(query=query, state=state, **kwargs)
+gradio_backend = GradioBackend()
+with gr.Blocks() as demo:
+    gr.Markdown("## LiveCC Real-Time Commentary and Conversation - Gradio Demo")
+    gr.Markdown("#### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale](https://showlab.github.io/livecc/)")
+    gr_state = gr.State({}, render=False) # control all useful state, including kv cache
+    gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
+    gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
+    gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh
+    with gr.Row():
+        with gr.Column():
+            gr_video = gr.Video(
+                label="video",
+                elem_id="gr_video",
+                visible=True,
+                sources=['upload'],
+                autoplay=True,
+                include_audio=False,
+                width=720,
+                height=480
+            )
+            gr_examples = gr.Examples(
+                examples=[
+                    'demo/sources/howto_fix_laptop_mute_1080p.mp4',
+                ],
+                inputs=[gr_video],
+            )
+            gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")
+        with gr.Column():
+            with gr.Row():
+                gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True)
+            def gr_chatinterface_fn(message, history, state, mode):
+                response, state = gradio_backend(query=message, state=state, mode=mode)
+                return response, state
+            def gr_chatinterface_chatbot_clear_fn():
+                return {}, {}, 0, 0
+            gr_chatinterface = gr.ChatInterface(
+                fn=gr_chatinterface_fn,
+                type="messages",
+                additional_inputs=[gr_state, gr_radio_mode],
+                additional_outputs=[gr_state],
+            )
+            gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
+            gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
+            def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int):
+                # if static_trigger == 0:
+                #     return gr_chatinterface_chatbot_clear_fn()
+                # if video_state['video_path'] != state.get('video_path', None):
+                #     return gr_chatinterface_chatbot_clear_fn()
+                state.update(video_state)
+                query, assistant_waiting_message = None, None
+                for message in history[::-1]:
+                    if message['role'] == 'user':
+                        if message['metadata'] is None or message['metadata'].get('status', '') == '':
+                            query = message['content']
+                            if message['metadata'] is None:
+                                message['metadata'] = {}
+                            message['metadata']['status'] = 'pending'
+                            continue
+                        if query is not None: # put others as done
+                            message['metadata']['status'] = 'done'
+                    elif message['content'] == GradioBackend.waiting_video_response:
+                        assistant_waiting_message = message
+                for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
+                    if start_timestamp >= 0:
+                        response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
+                        if assistant_waiting_message is None:
+                            history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
+                        else:
+                            assistant_waiting_message['content'] = response_with_timestamp
+                            assistant_waiting_message = None
+                        yield history, state, dynamic_trigger
+                yield history, state, 1 - dynamic_trigger
+            js_video_timestamp_fetcher = """
+                (state, video_state) => {
+                    const videoEl = document.querySelector("#gr_video video");
+                    return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
+                }
+            """
+            gr_video.change(fn=lambda :[1,1], outputs=[gr_static_trigger, gr_dynamic_trigger])
+            def gr_get_video_state(video_state):
+                print(video_state)
+                if 'file=' in video_state['video_path']:
+                    video_state['video_path'] = video_state['video_path'].split('file=')[1]
+                return video_state
+            gr_dynamic_trigger.change(
+                fn=gr_get_video_state,
+                inputs=[gr_video_state],
+                outputs=[gr_video_state],
+                js=js_video_timestamp_fetcher
+            ).then(
+                fn=gr_for_streaming,
+                inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger],
+                outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger],
+            )
+    demo.queue(max_size=5, default_concurrency_limit=5)
+    demo.launch(share=True)
+    # --- for streaming ---
+    # gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
+    # def tts():
+    #     while True:
+    #         contents = ''
+    #         while not gradio_backend.contents.empty():
+    #             content = gradio_backend.contents.get()
+    #             contents += ' ' + content.rstrip(' ...')
+    #         contents = contents.strip()
+    #         if contents:
+    #             generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
+    #             for _, _, audio_torch in generator:
+    #                 audio_np = audio_torch.cpu().numpy()
+    #                 max_val = np.max(np.abs(audio_np))
+    #                 if max_val > 0:
+    #                     audio_np = audio_np / max_val
+    #                 audio_int16 = (audio_np * 32767).astype(np.int16)
+    #                 yield (24000, audio_int16)
+    # gr_video.change(fn=tts, outputs=[gr_tts])

demo/infer.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import functools, torch, os, tqdm
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+apply_liger_kernel_to_qwen2_vl()
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
+from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
+from qwen_vl_utils import process_vision_info
+logger = logging.get_logger(__name__)
+class ThresholdLogitsProcessor(LogitsProcessor):
+    def __init__(self, token_id: int, base_threshold: float, step: float):
+        self.token_id = token_id
+        self.base_threshold = base_threshold
+        self.step = step
+        self.count = 0
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        threshold = self.base_threshold + self.step * self.count
+        low_confidence = torch.softmax(scores, dim=-1)[:, self.token_id] <= threshold
+        if low_confidence.any():
+            scores[low_confidence, self.token_id] = -float("inf")
+        self.count += 1
+        return scores
+class LiveCCDemoInfer:
+    VIDEO_PLAY_END = object()
+    VIDEO_PLAY_CONTINUE = object()
+    fps = 2
+    initial_fps_frames = 6
+    streaming_fps_frames = 2
+    initial_time_interval = initial_fps_frames / fps
+    streaming_time_interval = streaming_fps_frames / fps
+    frame_time_interval = 1 / fps
+    def __init__(self, model_path: str = None, device_id: int = 0):
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_path, torch_dtype="auto",
+            device_map=f'cuda:{device_id}',
+            attn_implementation='flash_attention_2'
+        )
+        self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
+        self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
+        self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
+        message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": 'livecc'},
+            ]
+        }
+        texts = self.processor.apply_chat_template([message], tokenize=False)
+        self.system_prompt_offset = texts.index('<|im_start|>user')
+        self._cached_video_readers_with_hw = {}
+    @torch.inference_mode()
+    def live_cc(
+        self,
+        query: str,
+        state: dict,
+        max_pixels: int = 384 * 28 * 28,
+        default_query: str = 'Please describe the video.',
+        do_sample: bool = False,
+        repetition_penalty: float = 1.05,
+        streaming_eos_base_threshold: float = None,
+        streaming_eos_threshold_step: float = None,
+        **kwargs,
+    ):
+        """
+        state: dict, (maybe) with keys:
+            video_path: str, video path
+            video_timestamp: float, current video timestamp
+            last_timestamp: float, last processed video timestamp
+            last_video_pts_index: int, last processed video frame index
+            video_pts: np.ndarray, video pts
+            last_history: list, last processed history
+        """
+        # 1. preparation: video_reader, and last processing info
+        video_timestamp, last_timestamp = state.get('video_timestamp', 0), state.get('last_timestamp', -1 / self.fps)
+        video_path = state['video_path']
+        if video_path not in self._cached_video_readers_with_hw:
+            self._cached_video_readers_with_hw[video_path] = get_smart_resized_video_reader(video_path, max_pixels)
+            video_reader = self._cached_video_readers_with_hw[video_path][0]
+            video_reader.get_frame_timestamp(0)
+            state['video_pts'] = torch.from_numpy(video_reader._frame_pts[:, 1])
+            state['last_video_pts_index'] = -1
+        video_pts = state['video_pts']
+        if last_timestamp + self.frame_time_interval > video_pts[-1]:
+            state['video_end'] = True
+            return
+        video_reader, resized_height, resized_width = self._cached_video_readers_with_hw[video_path]
+        last_video_pts_index = state['last_video_pts_index']
+        # 2. which frames will be processed
+        initialized = last_timestamp >= 0
+        if not initialized:
+            video_timestamp = max(video_timestamp, self.initial_time_interval)
+        if video_timestamp <= last_timestamp + self.frame_time_interval:
+            return
+        timestamps = torch.arange(last_timestamp + self.frame_time_interval, video_timestamp, self.frame_time_interval) # add compensation
+        # 3. fetch frames in required timestamps
+        clip, clip_timestamps, clip_idxs = get_smart_resized_clip(video_reader, resized_height, resized_width, timestamps, video_pts, video_pts_index_from=last_video_pts_index+1)
+        state['last_video_pts_index'] = clip_idxs[-1]
+        state['last_timestamp'] = clip_timestamps[-1]
+        # 4. organize to interleave frames
+        interleave_clips, interleave_timestamps = [], []
+        if not initialized:
+            interleave_clips.append(clip[:self.initial_fps_frames])
+            interleave_timestamps.append(clip_timestamps[:self.initial_fps_frames])
+            clip = clip[self.initial_fps_frames:]
+            clip_timestamps = clip_timestamps[self.initial_fps_frames:]
+        if len(clip) > 0:
+            interleave_clips.extend(list(clip.split(self.streaming_fps_frames)))
+            interleave_timestamps.extend(list(clip_timestamps.split(self.streaming_fps_frames)))
+        # 5. make conversation and send to model
+        for clip, timestamps in zip(interleave_clips, interleave_timestamps):
+            start_timestamp, stop_timestamp = timestamps[0].item(), timestamps[-1].item() + self.frame_time_interval
+            message = {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": f'Time={start_timestamp:.1f}-{stop_timestamp:.1f}s'},
+                    {"type": "video", "video": clip}
+                ]
+            }
+            if not query and not state.get('query', None):
+                query = default_query
+                logger.warning(f'No query provided, use default_query={default_query}')
+            if query and state.get('query', None) != query:
+                message['content'].append({"type": "text", "text": query})
+                state['query'] = query
+            texts = self.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
+            past_ids = state.get('past_ids', None)
+            if past_ids is not None:
+                texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
+            inputs = self.processor(
+                text=texts,
+                images=None,
+                videos=[clip],
+                return_tensors="pt",
+                return_attention_mask=False
+            )
+            inputs.to('cuda')
+            if past_ids is not None:
+                inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
+            if streaming_eos_base_threshold is not None:
+                logits_processor = [ThresholdLogitsProcessor(self.streaming_eos_token_id, streaming_eos_base_threshold, streaming_eos_threshold_step)]
+            else:
+                logits_processor = None
+            outputs = self.model.generate(
+                **inputs, past_key_values=state.get('past_key_values', None),
+                return_dict_in_generate=True, do_sample=do_sample,
+                repetition_penalty=repetition_penalty,
+                logits_processor=logits_processor,
+            )
+            state['past_key_values'] = outputs.past_key_values
+            state['past_ids'] = outputs.sequences[:, :-1]
+            yield (start_timestamp, stop_timestamp), self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True), state
+    def video_qa(
+        model,
+        processor,
+        video_path: str,
+        query: str,
+        answer_prefix: str = '',
+        video_start: float = None,
+        video_end: float = None,
+        strict_fps: bool = False,
+        strict_abcd_ids: list[int] = None,
+        do_sample: bool = False,
+        max_new_tokens: int = 128
+    ):
+        if strict_fps:
+            video_inputs, _ = _read_video_decord_plus({'video': video_path, 'video_start': video_start, 'video_end': video_end}, strict_fps=True, drop_last=False)
+            video_inputs = _spatial_resize_video(video_inputs)
+            conversation = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "video": video_inputs},
+                        {"type": "text", "text": query},
+                    ],
+                }
+            ]
+            image_inputs = None
+        else:
+            conversation = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "video": video_path, "video_start": video_start, "video_end": video_end},
+                        {"type": "text", "text": query},
+                    ],
+                }
+            ]
+            image_inputs, video_inputs = process_vision_info(conversation)
+        text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) + answer_prefix
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+        )
+        print(text)
+        inputs = inputs.to("cuda")
+        if not strict_abcd_ids:
+            generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=do_sample)
+            output_text = processor.decode(generated_ids[0, inputs.input_ids.size(1):], clean_up_tokenization_spaces=False)
+        else:
+            outputs = model.generate(**inputs, do_sample=do_sample, top_p=None, temperature=None, top_k=None, max_new_tokens=1, return_dict_in_generate=True, output_scores=True, repetition_penalty=1)
+            print(outputs.scores[0][0, strict_abcd_ids])
+            output_text = ['A', 'B', 'C', 'D'][outputs.scores[0][0, strict_abcd_ids].argmax()]
+        return output_text