|
|
|
conda install -c conda-forge ffmpeg -y |
|
import os |
|
import contextlib |
|
|
|
@contextlib.contextmanager |
|
def new_cd(x): |
|
d = os.getcwd() |
|
|
|
|
|
|
|
|
|
os.chdir(x) |
|
|
|
try: |
|
yield |
|
|
|
finally: |
|
|
|
|
|
|
|
os.chdir(d) |
|
from ipex_llm.transformers import AutoModelForCausalLM |
|
from transformers import LlamaTokenizer |
|
|
|
|
|
llm = AutoModelForCausalLM.from_pretrained("checkpoints\\Llama-2-7b-chat-hf",load_in_low_bit="sym_int4") |
|
llm.save_low_bit("checkpoints\\Llama-2-7b-chat-hf-INT4") |
|
|
|
tokenizer = LlamaTokenizer.from_pretrained("checkpoints\\Llama-2-7b-chat-hf\\") |
|
tokenizer.save_pretrained("checkpoints\\Llama-2-7b-chat-hf-INT4") |
|
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
snapshot_download(repo_id='openai/clip-vit-base-patch32', |
|
local_dir="./checkpoints/clip-vit-base-patch32") |
|
|
|
|
|
snapshot_download(repo_id='meta-llama/Llama-2-7b-chat-hf', |
|
local_dir="./checkpoints/Llama-2-7b-chat-hf", token=hf_token) |
|
|
|
|
|
snapshot_download(repo_id='Helsinki-NLP/opus-mt-en-zh', |
|
local_dir="./checkpoints/Helsinki-NLP-opus-mt-en-zh") |
|
snapshot_download(repo_id='Helsinki-NLP/opus-mt-zh-en', |
|
local_dir="./checkpoints/Helsinki-NLP-opus-mt-zh-en") |
|
|
|
|
|
snapshot_download(repo_id='sentence-transformers/all-MiniLM-L12-v2', |
|
local_dir="./checkpoints/all-MiniLM-L12-v2") |
|
import argparse |
|
import gradio as gr |
|
import os |
|
from models.helperbot_bigdl import Chat |
|
from models.sum_model import Sum |
|
|
|
from models.whisper_model import AudioTranslator |
|
from models.llm_model import LlmReasoner |
|
|
|
import os |
|
from langchain.chains import ConversationalRetrievalChain, StuffDocumentsChain |
|
from langchain.prompts import PromptTemplate |
|
from ipex_llm.langchain.llms import TransformersLLM |
|
from langchain.vectorstores import FAISS |
|
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter |
|
from ipex_llm.langchain.embeddings import TransformersEmbeddings |
|
from langchain import LLMChain |
|
from utils.utils import new_cd |
|
|
|
from ipex_llm.langchain.llms import TransformersLLM |
|
from langchain import LLMChain |
|
from langchain.chains.summarize import load_summarize_chain |
|
from langchain.docstore.document import Document |
|
from langchain.prompts import PromptTemplate |
|
from langchain.chains.combine_documents.stuff import StuffDocumentsChain |
|
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain |
|
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter |
|
|
|
import whisper |
|
from ipex_llm import optimize_model |
|
|
|
def has_intersection(t1, t2): |
|
if t1[1] < t2[0] or t2[1] < t1[0]: |
|
return False |
|
else: |
|
return True |
|
|
|
class AudioTranslator(): |
|
def __init__(self, args): |
|
self.model = whisper.load_model(args.whisper_version, download_root='checkpoints') |
|
self.model = optimize_model(self.model) |
|
|
|
def __call__(self, video_path): |
|
""" |
|
input: video_path (str) |
|
output: audio_results (list) |
|
""" |
|
print("Extract the audio results.") |
|
audio_results = self.model.transcribe(video_path, task = 'translate')["segments"] |
|
print("Finished.") |
|
return audio_results |
|
|
|
def match(self, audio_results): |
|
transcript = '' |
|
for res in audio_results: |
|
transcript += res['text'] + ' ' |
|
|
|
|
|
return transcript |
|
|
|
class Sum(): |
|
def __init__(self, args): |
|
self.llm_version = args.llm_version |
|
|
|
|
|
def summarize_refine(self, script): |
|
text_splitter = CharacterTextSplitter(chunk_size=1024, separator="\n", chunk_overlap=0) |
|
texts = text_splitter.split_text(script) |
|
docs = [Document(page_content=t) for t in texts] |
|
llm = TransformersLLM.from_model_id_low_bit(f"checkpoint\\{self.llm_version}") |
|
|
|
prompt_template = """Write a concise summary of the following: |
|
{text} |
|
CONCISE SUMMARY:""" |
|
prompt = PromptTemplate.from_template(prompt_template) |
|
refine_template = ( |
|
"Your job is to produce a final summary\n" |
|
"We have provided an existing summary up to a certain point: {existing_answer}\n" |
|
"We have the opportunity to refine the existing summary" |
|
"(only if needed) with some more context below.\n" |
|
"------------\n" |
|
"{text}\n" |
|
"------------\n" |
|
"If the context isn't useful, return the original summary." |
|
) |
|
refine_prompt = PromptTemplate.from_template(refine_template) |
|
chain = load_summarize_chain( |
|
llm=llm, |
|
chain_type="refine", |
|
question_prompt=prompt, |
|
refine_prompt=refine_prompt, |
|
return_intermediate_steps=True, |
|
input_key="input_documents", |
|
output_key="output_text", |
|
) |
|
result = chain({"input_documents": docs}, return_only_outputs=True) |
|
|
|
return result |
|
|
|
def summarize_mapreduce(self, script): |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0) |
|
texts = text_splitter.split_text(script) |
|
text = [Document(page_content=t) for t in texts] |
|
|
|
llm = TransformersLLM.from_model_id_low_bit(f"checkpoint\\{self.llm_version}") |
|
|
|
|
|
map_template = """The following is a meeting recording |
|
========= |
|
{texts} |
|
========= |
|
Based on this list of recordings, please summary the main idea briefly |
|
Helpful Answer:""" |
|
map_prompt = PromptTemplate.from_template(map_template) |
|
map_chain = LLMChain(llm=llm, prompt=map_prompt, llm_kwargs={"max_new_tokens": 512}) |
|
|
|
|
|
reduce_template = """The following is set of summaries: |
|
========= |
|
{texts} |
|
========= |
|
Take these and distill it into a final, consolidated summary of the meeting. |
|
Helpful Answer:""" |
|
reduce_prompt = PromptTemplate.from_template(reduce_template) |
|
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt, llm_kwargs={"max_new_tokens": 4096}) |
|
|
|
|
|
combine_documents_chain = StuffDocumentsChain( |
|
llm_chain=reduce_chain, document_variable_name="texts" |
|
) |
|
|
|
|
|
reduce_documents_chain = ReduceDocumentsChain( |
|
combine_documents_chain=combine_documents_chain, |
|
collapse_documents_chain=combine_documents_chain, |
|
token_max=4000, |
|
) |
|
|
|
|
|
map_reduce_chain = MapReduceDocumentsChain( |
|
llm_chain=map_chain, |
|
reduce_documents_chain=reduce_documents_chain, |
|
document_variable_name="texts", |
|
return_intermediate_steps=False, |
|
) |
|
|
|
result = map_reduce_chain({"input_documents": text}, return_only_outputs=True) |
|
|
|
|
|
result = result['output_text'].split("Helpful Answer:").strip()[-1] |
|
return result |
|
|
|
def summarize(self, script): |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0) |
|
texts = text_splitter.split_text(script) |
|
|
|
prompt_template = """The following is a piece of meeting recording: |
|
<<<{text}>>> |
|
Based on recording, summary the main idea fluently. |
|
JUST SUMMARY!NO OTHER WORDS! |
|
SUMMARY:""" |
|
|
|
reduce_template = """The following is a meeting recording pieces: |
|
<<<{text}>>> |
|
Take these and distill it into a final, consolidated summary of the meeting. |
|
JUST SUMMARY!NO OTHER WORDS! |
|
SUMMARY:""" |
|
|
|
print(len(texts)) |
|
for text in texts: |
|
print(text) |
|
print("\n") |
|
|
|
llm = TransformersLLM.from_model_id_low_bit( |
|
f"checkpoint\\{self.llm_version}") |
|
sum_split = [] |
|
|
|
for text in texts: |
|
response = llm(prompt=prompt_template.format(text=text), max_new_tokens=1024) |
|
print(response) |
|
response_answer = response.split("SUMMARY:") |
|
|
|
sum_split.append(response_answer[1]) |
|
|
|
sum_all = "\n".join(sum_split) |
|
|
|
result = llm(prompt=reduce_template.format(text=sum_all), max_new_tokens=4000) |
|
result_split = result.split("SUMMARY:") |
|
return result_split[1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_dir = os.path.dirname(__file__) |
|
|
|
condense_template = """ |
|
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. |
|
You can assume the discussion is about the video content. |
|
REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \ |
|
not sure." Don't try to make up an answer. \ |
|
Chat History: |
|
{chat_history} |
|
Follow Up Question: {question} |
|
Standalone question: |
|
""" |
|
|
|
qa_template = """ |
|
You are an AI assistant designed for answering questions about a meeting. |
|
You are given a word records of this meeting. |
|
Try to comprehend the dialogs and provide a answer based on it. |
|
========= |
|
{context} |
|
========= |
|
Question: {question} |
|
Answer: |
|
""" |
|
|
|
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(condense_template) |
|
|
|
QA_PROMPT = PromptTemplate(template=qa_template, input_variables=["question", "context"]) |
|
|
|
DOC_PROMPT = PromptTemplate.from_template("{page_content}") |
|
|
|
|
|
class LlmReasoner(): |
|
def __init__(self, args): |
|
self.history = [] |
|
self.llm_version = args.llm_version |
|
self.embed_version = args.embed_version |
|
self.qa_chain = None |
|
self.vectorstore = None |
|
self.top_k = args.top_k |
|
self.qa_max_new_tokens = args.qa_max_new_tokens |
|
self.init_model() |
|
|
|
def init_model(self): |
|
with new_cd(parent_dir): |
|
self.llm = TransformersLLM.from_model_id_low_bit( |
|
f"..\\checkpoints\\{self.llm_version}") |
|
self.llm.streaming = False |
|
self.embeddings = TransformersEmbeddings.from_model_id( |
|
model_id=f"..\\checkpoints\\{self.embed_version}") |
|
|
|
def create_qa_chain(self, args, input_log): |
|
self.top_k = args.top_k |
|
self.qa_max_new_tokens = args.qa_max_new_tokens |
|
self.question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT) |
|
self.answer_generator = LLMChain(llm=self.llm, prompt=QA_PROMPT, |
|
llm_kwargs={"max_new_tokens": self.qa_max_new_tokens}) |
|
self.doc_chain = StuffDocumentsChain(llm_chain=self.answer_generator, document_prompt=DOC_PROMPT, |
|
document_variable_name='context') |
|
|
|
|
|
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0) |
|
texts = self.text_splitter.split_text(input_log) |
|
self.vectorstore = FAISS.from_texts(texts, self.embeddings, |
|
metadatas=[{"video_clip": str(i)} for i in range(len(texts))]) |
|
retriever = self.vectorstore.as_retriever(search_kwargs={"k": self.top_k}) |
|
self.qa_chain = ConversationalRetrievalChain(retriever=retriever, |
|
question_generator=self.question_generator, |
|
combine_docs_chain=self.doc_chain, |
|
return_generated_question=True, |
|
return_source_documents=True, |
|
rephrase_question=False) |
|
|
|
def __call__(self, question): |
|
response = self.qa_chain({"question": question, "chat_history": self.history}) |
|
answer = response["answer"] |
|
generated_question = response["generated_question"] |
|
source_documents = response["source_documents"] |
|
self.history.append([question, answer]) |
|
return self.history, generated_question, source_documents |
|
|
|
def clean_history(self): |
|
self.history = [] |
|
|
|
|
|
class Chat: |
|
|
|
def __init__(self, args) -> None: |
|
self.args = args |
|
|
|
def init_model(self): |
|
print('\033[1;33m' + "Initializing models...".center(50, '-') + '\033[0m') |
|
self.audio_translator = AudioTranslator(self.args) |
|
self.llm_reasoner = LlmReasoner(self.args) |
|
|
|
print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m') |
|
|
|
def video2log(self, video_path): |
|
audio_results = self.audio_translator(video_path) |
|
|
|
en_log_result = [] |
|
en_log_result_tmp = "" |
|
audio_transcript = self.audio_translator.match(audio_results) |
|
en_log_result_tmp += f"\n{audio_transcript}" |
|
|
|
en_log_result.append(en_log_result_tmp) |
|
|
|
en_log_result = "\n\n".join(en_log_result) |
|
print(f"\033[1;34mLog: \033[0m\n{en_log_result}\n") |
|
|
|
return en_log_result |
|
|
|
def chat2video(self, args, user_input, en_log_result): |
|
self.llm_reasoner.create_qa_chain(args, en_log_result) |
|
en_user_input = user_input |
|
|
|
print("\n\033[1;32mGnerating response...\033[0m") |
|
answer, generated_question, source_documents = self.llm_reasoner(en_user_input) |
|
print(f"\033[1;32mQuestion: \033[0m{user_input}") |
|
print(f"\033[1;32mAnswer: \033[0m{answer[0][1]}") |
|
self.clean_history() |
|
|
|
return answer, generated_question, source_documents |
|
|
|
def clean_history(self): |
|
self.llm_reasoner.clean_history() |
|
return |
|
|
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" |
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
parser.add_argument("--whisper_version", default="small", help="Whisper model version for video asr") |
|
|
|
parser.add_argument("--llm_version", default="Llama-2-7b-chat-hf-INT4", help="LLM model version") |
|
parser.add_argument("--embed_version", default="all-MiniLM-L12-v2", help="Embedding model version") |
|
parser.add_argument("--top_k", default=3, type=int, help="Return top k relevant contexts to llm") |
|
parser.add_argument("--qa_max_new_tokens", default=128, type=int, help="Number of max new tokens for llm") |
|
|
|
parser.add_argument("--port", type=int, default=7860, help="Gradio server port") |
|
|
|
args = parser.parse_args() |
|
|
|
chat = Chat(args) |
|
sumbot = Sum(args) |
|
chat.init_model() |
|
|
|
global_chat_history = [] |
|
global_result = "" |
|
|
|
global_summary = "" |
|
|
|
|
|
def clean_conversation(): |
|
global global_chat_history |
|
chat.clean_history() |
|
global_chat_history = [] |
|
return '', gr.update(value=None, interactive=True), None, gr.update(value=None, visible=True), gr.update(value=None, |
|
visible=True) |
|
|
|
|
|
def clean_chat_history(): |
|
global global_chat_history |
|
chat.clean_history() |
|
global_chat_history = [] |
|
return '', None |
|
|
|
|
|
def submit_message(message, max_tokens, top_p): |
|
args.qa_max_new_tokens = max_tokens |
|
args.top_k = top_p |
|
|
|
print(args) |
|
chat_history, generated_question, source_documents = chat.chat2video(args, message, global_result) |
|
global_chat_history.append((message, chat_history[0][1])) |
|
return '', global_chat_history |
|
|
|
|
|
def gen_script(vid_path): |
|
print(vid_path) |
|
global global_result |
|
if vid_path is None: |
|
log_text = "===== Please upload video! =====" |
|
gr.update(value=log_text, visible=True) |
|
else: |
|
global_result = chat.video2log(vid_path) |
|
|
|
return gr.update(value=global_result, visible=True), download_script_file() |
|
|
|
|
|
def download_script_file(): |
|
try: |
|
with open("script_result.txt", "w") as file: |
|
file.write(global_result) |
|
return "script_result.txt" |
|
except Exception as e: |
|
return f"Error preparing file for download: {str(e)}" |
|
|
|
|
|
def download_sum_file(): |
|
try: |
|
with open("sum_result.txt", "w") as file: |
|
file.write(global_summary) |
|
return "sum_result.txt" |
|
except Exception as e: |
|
return f"Error preparing file for download: {str(e)}" |
|
|
|
|
|
def upload_file(files): |
|
global global_result |
|
file_paths = [file.name for file in files][0] |
|
try: |
|
with open(file_paths, "r", encoding="utf-8") as file: |
|
file_content = file.read() |
|
global_result = file_content |
|
except FileNotFoundError: |
|
print("File not found") |
|
except IOError: |
|
print("Error occurred while reading the file") |
|
return file_content, download_script_file() |
|
|
|
|
|
def summary(): |
|
global global_summary |
|
global_summary = sumbot.summarize(global_result) |
|
return gr.update(value=global_summary, visible=True), download_sum_file() |
|
|
|
|
|
css = """ |
|
#col-container {max-width: 80%; margin-left: auto; margin-right: auto;} |
|
#video_inp {min-height: 100px} |
|
#chatbox {min-height: 100px;} |
|
#header {text-align: center;} |
|
#hint {font-size: 1.0em; padding: 0.5em; margin: 0;} |
|
.message { font-size: 1.2em; } |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.Markdown(""" ## Meeting Helper Bot |
|
Upload meeting recording in mp3/mp4/txt format and you can get the summary and chat based on content |
|
(You can adjust parameters based on your needs) |
|
Powered by BigDL, Llama, Whisper, and LangChain""", |
|
elem_id="header") |
|
|
|
with gr.Column() as advanced_column: |
|
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=1024, step=1, value=128) |
|
top_k = gr.Slider(label="Top-k", minimum=1, maximum=50, step=1, value=3) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
video_inp = gr.Video(label="1.Upload MP3/MP4 File") |
|
|
|
upload_button = gr.UploadButton("1. Or Click to Upload a txt File", file_types=["doc", "txt"], |
|
file_count="multiple") |
|
gen_btn = gr.Button("2. Generate Script") |
|
sum_outp = gr.Textbox(label="Summerization output", lines=15) |
|
|
|
save_sum_dl = gr.outputs.File(label="Download Summary") |
|
|
|
|
|
with gr.Column(): |
|
script_outp = gr.Textbox(label="Script output", lines=30) |
|
with gr.Row(): |
|
script_summarization_btn = gr.Button("3.Script Summarization ") |
|
|
|
|
|
save_script_dl = gr.outputs.File(label="Download Script") |
|
|
|
|
|
with gr.Column(): |
|
chatbot = gr.Chatbot(elem_id="chatbox") |
|
input_message = gr.Textbox(show_label=False, placeholder="Enter text and press enter", visible=True) |
|
btn_submit = gr.Button("Submit") |
|
with gr.Row(): |
|
btn_clean_chat_history = gr.Button("Clean Chat History") |
|
btn_clean_conversation = gr.Button("Start New Conversation") |
|
|
|
upload_button.upload(upload_file, upload_button, [script_outp, save_script_dl]) |
|
|
|
gen_btn.click(gen_script, [video_inp], [script_outp, save_script_dl]) |
|
script_summarization_btn.click(summary, [], [sum_outp, save_sum_dl]) |
|
|
|
btn_submit.click(submit_message, [input_message, max_new_tokens, top_k], [input_message, chatbot]) |
|
input_message.submit(submit_message, [input_message, max_new_tokens, top_k], [input_message, chatbot]) |
|
|
|
btn_clean_conversation.click(clean_conversation, [], [input_message, video_inp, chatbot, sum_outp, script_outp]) |
|
btn_clean_chat_history.click(clean_chat_history, [], [input_message, chatbot]) |
|
|
|
demo.load(queur=False) |
|
|
|
demo.queue(concurrency_count=1) |
|
demo.launch(height='800px', server_port=args.port, debug=True, share=False) |