import gradio as gr from smolagents import HfApiModel, CodeAgent, Tool from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool from huggingface_hub import login from llama_index.retrievers.bm25 import BM25Retriever import spaces import torch from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer from string import punctuation import re from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "parler-tts/parler-tts-mini-v1" # repo_id_large = "parler-tts/parler-tts-large-v1" tts_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id) feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) SAMPLE_RATE = feature_extractor.sampling_rate SEED = 42 number_normalizer = EnglishNumberNormalizer() def preprocess(text): text = number_normalizer(text).strip() text = text.replace("-", " ") if text[-1] not in punctuation: text = f"{text}." abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' def separate_abb(chunk): chunk = chunk.replace(".","") print(chunk) return " ".join(chunk) abbreviations = re.findall(abbreviations_pattern, text) for abv in abbreviations: if abv in text: text = text.replace(abv, separate_abb(abv)) return text @spaces.GPU def gen_tts(text, description): inputs = tokenizer(description.strip(), return_tensors="pt").to(device) prompt = tokenizer(preprocess(text), return_tensors="pt").to(device) set_seed(SEED) generation = tts_model.generate( input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 ) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr class RetrieverTool(Tool): name = "retriever" description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query." inputs = { "query": { "type": "string", "description": "The query to perform. Ask the question as an human would, with simple explanation. The underlying index is BM25.", } } output_type = "string" def __init__(self, path, **kwargs): super().__init__(**kwargs) self.retriever = BM25Retriever.from_persist_dir(path) def forward(self, query: str) -> str: assert isinstance(query, str), "Your search query must be a string" docs = self.retriever.retrieve( query, ) return "\nRetrieved documents:\n" + "".join( [ f"\n\n===== Document {str(i)} =====\n" + doc.text for i, doc in enumerate(docs) ] ) path = "./ml_notes_index" model = HfApiModel( max_tokens=4086, temperature=0.5, model_id='Qwen/Qwen2.5-Coder-32B-Instruct', custom_role_conversions=None ) retriever_tool = RetrieverTool(path) agent = CodeAgent( tools=[retriever_tool], model=model, max_steps=4, verbosity_level=2 ) summarization_agent = CodeAgent( tools=[], model=model, max_steps=1, verbosity_level=2 ) def greet(question): agent_output = agent.run(question) result = summarization_agent.run(f"Rephrase the following out since it will be passed to an Text-To-Speach Model: {agent_output}") # Generate audio from the text description = "Laura's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise." sample_rate, audio = gen_tts(result, description) return result, (sample_rate, audio) # login() css = """ #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; margin-top: 10px; margin-left: auto; flex: unset !important; } """ with gr.Blocks(css=css) as block: gr.HTML( """