Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Script to extract speaker notes from presentation.md files and convert them to | |
audio files. | |
Usage: | |
python transcription_to_audio.py path/to/chapter/directory | |
This will: | |
1. Parse the presentation.md file in the specified directory | |
2. Extract speaker notes (text between ??? markers) | |
3. Generate audio files using FAL AI with optional voice customization | |
4. Save audio files in {dir}/audio/{n}.wav format | |
""" | |
import argparse | |
import hashlib | |
import json | |
import logging | |
import os | |
import re | |
import sys | |
import time | |
from pathlib import Path | |
import fal_client | |
import requests | |
from dotenv import load_dotenv | |
load_dotenv() | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
VOICE_ID = os.getenv("VOICE_ID") | |
def extract_speaker_notes(markdown_content): | |
"""Extract speaker notes from markdown content.""" | |
# Pattern to match content between ??? markers | |
pattern = r"\?\?\?(.*?)(?=\n---|\n$|\Z)" | |
# Find all matches using regex | |
matches = re.findall(pattern, markdown_content, re.DOTALL) | |
# Clean up the extracted notes | |
notes = [note.strip() for note in matches] | |
return notes | |
def get_cache_key(text, voice, speed, emotion, language): | |
"""Generate a unique cache key for the given parameters.""" | |
# Create a string with all parameters | |
params_str = f"{text}|{voice}|{speed}|{emotion}|{language}" | |
# Generate a hash of the parameters | |
return hashlib.md5(params_str.encode()).hexdigest() | |
def load_cache(cache_file): | |
"""Load the cache from a file.""" | |
if not cache_file.exists(): | |
return {} | |
try: | |
with open(cache_file, "r") as f: | |
return json.load(f) | |
except (json.JSONDecodeError, IOError) as e: | |
logger.warning(f"Error loading cache: {e}") | |
return {} | |
def save_cache(cache_data, cache_file): | |
"""Save the cache to a file.""" | |
try: | |
with open(cache_file, "w") as f: | |
json.dump(cache_data, f) | |
except IOError as e: | |
logger.warning(f"Error saving cache: {e}") | |
def text_to_speech( | |
text, | |
output_file, | |
voice=None, | |
speed=1.0, | |
emotion="happy", | |
language="English", | |
cache_dir=None, | |
): | |
"""Convert text to speech using FAL AI and save as audio file. | |
Args: | |
text: The text to convert to speech | |
output_file: Path to save the output audio file | |
voice: The voice ID to use | |
speed: Speech speed (0.5-2.0) | |
emotion: Emotion to apply (neutral, happy, sad, etc.) | |
language: Language for language boost | |
cache_dir: Directory to store cache files | |
""" | |
try: | |
start_time = time.monotonic() | |
# Create the output directory if it doesn't exist | |
output_file.parent.mkdir(exist_ok=True) | |
# Set up caching | |
cache_file = None | |
cache_data = {} | |
cache_key = get_cache_key(text, voice, speed, emotion, language) | |
if cache_dir: | |
cache_dir_path = Path(cache_dir) | |
cache_dir_path.mkdir(exist_ok=True) | |
cache_file = cache_dir_path / "audio_cache.json" | |
cache_data = load_cache(cache_file) | |
# Check if we have a cached URL for this request | |
if cache_key in cache_data: | |
audio_url = cache_data[cache_key] | |
logger.info(f"Using cached audio URL: {audio_url}") | |
# Download the audio from the cached URL | |
response = requests.get(audio_url) | |
if response.status_code == 200: | |
with open(output_file, "wb") as f: | |
f.write(response.content) | |
logger.info(f"Downloaded cached audio to {output_file}") | |
return True | |
else: | |
logger.warning(f"Cached URL failed, status: {response.status_code}") | |
# Continue with generation as the cached URL failed | |
# Set up voice settings | |
voice_setting = {"speed": speed, "emotion": emotion} | |
# Add custom voice ID if provided | |
if voice: | |
voice_setting["custom_voice_id"] = voice | |
def on_queue_update(update): | |
if isinstance(update, fal_client.InProgress): | |
for log in update.logs: | |
logger.debug(log["message"]) | |
# Generate speech with FAL AI | |
logger.info(f"Generating speech with voice ID: {voice}") | |
result = fal_client.subscribe( | |
"fal-ai/minimax-tts/text-to-speech/turbo", | |
arguments={ | |
"text": text, | |
"voice_setting": voice_setting, | |
"language_boost": language, | |
}, | |
with_logs=True, | |
on_queue_update=on_queue_update, | |
) | |
# Download the audio file from the URL | |
if "audio" in result and "url" in result["audio"]: | |
audio_url = result["audio"]["url"] | |
logger.info(f"Downloading audio from {audio_url}") | |
# Cache the URL if caching is enabled | |
if cache_file: | |
cache_data[cache_key] = audio_url | |
save_cache(cache_data, cache_file) | |
logger.info(f"Cached audio URL for future use") | |
response = requests.get(audio_url) | |
if response.status_code == 200: | |
# Save the audio file | |
with open(output_file, "wb") as f: | |
f.write(response.content) | |
else: | |
logger.error(f"Failed to download audio: {response.status_code}") | |
return False | |
else: | |
logger.error(f"Unexpected response format: {result}") | |
return False | |
end_time = time.monotonic() | |
logger.info( | |
f"Generated audio in {end_time - start_time:.2f} seconds, " | |
f"saved to {output_file}" | |
) | |
return True | |
except Exception as e: | |
logger.error(f"Error generating audio: {e}") | |
return False | |
def process_presentation( | |
chapter_dir, | |
voice=None, | |
speed=1.0, | |
emotion="happy", | |
language="English", | |
cache_dir=None, | |
): | |
"""Process the presentation.md file in the given directory.""" | |
# Construct paths | |
chapter_path = Path(chapter_dir) | |
presentation_file = chapter_path / "presentation.md" | |
audio_dir = chapter_path / "audio" | |
# Check if presentation file exists | |
if not presentation_file.exists(): | |
logger.error(f"Presentation file not found: {presentation_file}") | |
return False | |
# Create audio directory if it doesn't exist | |
audio_dir.mkdir(exist_ok=True) | |
# Read the presentation file | |
with open(presentation_file, "r", encoding="utf-8") as file: | |
content = file.read() | |
# Extract speaker notes | |
notes = extract_speaker_notes(content) | |
if not notes: | |
logger.warning("No speaker notes found in the presentation file.") | |
return False | |
logger.info(f"Found {len(notes)} slides with speaker notes.") | |
# Generate audio files for each note | |
for i, note in enumerate(notes, 1): | |
if not note.strip(): | |
logger.warning(f"Skipping empty note for slide {i}") | |
continue | |
output_file = audio_dir / f"{i}.wav" | |
logger.info(f"Generating audio for slide {i}") | |
success = text_to_speech( | |
note, | |
output_file, | |
voice, | |
speed, | |
emotion, | |
language, | |
cache_dir, | |
) | |
if success: | |
logger.info(f"Saved audio to {output_file}") | |
else: | |
logger.error(f"Failed to generate audio for slide {i}") | |
return True | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Extract speaker notes from presentation.md and convert to" | |
" audio files." | |
) | |
parser.add_argument( | |
"chapter_dir", help="Path to the chapter directory containing presentation.md" | |
) | |
parser.add_argument( | |
"--voice", | |
default=VOICE_ID, | |
help="Voice ID to use (defaults to VOICE_ID from .env)", | |
) | |
parser.add_argument( | |
"--speed", type=float, default=1.0, help="Speech speed (0.5-2.0, default: 1.0)" | |
) | |
parser.add_argument( | |
"--emotion", | |
default="happy", | |
help="Emotion to apply (neutral, happy, sad, etc.)", | |
) | |
parser.add_argument( | |
"--language", default="English", help="Language for language boost" | |
) | |
parser.add_argument( | |
"--cache-dir", | |
default=".cache", | |
help="Directory to store cache files (default: .cache)", | |
) | |
parser.add_argument( | |
"--no-cache", action="store_true", help="Disable caching of audio URLs" | |
) | |
args = parser.parse_args() | |
# Determine cache directory | |
cache_dir = None if args.no_cache else args.cache_dir | |
logger.info(f"Processing presentation in {args.chapter_dir}") | |
success = process_presentation( | |
args.chapter_dir, | |
args.voice, | |
args.speed, | |
args.emotion, | |
args.language, | |
cache_dir, | |
) | |
if success: | |
logger.info("Audio generation completed successfully.") | |
else: | |
logger.error("Audio generation failed.") | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() | |