Spaces:
Running
Running
File size: 15,699 Bytes
f72d35e 4a1cbed f72d35e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 |
import os
import torch
import numpy as np
import gradio as gr
import zipfile
import json
import requests
import subprocess
import shutil
from transformers import BlipProcessor, BlipForConditionalGeneration
title = "# 🗜️ CLaMP 3 - Multimodal & Multilingual Semantic Music Search"
badges = """
<div style="text-align: center;">
<a href="https://sanderwood.github.io/clamp3/">
<img src="https://img.shields.io/badge/CLaMP%203%20Homepage-GitHub-181717?style=for-the-badge&logo=home-assistant" alt="Homepage">
</a>
<a href="https://arxiv.org/abs/2502.10362">
<img src="https://img.shields.io/badge/CLaMP%203%20Paper-Arxiv-red?style=for-the-badge&logo=arxiv" alt="Paper">
</a>
<a href="https://github.com/sanderwood/clamp3">
<img src="https://img.shields.io/badge/CLaMP%203%20Code-GitHub-181717?style=for-the-badge&logo=github" alt="GitHub">
</a>
<a href="https://huggingface.co./spaces/sander-wood/clamp3">
<img src="https://img.shields.io/badge/CLaMP%203%20Demo-Gradio-green?style=for-the-badge&logo=gradio" alt="Demo">
</a>
<a href="https://huggingface.co./sander-wood/clamp3/tree/main">
<img src="https://img.shields.io/badge/Model%20Weights-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Model Weights">
</a>
<a href="https://huggingface.co./datasets/sander-wood/m4-rag">
<img src="https://img.shields.io/badge/M4--RAG%20Dataset-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Dataset">
</a>
<a href="https://huggingface.co./datasets/sander-wood/wikimt-x">
<img src="https://img.shields.io/badge/WikiMT--X%20Benchmark-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Benchmark">
</a>
</div>
<style>
div a {
display: inline-block;
margin: 5px;
}
div a img {
height: 30px;
}
</style>
"""
description = """CLaMP 3 is a **multimodal and multilingual** music information retrieval (MIR) framework, supporting **sheet music, audio, and performance signals** in **100 languages**. Using **contrastive learning**, it aligns these modalities in a shared space for **cross-modal retrieval**.
### 🔍 **How This Demo Works**
- You can **retrieve music using any text input (in any language) or an image** (`.png`, `.jpg`).
- When using an image, **BLIP** generates a caption, which is then used for retrieval.
- Since CLaMP 3's training data includes **rich visual descriptions of musical scenes**, it can **match images to semantically relevant music**.
- For simplicity, this demo retrieves music based on **metadata (text descriptions)** rather than directly searching sheet music, MIDI, or audio files.
### ⚠️ **Limitations**
- This demo retrieves music **only from the WikiMT-X benchmark (1,000 pieces)**.
- These pieces are **mainly from the U.S. and Western Europe (especially the U.S.)** and **mostly from the 20th century**.
- Thus, retrieval results are **mostly limited to Western 20th-century music**, so you **won’t** find music from **other regions or historical periods**.
🔧 **Need retrieval for a different music collection?** Deploy **[CLaMP 3](https://github.com/sanderwood/clamp3)** on your own dataset.
Generally, the larger and more diverse the reference music dataset, the better the retrieval quality, increasing the likelihood of finding relevant and accurately matched music.
**Note: This project is for research use only.**
"""
# Load BLIP image captioning model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Download weight file if it does not exist
weights_url = "https://huggingface.co./sander-wood/clamp3/resolve/main/weights_clamp3_saas_h_size_768_t_model_FacebookAI_xlm-roberta-base_t_length_128_a_size_768_a_layers_12_a_length_128_s_size_768_s_layers_12_p_size_64_p_length_512.pth"
weights_filename = "weights_clamp3_saas_h_size_768_t_model_FacebookAI_xlm-roberta-base_t_length_128_a_size_768_a_layers_12_a_length_128_s_size_768_s_layers_12_p_size_64_p_length_512.pth"
if not os.path.exists(weights_filename):
print("Downloading weights file...")
response = requests.get(weights_url, stream=True)
response.raise_for_status()
with open(weights_filename, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Weights file downloaded.")
ZIP_PATH = "features.zip"
if os.path.exists(ZIP_PATH):
print(f"Extracting {ZIP_PATH}...")
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
zip_ref.extractall(".")
print("Extraction complete.")
# Load metadata
metadata_map = {}
METADATA_FILE = "wikimt-x-public.jsonl"
if os.path.exists(METADATA_FILE):
with open(METADATA_FILE, "r", encoding="utf-8") as f:
for line in f:
data = json.loads(line)
metadata_map[data["id"]] = data
else:
print(f"Warning: {METADATA_FILE} not found.")
features_cache = {}
def get_info(folder_path):
"""
Load all .npy files from the specified folder and return a dictionary
with the file names (without extension) as keys.
"""
if folder_path in features_cache:
return features_cache[folder_path]
if not os.path.exists(folder_path):
return {}
files = sorted(os.listdir(folder_path))
features = {}
for file in files:
if file.endswith(".npy"):
key = file.split(".")[0]
try:
features[key] = np.load(os.path.join(folder_path, file))[0]
except Exception as e:
print(f"Error loading {file}: {e}")
features_cache[folder_path] = features
return features
def find_top_similar(query_file, reference_folder):
"""
Compare the query feature with all reference features in the specified folder
using cosine similarity and return the top 10 candidate results in the format:
Title | Artists | sim: SimilarityScore.
"""
top_k = 10
try:
query_feature = np.load(query_file.name)[0]
except Exception as e:
return [], f"Error loading query feature: {e}"
query_tensor = torch.tensor(query_feature, dtype=torch.float32).unsqueeze(dim=0)
key_features = get_info(reference_folder)
if not key_features:
return [], f"No reference features found in {reference_folder}."
ref_keys = list(key_features.keys())
ref_array = np.array([key_features[k] for k in ref_keys])
key_feats_tensor = torch.tensor(ref_array, dtype=torch.float32)
query_tensor_expanded = query_tensor.expand(key_feats_tensor.size(0), -1)
similarities = torch.cosine_similarity(query_tensor_expanded, key_feats_tensor, dim=1)
ranked_indices = torch.argsort(similarities, descending=True)
candidate_ids = []
candidate_display = []
for i in range(top_k):
if i < len(ref_keys):
candidate_idx = ranked_indices[i].item()
candidate_id = ref_keys[candidate_idx]
sim = round(similarities[candidate_idx].item(), 4)
meta = metadata_map.get(candidate_id, {})
title = meta.get("title", candidate_id)
artists = meta.get("artists", "Unknown")
if isinstance(artists, list):
artists = ", ".join(artists)
candidate_ids.append(candidate_id)
candidate_display.append(f"{title} | {artists} | sim: {sim}")
else:
candidate_ids.append("N/A")
candidate_display.append("N/A")
return candidate_ids, candidate_display
def show_details(selected_id):
"""
Return detailed metadata and embedded YouTube video HTML based on the candidate ID.
"""
if selected_id == "N/A":
return ("", "", "", "", "", "", "", "")
data = metadata_map.get(selected_id, {})
if not data:
return ("No details found", "", "", "", "", "", "", "")
title = data.get("title", "")
artists = data.get("artists", "")
if isinstance(artists, list):
artists = ", ".join(artists)
genre = data.get("genre", "")
background = data.get("background", "")
analysis = data.get("analysis", "")
description = data.get("description", "")
scene = data.get("scene", "")
youtube_html = (
f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{selected_id}" '
f'frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; '
f'gyroscope; picture-in-picture" allowfullscreen></iframe>'
)
return title, artists, genre, background, analysis, description, scene, youtube_html
def extract_features_from_text(text):
"""
Save the input text to a file, call the CLaMP 3 feature extraction script,
and return the generated feature file path.
"""
input_dir = "input_dir"
output_dir = "output_dir"
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
# Clear input_dir and output_dir
for d in [input_dir, output_dir]:
for filename in os.listdir(d):
file_path = os.path.join(d, filename)
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
input_file = os.path.join(input_dir, "input.txt")
print("Text input:", text)
with open(input_file, "w", encoding="utf-8") as f:
f.write(text)
command = ["python", "extract_clamp3.py", input_dir, output_dir, "--get_global"]
subprocess.run(command, check=True)
output_file = os.path.join(output_dir, "input.npy")
return output_file
def generate_caption(image):
"""
Use the BLIP model to generate a descriptive caption for the given image.
"""
inputs = processor(image, return_tensors="pt")
outputs = blip_model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
class FileWrapper:
"""
Simulate a file object with a .name attribute.
"""
def __init__(self, path):
self.name = path
def search_wrapper(search_mode, text_input, image_input):
"""
Perform retrieval based on the selected input mode:
- If search_mode is "Image", use the uploaded image to generate a caption, then extract features
and search in the "image/" folder.
- If search_mode is "Text", use the provided text to extract features and search in the "image/" folder.
"""
if search_mode == "Image":
if image_input is None:
return text_input, gr.update(choices=[]), "Please upload an image.", "", "", "", "", "", "", ""
caption = generate_caption(image_input)
text_to_use = caption
reference_folder = "image/"
elif search_mode == "Text":
if not text_input or text_input.strip() == "":
return "Describe the music you're looking for (in any language)", gr.update(choices=[]), "Please enter text for retrieval.", "", "", "", "", "", "", ""
text_to_use = text_input
reference_folder = "text/"
else:
return "Describe the music you're looking for (in any language)", gr.update(choices=[]), "Invalid search mode selected.", "", "", "", "", "", "", ""
try:
output_file = extract_features_from_text(text_to_use)
query_file = FileWrapper(output_file)
except Exception as e:
return text_to_use, gr.update(choices=[]), f"Error during feature extraction: {e}", "", "", "", "", "", "", ""
candidate_ids, candidate_display = find_top_similar(query_file, reference_folder)
if not candidate_ids:
return text_to_use, gr.update(choices=[]), "", "", "", "", "", "", "", ""
choices = [(f"{i+1}. {disp}", cid) for i, (cid, disp) in enumerate(zip(candidate_ids, candidate_display))]
top_candidate = candidate_ids[0]
details = show_details(top_candidate)
return text_to_use, gr.update(choices=choices), *details
# 定义示例数据(示例数据放在组件定义之后也可以正常运行)
examples = [
["Image", None, "V4EauuhVEw4.jpg"],
["Image", None, "Kw-_Ew5bVxs.jpg"],
["Image", None, "BuYf0taXoNw.webp"],
["Image", None, "4tDYMayp6Dk.jpg"],
["Text", "classic rock, British, 1960s, upbeat", None],
["Text", "A Latin jazz piece with rhythmic percussion and brass", None],
["Text", "big band, major key, swing, brass-heavy, syncopation, baritone vocal", None],
["Text", "Heartfelt and nostalgic, with a bittersweet, melancholic feel", None],
["Text", "Melodía instrumental en re mayor con progresión armónica repetitiva y fluida", None],
["Text", "D大调四四拍的爱尔兰舞曲", None],
["Text", "Ιερή μουσική με πνευματική ατμόσφαιρα", None],
["Text", "የፍቅር ሙዚቃ ሞቅ እና ስሜታማ ከሆነ ነገር ግን ድንቅ እና አስደሳች ቃላት ያካትታል", None],
]
with gr.Blocks() as demo:
gr.Markdown(title)
gr.HTML(badges)
gr.Markdown(description)
with gr.Row():
with gr.Column():
search_mode = gr.Radio(
choices=["Text", "Image"],
label="Select Search Mode",
value="Text",
interactive=True,
elem_classes=["vertical-radio"]
)
text_input = gr.Textbox(
placeholder="Describe the music you're looking for (in any language)",
lines=4
)
image_input = gr.Image(
label="Or upload an image (PNG, JPG)",
type="pil"
)
search_button = gr.Button("Search from 1,000 Western 20th-century music in WikiMT-X")
candidate_radio = gr.Radio(choices=[], label="Select Retrieval Result", interactive=True, elem_classes=["vertical-radio"])
with gr.Column():
gr.Markdown("### YouTube Video")
youtube_box = gr.HTML(label="YouTube Video")
gr.Markdown("### Metadata")
title_box = gr.Textbox(label="Title", interactive=False)
artists_box = gr.Textbox(label="Artists", interactive=False)
genre_box = gr.Textbox(label="Genre", interactive=False)
background_box = gr.Textbox(label="Background", interactive=False)
analysis_box = gr.Textbox(label="Analysis", interactive=False)
description_box = gr.Textbox(label="Description", interactive=False)
scene_box = gr.Textbox(label="Scene", interactive=False)
gr.HTML(
"""
<style>
.vertical-radio .gradio-radio label {
display: block !important;
margin-bottom: 5px;
}
</style>
"""
)
gr.Examples(
examples=examples,
inputs=[search_mode, text_input, image_input],
outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box],
fn=search_wrapper,
cache_examples=False,
)
search_button.click(
fn=search_wrapper,
inputs=[search_mode, text_input, image_input],
outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box]
)
candidate_radio.change(
fn=show_details,
inputs=candidate_radio,
outputs=[title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box]
)
demo.launch()
|