Spaces:

xiaoyao9184
/

audio-seal

Sleeping

App Files Files Community

audio-seal / gradio_app.py

xiaoyao9184

Synced repo using 'sync_with_huggingface' Github Action

ef7bf13 verified 4 months ago

raw

history blame contribute delete

10.4 kB

	import os
	import sys

	if "APP_PATH" in os.environ:
	app_path = os.path.abspath(os.environ["APP_PATH"])
	if os.getcwd() != app_path:
	# fix sys.path for import
	os.chdir(app_path)
	if app_path not in sys.path:
	sys.path.append(app_path)

	import gradio as gr

	import torch
	import torchaudio
	import numpy as np
	import matplotlib.pyplot as plt
	import re
	import random
	import string
	from audioseal import AudioSeal

	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load generator if not already loaded in reload mode
	if 'generator' not in globals():
	generator = AudioSeal.load_generator("audioseal_wm_16bits")
	generator = generator.to(device)
	generator_nbytes = int(generator.msg_processor.nbits / 8)

	# Load detector if not already loaded in reload mode
	if 'detector' not in globals():
	detector = AudioSeal.load_detector("audioseal_detector_16bits")
	detector = detector.to(device)


	def load_audio(file):
	wav, sample_rate = torchaudio.load(file)
	return wav, sample_rate

	def generate_msg_pt_by_format_string(format_string, bytes_count):
	msg_hex = format_string.replace("-", "")
	hex_length = bytes_count * 2
	binary_list = []
	for i in range(0, len(msg_hex), hex_length):
	chunk = msg_hex[i:i+hex_length]
	binary = bin(int(chunk, 16))[2:].zfill(bytes_count * 8)
	binary_list.append([int(b) for b in binary])
	# torch.randint(0, 2, (1, 16), dtype=torch.int32)
	msg_pt = torch.tensor(binary_list, dtype=torch.int32)
	return msg_pt.to(device)

	def embed_watermark(audio, sr, msg_pt):
	original_audio = audio.to(device)

	# If the audio has more than one channel, average all channels to 1 channel
	if original_audio.shape[0] > 1:
	mono_audio = torch.mean(original_audio, dim=0, keepdim=True)
	else:
	mono_audio = original_audio

	# We add the batch dimension to the single audio to mimic the batch watermarking
	batched_audio = mono_audio.unsqueeze(0)

	watermark = generator.get_watermark(batched_audio, sr, message=msg_pt)

	watermarked_audio = batched_audio + watermark

	# Alternatively, you can also call forward() function directly with different tune-down / tune-up rate
	# watermarked_audio = generator(audios, sample_rate=sr, alpha=1)

	# Need remove batch dimension and to cpu
	return watermarked_audio.squeeze(0).detach().cpu()

	def generate_format_string_by_msg_pt(msg_pt, bytes_count):
	hex_length = bytes_count * 2
	binary_int = 0
	for bit in msg_pt:
	binary_int = (binary_int << 1) \| int(bit.item())
	hex_string = format(binary_int, f'0{hex_length}x')

	split_hex = [hex_string[i:i + 4] for i in range(0, len(hex_string), 4)]
	format_hex = "-".join(split_hex)

	return hex_string, format_hex

	def detect_watermark(audio, sr):
	watermarked_audio = audio.to(device)

	# If the audio has more than one channel, average all channels to 1 channel
	if watermarked_audio.shape[0] > 1:
	mono_audio = torch.mean(watermarked_audio, dim=0, keepdim=True)
	else:
	mono_audio = watermarked_audio

	# We add the batch dimension to the single audio to mimic the batch watermarking
	batched_audio = mono_audio.unsqueeze(0)

	result, message = detector.detect_watermark(batched_audio, sr)

	# pred_prob is a tensor of size batch x 2 x frames, indicating the probability (positive and negative) of watermarking for each frame
	# A watermarked audio should have pred_prob[:, 1, :] > 0.5
	# message_prob is a tensor of size batch x 16, indicating of the probability of each bit to be 1.
	# message will be a random tensor if the detector detects no watermarking from the audio
	pred_prob, message_prob = detector(batched_audio, sr)

	return result, message, pred_prob, message_prob

	def get_waveform_and_specgram(waveform, sample_rate):
	# If the audio has more than one channel, average all channels to 1 channel
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	waveform = waveform.squeeze().detach().cpu().numpy()

	num_frames = waveform.shape[-1]
	time_axis = torch.arange(0, num_frames) / sample_rate

	figure, (ax1, ax2) = plt.subplots(2, 1)

	ax1.plot(time_axis, waveform, linewidth=1)
	ax1.grid(True)
	ax2.specgram(waveform, Fs=sample_rate)

	figure.suptitle(f"Waveform and specgram")

	return figure

	def generate_hex_format_regex(bytes_count):
	hex_length = bytes_count * 2
	hex_string = 'F' * hex_length
	split_hex = [hex_string[i:i + 4] for i in range(0, len(hex_string), 4)]
	format_like = "-".join(split_hex)
	regex_pattern = '^' + '-'.join([r'[0-9A-Fa-f]{4}'] * len(split_hex)) + '$'
	return format_like, regex_pattern

	def generate_hex_random_message(bytes_count):
	hex_length = bytes_count * 2
	hex_string = ''.join(random.choice(string.hexdigits) for _ in range(hex_length))
	split_hex = [hex_string[i:i + 4] for i in range(0, len(hex_string), 4)]
	random_str = "-".join(split_hex)
	return random_str, "".join(split_hex)

	with gr.Blocks(title="AudioSeal") as demo:
	gr.Markdown("""
	# AudioSeal Demo

	Find the project [here](https://github.com/facebookresearch/audioseal.git).
	""")

	with gr.Tabs():
	with gr.TabItem("Embed Watermark"):
	with gr.Row():
	with gr.Column():
	embedding_aud = gr.Audio(label="Input Audio", type="filepath")
	embedding_specgram = gr.Checkbox(label="Show specgram", value=False, info="Show debug information")

	embedding_type = gr.Radio(["random", "input"], value="random", label="Type", info="Type of watermarks")

	format_like, regex_pattern = generate_hex_format_regex(generator_nbytes)
	msg, _ = generate_hex_random_message(generator_nbytes)
	embedding_msg = gr.Textbox(
	label=f"Message ({generator_nbytes} bytes hex string)",
	info=f"format like {format_like}",
	value=msg,
	interactive=False, show_copy_button=True)

	embedding_btn = gr.Button("Embed Watermark")
	with gr.Column():
	marked_aud = gr.Audio(label="Output Audio", show_download_button=True)
	specgram_original = gr.Plot(label="Original Audio", format="png", visible=False)
	specgram_watermarked = gr.Plot(label="Watermarked Audio", format="png", visible=False)


	def change_embedding_type(type):
	if type == "random":
	msg, _ = generate_hex_random_message(generator_nbytes)
	return gr.update(interactive=False, value=msg)
	else:
	return gr.update(interactive=True)
	embedding_type.change(
	fn=change_embedding_type,
	inputs=[embedding_type],
	outputs=[embedding_msg]
	)

	def check_embedding_msg(msg):
	if not re.match(regex_pattern, msg):
	gr.Warning(
	f"Invalid format. Please use like '{format_like}'",
	duration=0)
	embedding_msg.change(
	fn=check_embedding_msg,
	inputs=[embedding_msg],
	outputs=[]
	)

	def run_embed_watermark(file, show_specgram, type, msg):
	if file is None:
	raise gr.Erro("No file uploaded", duration=5)
	if not re.match(regex_pattern, msg):
	raise gr.Error(f"Invalid format. Please use like '{format_like}'", duration=5)

	audio_original, rate = load_audio(file)
	msg_pt = generate_msg_pt_by_format_string(msg, generator_nbytes)
	audio_watermarked = embed_watermark(audio_original, rate, msg_pt)
	output = rate, audio_watermarked.squeeze().numpy().astype(np.float32)

	if show_specgram:
	fig_original = get_waveform_and_specgram(audio_original, rate)
	fig_watermarked = get_waveform_and_specgram(audio_watermarked, rate)
	return [
	output,
	gr.update(visible=True, value=fig_original),
	gr.update(visible=True, value=fig_watermarked)]
	else:
	return [
	output,
	gr.update(visible=False),
	gr.update(visible=False)]

	embedding_btn.click(
	fn=run_embed_watermark,
	inputs=[embedding_aud, embedding_specgram, embedding_type, embedding_msg],
	outputs=[marked_aud, specgram_original, specgram_watermarked]
	)

	with gr.TabItem("Detect Watermark"):
	with gr.Row():
	with gr.Column():
	detecting_aud = gr.Audio(label="Input Audio", type="filepath")
	detecting_btn = gr.Button("Detect Watermark")
	with gr.Column():
	predicted_messages = gr.JSON(label="Detected Messages")

	def run_detect_watermark(file):
	if file is None:
	raise gr.Error("No file uploaded", duration=5)

	audio_watermarked, rate = load_audio(file)
	result, message, pred_prob, message_prob = detect_watermark(audio_watermarked, rate)

	_, fromat_msg = generate_format_string_by_msg_pt(message[0], generator_nbytes)

	sum_above_05 = (pred_prob[:, 1, :] > 0.5).sum(dim=1)

	# Create message output as JSON
	message_json = {
	"socre": result,
	"message": fromat_msg,
	"frames_count_all": pred_prob.shape[2],
	"frames_count_above_05": sum_above_05[0].item(),
	"bits_probability": message_prob[0].tolist(),
	"bits_massage": message[0].tolist()
	}
	return message_json
	detecting_btn.click(
	fn=run_detect_watermark,
	inputs=[detecting_aud],
	outputs=[predicted_messages]
	)

	if __name__ == "__main__":
	demo.launch()