Spaces:
Running
on
Zero
Running
on
Zero
import imageio, librosa | |
import torch | |
from PIL import Image | |
from tqdm import tqdm | |
import numpy as np | |
def resize_image_by_longest_edge(image_path, target_size): | |
image = Image.open(image_path).convert("RGB") | |
width, height = image.size | |
scale = target_size / max(width, height) | |
new_size = (int(width * scale), int(height * scale)) | |
return image.resize(new_size, Image.LANCZOS) | |
def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None): | |
writer = imageio.get_writer( | |
save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params | |
) | |
for frame in tqdm(frames, desc="Saving video"): | |
frame = np.array(frame) | |
writer.append_data(frame) | |
writer.close() | |
def get_audio_features(wav2vec, audio_processor, audio_path, fps, num_frames): | |
sr = 16000 | |
audio_input, sample_rate = librosa.load(audio_path, sr=sr) # 采样率为 16kHz | |
start_time = 0 | |
# end_time = (0 + (num_frames - 1) * 1) / fps | |
end_time = num_frames / fps | |
start_sample = int(start_time * sr) | |
end_sample = int(end_time * sr) | |
try: | |
audio_segment = audio_input[start_sample:end_sample] | |
except: | |
audio_segment = audio_input | |
input_values = audio_processor( | |
audio_segment, sampling_rate=sample_rate, return_tensors="pt" | |
).input_values.to("cuda") | |
with torch.no_grad(): | |
fea = wav2vec(input_values).last_hidden_state | |
return fea | |