Spaces:
Running
on
Zero
Running
on
Zero
# Define the root directory where the tar files are located | |
root=$1 # /data/scratch/pyp/datasets/emilia/downloads | |
save_root=$2 # /data/scratch/pyp/datasets/emilia/preprocessed/audio | |
mkdir -p "${save_root}" | |
# Input log files | |
log_file="file_log.txt" # Full log of files to process | |
exist_log_file="file_log_debug.txt" # Log of already processed files | |
failure_log="untar_failures.log" # Log file for untar failures | |
# Clear previous failure log | |
> "$failure_log" | |
# Create an array of filenames already processed (from exist_log_file) | |
if [ -f "$exist_log_file" ]; then | |
mapfile -t existing_files < "$exist_log_file" | |
else | |
existing_files=() | |
fi | |
# Create a temporary filtered log of files to process | |
filtered_log="filtered_file_log.txt" | |
grep -v -F -f "$exist_log_file" "$log_file" > "$filtered_log" | |
# Count total filtered files | |
total_files=$(wc -l < "$filtered_log") | |
echo "Found $total_files entries to process in $filtered_log." | |
# Print the filtered files | |
echo "Filtered files to process:" | |
cat "$filtered_log" | |
echo | |
# Confirm before starting processing | |
read -p "Do you want to proceed with the above files? (y/n): " confirm | |
if [[ "$confirm" != "y" ]]; then | |
echo "Operation canceled." | |
rm -f "$filtered_log" | |
exit 1 | |
fi | |
# Start time | |
start_time=$(date +%s) | |
# Counter for how many lines we've processed | |
count=0 | |
# Process filtered log | |
while IFS=',' read -r filename size local_sha256 original_filename url; do | |
count=$((count + 1)) | |
# Trim leading/trailing whitespace | |
filename=$(echo "$filename" | xargs) | |
size=$(echo "$size" | xargs) | |
local_sha256=$(echo "$local_sha256" | xargs) | |
original_filename=$(echo "$original_filename" | xargs) | |
url=$(echo "$url" | xargs) | |
# Construct the full path to the tar file | |
tar_file="${root}/${original_filename}" | |
# Check if the tar file exists | |
if [ ! -f "$tar_file" ]; then | |
echo "β File not found: $tar_file" | |
echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log" | |
else | |
# Try to untar the file | |
echo "[$count/$total_files] Untarring $tar_file..." | |
if ! tar -xf "$tar_file" -C "${save_root}"; then | |
# If untar fails, log the failure | |
echo "β Failed to untar: $tar_file" | |
echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log" | |
else | |
echo "β Successfully untarred: $tar_file" | |
# Append successfully untarred filename to exist_log_file | |
echo "$filename" >> "$exist_log_file" | |
fi | |
fi | |
# Calculate elapsed time, average time per file, and ETA | |
now=$(date +%s) | |
elapsed=$(( now - start_time )) # total seconds since the start | |
if [ $count -gt 0 ]; then | |
avg_time=$(awk "BEGIN { printf \"%.2f\", $elapsed / $count }") | |
remain=$(( total_files - count )) | |
eta_seconds=$(awk "BEGIN { printf \"%.0f\", $avg_time * $remain }") | |
eta_formatted=$(date -ud "@${eta_seconds}" +'%H:%M:%S') | |
echo "Elapsed: ${elapsed}s | Avg/f: ${avg_time}s | Remaining: $remain files | ETA: ~${eta_formatted}" | |
fi | |
done < "$filtered_log" | |
# Clean up temporary filtered log | |
rm -f "$filtered_log" | |
# Summary | |
echo "Untar operation completed. Check $failure_log for any failures." |