mrfakename's picture
Upload 51 files
82bc972 verified
raw
history blame contribute delete
3.29 kB
#!/bin/bash
# Define the root directory where the tar files are located
root=$1 # /data/scratch/pyp/datasets/emilia/downloads
save_root=$2 # /data/scratch/pyp/datasets/emilia/preprocessed/audio
mkdir -p "${save_root}"
# Input log files
log_file="file_log.txt" # Full log of files to process
exist_log_file="file_log_debug.txt" # Log of already processed files
failure_log="untar_failures.log" # Log file for untar failures
# Clear previous failure log
> "$failure_log"
# Create an array of filenames already processed (from exist_log_file)
if [ -f "$exist_log_file" ]; then
mapfile -t existing_files < "$exist_log_file"
else
existing_files=()
fi
# Create a temporary filtered log of files to process
filtered_log="filtered_file_log.txt"
grep -v -F -f "$exist_log_file" "$log_file" > "$filtered_log"
# Count total filtered files
total_files=$(wc -l < "$filtered_log")
echo "Found $total_files entries to process in $filtered_log."
# Print the filtered files
echo "Filtered files to process:"
cat "$filtered_log"
echo
# Confirm before starting processing
read -p "Do you want to proceed with the above files? (y/n): " confirm
if [[ "$confirm" != "y" ]]; then
echo "Operation canceled."
rm -f "$filtered_log"
exit 1
fi
# Start time
start_time=$(date +%s)
# Counter for how many lines we've processed
count=0
# Process filtered log
while IFS=',' read -r filename size local_sha256 original_filename url; do
count=$((count + 1))
# Trim leading/trailing whitespace
filename=$(echo "$filename" | xargs)
size=$(echo "$size" | xargs)
local_sha256=$(echo "$local_sha256" | xargs)
original_filename=$(echo "$original_filename" | xargs)
url=$(echo "$url" | xargs)
# Construct the full path to the tar file
tar_file="${root}/${original_filename}"
# Check if the tar file exists
if [ ! -f "$tar_file" ]; then
echo "❌ File not found: $tar_file"
echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log"
else
# Try to untar the file
echo "[$count/$total_files] Untarring $tar_file..."
if ! tar -xf "$tar_file" -C "${save_root}"; then
# If untar fails, log the failure
echo "❌ Failed to untar: $tar_file"
echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log"
else
echo "βœ… Successfully untarred: $tar_file"
# Append successfully untarred filename to exist_log_file
echo "$filename" >> "$exist_log_file"
fi
fi
# Calculate elapsed time, average time per file, and ETA
now=$(date +%s)
elapsed=$(( now - start_time )) # total seconds since the start
if [ $count -gt 0 ]; then
avg_time=$(awk "BEGIN { printf \"%.2f\", $elapsed / $count }")
remain=$(( total_files - count ))
eta_seconds=$(awk "BEGIN { printf \"%.0f\", $avg_time * $remain }")
eta_formatted=$(date -ud "@${eta_seconds}" +'%H:%M:%S')
echo "Elapsed: ${elapsed}s | Avg/f: ${avg_time}s | Remaining: $remain files | ETA: ~${eta_formatted}"
fi
done < "$filtered_log"
# Clean up temporary filtered log
rm -f "$filtered_log"
# Summary
echo "Untar operation completed. Check $failure_log for any failures."