VoiceStar / data /emilia_preprocessing /step2_log_tar_files.sh
mrfakename's picture
Upload 51 files
82bc972 verified
emilia_root=$1 # /data/scratch/pyp/datasets/emilia/downloads
for file in ${emilia_root}/* ; do
# Check if gzip compressed archive
if file "$file" | grep -q 'gzip compressed data'; then
# Extract string of form 'was "EN_B00100.tar"'' from the output of the file command to keep EN_B00100
filename=$(file "$file" | grep -oP '(?<=was ")[^"]*' | sed 's/\.tar$//')
# Get the file size
size=$(du -sh "$file" | cut -f1)
original_filename=$(basename "$file")
# Get URL string from corresponding JSON file with same basename
json_file=$file.json
if [ -f "$json_file" ]; then
# url=$(jq -r '.url' "$json_file") # jq is not installed on the server
url=$(python3 -c "import sys, json; print(json.load(open('$json_file'))['url'])")
else
url="N/A"
fi
# Compute SHA256 hash of the file
hash=$(python sha256hash.py "$file")
echo $original_filename
# Write filename, size, hash, original filename, URL to output file
echo "$filename, $size, $hash, $original_filename, $url" >> file_log.txt
fi
done
# Sort the output file by filename
sort -o file_log.txt file_log.txt