Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,134 Bytes
82bc972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
emilia_root=$1 # /data/scratch/pyp/datasets/emilia/downloads
for file in ${emilia_root}/* ; do
# Check if gzip compressed archive
if file "$file" | grep -q 'gzip compressed data'; then
# Extract string of form 'was "EN_B00100.tar"'' from the output of the file command to keep EN_B00100
filename=$(file "$file" | grep -oP '(?<=was ")[^"]*' | sed 's/\.tar$//')
# Get the file size
size=$(du -sh "$file" | cut -f1)
original_filename=$(basename "$file")
# Get URL string from corresponding JSON file with same basename
json_file=$file.json
if [ -f "$json_file" ]; then
# url=$(jq -r '.url' "$json_file") # jq is not installed on the server
url=$(python3 -c "import sys, json; print(json.load(open('$json_file'))['url'])")
else
url="N/A"
fi
# Compute SHA256 hash of the file
hash=$(python sha256hash.py "$file")
echo $original_filename
# Write filename, size, hash, original filename, URL to output file
echo "$filename, $size, $hash, $original_filename, $url" >> file_log.txt
fi
done
# Sort the output file by filename
sort -o file_log.txt file_log.txt |