Spaces:
Running
on
Zero
Running
on
Zero
emilia_root=$1 # /data/scratch/pyp/datasets/emilia/downloads | |
for file in ${emilia_root}/* ; do | |
# Check if gzip compressed archive | |
if file "$file" | grep -q 'gzip compressed data'; then | |
# Extract string of form 'was "EN_B00100.tar"'' from the output of the file command to keep EN_B00100 | |
filename=$(file "$file" | grep -oP '(?<=was ")[^"]*' | sed 's/\.tar$//') | |
# Get the file size | |
size=$(du -sh "$file" | cut -f1) | |
original_filename=$(basename "$file") | |
# Get URL string from corresponding JSON file with same basename | |
json_file=$file.json | |
if [ -f "$json_file" ]; then | |
# url=$(jq -r '.url' "$json_file") # jq is not installed on the server | |
url=$(python3 -c "import sys, json; print(json.load(open('$json_file'))['url'])") | |
else | |
url="N/A" | |
fi | |
# Compute SHA256 hash of the file | |
hash=$(python sha256hash.py "$file") | |
echo $original_filename | |
# Write filename, size, hash, original filename, URL to output file | |
echo "$filename, $size, $hash, $original_filename, $url" >> file_log.txt | |
fi | |
done | |
# Sort the output file by filename | |
sort -o file_log.txt file_log.txt |