github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/hack/grabsamplearts.sh (about) 1 #/bin/bash 2 set -e 3 4 # Grab a random sample of article .warc files from arc 5 6 #ARC=/srv/jl_inna_box/arc 7 ARC=/home/ben/scotref/scrape/archive 8 9 pushd $ARC >/dev/null 10 DIRS=$(ls .) 11 12 13 SCRATCH=$(mktemp -d) 14 echo "output in $SCRATCH" 15 16 # grab 50 from each publication 17 for DIR in $DIRS; do 18 echo $DIR 19 FILES=$(find $DIR -type f | shuf -n 50) 20 cp --parents $FILES $SCRATCH/ 21 done 22 23 popd >/dev/null 24 25 echo "output in $SCRATCH" 26