github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/hack/grabsamplearts.sh (about)

     1  #/bin/bash
     2  set -e
     3  
     4  # Grab a random sample of article .warc files from arc
     5  
     6  #ARC=/srv/jl_inna_box/arc
     7  ARC=/home/ben/scotref/scrape/archive
     8  
     9  pushd $ARC >/dev/null
    10  DIRS=$(ls .)
    11  
    12  
    13  SCRATCH=$(mktemp -d)
    14  echo "output in $SCRATCH"
    15  
    16  # grab 50 from each publication
    17  for DIR in $DIRS; do
    18      echo $DIR
    19      FILES=$(find $DIR -type f | shuf -n 50)
    20      cp --parents $FILES $SCRATCH/
    21  done
    22  
    23  popd >/dev/null
    24  
    25  echo "output in $SCRATCH"
    26