Mercurial > hg > cc > cirrus_home
view bin/bigpdf.sh @ 143:ddff993994be
too clever by half, keys won't work in parallel for e.g. media types
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 20 Oct 2021 15:47:55 +0000 |
parents | f1bf3effa893 |
children |
line wrap: on
line source
#!/usr/bin/bash # Fetch big pdfs per segment from bigpdf_?.txt # First line thereof gives CC identifier cd /dev/shm hn=$1 echo $(date) $hn tot () { awk '{sum+=$1} END {printf "%u\n",sum}' } head -1 bigpdf_${hn}.txt |\ { read cc tail -n +2 bigpdf_${hn}.txt |\ while read s do echo $(date) $(hostname) starting $s mkdir -p /dev/shm/pcrawl/$s/segments cd /dev/shm/pcrawl/$s if [ ! -f bu.txt ] then for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar do tar -xOf $f '*.hdr' |\ fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' done > bu.txt fi if [ "$(tot < nb.txt)" -ne\ $((2 * $(wc -l < bu.txt))) ] then printf "length mismatch: tcount: %d != ucount: %d\n" \ $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 exit 1 fi export NUTCH_HEAPSIZE=20000 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s if [ ! -f segments/2020*/crawl_generate/part-r-00000 ] then ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments fi ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ -Dmapreduce.job.reduces=34 \ segments/* -threads 144 >/dev/null 2>&1 && \ echo $(date) $(hostname) finished $s && \ cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ { for f in warc/warc/*.gz do unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: ' done > truncated.txt cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc rm -rf cdx segments warc } done } echo $(date) $(hostname) $?