Mercurial > hg > cc > cirrus_home
comparison bin/preExtract.sh @ 45:bd0010ac88ce
parallelised version of reExtract.sh
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 08 Apr 2020 11:27:33 +0100 |
parents | |
children | 2e5b3439a2ed |
comparison
equal
deleted
inserted
replaced
44:abc1b05996c9 | 45:bd0010ac88ce |
---|---|
1 #!/usr/bin/bash | |
2 # Usage: reExtract.sh 20..-.. < files... | |
3 # Input is list of paths to relative path of warc files | |
4 # under /beegfs/common_crawl/CC-MAIN-$1 | |
5 | |
6 function sus () | |
7 { | |
8 sort "$@" | uniq -c | sort -k1nr,1 | |
9 } | |
10 | |
11 function edex () { | |
12 echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1)) | |
13 } | |
14 | |
15 function join_by () { | |
16 # Courtesy of https://stackoverflow.com/a/17841619/2595465 | |
17 local d=$1 | |
18 shift | |
19 echo -n "$1" | |
20 shift | |
21 printf "%s" "${@/#/$d}" | |
22 } | |
23 | |
24 h=/beegfs/common_crawl/CC-MAIN-$1 | |
25 | |
26 mkdir -p /dev/shm/rex | |
27 cd /dev/shm/rex | |
28 | |
29 sed 's/-00/ /;s/.warc.gz//' | \ | |
30 while read s p i | |
31 do | |
32 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i | |
33 done > in.txt | |
34 cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\ | |
35 parallel --will-cite -j 16 -N 2 h="$h"' | |
36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } | |
37 s={1} | |
38 p={2} | |
39 mkdir -p $s/logs | |
40 cd $s | |
41 # Sigh, should not have used this in the extraction ... | |
42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ | |
43 cut -f 2 -d / |cut -f 1 -d _) | |
44 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) | |
45 # this could be parallel | |
46 do | |
47 lsf=lsl${e}.txt | |
48 rm -f $lsf | |
49 lff=() | |
50 ii=() | |
51 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) | |
52 # this could be parallel | |
53 do | |
54 id=${p#CC-MAIN-*}-00$i | |
55 echo "$id" 1>&2 | |
56 lf=logs/${jobid}_${i}_log | |
57 lff+=("${lf}") # accumulate list of log files | |
58 echo starting ${id} $(date) > $lf | |
59 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | |
60 echo finished ${id} $(date) >> $lf | |
61 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf | |
62 ii+=("*-00${i}_*") | |
63 echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 | |
64 done | |
65 # now compare ls vs. tar | |
66 echo "${ii[@]}" ${#ii[@]} 1>&2 | |
67 echo lff "${lff[@]}" 1>&2 | |
68 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ | |
69 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ | |
70 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt | |
71 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt | |
72 ni=${#ii[@]} | |
73 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] | |
74 then | |
75 echo "extra lines in ${e}_check.txt" 1>&2 | |
76 cd .. | |
77 break | |
78 fi | |
79 if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] | |
80 then | |
81 echo "non-addition lines in ${e}_check.txt" 1>&2 | |
82 cd .. | |
83 break | |
84 fi | |
85 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt | |
86 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" | |
87 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" | |
88 done | |
89 cd .. | |
90 ' |