changeset 6:0f494c76a887

refactor to address tarred-up pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 23 Feb 2020 16:48:34 +0000
parents a28d731977da
children 25ca3505b4d7
files bin/doPlinks.sh bin/plinks.py bin/plinks.sh
diffstat 3 files changed, 28 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/bin/doPlinks.sh	Wed Feb 19 10:41:59 2020 +0000
+++ b/bin/doPlinks.sh	Sun Feb 23 16:48:34 2020 +0000
@@ -1,14 +1,17 @@
 #!/usr/bin/bash
-mkdir -p /dev/shm/x
-plinks.py $1
-# while read f
-# do
-#     if plinks.py $f > /dev/shm/x/links_${me}_${mine} 2>/dev/null
-#     then
-# 	((mine+=1))
-#     else
-# 	echo $f >> /dev/shm/x/badpdfs_$me
-# 	rm -f /dev/shm/x/links_${me}_${mine}
-#     fi
-# done
-rsync -a /dev/shm/x/ links
+hn=$1
+jn=$2
+tfn=$3
+
+mkdir -p /dev/shm/x$hn/${tfn}
+cd /dev/shm/x$hn/${tfn}
+tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.pdf'
+echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn
+ls *.pdf | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\
+plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; }
+echo $(date) $hn tarring results from job $jn for $tfn in $(pwd)
+tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_*
+rm *
+cd ..
+rmdir ${tfn}
+echo $(date) $hn finished job ${jn} for ${tfn}
--- a/bin/plinks.py	Wed Feb 19 10:41:59 2020 +0000
+++ b/bin/plinks.py	Sun Feb 23 16:48:34 2020 +0000
@@ -7,20 +7,24 @@
   pdf=pdfx.PDFx(file)
   return pdf.get_references_as_dict()
 
-me=sys.argv[1]
-with open('/dev/shm/x/badpdfs_%s'%me,'w') as bf:
+tarnum=sys.argv[1]
+with open('badpdfs_%s'%tarnum,'w') as bf:
   for l in sys.stdin:
-    (fno,f)=l.rstrip().split()
+    (fno,f)=l.split()
     try:
       links=run(f)
       if bool(links) and (links.get('scrape',False) or
                           links.get('annot',False)):
-        with open('/dev/shm/x/links_%s'%fno,'w') as of:
+        with open('links_%s_%s'%(tarnum,fno),'w') as of:
           for k in links.keys():
             for l in links[k]:
               print("%s\t%s"%(k,l),file=of)
     except Exception as e:
-      print("%s\t%s"%(fno,e),file=bf)
+      if str(e)=='Unexpected EOF':
+        print("%s\t%s\t%s"%(tarnum,fno,e),file=bf)
+      else:
+        traceback.print_exc(file=bf)
+
     if (path.exists('/dev/shm/stopJob')):
-      print("Quiting early: %s %s"%(me,fno),file=sys.stderr)
+      print("Quiting early: %s %s"%(tarnum,fno),file=sys.stderr)
       exit(1)          
--- a/bin/plinks.sh	Wed Feb 19 10:41:59 2020 +0000
+++ b/bin/plinks.sh	Sun Feb 23 16:48:34 2020 +0000
@@ -1,8 +1,8 @@
 #!/usr/bin/bash
 mkdir -p $TMPDIR
 echo $(date) $(hostname)
-cd /beegfs/common_crawl/CC-MAIN-2019-35/pdfs
 h=$(hostname)
 hn=${h##*n}
-parallel --will-cite -j 71 --pipepart -a lm/$(hostname)_pdfFilesWithLM doPlinks.sh ${hn}_'{#}'
+if [ $hn -eq 0 ]; then echo {001..002}; else echo {003..004}; fi |\
+tr ' ' '\012' |parallel --will-cite -j 71 -N 1 doPlinks.sh ${hn} '{#}' '{}'
 echo $(date) $(hostname)