changeset 7:25ca3505b4d7

more logging
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 24 Feb 2020 00:44:53 +0000
parents 0f494c76a887
children 3b56c2c9d0ee
files bin/doPlinks.sh bin/plinks.py bin/plinks.sh plinksJob.sh
diffstat 4 files changed, 21 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/doPlinks.sh	Sun Feb 23 16:48:34 2020 +0000
+++ b/bin/doPlinks.sh	Mon Feb 24 00:44:53 2020 +0000
@@ -6,12 +6,14 @@
 mkdir -p /dev/shm/x$hn/${tfn}
 cd /dev/shm/x$hn/${tfn}
 tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.pdf'
-echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn
+echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2
 ls *.pdf | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\
 plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; }
-echo $(date) $hn tarring results from job $jn for $tfn in $(pwd)
+echo $(date) $hn tarring $(ls badpdfs_*|wc -l)/$(ls links_*_*|wc -l) results from job $jn for $tfn in $(pwd) 1>&2
 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_*
+echo $(pwd) rm * 1>&2
 rm *
 cd ..
+echo $(pwd) rmdir ${tfn} 1>&2
 rmdir ${tfn}
 echo $(date) $hn finished job ${jn} for ${tfn}
--- a/bin/plinks.py	Sun Feb 23 16:48:34 2020 +0000
+++ b/bin/plinks.py	Mon Feb 24 00:44:53 2020 +0000
@@ -1,5 +1,6 @@
 #!/lustre/sw/miniconda3/bin/python3
-import sys,pdfx,traceback
+import sys,pdfx,traceback,os
+from datetime import datetime
 from os import path
 
 def run(file):
@@ -8,6 +9,8 @@
   return pdf.get_references_as_dict()
 
 tarnum=sys.argv[1]
+print(tarnum, sys.argv, os.getcwd(),file=sys.stderr)
+gf=0
 with open('badpdfs_%s'%tarnum,'w') as bf:
   for l in sys.stdin:
     (fno,f)=l.split()
@@ -15,16 +18,25 @@
       links=run(f)
       if bool(links) and (links.get('scrape',False) or
                           links.get('annot',False)):
+        gf+=1
         with open('links_%s_%s'%(tarnum,fno),'w') as of:
           for k in links.keys():
             for l in links[k]:
               print("%s\t%s"%(k,l),file=of)
     except Exception as e:
       if str(e)=='Unexpected EOF':
-        print("%s\t%s\t%s"%(tarnum,fno,e),file=bf)
+        print("%s:\t%s\t%s\t%s"%(datetime.now().isoformat(),
+                                 tarnum,fno,e),file=bf)
       else:
+        print("%s: "%(datetime.now().isoformat()),end='',file=bf)
         traceback.print_exc(file=bf)
 
     if (path.exists('/dev/shm/stopJob')):
-      print("Quiting early: %s %s"%(tarnum,fno),file=sys.stderr)
+      print("%s: Quiting early: %s %s"%(datetime.now().isoformat(),tarnum,fno),
+            file=sys.stderr)
       exit(1)          
+now=datetime.now().isoformat()
+print('%s: exiting from %s having found %s files with links out of %s'%(now,
+                                                                        tarnum,
+                                                                        gf,
+                                                                        fno))
--- a/bin/plinks.sh	Sun Feb 23 16:48:34 2020 +0000
+++ b/bin/plinks.sh	Mon Feb 24 00:44:53 2020 +0000
@@ -5,4 +5,4 @@
 hn=${h##*n}
 if [ $hn -eq 0 ]; then echo {001..002}; else echo {003..004}; fi |\
 tr ' ' '\012' |parallel --will-cite -j 71 -N 1 doPlinks.sh ${hn} '{#}' '{}'
-echo $(date) $(hostname)
+echo $(date) $(hostname) $?
--- a/plinksJob.sh	Sun Feb 23 16:48:34 2020 +0000
+++ b/plinksJob.sh	Mon Feb 24 00:44:53 2020 +0000
@@ -11,6 +11,7 @@
 cd ${PBS_O_WORKDIR}
 
 export MPI_SHEPHERD=true
+export MPI_UNBUFFERED_STDIO=true
 
 mpiexec_mpt -ppn 1 -n 2 bin/plinks.sh 2019-35