Mercurial > hg > cc > cirrus_home
changeset 7:25ca3505b4d7
more logging
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 24 Feb 2020 00:44:53 +0000 |
parents | 0f494c76a887 |
children | 3b56c2c9d0ee |
files | bin/doPlinks.sh bin/plinks.py bin/plinks.sh plinksJob.sh |
diffstat | 4 files changed, 21 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/doPlinks.sh Sun Feb 23 16:48:34 2020 +0000 +++ b/bin/doPlinks.sh Mon Feb 24 00:44:53 2020 +0000 @@ -6,12 +6,14 @@ mkdir -p /dev/shm/x$hn/${tfn} cd /dev/shm/x$hn/${tfn} tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.pdf' -echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn +echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2 ls *.pdf | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\ plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; } -echo $(date) $hn tarring results from job $jn for $tfn in $(pwd) +echo $(date) $hn tarring $(ls badpdfs_*|wc -l)/$(ls links_*_*|wc -l) results from job $jn for $tfn in $(pwd) 1>&2 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_* +echo $(pwd) rm * 1>&2 rm * cd .. +echo $(pwd) rmdir ${tfn} 1>&2 rmdir ${tfn} echo $(date) $hn finished job ${jn} for ${tfn}
--- a/bin/plinks.py Sun Feb 23 16:48:34 2020 +0000 +++ b/bin/plinks.py Mon Feb 24 00:44:53 2020 +0000 @@ -1,5 +1,6 @@ #!/lustre/sw/miniconda3/bin/python3 -import sys,pdfx,traceback +import sys,pdfx,traceback,os +from datetime import datetime from os import path def run(file): @@ -8,6 +9,8 @@ return pdf.get_references_as_dict() tarnum=sys.argv[1] +print(tarnum, sys.argv, os.getcwd(),file=sys.stderr) +gf=0 with open('badpdfs_%s'%tarnum,'w') as bf: for l in sys.stdin: (fno,f)=l.split() @@ -15,16 +18,25 @@ links=run(f) if bool(links) and (links.get('scrape',False) or links.get('annot',False)): + gf+=1 with open('links_%s_%s'%(tarnum,fno),'w') as of: for k in links.keys(): for l in links[k]: print("%s\t%s"%(k,l),file=of) except Exception as e: if str(e)=='Unexpected EOF': - print("%s\t%s\t%s"%(tarnum,fno,e),file=bf) + print("%s:\t%s\t%s\t%s"%(datetime.now().isoformat(), + tarnum,fno,e),file=bf) else: + print("%s: "%(datetime.now().isoformat()),end='',file=bf) traceback.print_exc(file=bf) if (path.exists('/dev/shm/stopJob')): - print("Quiting early: %s %s"%(tarnum,fno),file=sys.stderr) + print("%s: Quiting early: %s %s"%(datetime.now().isoformat(),tarnum,fno), + file=sys.stderr) exit(1) +now=datetime.now().isoformat() +print('%s: exiting from %s having found %s files with links out of %s'%(now, + tarnum, + gf, + fno))
--- a/bin/plinks.sh Sun Feb 23 16:48:34 2020 +0000 +++ b/bin/plinks.sh Mon Feb 24 00:44:53 2020 +0000 @@ -5,4 +5,4 @@ hn=${h##*n} if [ $hn -eq 0 ]; then echo {001..002}; else echo {003..004}; fi |\ tr ' ' '\012' |parallel --will-cite -j 71 -N 1 doPlinks.sh ${hn} '{#}' '{}' -echo $(date) $(hostname) +echo $(date) $(hostname) $?