changeset 77:bfff01c139ea

bare framework working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 18:28:52 +0100
parents 6cf3dc7ff022
children 846b38f8b204
files bin/track.py
diffstat 1 files changed, 35 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/bin/track.py	Wed May 06 14:25:44 2020 +0100
+++ b/bin/track.py	Wed May 06 18:28:52 2020 +0100
@@ -5,6 +5,7 @@
 import re,sys,glob,gzip,json
 
 CDX=re.compile("(.*)\)(.*) (\{.*\})$")
+FAIL=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
 seg=sys.argv[2]
 try:
@@ -16,6 +17,9 @@
     ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
     wwff=glob.glob(wwfp)
     ddff=glob.glob(ddfp)
+    wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg))
+    buf=open("%s/%s/bu.txt"%(cc,seg))
+    log=open("%s/%s/hadoop.log"%(cc,seg))
     assert len(wwff)!=0,wwfp
     assert len(ddff)!=0,ddfp
 except:
@@ -39,10 +43,40 @@
             except:
                 print(resFileName,n,c,file=sys.stderr)
                 raise
-        print (n,len(res))
+        #print (n,len(res),file=sys.stderr)
     return res
 
 fetches=readCDX(wwff)
 diags=readCDX(ddff)
+trunc={}
+for l in wtf:
+    if l.startswith('WARC-'):
+        (k,rest)=l.split(' ',1)
+        if k=='WARC-Target-URI:':
+            uri=rest.rstrip()
+        elif k=='WARC-Truncated:':
+            trunc[uri]=rest.rstrip()
+bu=list(map(str.rstrip,buf.readlines()))
 
+fails={}
+for l in log:
+    r=FAIL.match(l)
+    if r:
+        (u,m2)=r.groups()
+        fails[u]=m2
 
+print("""For %s/%s:
+ %4s requested
+ %4s retrieved
+ %4s truncated
+ %4s diagnosed
+ %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)),
+      file=sys.stderr)
+
+for u in bu:
+    sig=0
+    sig+=8 if u in fetches else 0
+    sig+=4 if u in diags else 0
+    sig+=2 if u in fails else 0
+    sig+=1 if u in trunc else 0
+    print(format(sig,'04b'))