changeset 76:6cf3dc7ff022

starting on tool to assemble as complete as we have info wrt a seed URI
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 14:25:44 +0100
parents 1c5dab2e1cb3
children bfff01c139ea
files bin/track.py
diffstat 1 files changed, 48 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/track.py	Wed May 06 14:25:44 2020 +0100
@@ -0,0 +1,48 @@
+#!/lustre/sw/miniconda3/bin/python3
+'''Track a list of URIs through nutch results'''
+# Usage: track.py year-nn segmentid [file]
+
+import re,sys,glob,gzip,json
+
+CDX=re.compile("(.*)\)(.*) (\{.*\})$")
+cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
+seg=sys.argv[2]
+try:
+    if len(sys.argv)==4:
+        uuf=open(sys.argv[3])
+    else:
+        uuf=sys.stdin
+    wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
+    ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
+    wwff=glob.glob(wwfp)
+    ddff=glob.glob(ddfp)
+    assert len(wwff)!=0,wwfp
+    assert len(ddff)!=0,ddfp
+except:
+    print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
+    raise
+
+def readCDX(files):
+    res={}
+    c=None
+    # Ref. https://github.com/ikreymer/webarchive-indexing
+    for resFileName in files:
+        with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
+            n=0
+            try:
+                for c in rf:
+                    r=CDX.match(c)
+                    (dom,path,props)=r.groups()
+                    d=json.loads(props)
+                    res[d["url"]]=d
+                    n+=1
+            except:
+                print(resFileName,n,c,file=sys.stderr)
+                raise
+        print (n,len(res))
+    return res
+
+fetches=readCDX(wwff)
+diags=readCDX(ddff)
+
+