Mercurial > hg > cc > cirrus_work
changeset 120:1d1bd22124c0
moved from bin
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 28 Sep 2023 08:46:01 +0100 |
parents | 1d12b51c4d59 |
children | f8d5e5355c4c |
files | bin/build_idx.py bin/count_warc.py bin/deltas.py bin/fix_key.py bin/ix.py bin/lmh_warc.py bin/merge_date.py bin/per_segment.py bin/percent_encode.py bin/plinks.py bin/sort_date.py bin/spearman.py bin/test_warc.py bin/warc.py lib/python/cc/build_idx.py lib/python/cc/count_warc.py lib/python/cc/fix_key.py lib/python/cc/ix.py lib/python/cc/lmh_warc.py lib/python/cc/merge_date.py lib/python/cc/per_segment.py lib/python/cc/percent_encode.py lib/python/cc/sort_date.py lib/python/cc/spearman.py lib/python/cc/test_warc.py lib/python/cc/warc.py lib/python/deltas.py lib/python/plinks.py |
diffstat | 28 files changed, 1237 insertions(+), 1237 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/build_idx.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -#!/usr/bin/python3 -'''Turn a merge_nnn.log file into a cluster.idx file - We cheat and use the old cluster.idx to save having to read - all the cdx-....gz files''' -import sys - -with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx: - i=-1 - curpos=0 - target="cdx-00%03d.gz"%i - log=open("/dev/null",'r') # embarassing hack - for ol in oidx: - (surt, datestamp, file, offset, length, cnt) = ol.split() - if file!=target: - i+=1 - target="cdx-00%03d.gz"%i - log.close() - curpos=0 - log=open('merge_%d.log'%(i+1),'r') - hdr=log.readline() - (j,f) = hdr.split() - sys.stderr.write(hdr) - if int(j)!=i+1: - raise ValueError("wrong file: i=%s, j=%s"%(i,j)) - nl=log.readline() - if not nl: - sys.stderr.write('quiting early: %s\n'%i) - exit(1) - nlen=int(nl) - nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt)) - curpos+=nlen
--- a/bin/count_warc.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -import warc,sys - -OUT=open(sys.stdout.fileno(),'wb') - -if (debug:=(sys.argv[1]=='-d')): - sys.argv.pop(1) - -def countme(wtype,buf,part): - if debug: - breakpoint() - OUT.write(b"%d\n"%len(buf)) - -#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) -#warc(showme,[b'response'],whole=True) - -warc.warc(sys.argv[1],countme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- a/bin/deltas.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#!/usr/bin/python3 -'''Extract and tabulate runtimes per file from a slurm output log''' -import sys, re -from datetime import datetime -pending={} -first=None -SPAT=re.compile('... (.*) BST start ([0-9]+ [0-9]+)') -EPAT=re.compile('... (.*) BST end ([0-9]+ [0-9]+)') -with open(sys.argv[1],'r') as f: - for l in f: - if m:=SPAT.match(l): - b=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p") - id=m[2] - if id in pending: - print('%s started twice at %s, %s'%(id,pending[id],b),file=sys.stderr) - else: - pending[id]=b - if first is None: - first=b - elif m:=EPAT.match(l): - e=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p") - id=m[2] - if id in pending: - delta=(e-pending[id]).seconds - print(delta,"%2d:%02d"%(delta/60,delta%60),sep='\t') - del pending[id] - else: - print('%s ended w/o start at %s'%(id,e),file=sys.stderr) -w=(e-first).seconds -sys.stdout.flush() -print('From %s to %s:'%(first.strftime("%d %b %Y %I:%M:%S %p"), - e.strftime("%d %b %Y %I:%M:%S %p")),file=sys.stderr) -print(' %d:%02d:%02d'%(w/3600,(w/60)%60,w%60),(e-first).seconds,sep='\t',file=sys.stderr)
--- a/bin/fix_key.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -#!/usr/bin/python3 -from percent_encode import percent_encode -from urllib.parse import quote, unquote -import sys - -# From RFC-3986: -# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" -# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" -# / "*" / "+" / "," / ";" / "=" -# But # _is_ escaped in Java surt results -# and additionally " \ : < = > ? \ ^ _ ` { | } are not -# Note also that quote already does _not_ quote - . / _ ~ - -# Also, Java surt strips _all_ leading 'www.', -# where python3 surt only strips the first one. - -with open(sys.argv[1],"r") as f: - for l in f: - while l.endswith(',www',0,ploc:=l.index(')')): - l=l[:ploc-4]+l[ploc:] - if '%' in l: - (key,wt,ts)=l.split('\t') - sys.stdout.write(quote(unquote(key,errors='percent'), - safe='!"$&\'()*+,:;<=>?@[\\]^`{|}').lower()) - sys.stdout.write('\t') - sys.stdout.write(wt) - sys.stdout.write('\t') - sys.stdout.write(ts) - else: - sys.stdout.write(l) -
--- a/bin/ix.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,315 +0,0 @@ -#!/usr/bin/env python3 -'''Extract response records from Common Crawl WARC-format files -given length, offset and filename triples. -Input one triple on command line, or -triples from stdin as tab-delimited lines -or complete cdx index lines. -In all cases by 'filename' is meant crawlid/segmentid/type/filename - -Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' - -import sys, argparse, regex, os, shutil, io, gzip, time, shlex -from isal import igzip -from subprocess import Popen, PIPE -#import asyncio - -HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') -BINOUT=sys.stdout.buffer -FPAT="/%s/%s/orig/%s/%s" - -CMD_PROC=None -TMPFILENAME=None - -class HackFormat(argparse.RawDescriptionHelpFormatter): - def format_help(self): - FOO=argparse.RawDescriptionHelpFormatter.format_help(self) - return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', - FOO) - -def process(options,buf,filename,offset,length,whole): - try: - process0(options,buf,filename,offset,length,whole) - except Exception as e: - if options.debug: - import traceback - traceback.print_exc(file=sys.stderr) - else: - print("Process fail: %s, input line:\n %s"%(e,l), - file=sys.stderr,end='') - exit(3) - -def process0(options,buf,filename,offset,length,whole): - global TMPFILENAME, TMPFILE - if options.save: - (tf,TMPFILENAME)=tempfile.mkstemp() - TMPFILE=open(tf,mode='wb') - if options.cmd and not options.process: - launch(options.cmd) - process1(options,buf,filename,offset,length,whole) - if options.save: - TMPFILE.close() - if options.cmd: - _output_subproc(bytes(TMPFILENAME,'utf-8')) - _output_subproc(b"\n") - else: - BINOUT.write(bytes(TMPFILENAME,'utf-8')) - BINOUT.write(b"\n") - if options.cmd: - if not options.process: - windup(filename,options,length) - if options.save: - os.unlink(TMPFILENAME) - TMPFILENAME=None - elif options.save: - print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr) - TMPFILENAME=None - -def launch(cmd): - global CMD_PROC, BINOUT - CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0) - BINOUT=CMD_PROC.stdin - -def windup(length,offset,filename): - # Wind up subproc - BINOUT.close() - if CMD_PROC.wait()!=0: # could/should be async? - print("subproc of %s:%s:%s failed with %s"%(length,offset,filename, - CMD_PROC.returncode), - file=sys.stderr) - -def _output_tmpfile(buf): - TMPFILE.write(buf) - -def _output_stdout(buf): - BINOUT.write(buf) - -def _output_subproc(buf): - toWrite=len(buf) - while toWrite>0: - toWrite -= BINOUT.write(buf) - -def process1(options,buf,filename,offset,length,whole): - root=options.root - rfn=root+filename - if root!="/beegfs/common_crawl": - # Support using ramdisk or other local disk as a faster cached - if not os.path.exists(rfn): - if not os.path.exists(os.path.dirname(rfn)): - os.makedirs(os.path.dirname(rfn)) - with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ - io.FileIO(rfn,'w') as outfile: - #shutil.copyfileobj(infile,outfile,128*1024*1024) - while True: - l=infile.readinto(buf) - if l==0: - break - outfile.write(memoryview(buf)[:l]) - file=open(rfn,'rb',0) - file.seek(offset) - bv=memoryview(buf)[:length] - nb=file.readinto(bv) - file.close() - if nb!=length: - raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name, - nb,length,offset)) - if whole and options.zipped: - _output(bv) - return - gzip_chunk = io.BytesIO(bv) - uv=memoryview(buf)[length:] - with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: - ll=0 - while True: - l=gzip_fin.readinto(uv) - if not l: - break - ll+=l - cb=memoryview(uv)[:ll] - if whole: - _output(cb) - return - # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted - state=0 - tr=None # Was this record truncated? - bl=None # for HTTP Content-Length for the length of the body? - # Could we make this faster by working purely within the cb memoryview? - # It would be messy, but avoid copying huge amounts - # The outer loop would just be something like - # clbv=memoryview(bytearray(b"Content-Length: ")) - # i=s=0 - # while i<ll: - # if cb[i]==10: # need to handle \r\n - # L=cb[s:i] - # s=i=i+1 - # if L[:16]==clbv: - # wl=int(L[16:]) - # else: - # i+=1 - # - with io.BytesIO(cb) as clear_text: - for L in clear_text: - if state==0: - # WARC header - if L.startswith(b"Content-Length: "): - wl=int(L[16:].rstrip()) - elif L.startswith(b"WARC-Truncated: "): - tr=L[16:].rstrip() - tr="EMPTY" if tr=="" else tr - elif L==b"" or L.startswith(b"\r"): # for idempotency - # Blank line, WARC header is finished - if not (options.headers or options.body): - return - state=1 - # Note we preserve the empty line - if options.warc: - _output(L) - continue - if state==1: - # HTTP header - wl -= len(L) - if not (L==b"" or L.startswith(b"\r")): - # Non-blank, it's a header - (h,_,v)=L.partition(b": ") - if bl is None and (h==b"Content-Length"): - bl=int(v) - if options.headers: - if isinstance(options.headers,dict): - if h in options.headers: - options.headers[h]=v - else: - _output(L) - else: - # Blank line, HTTP header is finished - if isinstance(options.headers,dict): - _output(bytes(str(options.headers),'utf-8')) - if not options.body: - return - if options.headers: - _output(L) - state=2 - # The above is just for sanity, because we do _not_ - # continue with the outer loop, - # since we can now block-output the entire rest of the - # input buffer. - if bl is not None: - if bl!=wl: - print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ - (length,offset,filename,wl,bl,tr),file=sys.stderr) - # HTTP body - balance=clear_text.tell() - #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) - # Output whatever is left - _output(cb[balance:balance+wl]) - return - -def main(): - global _output,TMPFILE,TMPFILENAME,tempfile - parser = argparse.ArgumentParser( - description='''Extract records from warc files given length, offset and file triples. - Input one triple on command line, or - triples from stdin as tab-delimited lines - or complete cdx index lines. - In all cases by 'filename' is meant crawlid/segmentid/type/filename''', - epilog='''Note that if no output flag(s) is/are given, - the whole WARC record will be output, more efficiently than - would be the case if all three flags were given.''', - add_help=False, - conflict_handler='resolve', - formatter_class=HackFormat - ) - fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') - parser.add_argument('--help',help='Show help',action='help') - parser.add_argument('-d','--debug',help='Debug output',action='store_true') - parser.add_argument('-w','--warc',help='output WARC headers', - action='store_true') - parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output', - nargs='?',default=None,const=True) - parser.add_argument('-b','--body',help='output HTTP body', - action='store_true') - parser.add_argument('-c','--cmd',help='pipes each result thru CMD') - parser.add_argument('-p','--process',help='with -c, launches CMD only once', - action='store_true') - parser.add_argument('-m','--module.function',help='module.function to call with a stream'), - parser.add_argument('-s','--save',action='store_true', - help="write to a temporary file and output the name") - parser.add_argument('-f','--fpath', - help=fphelp, - default=FPAT) - parser.add_argument('-r','--root',nargs='?', - help='File path root, create a copy there if necessary', - default='/beegfs/common_crawl'), - parser.add_argument('-z','--zipped', - help="output raw gzipped record, ignored if any of -bhw supplied", - action='store_true') - sg=parser.add_mutually_exclusive_group() - sg.add_argument('-x','--index', - help='take lines of triples from a cdx index file as input', - action='store_true') - sg.add_argument('length',type=int, - help='length in bytes of gzipped record', - nargs='?') - parser.add_argument('offset',type=int, - help='start position in bytes of gzipped record', - nargs='?') - parser.add_argument('filename', - help='pathname of gzipped Common Crawl WARC-format file', - nargs='?') - # Hack the order of optional and positional in the help output - parser._action_groups.sort(key=lambda g:g.title) - #parser.print_help() - pa=parser.parse_args(sys.argv[1:]) - #print(pa,file=sys.stderr) - if pa.length is not None: - # We have to enforce our own check.. - if pa.offset is None or pa.filename is None: - parser.error("length, offset and filename must all be supplied together") - if isinstance(pa.headers,str): - pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(',')) - - buf=bytearray(128*1024*1024) - - whole=not (pa.warc or pa.headers or pa.body) - if pa.save: - _output=_output_tmpfile - import tempfile - elif pa.cmd: - _output = _output_subproc - else: - _output = _output_stdout - if pa.cmd and pa.process: - launch(pa.cmd) - # three different ways to process - if pa.index: - CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... - for l in sys.stdin: - m=CDX.search(l) - if m is None: - if l.find('/robotstxt/')>-1: - continue - print("index line problem: \"%s\""%l,file=sys.stderr,end='') - exit(2) - filename=pa.fpath%(m[3:7]) - process(pa,buf,filename, - int(offset:=m[2]),int(length:=m[1]),whole) - elif pa.length is not None: - print(pa.filename,file=sys.stderr) - process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), - pa.offset,pa.length,whole) - else: - print("Reading length, offset, filename tab-delimited triples from stdin...", - file=sys.stderr) - for l in sys.stdin: - try: - (length,offset,filename)=l.rstrip().split('\t') - length=int(length) - offset=int(offset) - except ValueError as e: - parser.error('Invalid input line: %s\n "%s"'%(e,l)) - process(pa,buf,pa.fpath%tuple(filename.split('/')), - offset,length,whole) - # processing done one way or another - if pa.cmd and pa.process: - windup(length,offset,filename) - # if pa.save and pa.process, deleting temp files is down to cmd -if __name__ == "__main__": - main()
--- a/bin/lmh_warc.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -'''Extract identifying info + LastModified header value for all entries - that have one - - Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' - -import re,warc,sys,glob,codecs - -TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) -DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) -LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) - -DTAB=bytearray(range(256)) -DDEL=b'TZ-:' - -OUT=open(sys.stdout.fileno(),'wb') - -def showmeLMH(wtype,buf,part): - global URI, DATE, SEGMENT, FILETYPE, FILENO - if part==1: - if (m:=TUPAT.search(buf)): - URI=m[1] - else: - raise ValueError(b"No target URI in %s ??"%buf) - if (md:=DPAT.search(buf)): - DATE=md[1] - else: - raise ValueError(b"No date in %s ??"%buf) - else: - mm=LMPAT.search(buf) - OUT.write(URI) - if mm: - OUT.write(b'\t') - OUT.write(DATE.translate(DTAB,DDEL)) - OUT.write(b'\t') - OUT.write(SEGMENT) - OUT.write(b'\t') - OUT.write(FILETYPE) - OUT.write(b'\t') - OUT.write(FILENO) - OUT.write(b'\t') - OUT.write(mm[1]) - OUT.write(b'\n') - -(CCdate, segment, filetype, fileno) = sys.argv[1:] -fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( - CCdate, segment, filetype, fileno) - -SEGMENT=codecs.encode(segment,'ascii') -FILETYPE=codecs.encode(filetype,'ascii') -FILENO=codecs.encode(fileno,'ascii') - -warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) -
--- a/bin/merge_date.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,144 +0,0 @@ -#!/usr/bin/python3 -'''Add timestamps from Last-Modified-dated (ks.tsv) files into - that year's index - -Usage: merge_date.py ksvstream cdx-dir outdir - -ksvstream consists of tab-separated key, CC date, url and Unix timestamp -''' # ' - -import sys, io, os, os.path, time, re -from isal import igzip - - -DEBUG = 0 -while sys.argv[1] == '-d': - sys.argv.pop(1) - DEBUG += 1 - -XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] -NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] - -RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' -b'(crawldiagnostics|robotstxt)/') -SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' - b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' - b'=[^&]*)') -ISESSION = re.compile(SESSION.pattern,flags=re.I) -URL=re.compile(b'\{"url": "([^"]*)"') -WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') - -# Above based on this from broken Java code: -# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 -#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), -#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), -#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), -#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), -#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", - -#print(sys.argv[3],NPATH,file=sys.stderr) - -os.makedirs(sys.argv[3], exist_ok=True) - -FN = 0 - -XCNT = WCNT = 0 -DCNT = 0 - -XF = igzip.IGzipFile(filename=XPATH%0) -NF = open(NN:=(NPATH%0),'wb') - -def nextLine(): - '''Move on to next index file if current has run out''' - global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT - while True: - xl=XF.readline() - XCNT += 1 - if xl == b'': - # need to move to next index file - FN += 1 - XF.close() - NF.close() - print(NN, flush=True) # so we can compress it - print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True) - time.sleep(0.1) # so they flush? - XN=XPATH%FN - if not os.path.exists(XN): - return None - XF = igzip.IGzipFile(filename=XN) - NF = open((NN:=NPATH%FN), 'wb') - xl = XF.readline() - WCNT = XCNT = 1 - if WARC.search(xl): - WCNT += 1 - return xl - else: - NF.write(xl) - if DEBUG: - sys.stderr.write("out_rc\n") - - -def nextDate(df,dn): - global DEBUG, DCNT, XCNT - dl = df.readline() - if dl == b'': - # write out the last of the last index file, if any - return "", "", "", 0 - if DEBUG: - sys.stderr.write("dl%s: %s\n"%(dn,dl)) - dkey, ddate, durl, dtime = dl.split(b'\t') - DCNT += 1 - return dkey, ddate, durl, dtime - -with open(sys.argv[1], 'rb') as df: - DCNT = 0 - - dkey, ddate, durl, dtime = nextDate(df,1) - - while (xl := nextLine()) is not None: - xkey, xdate, xprops = xl.split(b' ', maxsplit=2) - m = URL.match(xprops) - if m: - xurl = m[1] - else: - raise ValueError("No url in %s"%xprops) - if DEBUG: - sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') - for xp in (xkey, xdate, xurl)))) - if dkey==xkey and ddate==xdate and durl==xurl: - # Got it - NF.write(xkey) - NF.write(b' ') - NF.write(xdate) - NF.write(b' ') - NF.write(xprops[:-2]) - NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) - if DEBUG: - sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') - for xp in (xkey, xdate, xurl)))) - sys.stderr.write(" %d\n"%int(dtime[:-3])) - - dkey, ddate, durl, dtime = nextDate(df,2) - continue - else: - if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): - # we've missed something, disaster looms - print("Fail2:" - " xkey: %s\n" - " dkey: %s\n" - " xdate: %s\n" - " ddate: %s\n" - " xurl: %s\n" - " durl: %s\n" - "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(xkey, dkey, xdate, ddate, - xurl, durl, - FN, XCNT, DCNT, xl), - file=sys.stderr) - # try to force recovery - dkey, ddate, durl, dtime = nextDate(df,3) - continue - # else fall through to write - NF.write(xl) - if DEBUG: - sys.stderr.write("out_nl\n")
--- a/bin/per_segment.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -#!/usr/bin/python3 -'''refactor a per-cdx count table to be per-segment -input on STDIN -Usage: per_segment segment-column -Assumes column 0 is empty, count is in column 1 -Segment column is 0-origin -''' - -import sys - -c=int(sys.argv[1]) - -ss=[dict() for i in range(100)] - -for l in sys.stdin: - try: - cc=l.split('\t') - s=int(cc.pop(c)) - n=int(cc.pop(1)) - ll='\t'.join(cc[1:]) # note we ditch the initial empty column - #print(s,n,cc,ll,sep='|') - #exit(0) - t=ss[s].get(ll,0) - ss[s][ll]=t+n - except: - sys.stdout.write(l) - print(cc) - exit(1) - -# note this won't work if c is last column! -for s in range(100): - with open('s%s.tsv'%s,'w') as f: - for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True): - f.write(str(c)) - f.write('\t') - f.write(l)
--- a/bin/percent_encode.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -'''Handle unquoting of non-UTF-8 bytes by %-encoding them''' -import codecs - -def percent_encode(ude): - #print(ude.object,ude.object[ude.start:ude.end]) - return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), - ude.end) - -codecs.register_error('percent',percent_encode)
--- a/bin/plinks.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -import sys,pdfx,traceback,os -from datetime import datetime - -def run(file): - try: - pdf=pdfx.PDFx(file) - links=pdf.get_references_as_dict() - if bool(links) and (links.get('scrape',False) or - links.get('annot',False)): - for k in links.keys(): - for l in links[k]: - print("%s\t%s"%(k,l)) - else: - print("None") - except Exception as e: - if str(e)=='Unexpected EOF': - print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr) - print("badpdf") - else: - print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr) - traceback.print_exc(file=sys.stderr) - -if sys.argv[1]=='-': - i=0 - for l in sys.stdin: - print(i,file=sys.stderr) - i+=1 - f=l.rstrip() - if os.path.getsize(f)==1048576: # truncated - print("truncated",file=sys.stderr) - print("truncated") - else: - run(f) - os.unlink(f) -else: - run(sys.argv[1])
--- a/bin/sort_date.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,142 +0,0 @@ -#!/usr/bin/python3 -'''Process output of lmh_warc [new 3-column version] - Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") -''' - -# Assumes you have used grep -v $'\t' on input for speed -# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' -# to fix a common 'bad' timestamp (~ .2% of inputs) - -import email.utils -import sys -from urllib.parse import urlsplit, quote, unquote -import surt - -import re, codecs -from itertools import chain - -WPAT = re.compile('(,www\\d*)+\\)') - -# Thanks to https://stackoverflow.com/a/8776871 -import locale -from functools import cmp_to_key - -def percent_encode(ude): - #print(ude.object,ude.object[ude.start:ude.end]) - return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), - ude.end) - -codecs.register_error('percent',percent_encode) - -def _u_esc(c): - if c<65536: - return '\\u%04X'%c - else: - return '\\U%08X'%c - -def java_unicode_encode(ude): - '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' - return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), - ude.end) - -codecs.register_error('java_unicode',java_unicode_encode) - -# From RFC-3986: -# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" -# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" -# / "*" / "+" / "," / ";" / "=" -# But # _is_ escaped in Java surt results -# and additionally " \ : < = > ? \ ^ _ ` { | } are not - -# Note also that although quote already does _not_ quote - . / _ ~ -# they are included below as that's what we find in surt.surt 0.3.1 - -# Also, Java surt strips _all_ leading 'www\d*.', -# where python3 surt only strips the first one. - -# And Java strips so-called option session-ids, but python doesn't - -import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer -import surt.URLRegexTransformer - -ident = ''.join(chr(i) for i in range(256)).encode('latin-1') - -IDMAP=bytes.maketrans(ident,ident) - -# For removal of non-printing characters: -# Note, this is only a guess, only example so are is DEL -NONPRINT= ''.join(chr(i) for i in chain(range(9), - range(14,32), - [127] # DEL - )).encode('latin-1') - -def notDefaultCanon(hu,**options): - if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): - # Try to fix the incompatibility between Java and - # Python surt handling of 'octal' numbers in numeric IPv4 addresses - # and it should! See "After this line: - # - # 15,225,107,143)" in .../azure/notes.txt - try: - bytestrs = hu.host.split(b'.') - hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) - except ValueError: - pass - if hu.query: - hu.query = hu.query.translate(IDMAP,delete=NONPRINT) - return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) - -# Hack this to reproduce the Java bug -surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ - re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), - ] - -# Above based on this from broken Java code: -# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 -#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), -#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), -#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), -#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), -#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", - -def cdx_key(uristring): - _surt = quote(unquote(surt.surt(unquote(uristring), - canonicalizer=notDefaultCanon), - errors='percent'), - safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' - ).lower() - # Wrt \x7f (DEL), see discussion in notes wrt - # "biz,televida)" case - # It remains to be seen whether other non-printing bytes - # will need to be treated as 'safe' - return WPAT.sub(')',_surt) - -def keyed(l): - uri, cc_stamp, dateTime = l.split('\t',2) - #print('ul',uri,file=sys.stderr) - try: - try: - epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() - except OverflowError: - epoch = 32535215999.0 - return ((cdx_key(uri), cc_stamp, uri), epoch) - except (TypeError,IndexError,ValueError) as e: - print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) - return - -fstr = sys.argv[1] - -with open(fstr,"r") as ff: - # crucial that the following is done _after_ the file is opened - # with the default (utf-8) locale! - locale.setlocale(locale.LC_ALL, "C") - ctk=cmp_to_key(locale.strcoll) - for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), - key=lambda x:x[0]): - print(key[0],key[1], - key[2].encode('ascii',errors='java_unicode').decode('ascii'), - ts,sep='\t')
--- a/bin/spearman.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,188 +0,0 @@ -#!/usr/bin/env python3 -'''Rank correlation processing for a csv tabulation of counts by segment - First column is for whole crawl, then 100 columns for segs 0-99 - Each row is counts for some property, e.g. mime-detected or tld - - For example, assuming all.tsv has the whole-crawl warc-only counts - and s...tsv have the segment counts, all with counts in column 1, - - tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv - - will produce such a file with - * 100 rows, one for each of the top 100 counts - * 101 columns, 0 for all and 1--100 for segs 0--99 - - Usage: python3 -i spearman.py name id - where name.csv has the input -''' - -import numpy as np -from numpy import loadtxt -from scipy import stats -import statsmodels.api as sm -import matplotlib.pyplot as plt -import pylab - -import sys, math - -def qqa(): - # q-q plot for the whole crawl - sm.qqplot(all, line='s') - plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) - plt.show() - -def qqs(): - # q-q plots for the best and worst (by variance) segments - global xv, xworst, xbest - xv=[d.variance for d in xd] - xworst=xv.index(max(xv)) - xbest=xv.index(min(xv)) - print(xbest,xworst) - sm.qqplot(x[xbest], line='s') - plt.gca().set_title('Best segment (least variance): %s'%xbest) - plt.show() - sm.qqplot(x[xworst], line='s') - plt.gca().set_title('Worst segment (most variance): %s'%xworst) - plt.show() - -def plot_x(sort=False,block=True,all_only=True,title=None): - # Make these two subplots, w. and w/o sorting - # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot - # for legend hacking - if sort: - aso=np.argsort(-all) - plot_all=all[aso] - plot_x=np.array([xd[i].mean for i in range(N)])[aso] - else: - plot_all=all - plot_x=[xd[i].mean for i in range(N)] - if title is None: - l1='Rank correlation of segment x whole crawl' - l2='Mean of segment x whole crawl' - plt.legend(loc='best',fontsize='small') - else: - l1=l2=None - plt.plot(plot_all,'rx',label=l1) - plt.plot([0,N-1],[all_m,all_m],'r',label=l2) - if not(all_only): - plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') - plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') - plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) - plt.grid(True) - if title is not None: - plt.title(title) - plt.show(block=block) - -def hist_x(align='mid'): - hist(xm,xsd,[xd[i].mean for i in range(N)], - 'Mean of rank correlation of each segment x all other segments', - align) - -def hist_all(align='mid'): - hist(all_m,np.sqrt(all_s.variance),all, - 'Rank correlation of each segment x whole crawl %s'%id, - align) - -def hist(m,sd,hh,title,align): - sdd=[(i,m-(i*sd)) for i in range(-2,3)] - fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 - sdax=hax.twiny() - hax.hist(hh,color='lightblue',align=align) - hax.set_title(title) - for s,v in sdd: - sdax.plot([v,v],[0,18],'b') - sdax.set_xlim(hax.get_xlim()) - sdax.set_ylim(hax.get_ylim()) - sdax.set_xticks([v for s,v in sdd]) - sdax.set_xticklabels([str(s) for s,v in sdd]) - plt.show() - -def ci(rho,n,conf=0.95): - # Courtesy of https://stats.stackexchange.com/a/18904 - # rho is (rank) correlation, n is sample size - stderr=1.0/math.sqrt(n-3) - z=stats.norm.ppf(1.0-((1.0-conf)/2)) - delta=z*stderr - lower=math.tanh(math.atanh(rho)-delta) - upper=math.tanh(math.atanh(rho)+delta) - return (lower,upper) - -def plot_ci(rhos,n,trim=None,conf=0.95): - # rhos are (rank) correlation values - rhos_s=rhos[(-rhos).argsort()] - if trim is None: - l=len(rhos) - else: - rhos_s=rhos_s[:trim] - l=trim - cc=(np.array([ci(r,n,conf) for r in rhos_s])).T - ue=cc[1]-rhos_s - le=rhos_s-cc[0] - #for i in range(len(rhos)): - #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) - plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') - plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) - plt.show() - -def first_diff(ranks): - # first disagreement with baseline == {1,2,...} - for i in range(len(ranks)): - if ranks[i]!=i+1.0: - return i - return i+1 - -def ranks(): - # Combine segment measures: - # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement - # convert to ranks, smallest value == highest rank - all_ranked=stats.rankdata(-all,method='average') # invert since - # large corr is good - x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) - # small corr variance is good - x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) - # invert since - # large mean corr is good - fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) - # invert since - # large first diff is good - return np.array([[i, - all_ranked[i], - x_variance_ranked[i], - x_mean_ranked[i], - fd_ranked[i]] for i in range(N)]) - -def main(): - global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr - global aa, aa_by_all, N - counts=loadtxt(sys.argv[1]+".csv",delimiter=',') - id=sys.argv[2] - N=counts.shape[1]-1 - # "If axis=0 (default), then each column represents a variable, with - # observations in the rows" - # So each column is a sequence of counts, for whole crawl in column 0 - # and for segments 0--N-1 in columns 1--N - corr=stats.spearmanr(counts,nan_policy='omit').correlation - - all=corr[0][1:] - all_s=stats.describe(all) - all_m=all_s.mean - - x=np.array([np.concatenate((corr[i][1:i], - corr[i][i+1:])) for i in range(1,N+1)]) - # The above, although transposed, works because the correlation matrix - # is symmetric - xd=[stats.describe(x[i]) for i in range(N)] - xs=stats.describe(np.array([xd[i].mean for i in range(N)])) - xm=xs.mean - xsd=np.sqrt(xs.variance) - - x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] - - aa=ranks() - aa_by_all=aa[aa[:,1].argsort()] - -### I need to review rows, e.g. counts[0] is an array of N+1 counts -### for the most common label in the complete crawl, -### from the complete crawl and all the segments -### versus columns, e.g. counts[:,0] is an array of N decreasing counts -### for all the labels in the complete crawl
--- a/bin/test_warc.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -import warc,sys - -OUT=open(sys.stdout.fileno(),'wb') - -if (debug:=(sys.argv[1]=='-d')): - sys.argv.pop(1) - -tt=int(sys.argv.pop(1)) - -def showme(wtype,buf,part): - # This should exactly reproduce a complete warc file if called - # as per version 1 below - if debug: - OUT.write(b"----start %d-----\n"%part) - OUT.write(buf) - if buf[-1]!=10: - OUT.write(b'\r\n') - if part==7: - OUT.write(b'\r\n') # to match complete file formatting - if debug: - OUT.write(b"----end %d-----\n"%part) - return OUT - -if tt==1: - warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug) -elif tt==2: - warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug) -elif tt==3: - warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug) -elif tt==4: - warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug) -elif tt==5: - warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- a/bin/warc.py Wed Sep 27 17:29:51 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,167 +0,0 @@ -#!/usr/bin/env python3 -'''Stream a warc format file, unzipping if necessary, invoking a -callback on each record. Callback can be limited by WARC-Type, record -part''' - -import sys,io -from isal import igzip - -RESP = b'response' -REQ = b'request' -META = b'metadata' -INFO = b'warcinfo' - -def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): - '''parts is a bit-mask: - 1 for warc header; - 2 for req/resp HTTP header, warcinfo/metadata features; - 4 for req/resp body''' - # should do some sanity checking wrt parts and types - types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] - nb=0 - if filename.endswith(".gz"): - stream=igzip.IGzipFile(filename=filename) - else: - stream=open(filename,'rb',0) - bufSize=2*1024*1024 - hdrMax=16*1024 - buf=bytearray(bufSize) - bufView=memoryview(buf) - fpos=bl=stream.readinto(buf) - bp=0 - done=False - while True: - while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty - bp+=2 - start_1=bp - if not buf.startswith(b'WARC/1.0\r\n',bp): - if done and bl-bp==0: - # really done - return - raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename, - bp,bl,fpos, - (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), - bl-bp)) - bp+=10 - wtype=None - length=None - state=1 - tr=None # Was this record truncated? - while not buf.startswith(b'\r\n',bp): - # there should always be enough in the buffer to complete this loop, - # because of the buffer update logic below - eol=buf.index(b'\r\n',bp)+2 - if buf.startswith(b"Content-Length: ",bp): - length=wl=int(bufView[bp+16:eol-2]) - elif buf.startswith(b"WARC-Truncated: ",bp): - if bp+16==eol-2: - tr=b"EMPTY" - else: - tr=bytes(bufView[bp+16:eol-2]) - elif buf.startswith(b'WARC-Type: ',bp): - if buf.startswith(b's',bp+13): - wtype = RESP - elif buf.startswith(b'q',bp+13): - wtype = REQ - elif buf.startswith(b'm',bp+11): - wtype = META - elif buf.startswith(b'w',bp+11): - wtype = INFO - else: - raise ValueError("Unknown WARC-Type: %s in %s at %s"%( - bytes(bufView[bp+11:eol-2]),filename, - fpos-(bl-bp))) - bp=eol - bp=eol+2 - if done: - if (bp+length)>bl: - raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, - length,bl,filename)) - elif (bp+(length+hdrMax))>bl: - # Need more data - if wtype in types: - # we need to keep from start_1 to bl - keepFrom=start_1 - keepLen=bl-keepFrom - buf[0:keepLen]=bufView[keepFrom:bl] - eol=eol-start_1 - start_1=0 - bp=eol+2 - else: - # we can skip the rest of this part - if (bp+length)<=bl: - # we have at least some bytes from the next part - keepLen=bl-(bp+length) - buf[0:keepLen]=bufView[bl-keepLen:bl] - else: - # we don't have all of the bytes from the current part - # so can skip the rest of it - keepLen=0 - fpos=stream.seek(fpos+(bp+length-bl)) - bp=0 - spaceToFill=bufSize-keepLen - with memoryview(buf)[keepLen:bufSize] as xBuf: - nb=stream.readinto(xBuf) - fpos+=nb - bl=keepLen+nb - if nb<spaceToFill: - done=True - if wtype not in types: - continue - if (wtype in types): - # Output whole or part 1 as required - if whole: - bp+=length - OUT=callback(wtype,bufView[start_1:bp],7) - continue - elif (parts & 1): - OUT=callback(wtype,bufView[start_1:eol],1) - if parts!=1: - while buf.startswith(b'\r\n',bp): - bp+=2 - start_2=bp - eob=bp+length - while buf.startswith(b'\r\n',eob-2): - eob-=2 - # Only output parts (2 = HTTP header, 4 = body) that are wanted - if parts & 2: - if wtype is META or wtype is INFO: - # rest of the part - OUT=callback(wtype,bufView[start_2:eob],2) - else: - # request and response have http headers - eo2=buf.index(b'\r\n\r\n',start_2) - OUT=callback(wtype,bufView[start_2:eo2+2],2) - if parts & 4: - for L in rec_text: - if state==2: - # HTTP header - wl -= len(L) - if not (L==b"" or L.startswith(b"\r")): - # Non-empty, it's (a continuation of) a header - if bl is None and L.startswith(b"Content-Length: "): - bl=int(L[16:].rstrip()) - else: - # Blank line, HTTP header is finished - if parts & 2: - callback(wtype,bufView[start_2:start_2+L_start],2) - state=4 - # The above is just for sanity, because we do _not_ - # continue with the outer loop, - # since we can now block-output the entire rest of the - # input buffer. - if bl is not None: - if bl!=wl: - print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ - (length,offset,filename,wl,bl,tr),file=sys.stderr) - # HTTP body - balance=start_2+rec_text.tell() - #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) - # Output whatever is left - if parts & 4: - callback(wtype,bufView[balance:balance+wl],4) - state=1 - - L_start=rec_text.tell() - bp+=length - #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/build_idx.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +'''Turn a merge_nnn.log file into a cluster.idx file + We cheat and use the old cluster.idx to save having to read + all the cdx-....gz files''' +import sys + +with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx: + i=-1 + curpos=0 + target="cdx-00%03d.gz"%i + log=open("/dev/null",'r') # embarassing hack + for ol in oidx: + (surt, datestamp, file, offset, length, cnt) = ol.split() + if file!=target: + i+=1 + target="cdx-00%03d.gz"%i + log.close() + curpos=0 + log=open('merge_%d.log'%(i+1),'r') + hdr=log.readline() + (j,f) = hdr.split() + sys.stderr.write(hdr) + if int(j)!=i+1: + raise ValueError("wrong file: i=%s, j=%s"%(i,j)) + nl=log.readline() + if not nl: + sys.stderr.write('quiting early: %s\n'%i) + exit(1) + nlen=int(nl) + nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt)) + curpos+=nlen
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/count_warc.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +import warc,sys + +OUT=open(sys.stdout.fileno(),'wb') + +if (debug:=(sys.argv[1]=='-d')): + sys.argv.pop(1) + +def countme(wtype,buf,part): + if debug: + breakpoint() + OUT.write(b"%d\n"%len(buf)) + +#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) +#warc(showme,[b'response'],whole=True) + +warc.warc(sys.argv[1],countme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/fix_key.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +from percent_encode import percent_encode +from urllib.parse import quote, unquote +import sys + +# From RFC-3986: +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +# / "*" / "+" / "," / ";" / "=" +# But # _is_ escaped in Java surt results +# and additionally " \ : < = > ? \ ^ _ ` { | } are not +# Note also that quote already does _not_ quote - . / _ ~ + +# Also, Java surt strips _all_ leading 'www.', +# where python3 surt only strips the first one. + +with open(sys.argv[1],"r") as f: + for l in f: + while l.endswith(',www',0,ploc:=l.index(')')): + l=l[:ploc-4]+l[ploc:] + if '%' in l: + (key,wt,ts)=l.split('\t') + sys.stdout.write(quote(unquote(key,errors='percent'), + safe='!"$&\'()*+,:;<=>?@[\\]^`{|}').lower()) + sys.stdout.write('\t') + sys.stdout.write(wt) + sys.stdout.write('\t') + sys.stdout.write(ts) + else: + sys.stdout.write(l) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/ix.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +'''Extract response records from Common Crawl WARC-format files +given length, offset and filename triples. +Input one triple on command line, or +triples from stdin as tab-delimited lines +or complete cdx index lines. +In all cases by 'filename' is meant crawlid/segmentid/type/filename + +Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' + +import sys, argparse, regex, os, shutil, io, gzip, time, shlex +from isal import igzip +from subprocess import Popen, PIPE +#import asyncio + +HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') +BINOUT=sys.stdout.buffer +FPAT="/%s/%s/orig/%s/%s" + +CMD_PROC=None +TMPFILENAME=None + +class HackFormat(argparse.RawDescriptionHelpFormatter): + def format_help(self): + FOO=argparse.RawDescriptionHelpFormatter.format_help(self) + return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', + FOO) + +def process(options,buf,filename,offset,length,whole): + try: + process0(options,buf,filename,offset,length,whole) + except Exception as e: + if options.debug: + import traceback + traceback.print_exc(file=sys.stderr) + else: + print("Process fail: %s, input line:\n %s"%(e,l), + file=sys.stderr,end='') + exit(3) + +def process0(options,buf,filename,offset,length,whole): + global TMPFILENAME, TMPFILE + if options.save: + (tf,TMPFILENAME)=tempfile.mkstemp() + TMPFILE=open(tf,mode='wb') + if options.cmd and not options.process: + launch(options.cmd) + process1(options,buf,filename,offset,length,whole) + if options.save: + TMPFILE.close() + if options.cmd: + _output_subproc(bytes(TMPFILENAME,'utf-8')) + _output_subproc(b"\n") + else: + BINOUT.write(bytes(TMPFILENAME,'utf-8')) + BINOUT.write(b"\n") + if options.cmd: + if not options.process: + windup(filename,options,length) + if options.save: + os.unlink(TMPFILENAME) + TMPFILENAME=None + elif options.save: + print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr) + TMPFILENAME=None + +def launch(cmd): + global CMD_PROC, BINOUT + CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0) + BINOUT=CMD_PROC.stdin + +def windup(length,offset,filename): + # Wind up subproc + BINOUT.close() + if CMD_PROC.wait()!=0: # could/should be async? + print("subproc of %s:%s:%s failed with %s"%(length,offset,filename, + CMD_PROC.returncode), + file=sys.stderr) + +def _output_tmpfile(buf): + TMPFILE.write(buf) + +def _output_stdout(buf): + BINOUT.write(buf) + +def _output_subproc(buf): + toWrite=len(buf) + while toWrite>0: + toWrite -= BINOUT.write(buf) + +def process1(options,buf,filename,offset,length,whole): + root=options.root + rfn=root+filename + if root!="/beegfs/common_crawl": + # Support using ramdisk or other local disk as a faster cached + if not os.path.exists(rfn): + if not os.path.exists(os.path.dirname(rfn)): + os.makedirs(os.path.dirname(rfn)) + with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ + io.FileIO(rfn,'w') as outfile: + #shutil.copyfileobj(infile,outfile,128*1024*1024) + while True: + l=infile.readinto(buf) + if l==0: + break + outfile.write(memoryview(buf)[:l]) + file=open(rfn,'rb',0) + file.seek(offset) + bv=memoryview(buf)[:length] + nb=file.readinto(bv) + file.close() + if nb!=length: + raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name, + nb,length,offset)) + if whole and options.zipped: + _output(bv) + return + gzip_chunk = io.BytesIO(bv) + uv=memoryview(buf)[length:] + with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: + ll=0 + while True: + l=gzip_fin.readinto(uv) + if not l: + break + ll+=l + cb=memoryview(uv)[:ll] + if whole: + _output(cb) + return + # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted + state=0 + tr=None # Was this record truncated? + bl=None # for HTTP Content-Length for the length of the body? + # Could we make this faster by working purely within the cb memoryview? + # It would be messy, but avoid copying huge amounts + # The outer loop would just be something like + # clbv=memoryview(bytearray(b"Content-Length: ")) + # i=s=0 + # while i<ll: + # if cb[i]==10: # need to handle \r\n + # L=cb[s:i] + # s=i=i+1 + # if L[:16]==clbv: + # wl=int(L[16:]) + # else: + # i+=1 + # + with io.BytesIO(cb) as clear_text: + for L in clear_text: + if state==0: + # WARC header + if L.startswith(b"Content-Length: "): + wl=int(L[16:].rstrip()) + elif L.startswith(b"WARC-Truncated: "): + tr=L[16:].rstrip() + tr="EMPTY" if tr=="" else tr + elif L==b"" or L.startswith(b"\r"): # for idempotency + # Blank line, WARC header is finished + if not (options.headers or options.body): + return + state=1 + # Note we preserve the empty line + if options.warc: + _output(L) + continue + if state==1: + # HTTP header + wl -= len(L) + if not (L==b"" or L.startswith(b"\r")): + # Non-blank, it's a header + (h,_,v)=L.partition(b": ") + if bl is None and (h==b"Content-Length"): + bl=int(v) + if options.headers: + if isinstance(options.headers,dict): + if h in options.headers: + options.headers[h]=v + else: + _output(L) + else: + # Blank line, HTTP header is finished + if isinstance(options.headers,dict): + _output(bytes(str(options.headers),'utf-8')) + if not options.body: + return + if options.headers: + _output(L) + state=2 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + # HTTP body + balance=clear_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + _output(cb[balance:balance+wl]) + return + +def main(): + global _output,TMPFILE,TMPFILENAME,tempfile + parser = argparse.ArgumentParser( + description='''Extract records from warc files given length, offset and file triples. + Input one triple on command line, or + triples from stdin as tab-delimited lines + or complete cdx index lines. + In all cases by 'filename' is meant crawlid/segmentid/type/filename''', + epilog='''Note that if no output flag(s) is/are given, + the whole WARC record will be output, more efficiently than + would be the case if all three flags were given.''', + add_help=False, + conflict_handler='resolve', + formatter_class=HackFormat + ) + fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') + parser.add_argument('--help',help='Show help',action='help') + parser.add_argument('-d','--debug',help='Debug output',action='store_true') + parser.add_argument('-w','--warc',help='output WARC headers', + action='store_true') + parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output', + nargs='?',default=None,const=True) + parser.add_argument('-b','--body',help='output HTTP body', + action='store_true') + parser.add_argument('-c','--cmd',help='pipes each result thru CMD') + parser.add_argument('-p','--process',help='with -c, launches CMD only once', + action='store_true') + parser.add_argument('-m','--module.function',help='module.function to call with a stream'), + parser.add_argument('-s','--save',action='store_true', + help="write to a temporary file and output the name") + parser.add_argument('-f','--fpath', + help=fphelp, + default=FPAT) + parser.add_argument('-r','--root',nargs='?', + help='File path root, create a copy there if necessary', + default='/beegfs/common_crawl'), + parser.add_argument('-z','--zipped', + help="output raw gzipped record, ignored if any of -bhw supplied", + action='store_true') + sg=parser.add_mutually_exclusive_group() + sg.add_argument('-x','--index', + help='take lines of triples from a cdx index file as input', + action='store_true') + sg.add_argument('length',type=int, + help='length in bytes of gzipped record', + nargs='?') + parser.add_argument('offset',type=int, + help='start position in bytes of gzipped record', + nargs='?') + parser.add_argument('filename', + help='pathname of gzipped Common Crawl WARC-format file', + nargs='?') + # Hack the order of optional and positional in the help output + parser._action_groups.sort(key=lambda g:g.title) + #parser.print_help() + pa=parser.parse_args(sys.argv[1:]) + #print(pa,file=sys.stderr) + if pa.length is not None: + # We have to enforce our own check.. + if pa.offset is None or pa.filename is None: + parser.error("length, offset and filename must all be supplied together") + if isinstance(pa.headers,str): + pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(',')) + + buf=bytearray(128*1024*1024) + + whole=not (pa.warc or pa.headers or pa.body) + if pa.save: + _output=_output_tmpfile + import tempfile + elif pa.cmd: + _output = _output_subproc + else: + _output = _output_stdout + if pa.cmd and pa.process: + launch(pa.cmd) + # three different ways to process + if pa.index: + CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... + for l in sys.stdin: + m=CDX.search(l) + if m is None: + if l.find('/robotstxt/')>-1: + continue + print("index line problem: \"%s\""%l,file=sys.stderr,end='') + exit(2) + filename=pa.fpath%(m[3:7]) + process(pa,buf,filename, + int(offset:=m[2]),int(length:=m[1]),whole) + elif pa.length is not None: + print(pa.filename,file=sys.stderr) + process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), + pa.offset,pa.length,whole) + else: + print("Reading length, offset, filename tab-delimited triples from stdin...", + file=sys.stderr) + for l in sys.stdin: + try: + (length,offset,filename)=l.rstrip().split('\t') + length=int(length) + offset=int(offset) + except ValueError as e: + parser.error('Invalid input line: %s\n "%s"'%(e,l)) + process(pa,buf,pa.fpath%tuple(filename.split('/')), + offset,length,whole) + # processing done one way or another + if pa.cmd and pa.process: + windup(length,offset,filename) + # if pa.save and pa.process, deleting temp files is down to cmd +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh_warc.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +'''Extract identifying info + LastModified header value for all entries + that have one + + Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' + +import re,warc,sys,glob,codecs + +TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) +DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) +LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) + +DTAB=bytearray(range(256)) +DDEL=b'TZ-:' + +OUT=open(sys.stdout.fileno(),'wb') + +def showmeLMH(wtype,buf,part): + global URI, DATE, SEGMENT, FILETYPE, FILENO + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + if (md:=DPAT.search(buf)): + DATE=md[1] + else: + raise ValueError(b"No date in %s ??"%buf) + else: + mm=LMPAT.search(buf) + OUT.write(URI) + if mm: + OUT.write(b'\t') + OUT.write(DATE.translate(DTAB,DDEL)) + OUT.write(b'\t') + OUT.write(SEGMENT) + OUT.write(b'\t') + OUT.write(FILETYPE) + OUT.write(b'\t') + OUT.write(FILENO) + OUT.write(b'\t') + OUT.write(mm[1]) + OUT.write(b'\n') + +(CCdate, segment, filetype, fileno) = sys.argv[1:] +fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) + +SEGMENT=codecs.encode(segment,'ascii') +FILETYPE=codecs.encode(filetype,'ascii') +FILENO=codecs.encode(fileno,'ascii') + +warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/merge_date.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,144 @@ +#!/usr/bin/python3 +'''Add timestamps from Last-Modified-dated (ks.tsv) files into + that year's index + +Usage: merge_date.py ksvstream cdx-dir outdir + +ksvstream consists of tab-separated key, CC date, url and Unix timestamp +''' # ' + +import sys, io, os, os.path, time, re +from isal import igzip + + +DEBUG = 0 +while sys.argv[1] == '-d': + sys.argv.pop(1) + DEBUG += 1 + +XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] +NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] + +RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' +b'(crawldiagnostics|robotstxt)/') +SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' + b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' + b'=[^&]*)') +ISESSION = re.compile(SESSION.pattern,flags=re.I) +URL=re.compile(b'\{"url": "([^"]*)"') +WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') + +# Above based on this from broken Java code: +# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 +#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", + +#print(sys.argv[3],NPATH,file=sys.stderr) + +os.makedirs(sys.argv[3], exist_ok=True) + +FN = 0 + +XCNT = WCNT = 0 +DCNT = 0 + +XF = igzip.IGzipFile(filename=XPATH%0) +NF = open(NN:=(NPATH%0),'wb') + +def nextLine(): + '''Move on to next index file if current has run out''' + global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT + while True: + xl=XF.readline() + XCNT += 1 + if xl == b'': + # need to move to next index file + FN += 1 + XF.close() + NF.close() + print(NN, flush=True) # so we can compress it + print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True) + time.sleep(0.1) # so they flush? + XN=XPATH%FN + if not os.path.exists(XN): + return None + XF = igzip.IGzipFile(filename=XN) + NF = open((NN:=NPATH%FN), 'wb') + xl = XF.readline() + WCNT = XCNT = 1 + if WARC.search(xl): + WCNT += 1 + return xl + else: + NF.write(xl) + if DEBUG: + sys.stderr.write("out_rc\n") + + +def nextDate(df,dn): + global DEBUG, DCNT, XCNT + dl = df.readline() + if dl == b'': + # write out the last of the last index file, if any + return "", "", "", 0 + if DEBUG: + sys.stderr.write("dl%s: %s\n"%(dn,dl)) + dkey, ddate, durl, dtime = dl.split(b'\t') + DCNT += 1 + return dkey, ddate, durl, dtime + +with open(sys.argv[1], 'rb') as df: + DCNT = 0 + + dkey, ddate, durl, dtime = nextDate(df,1) + + while (xl := nextLine()) is not None: + xkey, xdate, xprops = xl.split(b' ', maxsplit=2) + m = URL.match(xprops) + if m: + xurl = m[1] + else: + raise ValueError("No url in %s"%xprops) + if DEBUG: + sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + if dkey==xkey and ddate==xdate and durl==xurl: + # Got it + NF.write(xkey) + NF.write(b' ') + NF.write(xdate) + NF.write(b' ') + NF.write(xprops[:-2]) + NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) + if DEBUG: + sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + sys.stderr.write(" %d\n"%int(dtime[:-3])) + + dkey, ddate, durl, dtime = nextDate(df,2) + continue + else: + if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): + # we've missed something, disaster looms + print("Fail2:" + " xkey: %s\n" + " dkey: %s\n" + " xdate: %s\n" + " ddate: %s\n" + " xurl: %s\n" + " durl: %s\n" + "FN: %s XCNT: %s DCNT: %s\n" + "xl: %s"%(xkey, dkey, xdate, ddate, + xurl, durl, + FN, XCNT, DCNT, xl), + file=sys.stderr) + # try to force recovery + dkey, ddate, durl, dtime = nextDate(df,3) + continue + # else fall through to write + NF.write(xl) + if DEBUG: + sys.stderr.write("out_nl\n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/per_segment.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,36 @@ +#!/usr/bin/python3 +'''refactor a per-cdx count table to be per-segment +input on STDIN +Usage: per_segment segment-column +Assumes column 0 is empty, count is in column 1 +Segment column is 0-origin +''' + +import sys + +c=int(sys.argv[1]) + +ss=[dict() for i in range(100)] + +for l in sys.stdin: + try: + cc=l.split('\t') + s=int(cc.pop(c)) + n=int(cc.pop(1)) + ll='\t'.join(cc[1:]) # note we ditch the initial empty column + #print(s,n,cc,ll,sep='|') + #exit(0) + t=ss[s].get(ll,0) + ss[s][ll]=t+n + except: + sys.stdout.write(l) + print(cc) + exit(1) + +# note this won't work if c is last column! +for s in range(100): + with open('s%s.tsv'%s,'w') as f: + for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True): + f.write(str(c)) + f.write('\t') + f.write(l)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/percent_encode.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,9 @@ +'''Handle unquoting of non-UTF-8 bytes by %-encoding them''' +import codecs + +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('percent',percent_encode)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/sort_date.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,142 @@ +#!/usr/bin/python3 +'''Process output of lmh_warc [new 3-column version] + Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") +''' + +# Assumes you have used grep -v $'\t' on input for speed +# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' +# to fix a common 'bad' timestamp (~ .2% of inputs) + +import email.utils +import sys +from urllib.parse import urlsplit, quote, unquote +import surt + +import re, codecs +from itertools import chain + +WPAT = re.compile('(,www\\d*)+\\)') + +# Thanks to https://stackoverflow.com/a/8776871 +import locale +from functools import cmp_to_key + +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('percent',percent_encode) + +def _u_esc(c): + if c<65536: + return '\\u%04X'%c + else: + return '\\U%08X'%c + +def java_unicode_encode(ude): + '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' + return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('java_unicode',java_unicode_encode) + +# From RFC-3986: +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +# / "*" / "+" / "," / ";" / "=" +# But # _is_ escaped in Java surt results +# and additionally " \ : < = > ? \ ^ _ ` { | } are not + +# Note also that although quote already does _not_ quote - . / _ ~ +# they are included below as that's what we find in surt.surt 0.3.1 + +# Also, Java surt strips _all_ leading 'www\d*.', +# where python3 surt only strips the first one. + +# And Java strips so-called option session-ids, but python doesn't + +import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer +import surt.URLRegexTransformer + +ident = ''.join(chr(i) for i in range(256)).encode('latin-1') + +IDMAP=bytes.maketrans(ident,ident) + +# For removal of non-printing characters: +# Note, this is only a guess, only example so are is DEL +NONPRINT= ''.join(chr(i) for i in chain(range(9), + range(14,32), + [127] # DEL + )).encode('latin-1') + +def notDefaultCanon(hu,**options): + if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): + # Try to fix the incompatibility between Java and + # Python surt handling of 'octal' numbers in numeric IPv4 addresses + # and it should! See "After this line: + # + # 15,225,107,143)" in .../azure/notes.txt + try: + bytestrs = hu.host.split(b'.') + hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) + except ValueError: + pass + if hu.query: + hu.query = hu.query.translate(IDMAP,delete=NONPRINT) + return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) + +# Hack this to reproduce the Java bug +surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ + re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), + ] + +# Above based on this from broken Java code: +# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 +#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", + +def cdx_key(uristring): + _surt = quote(unquote(surt.surt(unquote(uristring), + canonicalizer=notDefaultCanon), + errors='percent'), + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' + ).lower() + # Wrt \x7f (DEL), see discussion in notes wrt + # "biz,televida)" case + # It remains to be seen whether other non-printing bytes + # will need to be treated as 'safe' + return WPAT.sub(')',_surt) + +def keyed(l): + uri, cc_stamp, dateTime = l.split('\t',2) + #print('ul',uri,file=sys.stderr) + try: + try: + epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() + except OverflowError: + epoch = 32535215999.0 + return ((cdx_key(uri), cc_stamp, uri), epoch) + except (TypeError,IndexError,ValueError) as e: + print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) + return + +fstr = sys.argv[1] + +with open(fstr,"r") as ff: + # crucial that the following is done _after_ the file is opened + # with the default (utf-8) locale! + locale.setlocale(locale.LC_ALL, "C") + ctk=cmp_to_key(locale.strcoll) + for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), + key=lambda x:x[0]): + print(key[0],key[1], + key[2].encode('ascii',errors='java_unicode').decode('ascii'), + ts,sep='\t')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/spearman.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +'''Rank correlation processing for a csv tabulation of counts by segment + First column is for whole crawl, then 100 columns for segs 0-99 + Each row is counts for some property, e.g. mime-detected or tld + + For example, assuming all.tsv has the whole-crawl warc-only counts + and s...tsv have the segment counts, all with counts in column 1, + + tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv + + will produce such a file with + * 100 rows, one for each of the top 100 counts + * 101 columns, 0 for all and 1--100 for segs 0--99 + + Usage: python3 -i spearman.py name id + where name.csv has the input +''' + +import numpy as np +from numpy import loadtxt +from scipy import stats +import statsmodels.api as sm +import matplotlib.pyplot as plt +import pylab + +import sys, math + +def qqa(): + # q-q plot for the whole crawl + sm.qqplot(all, line='s') + plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) + plt.show() + +def qqs(): + # q-q plots for the best and worst (by variance) segments + global xv, xworst, xbest + xv=[d.variance for d in xd] + xworst=xv.index(max(xv)) + xbest=xv.index(min(xv)) + print(xbest,xworst) + sm.qqplot(x[xbest], line='s') + plt.gca().set_title('Best segment (least variance): %s'%xbest) + plt.show() + sm.qqplot(x[xworst], line='s') + plt.gca().set_title('Worst segment (most variance): %s'%xworst) + plt.show() + +def plot_x(sort=False,block=True,all_only=True,title=None): + # Make these two subplots, w. and w/o sorting + # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot + # for legend hacking + if sort: + aso=np.argsort(-all) + plot_all=all[aso] + plot_x=np.array([xd[i].mean for i in range(N)])[aso] + else: + plot_all=all + plot_x=[xd[i].mean for i in range(N)] + if title is None: + l1='Rank correlation of segment x whole crawl' + l2='Mean of segment x whole crawl' + plt.legend(loc='best',fontsize='small') + else: + l1=l2=None + plt.plot(plot_all,'rx',label=l1) + plt.plot([0,N-1],[all_m,all_m],'r',label=l2) + if not(all_only): + plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') + plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') + plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) + plt.grid(True) + if title is not None: + plt.title(title) + plt.show(block=block) + +def hist_x(align='mid'): + hist(xm,xsd,[xd[i].mean for i in range(N)], + 'Mean of rank correlation of each segment x all other segments', + align) + +def hist_all(align='mid'): + hist(all_m,np.sqrt(all_s.variance),all, + 'Rank correlation of each segment x whole crawl %s'%id, + align) + +def hist(m,sd,hh,title,align): + sdd=[(i,m-(i*sd)) for i in range(-2,3)] + fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 + sdax=hax.twiny() + hax.hist(hh,color='lightblue',align=align) + hax.set_title(title) + for s,v in sdd: + sdax.plot([v,v],[0,18],'b') + sdax.set_xlim(hax.get_xlim()) + sdax.set_ylim(hax.get_ylim()) + sdax.set_xticks([v for s,v in sdd]) + sdax.set_xticklabels([str(s) for s,v in sdd]) + plt.show() + +def ci(rho,n,conf=0.95): + # Courtesy of https://stats.stackexchange.com/a/18904 + # rho is (rank) correlation, n is sample size + stderr=1.0/math.sqrt(n-3) + z=stats.norm.ppf(1.0-((1.0-conf)/2)) + delta=z*stderr + lower=math.tanh(math.atanh(rho)-delta) + upper=math.tanh(math.atanh(rho)+delta) + return (lower,upper) + +def plot_ci(rhos,n,trim=None,conf=0.95): + # rhos are (rank) correlation values + rhos_s=rhos[(-rhos).argsort()] + if trim is None: + l=len(rhos) + else: + rhos_s=rhos_s[:trim] + l=trim + cc=(np.array([ci(r,n,conf) for r in rhos_s])).T + ue=cc[1]-rhos_s + le=rhos_s-cc[0] + #for i in range(len(rhos)): + #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) + plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') + plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) + plt.show() + +def first_diff(ranks): + # first disagreement with baseline == {1,2,...} + for i in range(len(ranks)): + if ranks[i]!=i+1.0: + return i + return i+1 + +def ranks(): + # Combine segment measures: + # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement + # convert to ranks, smallest value == highest rank + all_ranked=stats.rankdata(-all,method='average') # invert since + # large corr is good + x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) + # small corr variance is good + x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) + # invert since + # large mean corr is good + fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) + # invert since + # large first diff is good + return np.array([[i, + all_ranked[i], + x_variance_ranked[i], + x_mean_ranked[i], + fd_ranked[i]] for i in range(N)]) + +def main(): + global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr + global aa, aa_by_all, N + counts=loadtxt(sys.argv[1]+".csv",delimiter=',') + id=sys.argv[2] + N=counts.shape[1]-1 + # "If axis=0 (default), then each column represents a variable, with + # observations in the rows" + # So each column is a sequence of counts, for whole crawl in column 0 + # and for segments 0--N-1 in columns 1--N + corr=stats.spearmanr(counts,nan_policy='omit').correlation + + all=corr[0][1:] + all_s=stats.describe(all) + all_m=all_s.mean + + x=np.array([np.concatenate((corr[i][1:i], + corr[i][i+1:])) for i in range(1,N+1)]) + # The above, although transposed, works because the correlation matrix + # is symmetric + xd=[stats.describe(x[i]) for i in range(N)] + xs=stats.describe(np.array([xd[i].mean for i in range(N)])) + xm=xs.mean + xsd=np.sqrt(xs.variance) + + x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] + + aa=ranks() + aa_by_all=aa[aa[:,1].argsort()] + +### I need to review rows, e.g. counts[0] is an array of N+1 counts +### for the most common label in the complete crawl, +### from the complete crawl and all the segments +### versus columns, e.g. counts[:,0] is an array of N decreasing counts +### for all the labels in the complete crawl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/test_warc.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,33 @@ +import warc,sys + +OUT=open(sys.stdout.fileno(),'wb') + +if (debug:=(sys.argv[1]=='-d')): + sys.argv.pop(1) + +tt=int(sys.argv.pop(1)) + +def showme(wtype,buf,part): + # This should exactly reproduce a complete warc file if called + # as per version 1 below + if debug: + OUT.write(b"----start %d-----\n"%part) + OUT.write(buf) + if buf[-1]!=10: + OUT.write(b'\r\n') + if part==7: + OUT.write(b'\r\n') # to match complete file formatting + if debug: + OUT.write(b"----end %d-----\n"%part) + return OUT + +if tt==1: + warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug) +elif tt==2: + warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug) +elif tt==3: + warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug) +elif tt==4: + warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug) +elif tt==5: + warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/warc.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +'''Stream a warc format file, unzipping if necessary, invoking a +callback on each record. Callback can be limited by WARC-Type, record +part''' + +import sys,io +from isal import igzip + +RESP = b'response' +REQ = b'request' +META = b'metadata' +INFO = b'warcinfo' + +def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): + '''parts is a bit-mask: + 1 for warc header; + 2 for req/resp HTTP header, warcinfo/metadata features; + 4 for req/resp body''' + # should do some sanity checking wrt parts and types + types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] + nb=0 + if filename.endswith(".gz"): + stream=igzip.IGzipFile(filename=filename) + else: + stream=open(filename,'rb',0) + bufSize=2*1024*1024 + hdrMax=16*1024 + buf=bytearray(bufSize) + bufView=memoryview(buf) + fpos=bl=stream.readinto(buf) + bp=0 + done=False + while True: + while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty + bp+=2 + start_1=bp + if not buf.startswith(b'WARC/1.0\r\n',bp): + if done and bl-bp==0: + # really done + return + raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename, + bp,bl,fpos, + (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), + bl-bp)) + bp+=10 + wtype=None + length=None + state=1 + tr=None # Was this record truncated? + while not buf.startswith(b'\r\n',bp): + # there should always be enough in the buffer to complete this loop, + # because of the buffer update logic below + eol=buf.index(b'\r\n',bp)+2 + if buf.startswith(b"Content-Length: ",bp): + length=wl=int(bufView[bp+16:eol-2]) + elif buf.startswith(b"WARC-Truncated: ",bp): + if bp+16==eol-2: + tr=b"EMPTY" + else: + tr=bytes(bufView[bp+16:eol-2]) + elif buf.startswith(b'WARC-Type: ',bp): + if buf.startswith(b's',bp+13): + wtype = RESP + elif buf.startswith(b'q',bp+13): + wtype = REQ + elif buf.startswith(b'm',bp+11): + wtype = META + elif buf.startswith(b'w',bp+11): + wtype = INFO + else: + raise ValueError("Unknown WARC-Type: %s in %s at %s"%( + bytes(bufView[bp+11:eol-2]),filename, + fpos-(bl-bp))) + bp=eol + bp=eol+2 + if done: + if (bp+length)>bl: + raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, + length,bl,filename)) + elif (bp+(length+hdrMax))>bl: + # Need more data + if wtype in types: + # we need to keep from start_1 to bl + keepFrom=start_1 + keepLen=bl-keepFrom + buf[0:keepLen]=bufView[keepFrom:bl] + eol=eol-start_1 + start_1=0 + bp=eol+2 + else: + # we can skip the rest of this part + if (bp+length)<=bl: + # we have at least some bytes from the next part + keepLen=bl-(bp+length) + buf[0:keepLen]=bufView[bl-keepLen:bl] + else: + # we don't have all of the bytes from the current part + # so can skip the rest of it + keepLen=0 + fpos=stream.seek(fpos+(bp+length-bl)) + bp=0 + spaceToFill=bufSize-keepLen + with memoryview(buf)[keepLen:bufSize] as xBuf: + nb=stream.readinto(xBuf) + fpos+=nb + bl=keepLen+nb + if nb<spaceToFill: + done=True + if wtype not in types: + continue + if (wtype in types): + # Output whole or part 1 as required + if whole: + bp+=length + OUT=callback(wtype,bufView[start_1:bp],7) + continue + elif (parts & 1): + OUT=callback(wtype,bufView[start_1:eol],1) + if parts!=1: + while buf.startswith(b'\r\n',bp): + bp+=2 + start_2=bp + eob=bp+length + while buf.startswith(b'\r\n',eob-2): + eob-=2 + # Only output parts (2 = HTTP header, 4 = body) that are wanted + if parts & 2: + if wtype is META or wtype is INFO: + # rest of the part + OUT=callback(wtype,bufView[start_2:eob],2) + else: + # request and response have http headers + eo2=buf.index(b'\r\n\r\n',start_2) + OUT=callback(wtype,bufView[start_2:eo2+2],2) + if parts & 4: + for L in rec_text: + if state==2: + # HTTP header + wl -= len(L) + if not (L==b"" or L.startswith(b"\r")): + # Non-empty, it's (a continuation of) a header + if bl is None and L.startswith(b"Content-Length: "): + bl=int(L[16:].rstrip()) + else: + # Blank line, HTTP header is finished + if parts & 2: + callback(wtype,bufView[start_2:start_2+L_start],2) + state=4 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + # HTTP body + balance=start_2+rec_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + if parts & 4: + callback(wtype,bufView[balance:balance+wl],4) + state=1 + + L_start=rec_text.tell() + bp+=length + #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/deltas.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +'''Extract and tabulate runtimes per file from a slurm output log''' +import sys, re +from datetime import datetime +pending={} +first=None +SPAT=re.compile('... (.*) BST start ([0-9]+ [0-9]+)') +EPAT=re.compile('... (.*) BST end ([0-9]+ [0-9]+)') +with open(sys.argv[1],'r') as f: + for l in f: + if m:=SPAT.match(l): + b=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p") + id=m[2] + if id in pending: + print('%s started twice at %s, %s'%(id,pending[id],b),file=sys.stderr) + else: + pending[id]=b + if first is None: + first=b + elif m:=EPAT.match(l): + e=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p") + id=m[2] + if id in pending: + delta=(e-pending[id]).seconds + print(delta,"%2d:%02d"%(delta/60,delta%60),sep='\t') + del pending[id] + else: + print('%s ended w/o start at %s'%(id,e),file=sys.stderr) +w=(e-first).seconds +sys.stdout.flush() +print('From %s to %s:'%(first.strftime("%d %b %Y %I:%M:%S %p"), + e.strftime("%d %b %Y %I:%M:%S %p")),file=sys.stderr) +print(' %d:%02d:%02d'%(w/3600,(w/60)%60,w%60),(e-first).seconds,sep='\t',file=sys.stderr)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/plinks.py Thu Sep 28 08:46:01 2023 +0100 @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import sys,pdfx,traceback,os +from datetime import datetime + +def run(file): + try: + pdf=pdfx.PDFx(file) + links=pdf.get_references_as_dict() + if bool(links) and (links.get('scrape',False) or + links.get('annot',False)): + for k in links.keys(): + for l in links[k]: + print("%s\t%s"%(k,l)) + else: + print("None") + except Exception as e: + if str(e)=='Unexpected EOF': + print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr) + print("badpdf") + else: + print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr) + traceback.print_exc(file=sys.stderr) + +if sys.argv[1]=='-': + i=0 + for l in sys.stdin: + print(i,file=sys.stderr) + i+=1 + f=l.rstrip() + if os.path.getsize(f)==1048576: # truncated + print("truncated",file=sys.stderr) + print("truncated") + else: + run(f) + os.unlink(f) +else: + run(sys.argv[1])