Mercurial > hg > cc > cirrus_work
changeset 293:12d13a1d387f
extend, then fix, to get it working for crawldiagnostics warc files
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Fri, 18 Apr 2025 13:39:55 +0100 |
| parents | a3d55cc7da18 |
| children | cc2945816b75 |
| files | lib/python/cc/lmh/warc2cdb.py lib/python/cc/test_warc.py lib/python/cc/warc.py |
| diffstat | 3 files changed, 83 insertions(+), 42 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py Wed Apr 09 20:42:29 2025 +0100 +++ b/lib/python/cc/lmh/warc2cdb.py Fri Apr 18 13:39:55 2025 +0100 @@ -3,14 +3,12 @@ Usage: warc2cdb.py CC-date segment output-dir''' -import re,warc,sys,glob,codecs,os.path +import re, warc, sys, glob, codecs, os.path import cython, typing import email.utils from urllib.parse import quote import subprocess -foo = cython.declare(int, 27) - TUPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r?$',re.MULTILINE) DPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Date: (.*?)\r?$',re.MULTILINE) LMPAT: typing.Pattern[bytes] = re.compile(b'^Last-Modified: (.*?)\r?$',re.MULTILINE) @@ -23,8 +21,27 @@ DATE: bytes OUT: typing.BinaryIO -def LMHline(_wtype: int, buf: memoryview , part: int) -> None: - global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT +WIN: int = 0 +LOSE: int = 0 +N: int = 0 +UERRS: int = 0 + +def _u_esc(c): + if c<65536: + return '\\u%04X'%c + else: + return '\\U%08X'%c + +def java_unicode_encode(ude): + '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' + return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('java_unicode',java_unicode_encode) + +def LMHline(wtype: int, buf: memoryview , part: int) -> None: + global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT, WIN, LOSE + global N, UERRS m: typing.Match[cython.bytes] | None mm: typing.Match[cython.bytes] | None if part==1: @@ -39,6 +56,7 @@ else: mm=LMPAT.search(buf) if mm: + N += 1 dateTime=mm[1] if dateTime.endswith(b'GMT'): if not dateTime.endswith(b' GMT'): @@ -50,37 +68,50 @@ lmi = 32535215999 except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) + LOSE += 1 return DATE=(DATE.translate(DTAB,DDEL)) + WIN += 1 try: URI.decode('ascii') except UnicodeDecodeError: - URI=quote(URI, safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') - ).encode('ascii') + UERRS += 1 + # Try just fixing the non-ASCII: + URI = URI.decode('utf-8').encode('ascii', errors='java_unicode') + l = len(lmi) + if wtype == warc.REVISIT: + whereami = b" need to fill this in some how" + l += len(whereami) OUT.write(b'+') OUT.write(b'%d'%(len(DATE)+len(URI))) OUT.write(b',') - OUT.write(b'%d'%len(lmi)) + OUT.write(b'%d'%l) OUT.write(b':') OUT.write(DATE) OUT.write(URI) OUT.write(b'->') OUT.write(lmi) + if wtype == warc.REVISIT: + OUT.write(whereami) OUT.write(b'\n') -def main(CCdate, segment, outdir, fpat = None): - global OUT +def main(CCdate, segment, outdir, subdir = 'warc', fpat = None): + global OUT, N, WIN, LOSE, UERRS, URI, DATE - infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00%s.warc.gz | sort -k8"'%( - CCdate, segment, ("???" if fpat is None else ( + infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*00%s.warc.gz | sort -k8"'%( + CCdate, segment, subdir, ("???" if fpat is None else ( (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat)))) - with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT: + with open((outfile_name:="%s/%s/%s/lmh.cdb_in"%(outdir, segment, subdir)),'wb') as OUT: for infile_name in subprocess.run(infile_pat, shell=True, stdout=subprocess.PIPE).stdout.decode('utf8').split(): print(infile_name,file=sys.stderr) - warc.warc(infile_name,LMHline,[1],parts=3) + WIN = LOSE = N = UERRS = 0 + warc.warc(infile_name,LMHline,[warc.RESP, warc.REVISIT],parts=3) + print('%d LM headers, %d win, %d lose, %d non-ASCII URIs'%(N,WIN,LOSE,UERRS), + file=sys.stderr) OUT.write(b'\n') + print(outfile_name) if __name__ == '__main__':
--- a/lib/python/cc/test_warc.py Wed Apr 09 20:42:29 2025 +0100 +++ b/lib/python/cc/test_warc.py Fri Apr 18 13:39:55 2025 +0100 @@ -28,6 +28,8 @@ elif tt==3: warc.warc(sys.argv[1],showme,[0],whole=True,debug=debug) elif tt==4: - warc.warc(sys.argv[1],showme,[1,2,3,0],whole=True,debug=debug) + warc.warc(sys.argv[1],showme,[1,2,3,0,4],whole=True,debug=debug) elif tt==5: warc.warc(sys.argv[1],showme,[1],parts=int(sys.argv[2]),debug=debug) +elif tt==6: + warc.warc(sys.argv[1],showme,[4],parts=int(sys.argv[2]),debug=debug)
--- a/lib/python/cc/warc.py Wed Apr 09 20:42:29 2025 +0100 +++ b/lib/python/cc/warc.py Fri Apr 18 13:39:55 2025 +0100 @@ -13,6 +13,7 @@ RESP: int = 1 REQ: int = 2 META: int = 3 +REVISIT: int = 4 BUFSIZE: int = 16 * 1024 * 1024 BUFMIN: int = 3 * 512 * 1024 # 1.5MiB, will need to be increased @@ -54,7 +55,7 @@ done: bool = bl < BUFSIZE while buf.startswith(b'\r\n',bp): bp+=2 - while not (done and bp <= bl): + while not (done and bp >= bl): start_1: int = bp if not buf.startswith(b'WARC/1.0\r\n',bp): breakpoint() @@ -87,6 +88,8 @@ wtype = META elif buf.startswith(b'w',bp+11): wtype = INFO + elif buf.startswith(b'v',bp+13): + wtype = REVISIT else: raise ValueError("Unknown WARC-Type: %s in %s at %s"%( bytes(bufView[bp+11:eol-2]),filename, @@ -102,32 +105,37 @@ if (wtype in types): # Output whole or part 1 as required if whole: - bp+=length - _out=callback(wtype,bufView[start_1:bp],7) - elif (parts & 1): - _out=callback(wtype,bufView[start_1:eol],1) - bp = eol - while buf.startswith(b'\r\n',bp): - bp+=2 - if whole: - return - if parts!=1: - start_2=bp - eob=bp+length - while buf.startswith(b'\r\n',eob-2): - eob-=2 - # Only output parts (2 = HTTP header, 4 = body) that are wanted - if parts & 2: - if wtype == META or wtype == INFO: - # rest of the part - _out=callback(wtype,bufView[start_2:eob],2) - else: - # request and response have http headers - eo2=buf.index(b'\r\n\r\n',start_2) - _out=callback(wtype,bufView[start_2:eo2+2],2) - if parts & 4: - raise ValueError("Not implemented: body part (4): %s"%parts) - bp += length + _out=callback(wtype,bufView[start_1:bp+length],7) + else: + if (parts & 1): + bp = eol+2 + _out=callback(wtype,bufView[start_1:bp],1) + if parts!=1: + while buf.startswith(b'\r\n',bp): + bp+=2 + start_2=bp + eob=bp+length + while buf.startswith(b'\r\n',eob-2): + eob-=2 + # Only output parts (2 = HTTP header, 4 = body) that are wanted + if parts & 2: + if wtype == RESP or wtype == REQ : + # request and response have http headers + eo2=buf.index(b'\r\n\r\n',start_2) + _out=callback(wtype,bufView[start_2:eo2+2],2) + else: + # rest of the part + _out=callback(wtype,bufView[start_2:eob],2) + if parts & 4: + raise ValueError("Not implemented: body part (4): %s"%parts) + #bp += length + #if buf[bp] != 13: + # # Why does this sometimes happen, e.g. when doing + # python3 ~/lib/python/cc/test_warc.py 4 /beegfs/common_crawl/CC-MAIN-2019-35/1566027313501.0/orig/crawldiagnostics/CC-MAIN-20190817222907-20190818004907-00000.warc.gz + # at a point where bp+length is 11018, looking at >\n\r\n + # bp += 1 [doesn't work] + bp = buf.index(b'\r\n',bp+length) + # check if refill needed rl: int if (rl := (bp - start_1)) > RECORDMAX: RECORDMAX = rl
