Mercurial > hg > cc > cirrus_work
changeset 291:70da637d1402
accommodate to change to digits for record type,
minor tweaks,
change format of input ranges,
still a bug, in 11/...540.warc.gz
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Wed, 09 Apr 2025 17:15:40 +0100 |
| parents | 52c9d1875608 |
| children | a3d55cc7da18 |
| files | lib/python/cc/lmh/warc2cdb.py lib/python/cc/test_warc.py lib/python/cc/warc.py |
| diffstat | 3 files changed, 25 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py Wed Apr 09 12:57:50 2025 +0100 +++ b/lib/python/cc/lmh/warc2cdb.py Wed Apr 09 17:15:40 2025 +0100 @@ -9,20 +9,22 @@ from urllib.parse import quote import subprocess -TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) -DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) -LMPAT: typing.Pattern[cython.bytes] = re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) -FFPAT: typing.Pattern[cython.bytes] = re.compile(b'([^ ])GMT$') +foo = cython.declare(int, 27) -DTAB: cython.bytes = bytearray(range(256)) -DDEL: cython.bytes = b'TZ-:' +TUPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r?$',re.MULTILINE) +DPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Date: (.*?)\r?$',re.MULTILINE) +LMPAT: typing.Pattern[bytes] = re.compile(b'^Last-Modified: (.*?)\r?$',re.MULTILINE) +FFPAT: typing.Pattern[bytes] = re.compile(b'([^ ])GMT$') -URI: cython.bytes -DATE: cython.bytes +DTAB: bytearray = bytearray(range(256)) +DDEL: bytes = b'TZ-:' + +URI: bytes +DATE: bytes OUT: typing.BinaryIO -def LMHline(wtype: cython.bytes, buf: char[::1] , part: int) -> None: - global URI, DATE +def LMHline(_wtype: int, buf: memoryview , part: int) -> None: + global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT m: typing.Match[cython.bytes] | None mm: typing.Match[cython.bytes] | None if part==1: @@ -39,7 +41,8 @@ if mm: dateTime=mm[1] if dateTime.endswith(b'GMT'): - dateTime = FFPAT.sub(b'\\1 GMT',dateTime) + if not dateTime.endswith(b' GMT'): + dateTime = dateTime[:-3]+b' GMT' # FFPAT.sub(b'\\1 GMT',dateTime) try: try: lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp()) @@ -65,22 +68,20 @@ OUT.write(lmi) OUT.write(b'\n') -def main(CCdate, segment, outdir, fpat="???"): +def main(CCdate, segment, outdir, fpat = None): global OUT infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00%s.warc.gz | sort -k8"'%( - CCdate, segment, fpat) + CCdate, segment, ("???" if fpat is None else ( + (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat)))) with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT: for infile_name in subprocess.run(infile_pat, shell=True, stdout=subprocess.PIPE).stdout.decode('utf8').split(): print(infile_name,file=sys.stderr) - warc.warc(infile_name,LMHline,[b'response'],parts=3) + warc.warc(infile_name,LMHline,[1],parts=3) OUT.write(b'\n') print(outfile_name) if __name__ == '__main__': sys.exit(main(*sys.argv[1:])) - - -
--- a/lib/python/cc/test_warc.py Wed Apr 09 12:57:50 2025 +0100 +++ b/lib/python/cc/test_warc.py Wed Apr 09 17:15:40 2025 +0100 @@ -22,12 +22,12 @@ return OUT if tt==1: - warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug) + warc.warc(sys.argv[1],showme,[1,2,3,0],parts=int(sys.argv[2]),debug=debug) elif tt==2: - warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug) + warc.warc(sys.argv[1],showme,[0],parts=int(sys.argv[2]),debug=debug) elif tt==3: - warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug) + warc.warc(sys.argv[1],showme,[0],whole=True,debug=debug) elif tt==4: - warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug) + warc.warc(sys.argv[1],showme,[1,2,3,0],whole=True,debug=debug) elif tt==5: - warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug) + warc.warc(sys.argv[1],showme,[1],parts=int(sys.argv[2]),debug=debug)
--- a/lib/python/cc/warc.py Wed Apr 09 12:57:50 2025 +0100 +++ b/lib/python/cc/warc.py Wed Apr 09 17:15:40 2025 +0100 @@ -103,12 +103,13 @@ if whole: bp+=length _out=callback(wtype,bufView[start_1:bp],7) - continue elif (parts & 1): _out=callback(wtype,bufView[start_1:eol],1) bp = eol while buf.startswith(b'\r\n',bp): bp+=2 + if whole: + return if parts!=1: start_2=bp eob=bp+length
