# HG changeset patch # User Henry S. Thompson # Date 1698668393 0 # Node ID 72631d4ac30bf649e3f5e2743e7d5eb78cdf9e83 # Parent d0dbfefd6fc00eb268c37b9ecb64a40745e20ed1 make extra file info optional diff -r d0dbfefd6fc0 -r 72631d4ac30b lib/python/cc/lmh/lmh.py --- a/lib/python/cc/lmh/lmh.py Wed Oct 25 23:01:59 2023 +0100 +++ b/lib/python/cc/lmh/lmh.py Mon Oct 30 12:19:53 2023 +0000 @@ -2,7 +2,9 @@ '''Extract identifying info + LastModified header value for all entries that have one - Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' + Usage: lmh.py CC-date segment filetype 3-digit-fileno [1] + Includes input identity columns in output if final arg is 1''' + import re,warc,sys,glob,codecs @@ -30,29 +32,35 @@ if mm: OUT.write(b'\t') OUT.write(DATE.translate(DTAB,DDEL)) - OUT.write(b'\t') - OUT.write(SEGMENT) - OUT.write(b'\t') - OUT.write(FILETYPE) - OUT.write(b'\t') - OUT.write(FILENO) + if EXTRAS: + OUT.write(b'\t') + OUT.write(SEGMENT) + OUT.write(b'\t') + OUT.write(FILETYPE) + OUT.write(b'\t') + OUT.write(FILENO) OUT.write(b'\t') OUT.write(mm[1]) OUT.write(b'\n') -def main(CCdate, segment, filetype, fileno): - global SEGMENT, FILETYPE, FILENO +def main(CCdate, segment, filetype, fileno, extras=False): + global SEGMENT, FILETYPE, FILENO, EXTRAS, OUT OUT=open(sys.stdout.fileno(),'wb') fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( CCdate, segment, filetype, fileno) - SEGMENT=codecs.encode(segment,'ascii') - FILETYPE=codecs.encode(filetype,'ascii') - FILENO=codecs.encode(fileno,'ascii') + if EXTRAS:=bool(extras): + SEGMENT=codecs.encode(segment,'ascii') + FILETYPE=codecs.encode(filetype,'ascii') + FILENO=codecs.encode(fileno,'ascii') + warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) if __name__ == '__main__': - sys.exit(main(**sys.argv[1:])) + sys.exit(main(*sys.argv[1:])) + + +