Mercurial > hg > cc > cirrus_work
changeset 162:72631d4ac30b
make extra file info optional
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 30 Oct 2023 12:19:53 +0000 |
parents | d0dbfefd6fc0 |
children | 348f4a31228f |
files | lib/python/cc/lmh/lmh.py |
diffstat | 1 files changed, 21 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/lmh.py Wed Oct 25 23:01:59 2023 +0100 +++ b/lib/python/cc/lmh/lmh.py Mon Oct 30 12:19:53 2023 +0000 @@ -2,7 +2,9 @@ '''Extract identifying info + LastModified header value for all entries that have one - Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' + Usage: lmh.py CC-date segment filetype 3-digit-fileno [1] + Includes input identity columns in output if final arg is 1''' + import re,warc,sys,glob,codecs @@ -30,29 +32,35 @@ if mm: OUT.write(b'\t') OUT.write(DATE.translate(DTAB,DDEL)) - OUT.write(b'\t') - OUT.write(SEGMENT) - OUT.write(b'\t') - OUT.write(FILETYPE) - OUT.write(b'\t') - OUT.write(FILENO) + if EXTRAS: + OUT.write(b'\t') + OUT.write(SEGMENT) + OUT.write(b'\t') + OUT.write(FILETYPE) + OUT.write(b'\t') + OUT.write(FILENO) OUT.write(b'\t') OUT.write(mm[1]) OUT.write(b'\n') -def main(CCdate, segment, filetype, fileno): - global SEGMENT, FILETYPE, FILENO +def main(CCdate, segment, filetype, fileno, extras=False): + global SEGMENT, FILETYPE, FILENO, EXTRAS, OUT OUT=open(sys.stdout.fileno(),'wb') fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( CCdate, segment, filetype, fileno) - SEGMENT=codecs.encode(segment,'ascii') - FILETYPE=codecs.encode(filetype,'ascii') - FILENO=codecs.encode(fileno,'ascii') + if EXTRAS:=bool(extras): + SEGMENT=codecs.encode(segment,'ascii') + FILETYPE=codecs.encode(filetype,'ascii') + FILENO=codecs.encode(fileno,'ascii') + warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) if __name__ == '__main__': - sys.exit(main(**sys.argv[1:])) + sys.exit(main(*sys.argv[1:])) + + +