Mercurial > hg > cc > cirrus_work
changeset 130:31abd509e365
importable just in case
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 28 Sep 2023 16:35:39 +0100 |
parents | 83a574b570a6 |
children | fd16e8fb9223 |
files | lib/python/cc/lmh/lmh.py |
diffstat | 1 files changed, 13 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/lmh.py Thu Sep 28 16:34:49 2023 +0100 +++ b/lib/python/cc/lmh/lmh.py Thu Sep 28 16:35:39 2023 +0100 @@ -13,8 +13,6 @@ DTAB=bytearray(range(256)) DDEL=b'TZ-:' -OUT=open(sys.stdout.fileno(),'wb') - def showmeLMH(wtype,buf,part): global URI, DATE, SEGMENT, FILETYPE, FILENO if part==1: @@ -42,13 +40,19 @@ OUT.write(mm[1]) OUT.write(b'\n') -(CCdate, segment, filetype, fileno) = sys.argv[1:] -fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( - CCdate, segment, filetype, fileno) +def main(CCdate, segment, filetype, fileno): + global SEGMENT, FILETYPE, FILENO + + OUT=open(sys.stdout.fileno(),'wb') + + fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) -SEGMENT=codecs.encode(segment,'ascii') -FILETYPE=codecs.encode(filetype,'ascii') -FILENO=codecs.encode(fileno,'ascii') + SEGMENT=codecs.encode(segment,'ascii') + FILETYPE=codecs.encode(filetype,'ascii') + FILENO=codecs.encode(fileno,'ascii') -warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) + warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) +if __name__ == '__main__': + sys.exit(main(**sys.argv[1:]))