# HG changeset patch # User Henry S. Thompson # Date 1695915339 -3600 # Node ID 31abd509e365237468238574848cf0ee867042bc # Parent 83a574b570a6b7b6955d034c665651e3ebf87fde importable just in case diff -r 83a574b570a6 -r 31abd509e365 lib/python/cc/lmh/lmh.py --- a/lib/python/cc/lmh/lmh.py Thu Sep 28 16:34:49 2023 +0100 +++ b/lib/python/cc/lmh/lmh.py Thu Sep 28 16:35:39 2023 +0100 @@ -13,8 +13,6 @@ DTAB=bytearray(range(256)) DDEL=b'TZ-:' -OUT=open(sys.stdout.fileno(),'wb') - def showmeLMH(wtype,buf,part): global URI, DATE, SEGMENT, FILETYPE, FILENO if part==1: @@ -42,13 +40,19 @@ OUT.write(mm[1]) OUT.write(b'\n') -(CCdate, segment, filetype, fileno) = sys.argv[1:] -fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( - CCdate, segment, filetype, fileno) +def main(CCdate, segment, filetype, fileno): + global SEGMENT, FILETYPE, FILENO + + OUT=open(sys.stdout.fileno(),'wb') + + fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) -SEGMENT=codecs.encode(segment,'ascii') -FILETYPE=codecs.encode(filetype,'ascii') -FILENO=codecs.encode(fileno,'ascii') + SEGMENT=codecs.encode(segment,'ascii') + FILETYPE=codecs.encode(filetype,'ascii') + FILENO=codecs.encode(fileno,'ascii') -warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) + warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) +if __name__ == '__main__': + sys.exit(main(**sys.argv[1:]))