changeset 162:72631d4ac30b

make extra file info optional
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 30 Oct 2023 12:19:53 +0000
parents d0dbfefd6fc0
children 348f4a31228f
files lib/python/cc/lmh/lmh.py
diffstat 1 files changed, 21 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/lmh.py	Wed Oct 25 23:01:59 2023 +0100
+++ b/lib/python/cc/lmh/lmh.py	Mon Oct 30 12:19:53 2023 +0000
@@ -2,7 +2,9 @@
 '''Extract identifying info + LastModified header value for all entries
    that have one
 
-   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''
+   Usage: lmh.py CC-date segment filetype 3-digit-fileno [1]
+   Includes input identity columns in output if final arg is 1'''
+
 
 import re,warc,sys,glob,codecs
 
@@ -30,29 +32,35 @@
     if mm:
       OUT.write(b'\t')
       OUT.write(DATE.translate(DTAB,DDEL))
-      OUT.write(b'\t')
-      OUT.write(SEGMENT)
-      OUT.write(b'\t')
-      OUT.write(FILETYPE)
-      OUT.write(b'\t')
-      OUT.write(FILENO)
+      if EXTRAS:
+        OUT.write(b'\t')
+        OUT.write(SEGMENT)
+        OUT.write(b'\t')
+        OUT.write(FILETYPE)
+        OUT.write(b'\t')
+        OUT.write(FILENO)
       OUT.write(b'\t')
       OUT.write(mm[1])
     OUT.write(b'\n')
 
-def main(CCdate, segment, filetype, fileno):
-  global SEGMENT, FILETYPE, FILENO
+def main(CCdate, segment, filetype, fileno, extras=False):
+  global SEGMENT, FILETYPE, FILENO, EXTRAS, OUT
 
   OUT=open(sys.stdout.fileno(),'wb')
 
   fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
     CCdate, segment, filetype, fileno)
 
-  SEGMENT=codecs.encode(segment,'ascii')
-  FILETYPE=codecs.encode(filetype,'ascii')
-  FILENO=codecs.encode(fileno,'ascii')
+  if EXTRAS:=bool(extras):
+    SEGMENT=codecs.encode(segment,'ascii')
+    FILETYPE=codecs.encode(filetype,'ascii')
+    FILENO=codecs.encode(fileno,'ascii')
+  
 
   warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
 
 if __name__ == '__main__':
-  sys.exit(main(**sys.argv[1:]))
+  sys.exit(main(*sys.argv[1:]))
+
+
+