changeset 286:147f648e4e5e

trying to recover from partial, not-ordered, run of segs 0--7
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 24 Mar 2025 14:30:32 +0000
parents 0ec17b2aab72
children fe78af4ea7c5
files lib/python/cc/lmh/hack.py lib/python/cc/lmh/warc2cdb.py
diffstat 2 files changed, 33 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/hack.py	Mon Mar 24 14:30:32 2025 +0000
@@ -0,0 +1,27 @@
+#!/usr/bin/python3
+import sys, re
+p = re.compile('([0-9]*)a([0-9]*)(,([0-9]*))?')
+f=True
+for l in sys.stdin:
+  if (m := p.match(l)):
+    if f:
+      sys.stdout.write('{')
+      f=False
+    else:
+      sys.stdout.write(',')
+    if m:
+      b=sys.stdin.readline()[4:]
+      if m[3]:
+        i=int(m[4])-int(m[2])
+        while i:
+          e=sys.stdin.readline()
+          i-=1
+        sys.stdout.write('{%s..%s}'%(b[:-1],e[4:][:-1]))
+      else:
+        sys.stdout.write(b[:-1])
+    else:
+      print('no match',l,m,file=sys.stderr)
+      exit(1)
+sys.stdout.write('}')
+
+
--- a/lib/python/cc/lmh/warc2cdb.py	Sat Mar 08 22:31:14 2025 +0000
+++ b/lib/python/cc/lmh/warc2cdb.py	Mon Mar 24 14:30:32 2025 +0000
@@ -7,6 +7,7 @@
 import cython, typing
 import email.utils
 from urllib.parse import quote
+import subprocess
 
 TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
 DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
@@ -64,14 +65,15 @@
       OUT.write(lmi)
       OUT.write(b'\n')
 
-def main(CCdate, segment, nFiles, outdir):
+def main(CCdate, segment, outdir, fpat="???"):
   global OUT
 
-  infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00???.warc.gz'%(
-    CCdate, segment)
+  infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00%s.warc.gz | sort -k8"'%(
+    CCdate, segment, fpat)
   
   with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT:
-    for infile_name in glob.glob(infile_pat):
+    for infile_name in subprocess.run(infile_pat, shell=True,
+                                   stdout=subprocess.PIPE).stdout.decode('utf8').split():
       print(infile_name,file=sys.stderr)
       warc.warc(infile_name,LMHline,[b'response'],parts=3)
     OUT.write(b'\n')