changeset 101:e2e64c3d763e

bug4 fixed, but that created a new, earlier bug
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 11 Sep 2023 22:06:45 +0100
parents 18446a7eeb9e
children e606c609f813
files bin/do_idx.sh bin/merge_date.py
diffstat 2 files changed, 41 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/do_idx.sh	Mon Sep 11 22:06:45 2023 +0100
@@ -0,0 +1,14 @@
+#!/bin/bash
+export res="$1"
+orig="$2"
+merge_date.py -d <(LC_ALL=C sort -m -k1,2 -s $res/ks_[0-9]*.tsv) $orig $res/idx # | \
+exit
+    tee /dev/stderr | \
+          parallel -j 10 'echo {#} {} >$res/merge_{#}.log
+                          echo $(date) {#} {}
+                          export res
+                          split -l 3000 --filter="igzip -c | \
+                                                  tee >(wc -c >> \
+                                                        $res/merge_{#}.log)" \
+                                    {} > {}.gz && \
+                                    rm {}'
--- a/bin/merge_date.py	Mon Sep 11 12:56:47 2023 +0100
+++ b/bin/merge_date.py	Mon Sep 11 22:06:45 2023 +0100
@@ -63,7 +63,7 @@
       time.sleep(0.1) # so it flushes?
       XN=XPATH%FN
       if not os.path.exists(XN):
-        return
+        return (None, None)
       XF = igzip.IGzipFile(filename=XN)
       NF = open((NN:=NPATH%FN), 'wb')
       xl = XF.readline()
@@ -110,12 +110,13 @@
                                             for xp in (xkey, xdate, xurl))))
     messyU, xkey1, xkey2 = keys(xkey)
     if messyD:
+      noMatch = (ddate != xdate or
+            not dkey.startswith(xkey1) or
+            dkey!=xkey1 or
+            durl!=xurl)
       if messyU:
         # better match
-        if (ddate != xdate or
-            not dkey.startswith(xkey1) or
-            dkey!=xkey1 or
-            durl!=xurl):
+        if noMatch:
           raise ValueError("Fail: xkey: %s\n"
                 "      dkey: %s\n"
                 "      xdate: %s\n"
@@ -128,11 +129,10 @@
                 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
                           (b'\n     '.join(dfq)).decode('ascii'),
                           xkey1, xkey2, FN, XCNT, DCNT, xl))
-        messyD = False
         # fall through to the ordinary (non-messy) match case
       else:
-        # still looking, save this one
-        if DEBUG:
+        # still looking, save if >= date else fall through to write
+        if DEBUG>1:
           print("Diso: xkey: %s\n"
                 "      dkey: %s\n"
                 "      xdate: %s\n"
@@ -141,14 +141,28 @@
                 "      durl: %s\n"
                 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl),
                 file=sys.stderr)
-        xq.append(xl)
-        if DEBUG>1:
-          sys.stderr.write('xpush\n')
-        continue
+        if not noMatch:
+          xq.append(xl)
+          if DEBUG>1:
+            sys.stderr.write('xpush\n')
+          continue
+        # else fall through
     else:
       # Not messyD
       if messyU:
-        raise ValueError("messyD w/o messyU")
+        raise ValueError("messyU w/o messyD:"
+                        "xkey: %s\n"
+                "dkey: %s\n"
+                "xdate: %s\n"
+                "ddate: %s\n"
+                "xurl: %s\n"
+                "durl: %s\n"
+                "dfq: %s\n"
+                "k1, k2: |%s|%s|\n"
+                "FN: %s XCNT: %s DCNT: %s\n"
+                "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
+                          (b'\n     '.join(dfq)).decode('ascii'),
+                          xkey1, xkey2, FN, XCNT, DCNT, xl))
     if (ddate != xdate or
             not dkey.startswith(xkey1) or
             (xkey2 is not None and dkey!=xkey2) or