changeset 149:34562e621f6d

try to get the counts right, particularly when re-merging
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 04 Oct 2023 18:53:55 +0100
parents ded66be0238c
children 4c499fc47ea7 85343fe48f69
files lib/python/cc/lmh/merge_date.py
diffstat 1 files changed, 17 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/merge_date.py	Wed Oct 04 18:51:56 2023 +0100
+++ b/lib/python/cc/lmh/merge_date.py	Wed Oct 04 18:53:55 2023 +0100
@@ -61,7 +61,7 @@
 
 NF = open('/dev/null','w')
 XF = open('/dev/null','rb')
-if False:
+if MERGED:
   MF = open(MERGED,'r')
   PREV_DCNT = 0
 else:
@@ -81,14 +81,18 @@
       if MF and REDOING:
         oo = ML.split()
         oo = [oo[0]]+[int(o) for o in oo[1:]]
-        if oo != (no:=[NN, XCNT, WCNT, DCNT]):
+        if oo != (no:=[NN, XCNT, WCNT, DCNT-1]):
           print(*('%s:\t%s<>%s'%vv for vv in
                   zip(('NN', 'XCNT', 'WCNT', 'DCNT'),oo,no)),
                 sep='\n',file=sys.stderr)
         REDOING=False
       if FN != -1:
         print(NN, flush=True) # so we can compress it
-        print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True)
+        print(NN, XCNT, WCNT,
+              DCNT-1,         # we've read one more date than we've actually
+                              #   used (even if we're finishing up, because
+                              #   we increment DCNT even on EOF)
+              sep='\t',file=sys.stderr,flush=True)
         time.sleep(0.1) # so they flush?
       FN += 1
       if MF:
@@ -106,12 +110,14 @@
             mo = ML.split()
             NN = mo[0]
             (XCNT, WCNT, DCNT) = [int(o) for o in mo[1:]]
-            # file col. 4 is 1 ahead of the game
-            for i in range((DCNT-1)-PREV_DCNT):
+            # file col. 4 is 1 ahead of the game ??
+            for i in range(DCNT-PREV_DCNT):
               dl = DF.readline()
+            DCNT+=1  # So the DCNT-1s above will be correct
             # hack because the first date of the next x file has
             #   already been read and split
-            DKEY, DDATE, DURL, DTIME = dl.split(b'\t')
+            DKEY, DDATE, DURL, DTIME = \
+                  (("", "", "", 0) if dl == b'' else dl.split(b'\t'))
             PREV_DCNT = DCNT
             # We've skipped this one, go around again,
             #  the existing XF will still be at EOF
@@ -139,13 +145,16 @@
 def nextDate(dn):
   global DEBUG, DF, DCNT, XCNT
   dl = DF.readline()
+  DCNT += 1
   if dl == b'':
-    # write out the last of the last index file, if any
+    # Write out the last of the last index file, if any
+    # Note that we increment DCNT even in this case, so that the decrement
+    #  that happens when we write out the summary line in nextLine on EOF
+    #  will give the right answer.
     return "", "", "", 0
   if DEBUG:
     sys.stderr.write("dl%s: %s\n"%(dn,dl))
   dkey, ddate, durl, dtime = dl.split(b'\t')
-  DCNT += 1
   return dkey, ddate, durl, dtime
 
 with open(sys.argv[1], 'rb') as DF: