changeset 88:464d2dfb99c9

new
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 13 Apr 2021 17:02:09 +0000
parents b6a5999d8e06
children 90f8f28b2e51
files .Xauthority bin/ezip.sh bin/intersection bin/ix.sh bin/stest.sh bin/uniq_merge.py lib/python/cdx_segment.py
diffstat 7 files changed, 104 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
Binary file .Xauthority has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/ezip.sh	Tue Apr 13 17:02:09 2021 +0000
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Invoke this as e.g. sbatch -n 4 -c 32 --exclusive masterJob.sh ezip
+# run pigz -p 8 on extract_...tar files
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+node=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+proc=$SLURM_PROCID
+echo $(date) $node:$proc start
+
+module load gnu-parallel
+
+parallel --will-cite -j $c -n 1 'pigz -p 8 {}/orig/extracts/*.tar' < ezip/$proc.txt
+
+echo $(date) $proc end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/intersection	Tue Apr 13 17:02:09 2021 +0000
@@ -0,0 +1,19 @@
+#!/bin/sh
+# Output intersection (or, with -d, difference) of two files, line by line
+# No detection or special treatment of duplicates
+# Usage: intersection [-d] s1 s2
+# In the case of difference, which is ordered, interpretation is s1 - s2
+if [ "$1" = "-?" ]
+ then
+   echo "Usage: intersection [-d] s1 s2"
+   exit 1
+fi
+if [ "$1" = "-d" ]
+ then
+  shift
+  fgrep -x -v -f "$2" "$1"
+ else
+  fgrep -x -f "$1" "$2"
+  exit 0
+fi
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/ix.sh	Tue Apr 13 17:02:09 2021 +0000
@@ -0,0 +1,13 @@
+#!/usr/bin/bash
+# Extract records from warc files given filename, length and offset triples
+#  from stdin or as command line args
+if [ -n "$1" ]
+then
+    printf "%s\t%s\t%s\n" "$1" "$2" "$3"
+else
+    cat
+fi | \
+while { IFS=$'\t' read f l o; }
+do
+  dd if="$f" of=/dev/stdout skip=$o count=$l iflag=skip_bytes,count_bytes
+done | unpigz -dp 1 -c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/stest.sh	Tue Apr 13 17:02:09 2021 +0000
@@ -0,0 +1,7 @@
+#!/bin/bash
+pwd
+ls /dev/shm
+echo I am node "$SLURM_NODEID", cpu "$PMI_FD", args "$@"
+cat stest_$PMI_FD.txt
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/uniq_merge.py	Tue Apr 13 17:02:09 2021 +0000
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# Merge counts by key from the output of "uniq -c" and sort in descending order
+# An alternative to sus when the scale is too big for the initial sort, or if uniq -c already does a lot
+#  of the work
+# Usage: ... | uniq -c | uniq-merge.py
+import sys
+s={}
+for l in sys.stdin:
+ (i,d)=l.split()
+ i=int(i)
+ if d in s:
+  s[d]+=i
+ else:
+  s[d]=i
+for (d,n) in sorted(s.items(),key=lambda j:j[1],reverse=True):
+ print('%5d\t%s'%(n,d))
--- a/lib/python/cdx_segment.py	Tue Mar 16 16:20:02 2021 +0000
+++ b/lib/python/cdx_segment.py	Tue Apr 13 17:02:09 2021 +0000
@@ -53,26 +53,39 @@
       sys.stderr.write("bogus: ",afn,l)
       e+=1
 
-mt=datetime.now()
-print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())),
-                                                              e,(mt-st).seconds),file=sys.stderr)
-# Randomise to try to avoid contention
-for s in sample(segdirs,100):
-  for r in rr:
-    of=ss[r][s]
-    of.flush()
-    o=of.fileno()
-    fsync(o)
-    with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df:
-      d=df.fileno()
-      while True:
-        data = read(o,131072)
-        if data == b'':  # end of file reached
-            break
-        write(d,data)
-    of.close()
+if True:
+  # See note below, will have to copy entire result to /beegfs at shell level
+  for rr in ss.values():
+    for s in rr.values():
+      s.close()
+else:
+  # The following fails, in that there are occasional small gaps in the result
+  #  I've given up trying to figure out why...
+  # Randomise to try to avoid contention
+  mt=datetime.now()
+  print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())),
+                                                               e,(mt-st).seconds),file=sys.stderr)
 
-res=system("rm -r %s"%ifn)
+  for s in sample(segdirs,100):
+    for r in rr:
+      of=ss[r][s]
+      of.flush()
+      o=of.fileno()
+      fsync(o)
+      opos=lseek(o,0,SEEK_SET)
+      with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df:
+        d=df.fileno()
+        dpos=lseek(d,0,SEEK_END)
+        print(of.name,opos,df.name,dpos,file=sys.stderr)
+        while True:
+          data = read(o,131072)
+          if data == b'':  # end of file reached
+              break
+          write(d,data)
+      of.close()
+
+  res=0 #system("rm -r %s"%ifn)
 
 et=datetime.now()
-print(et,"finished",ifn,res,"%d seconds total"%((et-st).seconds),file=sys.stderr)
+print(et,"finished",ifn,"%s ok, %d bogus, %d seconds total"%(':'.join(map(str,n.values())),
+                                                             e,(et-st).seconds),file=sys.stderr)