# HG changeset patch # User Henry S. Thompson # Date 1618333329 0 # Node ID 464d2dfb99c99af0b3b27af93c9ad26694e04db2 # Parent b6a5999d8e068e046a36ec35981fab98eae0fb79 new diff -r b6a5999d8e06 -r 464d2dfb99c9 .Xauthority Binary file .Xauthority has changed diff -r b6a5999d8e06 -r 464d2dfb99c9 bin/ezip.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ezip.sh Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,16 @@ +#!/bin/bash +# Invoke this as e.g. sbatch -n 4 -c 32 --exclusive masterJob.sh ezip +# run pigz -p 8 on extract_...tar files +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +node=$SLURMD_NODENAME +local=$SLURM_LOCALID +proc=$SLURM_PROCID +echo $(date) $node:$proc start + +module load gnu-parallel + +parallel --will-cite -j $c -n 1 'pigz -p 8 {}/orig/extracts/*.tar' < ezip/$proc.txt + +echo $(date) $proc end + diff -r b6a5999d8e06 -r 464d2dfb99c9 bin/intersection --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/intersection Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,19 @@ +#!/bin/sh +# Output intersection (or, with -d, difference) of two files, line by line +# No detection or special treatment of duplicates +# Usage: intersection [-d] s1 s2 +# In the case of difference, which is ordered, interpretation is s1 - s2 +if [ "$1" = "-?" ] + then + echo "Usage: intersection [-d] s1 s2" + exit 1 +fi +if [ "$1" = "-d" ] + then + shift + fgrep -x -v -f "$2" "$1" + else + fgrep -x -f "$1" "$2" + exit 0 +fi + diff -r b6a5999d8e06 -r 464d2dfb99c9 bin/ix.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ix.sh Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,13 @@ +#!/usr/bin/bash +# Extract records from warc files given filename, length and offset triples +# from stdin or as command line args +if [ -n "$1" ] +then + printf "%s\t%s\t%s\n" "$1" "$2" "$3" +else + cat +fi | \ +while { IFS=$'\t' read f l o; } +do + dd if="$f" of=/dev/stdout skip=$o count=$l iflag=skip_bytes,count_bytes +done | unpigz -dp 1 -c diff -r b6a5999d8e06 -r 464d2dfb99c9 bin/stest.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/stest.sh Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,7 @@ +#!/bin/bash +pwd +ls /dev/shm +echo I am node "$SLURM_NODEID", cpu "$PMI_FD", args "$@" +cat stest_$PMI_FD.txt + + diff -r b6a5999d8e06 -r 464d2dfb99c9 bin/uniq_merge.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/uniq_merge.py Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +# Merge counts by key from the output of "uniq -c" and sort in descending order +# An alternative to sus when the scale is too big for the initial sort, or if uniq -c already does a lot +# of the work +# Usage: ... | uniq -c | uniq-merge.py +import sys +s={} +for l in sys.stdin: + (i,d)=l.split() + i=int(i) + if d in s: + s[d]+=i + else: + s[d]=i +for (d,n) in sorted(s.items(),key=lambda j:j[1],reverse=True): + print('%5d\t%s'%(n,d)) diff -r b6a5999d8e06 -r 464d2dfb99c9 lib/python/cdx_segment.py --- a/lib/python/cdx_segment.py Tue Mar 16 16:20:02 2021 +0000 +++ b/lib/python/cdx_segment.py Tue Apr 13 17:02:09 2021 +0000 @@ -53,26 +53,39 @@ sys.stderr.write("bogus: ",afn,l) e+=1 -mt=datetime.now() -print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())), - e,(mt-st).seconds),file=sys.stderr) -# Randomise to try to avoid contention -for s in sample(segdirs,100): - for r in rr: - of=ss[r][s] - of.flush() - o=of.fileno() - fsync(o) - with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df: - d=df.fileno() - while True: - data = read(o,131072) - if data == b'': # end of file reached - break - write(d,data) - of.close() +if True: + # See note below, will have to copy entire result to /beegfs at shell level + for rr in ss.values(): + for s in rr.values(): + s.close() +else: + # The following fails, in that there are occasional small gaps in the result + # I've given up trying to figure out why... + # Randomise to try to avoid contention + mt=datetime.now() + print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())), + e,(mt-st).seconds),file=sys.stderr) -res=system("rm -r %s"%ifn) + for s in sample(segdirs,100): + for r in rr: + of=ss[r][s] + of.flush() + o=of.fileno() + fsync(o) + opos=lseek(o,0,SEEK_SET) + with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df: + d=df.fileno() + dpos=lseek(d,0,SEEK_END) + print(of.name,opos,df.name,dpos,file=sys.stderr) + while True: + data = read(o,131072) + if data == b'': # end of file reached + break + write(d,data) + of.close() + + res=0 #system("rm -r %s"%ifn) et=datetime.now() -print(et,"finished",ifn,res,"%d seconds total"%((et-st).seconds),file=sys.stderr) +print(et,"finished",ifn,"%s ok, %d bogus, %d seconds total"%(':'.join(map(str,n.values())), + e,(et-st).seconds),file=sys.stderr)