Mercurial > hg > cc > cirrus_home
changeset 88:464d2dfb99c9
new
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 13 Apr 2021 17:02:09 +0000 |
parents | b6a5999d8e06 |
children | 90f8f28b2e51 |
files | .Xauthority bin/ezip.sh bin/intersection bin/ix.sh bin/stest.sh bin/uniq_merge.py lib/python/cdx_segment.py |
diffstat | 7 files changed, 104 insertions(+), 20 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ezip.sh Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,16 @@ +#!/bin/bash +# Invoke this as e.g. sbatch -n 4 -c 32 --exclusive masterJob.sh ezip +# run pigz -p 8 on extract_...tar files +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +node=$SLURMD_NODENAME +local=$SLURM_LOCALID +proc=$SLURM_PROCID +echo $(date) $node:$proc start + +module load gnu-parallel + +parallel --will-cite -j $c -n 1 'pigz -p 8 {}/orig/extracts/*.tar' < ezip/$proc.txt + +echo $(date) $proc end +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/intersection Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,19 @@ +#!/bin/sh +# Output intersection (or, with -d, difference) of two files, line by line +# No detection or special treatment of duplicates +# Usage: intersection [-d] s1 s2 +# In the case of difference, which is ordered, interpretation is s1 - s2 +if [ "$1" = "-?" ] + then + echo "Usage: intersection [-d] s1 s2" + exit 1 +fi +if [ "$1" = "-d" ] + then + shift + fgrep -x -v -f "$2" "$1" + else + fgrep -x -f "$1" "$2" + exit 0 +fi +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ix.sh Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,13 @@ +#!/usr/bin/bash +# Extract records from warc files given filename, length and offset triples +# from stdin or as command line args +if [ -n "$1" ] +then + printf "%s\t%s\t%s\n" "$1" "$2" "$3" +else + cat +fi | \ +while { IFS=$'\t' read f l o; } +do + dd if="$f" of=/dev/stdout skip=$o count=$l iflag=skip_bytes,count_bytes +done | unpigz -dp 1 -c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/stest.sh Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,7 @@ +#!/bin/bash +pwd +ls /dev/shm +echo I am node "$SLURM_NODEID", cpu "$PMI_FD", args "$@" +cat stest_$PMI_FD.txt + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/uniq_merge.py Tue Apr 13 17:02:09 2021 +0000 @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +# Merge counts by key from the output of "uniq -c" and sort in descending order +# An alternative to sus when the scale is too big for the initial sort, or if uniq -c already does a lot +# of the work +# Usage: ... | uniq -c | uniq-merge.py +import sys +s={} +for l in sys.stdin: + (i,d)=l.split() + i=int(i) + if d in s: + s[d]+=i + else: + s[d]=i +for (d,n) in sorted(s.items(),key=lambda j:j[1],reverse=True): + print('%5d\t%s'%(n,d))
--- a/lib/python/cdx_segment.py Tue Mar 16 16:20:02 2021 +0000 +++ b/lib/python/cdx_segment.py Tue Apr 13 17:02:09 2021 +0000 @@ -53,26 +53,39 @@ sys.stderr.write("bogus: ",afn,l) e+=1 -mt=datetime.now() -print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())), - e,(mt-st).seconds),file=sys.stderr) -# Randomise to try to avoid contention -for s in sample(segdirs,100): - for r in rr: - of=ss[r][s] - of.flush() - o=of.fileno() - fsync(o) - with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df: - d=df.fileno() - while True: - data = read(o,131072) - if data == b'': # end of file reached - break - write(d,data) - of.close() +if True: + # See note below, will have to copy entire result to /beegfs at shell level + for rr in ss.values(): + for s in rr.values(): + s.close() +else: + # The following fails, in that there are occasional small gaps in the result + # I've given up trying to figure out why... + # Randomise to try to avoid contention + mt=datetime.now() + print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())), + e,(mt-st).seconds),file=sys.stderr) -res=system("rm -r %s"%ifn) + for s in sample(segdirs,100): + for r in rr: + of=ss[r][s] + of.flush() + o=of.fileno() + fsync(o) + opos=lseek(o,0,SEEK_SET) + with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df: + d=df.fileno() + dpos=lseek(d,0,SEEK_END) + print(of.name,opos,df.name,dpos,file=sys.stderr) + while True: + data = read(o,131072) + if data == b'': # end of file reached + break + write(d,data) + of.close() + + res=0 #system("rm -r %s"%ifn) et=datetime.now() -print(et,"finished",ifn,res,"%d seconds total"%((et-st).seconds),file=sys.stderr) +print(et,"finished",ifn,"%s ok, %d bogus, %d seconds total"%(':'.join(map(str,n.values())), + e,(et-st).seconds),file=sys.stderr)