# HG changeset patch # User Henry S. Thompson # Date 1637173593 0 # Node ID 2643a6825f17ef41843c82bbf7ecafef90df9129 # Parent 2b59f3ef229473722e663f710e86f4549bff2ad9 instead of csv diff -r 2b59f3ef2294 -r 2643a6825f17 .lesshst --- a/.lesshst Mon Nov 01 21:23:13 2021 +0000 +++ b/.lesshst Wed Nov 17 18:26:33 2021 +0000 @@ -1,103 +1,102 @@ .less-history-file: .search -"buffer -"^> -"get_ref -"json -"xml -"backends.py -"extractor -"config -"user -"rb -"avail -"krb -"ker -"Ker -"rb -"gssapi -"GSSAPI -"provider -"gssapi -"GSSPAI -"GSSAPI -"kerb -"Kerb -"KERB -"458_5730 -"resolve -"plumpton -"to-command -"650 -"650 -"plumpton -"995 650 -"995 65 -"fno -"plumpton -"news -"\\\\ +" +"unary +"unk +"810 +"unk +"unary +"900 +"902 +" 902 +"unary +"too many +"many +"unk +"revisit +"<<< +"[^Agu]$ +" +" $ +"found 7 +" +" 3 +" +"810 +"910 +"902 +" +"1158584 +" +" 12 +"233935 +"902 +"810 +"unk +"1050691 +"" +" +"" +" +" +" +" +"warc " -"_176 -"17:44 -"Feb -"Redo -"\(S\) -"permission -"rwx -"x -"S -"\bS\b -"setuid -"com, -"application/pdf -""application.pdf" -""application.pdf".*\.20 -"length": -"104800 -"application/pdf.*languages -"post -"x -"hao -"-S -"robots -"ac,e -"uk,ac,ed,i -"acc,ed,inf -",ac,ed,inf -"output -"process -"output -"--to-command -"--check -"stdin -"pipe -"input -"argv -"arg -"shift -"$# -"\$# -"shift -"quiet -"output -"--no-warn -"--no- -"warning -"no-warn -"warn -"sql -"db -"-k -"user -"man: -"robotstxt -"diagnostic -""status" -"Last-Modified -"Modified -"> -"dc007 -""length": "0" -.search -""offset": "0" +"|3 +"|3| +"\|3\| +"\" +"" +"504446 +"" +"task: 1 +"task: 12 +"partition +"task: 3., +"wildcard +"hst +"attach +"database +"cmd +"dump +"file +"csv +" +".mode +"\" +",3, +"" +""" +" 233935 +" +"233935 +"1 +"0 +"1 +"-u +" +"[^2] +"[013-9] +"com,r +"wordpress +"blogspot +"-c +"-c +" +": "[^"]*, +", +"xml, +", +"" +", +"\" +" +"\\ +"0 +" +"^> +"unk, +""application +"text/directory.*text/plain +"'application/xml' diff -r 2b59f3ef2294 -r 2643a6825f17 bin/doS2T.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doS2T.sh Wed Nov 17 18:26:33 2021 +0000 @@ -0,0 +1,20 @@ +#!/usr/bin/bash +# Usage: doS2T.sh node task cc resdir segs workd dbfile +node=$1 +task=$2 +cc=$3 +resdir=$4 +segs=$5 +workd=$6 +dbfile=$7 + +f=${dbfile%.db} +i=${f#*cdx} + +echo "> $node.$task: $segs $i" + +sqlite3 $dbfile ".mode tabs" ".once $workd/$i.tsv" "select count(*),* from props group by segment,ftype,https,nlangs" ".quit" + +echo "< $node.$task: $segs $i"; + + diff -r 2b59f3ef2294 -r 2643a6825f17 bin/lang_by_seg.py --- a/bin/lang_by_seg.py Mon Nov 01 21:23:13 2021 +0000 +++ b/bin/lang_by_seg.py Wed Nov 17 18:26:33 2021 +0000 @@ -1,26 +1,59 @@ #!/usr/bin/env python3 '''Replicate part of Jingrui MSc, tabulate a single index -file info of language vs segment +file info of language, http[s], mime vs segment for Borrows from cdx2tsv Usage: uz cdx... | lang_by_seg.py outfilename''' import sys, json, pickle -from collections import defaultdict fn=sys.argv[1] +WR=0 +SCHEME=1 +MIME=2 +DETECTED=3 +LANGS=4 + with open(fn,'bw') as f: - segs=[defaultdict(int) for i in range(100)] + segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]), + dict(),dict(),dict()) for i in range(100)] for l in sys.stdin: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) + fnf=ja['filename'].split('/',maxsplit=5) + seg=int(fnf[3].split('.')[1]) + st=segs[seg] + # Record type (w for warc, r for robots.txt) + wr=fnf[4][0] + st[WR][wr]+=1 + # URI scheme + sch=ja['url'].split(':',maxsplit=1)[0] + st[SCHEME][sch]+=1 + # Content-type + m=ja['mime'] + md=st[MIME] + if m in md: + md[m]+=1 + else: + md[m]=1 + # Sniffed content-type + m=ja['mime-detected'] + md=st[DETECTED] + if m in md: + md[m]+=1 + else: + md[m]=1 + # Language(s) lang=ja.get('languages','NA') - seg=int(ja['filename'].split('/')[3].split('.')[1]) - segs[seg][lang]+=1 + ld=st[LANGS] + if lang in ld: + ld[lang]+=1 + else: + ld[lang]=1 #for i in range(100): # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) diff -r 2b59f3ef2294 -r 2643a6825f17 bin/sql2tsv.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/sql2tsv.sh Wed Nov 17 18:26:33 2021 +0000 @@ -0,0 +1,38 @@ +#!/bin/bash +# Invoke this as e.g. sbatch -N 4 --ntasks=3 -c 5 masterJob.sh sql2tsv \ +# CC-MAIN-2019-35 cdx_db 0-299 +# run sql2tsv.py in parallel, taking input directly from .../0-299.n.c.tar.gz +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +node=$SLURMD_NODENAME +task=$SLURM_LOCALID +node=$SLURM_NODEID + +cc=$1 +resdir=$2 +segs=$3 + +echo $(date) $nodename:$node:$task start + +export PYTHONPATH=$PYTHONPATH:$HOME/lib/python + +ld=/dev/shm/ht/$task +mkdir -p $ld + +cd $ld +tar --wildcards -xf $HOME/results/$cc/$resdir/$segs.$node.$task.tar.gz '*.db' +cd $HOME + +ls $ld/*.db | \ + parallel --will-cite -j $c doS2T.sh $node $task $cc $resdir $segs $ld '{}' + +if [ "$ld" ] +then + cd $ld + cp *.tsv $HOME/results/$cc/$resdir + rm * +fi + +echo $(date) $nodename:$node:$task end + +