changeset 154:2643a6825f17

instead of csv
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 17 Nov 2021 18:26:33 +0000
parents 2b59f3ef2294
children 58b90cd52c15
files .lesshst bin/doS2T.sh bin/lang_by_seg.py bin/sql2tsv.sh
diffstat 4 files changed, 195 insertions(+), 105 deletions(-) [+]
line wrap: on
line diff
--- a/.lesshst	Mon Nov 01 21:23:13 2021 +0000
+++ b/.lesshst	Wed Nov 17 18:26:33 2021 +0000
@@ -1,103 +1,102 @@
 .less-history-file:
 .search
-"buffer
-"^>
-"get_ref
-"json
-"xml
-"backends.py
-"extractor
-"config
-"user
-"rb
-"avail
-"krb
-"ker
-"Ker
-"rb
-"gssapi
-"GSSAPI
-"provider
-"gssapi
-"GSSPAI
-"GSSAPI
-"kerb
-"Kerb
-"KERB
-"458_5730
-"resolve
-"plumpton
-"to-command
-"650 
-"650	
-"plumpton
-"995	650
-"995	65
-"fno
-"plumpton
-"news
-"\\\\
+"	
+"unary
+"unk
+"810
+"unk
+"unary
+"900
+"902
+" 902
+"unary
+"too many
+"many
+"unk
+"revisit
+"<<<
+"[^Agu]$
+"	
+"	$
+"found 7
+"	
+"	3	
+"	
+"810
+"910
+"902
+"	
+"1158584
+"	
+" 12 
+"233935
+"902
+"810
+"unk
+"1050691
+""
+"	
+""
+"	
+"  
+"	
+"		
+"warc
 "	
-"_176
-"17:44
-"Feb 
-"Redo
-"\(S\)
-"permission
-"rwx
-"x
-"S
-"\bS\b
-"setuid
-"com,
-"application/pdf
-""application.pdf"
-""application.pdf".*\.20
-"length": 
-"104800
-"application/pdf.*languages
-"post
-"x
-"hao
-"-S
-"robots
-"ac,e
-"uk,ac,ed,i
-"acc,ed,inf
-",ac,ed,inf
-"output
-"process
-"output
-"--to-command
-"--check
-"stdin
-"pipe
-"input
-"argv
-"arg
-"shift 
-"$#
-"\$#
-"shift
-"quiet
-"output
-"--no-warn
-"--no-
-"warning
-"no-warn
-"warn
-"sql
-"db
-"-k
-"user
-"man:
-"robotstxt
-"diagnostic
-""status"
-"Last-Modified
-"Modified
-">
-"dc007
-""length": "0"
-.search
-""offset": "0"
+"|3
+"|3|
+"\|3\|
+"\"
+""
+"504446
+""
+"task: 1
+"task: 12
+"partition
+"task: 3.,
+"wildcard
+"hst
+"attach
+"database
+"cmd
+"dump
+"file
+"csv
+"	
+".mode
+"\"
+",3,
+""
+"""
+" 233935
+"	
+"233935	
+"1 
+"0 
+"1 
+"-u
+"	
+"[^2] 
+"[013-9] 
+"com,r
+"wordpress
+"blogspot
+"-c
+"-c 
+" 
+": "[^"]*,
+",
+"xml,
+",
+""
+",
+"\"
+" 
+"\\
+"0			
+"	
+"^>
+"unk,
+""application
+"text/directory.*text/plain
+"'application/xml'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/doS2T.sh	Wed Nov 17 18:26:33 2021 +0000
@@ -0,0 +1,20 @@
+#!/usr/bin/bash
+# Usage: doS2T.sh node task cc resdir segs workd dbfile
+node=$1
+task=$2
+cc=$3
+resdir=$4
+segs=$5
+workd=$6
+dbfile=$7
+
+f=${dbfile%.db}
+i=${f#*cdx}
+
+echo "> $node.$task: $segs $i"
+
+sqlite3 $dbfile ".mode tabs" ".once $workd/$i.tsv" "select count(*),* from props group by segment,ftype,https,nlangs" ".quit"
+
+echo "< $node.$task: $segs $i";
+
+
--- a/bin/lang_by_seg.py	Mon Nov 01 21:23:13 2021 +0000
+++ b/bin/lang_by_seg.py	Wed Nov 17 18:26:33 2021 +0000
@@ -1,26 +1,59 @@
 #!/usr/bin/env python3
 '''Replicate part of Jingrui MSc, tabulate a single index
-file info of language vs segment
+file info of language, http[s], mime vs segment for
 
 Borrows from cdx2tsv
 
 Usage: uz cdx... | lang_by_seg.py outfilename'''
 
 import sys, json, pickle
-from collections import defaultdict
 
 fn=sys.argv[1]
 
+WR=0
+SCHEME=1
+MIME=2
+DETECTED=3
+LANGS=4
+
 with open(fn,'bw') as f:
 
-  segs=[defaultdict(int) for i in range(100)]
+  segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]),
+         dict(),dict(),dict()) for i in range(100)]
 
   for l in sys.stdin:
     (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
     ja=json.loads(jj)
+    fnf=ja['filename'].split('/',maxsplit=5)
+    seg=int(fnf[3].split('.')[1])
+    st=segs[seg]
+    # Record type (w for warc, r for robots.txt)
+    wr=fnf[4][0]
+    st[WR][wr]+=1
+    # URI scheme
+    sch=ja['url'].split(':',maxsplit=1)[0]
+    st[SCHEME][sch]+=1
+    # Content-type
+    m=ja['mime']
+    md=st[MIME]
+    if m in md:
+      md[m]+=1
+    else:
+      md[m]=1
+    # Sniffed content-type
+    m=ja['mime-detected']
+    md=st[DETECTED]
+    if m in md:
+      md[m]+=1
+    else:
+      md[m]=1
+    # Language(s)
     lang=ja.get('languages','NA')
-    seg=int(ja['filename'].split('/')[3].split('.')[1])
-    segs[seg][lang]+=1
+    ld=st[LANGS]
+    if lang in ld:
+      ld[lang]+=1
+    else:
+      ld[lang]=1
 
   #for i in range(100):
   #  print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/sql2tsv.sh	Wed Nov 17 18:26:33 2021 +0000
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Invoke this as e.g. sbatch -N 4 --ntasks=3 -c 5 masterJob.sh sql2tsv \
+#                     CC-MAIN-2019-35 cdx_db 0-299
+# run sql2tsv.py in parallel, taking input directly from .../0-299.n.c.tar.gz
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+node=$SLURMD_NODENAME
+task=$SLURM_LOCALID
+node=$SLURM_NODEID
+
+cc=$1
+resdir=$2
+segs=$3
+
+echo $(date) $nodename:$node:$task start
+
+export PYTHONPATH=$PYTHONPATH:$HOME/lib/python
+
+ld=/dev/shm/ht/$task
+mkdir -p $ld
+
+cd $ld
+tar --wildcards -xf $HOME/results/$cc/$resdir/$segs.$node.$task.tar.gz '*.db'
+cd $HOME
+
+ls $ld/*.db | \
+   parallel --will-cite -j $c doS2T.sh $node $task $cc $resdir $segs $ld '{}'
+
+if [ "$ld" ]
+then
+ cd $ld
+ cp *.tsv $HOME/results/$cc/$resdir
+ rm *
+fi
+
+echo $(date) $nodename:$node:$task end
+
+