diff bin/lang_by_seg.py @ 154:2643a6825f17

instead of csv
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 17 Nov 2021 18:26:33 +0000
parents 8af4e9937799
children
line wrap: on
line diff
--- a/bin/lang_by_seg.py	Mon Nov 01 21:23:13 2021 +0000
+++ b/bin/lang_by_seg.py	Wed Nov 17 18:26:33 2021 +0000
@@ -1,26 +1,59 @@
 #!/usr/bin/env python3
 '''Replicate part of Jingrui MSc, tabulate a single index
-file info of language vs segment
+file info of language, http[s], mime vs segment for
 
 Borrows from cdx2tsv
 
 Usage: uz cdx... | lang_by_seg.py outfilename'''
 
 import sys, json, pickle
-from collections import defaultdict
 
 fn=sys.argv[1]
 
+WR=0
+SCHEME=1
+MIME=2
+DETECTED=3
+LANGS=4
+
 with open(fn,'bw') as f:
 
-  segs=[defaultdict(int) for i in range(100)]
+  segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]),
+         dict(),dict(),dict()) for i in range(100)]
 
   for l in sys.stdin:
     (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
     ja=json.loads(jj)
+    fnf=ja['filename'].split('/',maxsplit=5)
+    seg=int(fnf[3].split('.')[1])
+    st=segs[seg]
+    # Record type (w for warc, r for robots.txt)
+    wr=fnf[4][0]
+    st[WR][wr]+=1
+    # URI scheme
+    sch=ja['url'].split(':',maxsplit=1)[0]
+    st[SCHEME][sch]+=1
+    # Content-type
+    m=ja['mime']
+    md=st[MIME]
+    if m in md:
+      md[m]+=1
+    else:
+      md[m]=1
+    # Sniffed content-type
+    m=ja['mime-detected']
+    md=st[DETECTED]
+    if m in md:
+      md[m]+=1
+    else:
+      md[m]=1
+    # Language(s)
     lang=ja.get('languages','NA')
-    seg=int(ja['filename'].split('/')[3].split('.')[1])
-    segs[seg][lang]+=1
+    ld=st[LANGS]
+    if lang in ld:
+      ld[lang]+=1
+    else:
+      ld[lang]=1
 
   #for i in range(100):
   #  print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))