Mercurial > hg > cc > cirrus_home
diff bin/lang_by_seg.py @ 154:2643a6825f17
instead of csv
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 17 Nov 2021 18:26:33 +0000 |
parents | 8af4e9937799 |
children |
line wrap: on
line diff
--- a/bin/lang_by_seg.py Mon Nov 01 21:23:13 2021 +0000 +++ b/bin/lang_by_seg.py Wed Nov 17 18:26:33 2021 +0000 @@ -1,26 +1,59 @@ #!/usr/bin/env python3 '''Replicate part of Jingrui MSc, tabulate a single index -file info of language vs segment +file info of language, http[s], mime vs segment for Borrows from cdx2tsv Usage: uz cdx... | lang_by_seg.py outfilename''' import sys, json, pickle -from collections import defaultdict fn=sys.argv[1] +WR=0 +SCHEME=1 +MIME=2 +DETECTED=3 +LANGS=4 + with open(fn,'bw') as f: - segs=[defaultdict(int) for i in range(100)] + segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]), + dict(),dict(),dict()) for i in range(100)] for l in sys.stdin: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) + fnf=ja['filename'].split('/',maxsplit=5) + seg=int(fnf[3].split('.')[1]) + st=segs[seg] + # Record type (w for warc, r for robots.txt) + wr=fnf[4][0] + st[WR][wr]+=1 + # URI scheme + sch=ja['url'].split(':',maxsplit=1)[0] + st[SCHEME][sch]+=1 + # Content-type + m=ja['mime'] + md=st[MIME] + if m in md: + md[m]+=1 + else: + md[m]=1 + # Sniffed content-type + m=ja['mime-detected'] + md=st[DETECTED] + if m in md: + md[m]+=1 + else: + md[m]=1 + # Language(s) lang=ja.get('languages','NA') - seg=int(ja['filename'].split('/')[3].split('.')[1]) - segs[seg][lang]+=1 + ld=st[LANGS] + if lang in ld: + ld[lang]+=1 + else: + ld[lang]=1 #for i in range(100): # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))