Mercurial > hg > cc > cirrus_home
view bin/lang_by_seg.py @ 154:2643a6825f17
instead of csv
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 17 Nov 2021 18:26:33 +0000 |
parents | 8af4e9937799 |
children |
line wrap: on
line source
#!/usr/bin/env python3 '''Replicate part of Jingrui MSc, tabulate a single index file info of language, http[s], mime vs segment for Borrows from cdx2tsv Usage: uz cdx... | lang_by_seg.py outfilename''' import sys, json, pickle fn=sys.argv[1] WR=0 SCHEME=1 MIME=2 DETECTED=3 LANGS=4 with open(fn,'bw') as f: segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]), dict(),dict(),dict()) for i in range(100)] for l in sys.stdin: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) fnf=ja['filename'].split('/',maxsplit=5) seg=int(fnf[3].split('.')[1]) st=segs[seg] # Record type (w for warc, r for robots.txt) wr=fnf[4][0] st[WR][wr]+=1 # URI scheme sch=ja['url'].split(':',maxsplit=1)[0] st[SCHEME][sch]+=1 # Content-type m=ja['mime'] md=st[MIME] if m in md: md[m]+=1 else: md[m]=1 # Sniffed content-type m=ja['mime-detected'] md=st[DETECTED] if m in md: md[m]+=1 else: md[m]=1 # Language(s) lang=ja.get('languages','NA') ld=st[LANGS] if lang in ld: ld[lang]+=1 else: ld[lang]=1 #for i in range(100): # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) #print([dict(seg) for seg in segs]) pickle.dump(segs,f)