comparison bin/lang_by_seg.py @ 154:2643a6825f17

instead of csv
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 17 Nov 2021 18:26:33 +0000
parents 8af4e9937799
children
comparison
equal deleted inserted replaced
153:2b59f3ef2294 154:2643a6825f17
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Replicate part of Jingrui MSc, tabulate a single index 2 '''Replicate part of Jingrui MSc, tabulate a single index
3 file info of language vs segment 3 file info of language, http[s], mime vs segment for
4 4
5 Borrows from cdx2tsv 5 Borrows from cdx2tsv
6 6
7 Usage: uz cdx... | lang_by_seg.py outfilename''' 7 Usage: uz cdx... | lang_by_seg.py outfilename'''
8 8
9 import sys, json, pickle 9 import sys, json, pickle
10 from collections import defaultdict
11 10
12 fn=sys.argv[1] 11 fn=sys.argv[1]
13 12
13 WR=0
14 SCHEME=1
15 MIME=2
16 DETECTED=3
17 LANGS=4
18
14 with open(fn,'bw') as f: 19 with open(fn,'bw') as f:
15 20
16 segs=[defaultdict(int) for i in range(100)] 21 segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]),
22 dict(),dict(),dict()) for i in range(100)]
17 23
18 for l in sys.stdin: 24 for l in sys.stdin:
19 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) 25 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
20 ja=json.loads(jj) 26 ja=json.loads(jj)
27 fnf=ja['filename'].split('/',maxsplit=5)
28 seg=int(fnf[3].split('.')[1])
29 st=segs[seg]
30 # Record type (w for warc, r for robots.txt)
31 wr=fnf[4][0]
32 st[WR][wr]+=1
33 # URI scheme
34 sch=ja['url'].split(':',maxsplit=1)[0]
35 st[SCHEME][sch]+=1
36 # Content-type
37 m=ja['mime']
38 md=st[MIME]
39 if m in md:
40 md[m]+=1
41 else:
42 md[m]=1
43 # Sniffed content-type
44 m=ja['mime-detected']
45 md=st[DETECTED]
46 if m in md:
47 md[m]+=1
48 else:
49 md[m]=1
50 # Language(s)
21 lang=ja.get('languages','NA') 51 lang=ja.get('languages','NA')
22 seg=int(ja['filename'].split('/')[3].split('.')[1]) 52 ld=st[LANGS]
23 segs[seg][lang]+=1 53 if lang in ld:
54 ld[lang]+=1
55 else:
56 ld[lang]=1
24 57
25 #for i in range(100): 58 #for i in range(100):
26 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) 59 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
27 #print([dict(seg) for seg in segs]) 60 #print([dict(seg) for seg in segs])
28 pickle.dump(segs,f) 61 pickle.dump(segs,f)