142
|
1 #!/usr/bin/env python3
|
|
2 '''Replicate part of Jingrui MSc, tabulate a single index
|
154
|
3 file info of language, http[s], mime vs segment for
|
142
|
4
|
|
5 Borrows from cdx2tsv
|
|
6
|
|
7 Usage: uz cdx... | lang_by_seg.py outfilename'''
|
|
8
|
|
9 import sys, json, pickle
|
|
10
|
|
11 fn=sys.argv[1]
|
|
12
|
154
|
13 WR=0
|
|
14 SCHEME=1
|
|
15 MIME=2
|
|
16 DETECTED=3
|
|
17 LANGS=4
|
|
18
|
142
|
19 with open(fn,'bw') as f:
|
|
20
|
154
|
21 segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]),
|
|
22 dict(),dict(),dict()) for i in range(100)]
|
142
|
23
|
|
24 for l in sys.stdin:
|
|
25 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
|
|
26 ja=json.loads(jj)
|
154
|
27 fnf=ja['filename'].split('/',maxsplit=5)
|
|
28 seg=int(fnf[3].split('.')[1])
|
|
29 st=segs[seg]
|
|
30 # Record type (w for warc, r for robots.txt)
|
|
31 wr=fnf[4][0]
|
|
32 st[WR][wr]+=1
|
|
33 # URI scheme
|
|
34 sch=ja['url'].split(':',maxsplit=1)[0]
|
|
35 st[SCHEME][sch]+=1
|
|
36 # Content-type
|
|
37 m=ja['mime']
|
|
38 md=st[MIME]
|
|
39 if m in md:
|
|
40 md[m]+=1
|
|
41 else:
|
|
42 md[m]=1
|
|
43 # Sniffed content-type
|
|
44 m=ja['mime-detected']
|
|
45 md=st[DETECTED]
|
|
46 if m in md:
|
|
47 md[m]+=1
|
|
48 else:
|
|
49 md[m]=1
|
|
50 # Language(s)
|
142
|
51 lang=ja.get('languages','NA')
|
154
|
52 ld=st[LANGS]
|
|
53 if lang in ld:
|
|
54 ld[lang]+=1
|
|
55 else:
|
|
56 ld[lang]=1
|
142
|
57
|
|
58 #for i in range(100):
|
|
59 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
|
|
60 #print([dict(seg) for seg in segs])
|
|
61 pickle.dump(segs,f)
|
|
62
|