Mercurial > hg > cc > cirrus_home
comparison bin/lang_by_seg.py @ 154:2643a6825f17
instead of csv
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 17 Nov 2021 18:26:33 +0000 |
parents | 8af4e9937799 |
children |
comparison
equal
deleted
inserted
replaced
153:2b59f3ef2294 | 154:2643a6825f17 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 '''Replicate part of Jingrui MSc, tabulate a single index | 2 '''Replicate part of Jingrui MSc, tabulate a single index |
3 file info of language vs segment | 3 file info of language, http[s], mime vs segment for |
4 | 4 |
5 Borrows from cdx2tsv | 5 Borrows from cdx2tsv |
6 | 6 |
7 Usage: uz cdx... | lang_by_seg.py outfilename''' | 7 Usage: uz cdx... | lang_by_seg.py outfilename''' |
8 | 8 |
9 import sys, json, pickle | 9 import sys, json, pickle |
10 from collections import defaultdict | |
11 | 10 |
12 fn=sys.argv[1] | 11 fn=sys.argv[1] |
13 | 12 |
13 WR=0 | |
14 SCHEME=1 | |
15 MIME=2 | |
16 DETECTED=3 | |
17 LANGS=4 | |
18 | |
14 with open(fn,'bw') as f: | 19 with open(fn,'bw') as f: |
15 | 20 |
16 segs=[defaultdict(int) for i in range(100)] | 21 segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]), |
22 dict(),dict(),dict()) for i in range(100)] | |
17 | 23 |
18 for l in sys.stdin: | 24 for l in sys.stdin: |
19 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) | 25 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) |
20 ja=json.loads(jj) | 26 ja=json.loads(jj) |
27 fnf=ja['filename'].split('/',maxsplit=5) | |
28 seg=int(fnf[3].split('.')[1]) | |
29 st=segs[seg] | |
30 # Record type (w for warc, r for robots.txt) | |
31 wr=fnf[4][0] | |
32 st[WR][wr]+=1 | |
33 # URI scheme | |
34 sch=ja['url'].split(':',maxsplit=1)[0] | |
35 st[SCHEME][sch]+=1 | |
36 # Content-type | |
37 m=ja['mime'] | |
38 md=st[MIME] | |
39 if m in md: | |
40 md[m]+=1 | |
41 else: | |
42 md[m]=1 | |
43 # Sniffed content-type | |
44 m=ja['mime-detected'] | |
45 md=st[DETECTED] | |
46 if m in md: | |
47 md[m]+=1 | |
48 else: | |
49 md[m]=1 | |
50 # Language(s) | |
21 lang=ja.get('languages','NA') | 51 lang=ja.get('languages','NA') |
22 seg=int(ja['filename'].split('/')[3].split('.')[1]) | 52 ld=st[LANGS] |
23 segs[seg][lang]+=1 | 53 if lang in ld: |
54 ld[lang]+=1 | |
55 else: | |
56 ld[lang]=1 | |
24 | 57 |
25 #for i in range(100): | 58 #for i in range(100): |
26 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) | 59 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) |
27 #print([dict(seg) for seg in segs]) | 60 #print([dict(seg) for seg in segs]) |
28 pickle.dump(segs,f) | 61 pickle.dump(segs,f) |