annotate bin/lang_by_seg.py @ 159:c3c3dd60b8a8

demo of slurm usage using cdx2tsv.py
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Jul 2022 18:07:34 +0100
parents 2643a6825f17
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Replicate part of Jingrui MSc, tabulate a single index
154
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
3 file info of language, http[s], mime vs segment for
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Borrows from cdx2tsv
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 Usage: uz cdx... | lang_by_seg.py outfilename'''
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 import sys, json, pickle
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 fn=sys.argv[1]
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
154
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
13 WR=0
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
14 SCHEME=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
15 MIME=2
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
16 DETECTED=3
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
17 LANGS=4
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
18
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 with open(fn,'bw') as f:
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20
154
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
21 segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]),
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
22 dict(),dict(),dict()) for i in range(100)]
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 for l in sys.stdin:
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 ja=json.loads(jj)
154
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
27 fnf=ja['filename'].split('/',maxsplit=5)
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
28 seg=int(fnf[3].split('.')[1])
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
29 st=segs[seg]
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
30 # Record type (w for warc, r for robots.txt)
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
31 wr=fnf[4][0]
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
32 st[WR][wr]+=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
33 # URI scheme
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
34 sch=ja['url'].split(':',maxsplit=1)[0]
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
35 st[SCHEME][sch]+=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
36 # Content-type
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
37 m=ja['mime']
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
38 md=st[MIME]
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
39 if m in md:
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
40 md[m]+=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
41 else:
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
42 md[m]=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
43 # Sniffed content-type
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
44 m=ja['mime-detected']
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
45 md=st[DETECTED]
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
46 if m in md:
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
47 md[m]+=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
48 else:
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
49 md[m]=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
50 # Language(s)
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 lang=ja.get('languages','NA')
154
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
52 ld=st[LANGS]
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
53 if lang in ld:
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
54 ld[lang]+=1
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
55 else:
2643a6825f17 instead of csv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 142
diff changeset
56 ld[lang]=1
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 #for i in range(100):
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 #print([dict(seg) for seg in segs])
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 pickle.dump(segs,f)
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62