view bin/lang_by_seg.py @ 154:2643a6825f17

instead of csv
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 17 Nov 2021 18:26:33 +0000
parents 8af4e9937799
children
line wrap: on
line source

#!/usr/bin/env python3
'''Replicate part of Jingrui MSc, tabulate a single index
file info of language, http[s], mime vs segment for

Borrows from cdx2tsv

Usage: uz cdx... | lang_by_seg.py outfilename'''

import sys, json, pickle

fn=sys.argv[1]

WR=0
SCHEME=1
MIME=2
DETECTED=3
LANGS=4

with open(fn,'bw') as f:

  segs=[(dict([('r',0),('w',0),('c',0)]),dict([('http',0),('https',0)]),
         dict(),dict(),dict()) for i in range(100)]

  for l in sys.stdin:
    (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
    ja=json.loads(jj)
    fnf=ja['filename'].split('/',maxsplit=5)
    seg=int(fnf[3].split('.')[1])
    st=segs[seg]
    # Record type (w for warc, r for robots.txt)
    wr=fnf[4][0]
    st[WR][wr]+=1
    # URI scheme
    sch=ja['url'].split(':',maxsplit=1)[0]
    st[SCHEME][sch]+=1
    # Content-type
    m=ja['mime']
    md=st[MIME]
    if m in md:
      md[m]+=1
    else:
      md[m]=1
    # Sniffed content-type
    m=ja['mime-detected']
    md=st[DETECTED]
    if m in md:
      md[m]+=1
    else:
      md[m]=1
    # Language(s)
    lang=ja.get('languages','NA')
    ld=st[LANGS]
    if lang in ld:
      ld[lang]+=1
    else:
      ld[lang]=1

  #for i in range(100):
  #  print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
  #print([dict(seg) for seg in segs])
  pickle.dump(segs,f)