Mercurial > hg > cc > cirrus_home
changeset 146:c8e41c543c0b
works for 0--9
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 22 Oct 2021 12:36:15 +0000 |
parents | a6d2b299ccdd |
children | 11d973ecff4e |
files | bin/cdx2sql.py |
diffstat | 1 files changed, 15 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/cdx2sql.py Thu Oct 21 19:18:47 2021 +0000 +++ b/bin/cdx2sql.py Fri Oct 22 12:36:15 2021 +0000 @@ -14,11 +14,22 @@ dir=sys.argv[1] index=int(sys.argv[2]) +def process_mime(m): + m=m.strip() # Should be handled by CC :-( + if '"' in m: + # Handle obscure "-escaping conventions of sqlite3 + m=m.replace('"','""') + return ('"%s"'%m,'') + elif '\t' in m or '\n' in m: + return ('"%s"'%m,'') + else: + m=m.split('/',maxsplit=1) + return (m[0],m[1] if len(m)>1 else '') + with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f: for l in f: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) - ja=json.loads(jj.replace('\\','\\\\')) # some values observed - # to include \t :-( + ja=json.loads(jj) fnf=ja['filename'].split('/',maxsplit=5) # Segment number seg=int(fnf[3].split('.')[1]) @@ -27,12 +38,9 @@ # URI scheme sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') # Content-type - m=ja['mime'] - m=m.split('/',maxsplit=1) - m=(m[0],m[1] if len(m)>1 else '') + m=process_mime(ja['mime']) # Sniffed content-type - md=ja.get('mime-detected','/').split('/',maxsplit=1) - md=(md[0],md[1] if len(md)>1 else '') + md=process_mime(ja.get('mime-detected','/')) # Language(s) langs=ja.get('languages',None) if langs is None: