# HG changeset patch # User Henry S. Thompson # Date 1634906175 0 # Node ID c8e41c543c0b19969268b2ade10f41be048ecdca # Parent a6d2b299ccdd4b33d22d52db1dcd34c2c0caaf59 works for 0--9 diff -r a6d2b299ccdd -r c8e41c543c0b bin/cdx2sql.py --- a/bin/cdx2sql.py Thu Oct 21 19:18:47 2021 +0000 +++ b/bin/cdx2sql.py Fri Oct 22 12:36:15 2021 +0000 @@ -14,11 +14,22 @@ dir=sys.argv[1] index=int(sys.argv[2]) +def process_mime(m): + m=m.strip() # Should be handled by CC :-( + if '"' in m: + # Handle obscure "-escaping conventions of sqlite3 + m=m.replace('"','""') + return ('"%s"'%m,'') + elif '\t' in m or '\n' in m: + return ('"%s"'%m,'') + else: + m=m.split('/',maxsplit=1) + return (m[0],m[1] if len(m)>1 else '') + with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f: for l in f: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) - ja=json.loads(jj.replace('\\','\\\\')) # some values observed - # to include \t :-( + ja=json.loads(jj) fnf=ja['filename'].split('/',maxsplit=5) # Segment number seg=int(fnf[3].split('.')[1]) @@ -27,12 +38,9 @@ # URI scheme sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') # Content-type - m=ja['mime'] - m=m.split('/',maxsplit=1) - m=(m[0],m[1] if len(m)>1 else '') + m=process_mime(ja['mime']) # Sniffed content-type - md=ja.get('mime-detected','/').split('/',maxsplit=1) - md=(md[0],md[1] if len(md)>1 else '') + md=process_mime(ja.get('mime-detected','/')) # Language(s) langs=ja.get('languages',None) if langs is None: