changeset 146:c8e41c543c0b

works for 0--9
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 22 Oct 2021 12:36:15 +0000
parents a6d2b299ccdd
children 11d973ecff4e
files bin/cdx2sql.py
diffstat 1 files changed, 15 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/bin/cdx2sql.py	Thu Oct 21 19:18:47 2021 +0000
+++ b/bin/cdx2sql.py	Fri Oct 22 12:36:15 2021 +0000
@@ -14,11 +14,22 @@
 dir=sys.argv[1]
 index=int(sys.argv[2])
 
+def process_mime(m):
+  m=m.strip() # Should be handled by CC :-(
+  if '"' in m:
+    # Handle obscure "-escaping conventions of sqlite3
+    m=m.replace('"','""')
+    return ('"%s"'%m,'')
+  elif '\t' in m or '\n' in m:
+    return ('"%s"'%m,'')
+  else:
+    m=m.split('/',maxsplit=1)
+    return (m[0],m[1] if len(m)>1 else '')
+
 with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f:
   for l in f:
     (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
-    ja=json.loads(jj.replace('\\','\\\\')) # some values observed
-                                           # to include \t :-(
+    ja=json.loads(jj)
     fnf=ja['filename'].split('/',maxsplit=5)
     # Segment number
     seg=int(fnf[3].split('.')[1])
@@ -27,12 +38,9 @@
     # URI scheme
     sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
     # Content-type
-    m=ja['mime']
-    m=m.split('/',maxsplit=1)
-    m=(m[0],m[1] if len(m)>1 else '')
+    m=process_mime(ja['mime'])
     # Sniffed content-type
-    md=ja.get('mime-detected','/').split('/',maxsplit=1)
-    md=(md[0],md[1] if len(md)>1 else '')
+    md=process_mime(ja.get('mime-detected','/'))
     # Language(s)
     langs=ja.get('languages',None)
     if langs is None: