cc/cirrus_work: bin/ix.py comparison

comparison bin/ix.py @ 21:cbac7dfe2f24

interpolate process0, support permanent subproc

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Thu, 29 Sep 2022 16:33:42 +0100
parents	046dbe557911
children	fa43c318749b

comparison

equal deleted inserted replaced

-:a5dafc1364ed
+:cbac7dfe2f24
 FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
 return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
 FOO)
 def process(options,buf,filename,offset,length,whole):
-global CMD_PROC, BINOUT, TMPFILENAME, TMPFILE
+try:
+process0(options,buf,filename,offset,length,whole)
+except Exception as e:
+if options.debug:
+import traceback
+traceback.print_exc(file=sys.stderr)
+else:
+print("Process fail: %s, input line:\n %s"%(e,l),
+file=sys.stderr,end='')
+exit(3)
+def process0(options,buf,filename,offset,length,whole):
+global TMPFILENAME, TMPFILE
 if options.save:
 (tf,TMPFILENAME)=tempfile.mkstemp()
 TMPFILE=open(tf,mode='wb')
-if options.cmd:
+if options.cmd and not options.process:
-CMD_PROC=Popen(shlex.split(options.cmd),stdin=PIPE,bufsize=0)
+launch(options.cmd)
-BINOUT=CMD_PROC.stdin
 process1(options,buf,filename,offset,length,whole)
 if options.save:
 TMPFILE.close()
 if options.cmd:
 _output_subproc(bytes(TMPFILENAME,'utf-8'))
 _output_subproc(b"\n")
 else:
 BINOUT.write(bytes(TMPFILENAME,'utf-8'))
 BINOUT.write(b"\n")
 if options.cmd:
-# Wind up subproc
+if not options.process:
-BINOUT.close()
+windup(filename,options,length)
-if CMD_PROC.wait()!=0:    # could/should be async?
-print("subproc of %s:%s:%s failed with %s"%(length,offset,filename,
-CMD_PROC.returncode),
-file=sys.stderr)
 if options.save:
-# not if async?
 os.unlink(TMPFILENAME)
 TMPFILENAME=None
 elif options.save:
 print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr)
 TMPFILENAME=None
+def launch(cmd):
+global CMD_PROC, BINOUT
+CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0)
+BINOUT=CMD_PROC.stdin
+def windup(length,offset,filename):
+# Wind up subproc
+BINOUT.close()
+if CMD_PROC.wait()!=0:    # could/should be async?
+print("subproc of %s:%s:%s failed with %s"%(length,offset,filename,
+CMD_PROC.returncode),
+file=sys.stderr)
 def _output_tmpfile(buf):
 TMPFILE.write(buf)
 def _output_stdout(buf):
 BINOUT.write(buf)
 return
 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
 state=0
 tr=None # Was this record truncated?
 bl=None # for HTTP Content-Length for the length of the body?
+# Could we make this faster by working purely within the cb memoryview?
+# It would be messy, but avoid copying huge amounts
+# The outer loop would just be something like
+#   clbv=memoryview(bytearray(b"Content-Length: "))
+#   i=s=0
+#   while i<ll:
+#     if cb[i]==10: # need to handle \r\n
+#       L=cb[s:i]
+#       s=i=i+1
+#       if L[:16]==clbv:
+#         wl=int(L[16:])
+#     else:
+#       i+=1
+#
 with io.BytesIO(cb) as clear_text:
 for L in clear_text:
 if state==0:
 # WARC header
 if L.startswith(b"Content-Length: "):
 if state==1:
 # HTTP header
 wl -= len(L)
 if not (L==b"" or L.startswith(b"\r")):
 # Non-blank, it's a header
-if bl is None and L.startswith(b"Content-Length: "):
+(h,_,v)=L.partition(b": ")
-bl=int(L[16:].rstrip())
+if bl is None and (h==b"Content-Length"):
+bl=int(v)
 if options.headers:
-_output(L)
+if isinstance(options.headers,dict):
+if h in options.headers:
+options.headers[h]=v
+else:
+_output(L)
 else:
 # Blank line, HTTP header is finished
+if isinstance(options.headers,dict):
+_output(bytes(str(options.headers),'utf-8'))
 if not options.body:
 return
 if options.headers:
 _output(L)
 state=2
 fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s')
 parser.add_argument('--help',help='Show help',action='help')
 parser.add_argument('-d','--debug',help='Debug output',action='store_true')
 parser.add_argument('-w','--warc',help='output WARC headers',
 action='store_true')
-parser.add_argument('-h','--headers',help='output HTTP headers',
+parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output',
-action='store_true')
+nargs='?',default=None,const=True)
 parser.add_argument('-b','--body',help='output HTTP body',
 action='store_true')
 parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
+parser.add_argument('-p','--process',help='with -c, launches CMD only once',
+action='store_true')
 parser.add_argument('-m','--module.function',help='module.function to call with a stream'),
 parser.add_argument('-s','--save',action='store_true',
 help="write to a temporary file and output the name")
 parser.add_argument('-f','--fpath',
 help=fphelp,
 #print(pa,file=sys.stderr)
 if pa.length is not None:
 # We have to enforce our own check..
 if pa.offset is None or pa.filename is None:
 parser.error("length, offset and filename must all be supplied together")
+if isinstance(pa.headers,str):
+pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(','))
 buf=bytearray(128*1024*1024)
 whole=not (pa.warc or pa.headers or pa.body)
 if pa.save:
 import tempfile
 elif pa.cmd:
 _output = _output_subproc
 else:
 _output = _output_stdout
+if pa.cmd and pa.process:
+launch(pa.cmd)
+# three different ways to process
 if pa.index:
 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
 for l in sys.stdin:
 m=CDX.search(l)
 if m is None:
 if l.find('/robotstxt/')>-1:
 continue
 print("index line problem: \"%s\""%l,file=sys.stderr,end='')
 exit(2)
-f=pa.fpath%(m[3:7])
+filename=pa.fpath%(m[3:7])
-try:
+process(pa,buf,filename,
-process(pa,buf,f,
+int(offset:=m[2]),int(length:=m[1]),whole)
-int(m[2]),int(m[1]),whole)
-except Exception as e:
-if pa.debug:
-import traceback
-traceback.print_exc(file=sys.stderr)
-else:
-print("Process fail: %s, input line:\n %s"%(e,l),
-file=sys.stderr,end='')
-exit(3)
 elif pa.length is not None:
 print(pa.filename,file=sys.stderr)
 process(pa,buf,pa.fpath%tuple(pa.filename.split('/')),
 pa.offset,pa.length,whole)
 else:
 offset=int(offset)
 except ValueError as e:
 parser.error('Invalid input line: %s\n "%s"'%(e,l))
 process(pa,buf,pa.fpath%tuple(filename.split('/')),
 offset,length,whole)
+# processing done one way or another
+if pa.cmd and pa.process:
+windup(length,offset,filename)
+# if pa.save and pa.process, deleting temp files is down to cmd
 if __name__ == "__main__":
 main()

Mercurial > hg > cc > cirrus_work

comparison bin/ix.py @ 21:cbac7dfe2f24