comparison bin/ix.py @ 110:f148c2366faa

back to IGzipFile
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 22 Apr 2021 21:31:03 +0000
parents 15abf4aab307
children 3119bca71181
comparison
equal deleted inserted replaced
109:15abf4aab307 110:f148c2366faa
7 In all cases by 'filename' is meant crawlid/segmentid/filename 7 In all cases by 'filename' is meant crawlid/segmentid/filename
8 8
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' 9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
10 10
11 import sys, argparse, regex, os, shutil, io, gzip, time 11 import sys, argparse, regex, os, shutil, io, gzip, time
12 #from isal import igzip 12 from isal import igzip
13 from subprocess import Popen, PIPE 13 #from subprocess import Popen, PIPE
14 #import asyncio 14 #import asyncio
15 15
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') 16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
17 BINOUT=sys.stdout.buffer 17 BINOUT=sys.stdout.buffer
18 FPAT="/%s/%s/orig/warc/%s" 18 FPAT="/%s/%s/orig/warc/%s"
48 if nb!=length: 48 if nb!=length:
49 print("losing",file.name,length,nb,file=sys.stderr) 49 print("losing",file.name,length,nb,file=sys.stderr)
50 if options.zipped: 50 if options.zipped:
51 BINOUT.write(bv) 51 BINOUT.write(bv)
52 else: 52 else:
53 #gzip_chunk = io.BytesIO(bv) 53 gzip_chunk = io.BytesIO(bv)
54 uv=memoryview(buf)[length:] 54 uv=memoryview(buf)[length:]
55 #clear_bytes=io.BytesIO(uv) 55 #clear_bytes=io.BytesIO(uv)
56 p = Popen(["/lustre/home/dc007/hst/gentoo/usr/bin/igzip",
57 "-dc"],
58 stdin=PIPE,
59 stdout=None)
60 fout, ferr = p.communicate(bv)
61 res=p.wait()
62 if res!=0:
63 print('pipe failed',res,ferr.decode())
64 exit(2)
65 return
66 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: 56 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
67 while True: 57 while True:
68 l=gzip_fin.readinto(uv) 58 l=gzip_fin.readinto(uv)
69 if not l: 59 if not l:
70 break 60 break