From 49e98e0cdc3bab540d912fbf1e067bc73d335ed0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Nov 2014 01:29:14 -0800 Subject: [PATCH] archiveiterator/cdxindexer: cleaner load path for compressed and uncompressed, ability to distinguish between chunked and non-chunked warcs/arcs Raise error for non-chunked gzip warcs as they can not be indexed for replay, addressing #48 add 'bad' non-chunked gzip file for testing, using custom ext --- pywb/warc/archiveiterator.py | 82 ++++++++++++++----- pywb/warc/test/test_indexing.py | 12 ++- sample_archive/warcs/example-bad.warc.gz.bad | Bin 0 -> 1950 bytes 3 files changed, 74 insertions(+), 20 deletions(-) create mode 100644 sample_archive/warcs/example-bad.warc.gz.bad diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index e1408432..e0994a7f 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -21,6 +21,25 @@ class ArchiveIterator(object): """ + GZIP_ERR_MSG = """ + ERROR: Non-chunked gzip file detected, gzip block continues + beyond single record. + + This file is probably not a multi-chunk gzip but a single gzip file. + + To allow seek, a gzipped {1} must have each record compressed into + a single gzip chunk and concatenated together. + + This file is likely still valid and you can use it by decompressing it: + + gunzip myfile.{0}.gz + + You can then also use the 'warc2warc' tool from the 'warc-tools' + package which will create a properly chunked gzip file: + + warc2warc -Z myfile.{0} > myfile.{0}.gz + """ + def __init__(self, fileobj): self.fh = fileobj @@ -42,27 +61,34 @@ class ArchiveIterator(object): block_size=block_size) self.offset = self.fh.tell() - next_line = None + self.next_line = None + + is_valid = True while True: try: - record = self._next_record(next_line) + record = self._next_record(self.next_line) + if not is_valid: + self._raise_err() + yield record except EOFError: break self.read_to_end(record) - # for non-compressed, consume blank lines here - if not self.reader.decompressor: - next_line = self._consume_blanklines() - if next_line is None: - # at end of file - break + if self.reader.decompressor: + is_valid = self.reader.read_next_member() - # reset reader for next member - else: - self.reader.read_next_member() + def _raise_err(self): + frmt = 'warc/arc' + if self.known_format: + frmt = self.known_format + + frmt_up = frmt.upper() + + msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) + raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records @@ -72,25 +98,31 @@ class ArchiveIterator(object): and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length + + count empty_size so that it can be substracted from + the record length for uncompressed """ + empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: - return None + return None, empty_size if line.rstrip() == '': - self.offset = self.fh.tell() - self.reader.rem_length() + empty_size += len(line) continue - return line + return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ + + # already at end of this record, don't read until it is consumed if self.member_info: - return self.member_info + return None if compute_digest: digester = hashlib.sha1() @@ -114,19 +146,29 @@ class ArchiveIterator(object): - For uncompressed files, blank lines are read later, and not included in the record length """ - if self.reader.decompressor: - self._consume_blanklines() + #if self.reader.decompressor: + self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() + #if self.offset < 0: + # raise Exception('Not Gzipped Properly') + + if self.next_line: + self.offset -= len(self.next_line) + length = self.offset - curr_offset + if not self.reader.decompressor: + length -= empty_size + if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) - return self.member_info + #return self.member_info + #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream @@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options): entry.post_query = post_query - entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) + #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) + arcv_iter.read_to_end(record, compute_digest) + entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index cb8dc4bb..c8584c8d 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -144,7 +144,6 @@ Total: 4 from pywb import get_test_dir -#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename from io import BytesIO @@ -154,6 +153,9 @@ import os import shutil import tempfile +from pytest import raises + + TEST_CDX_DIR = get_test_dir() + 'cdx/' TEST_WARC_DIR = get_test_dir() + 'warcs/' @@ -231,3 +233,11 @@ def cli_lines_with_dir(input_): print('Total: ' + str(len(lines))) +def test_non_chunked_gzip_err(): + with raises(Exception): + print_cdx_index('example-bad.warc.gz.bad') + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/sample_archive/warcs/example-bad.warc.gz.bad b/sample_archive/warcs/example-bad.warc.gz.bad new file mode 100644 index 0000000000000000000000000000000000000000..95d2c415b4d792cd7fab3f0141dd224c7d9b08f6 GIT binary patch literal 1950 zcmV;P2VwXhiwFo2Ia^c!17&z&ZE$R5E_Y#aV*u?~S##US5q@VC|HFiNu{RuGa33O> z&=Pg5B};3OmhBhU02q)H;W7@2+SLB{^Z?)?i;`pItR$j}9L&+v{q>ET@%uwrkw~|5 z10y`W&zOtuXzueQOxwx#%un+GpB=eqQ00kRRXlJ*FEIU(P;4qD!Mb4(>}k4&6;-kA zz$b>P>7$mw5iNnLN)#Ov1ru#bG#CC=^>ep#m?kAlN_@_edAV?r;+WgXh8>z^5%+1y z(?k}m;gGW^C|csjJYorrVPC|T42xTcLNxJF6?}Jh=V2gVNL3-h*#+#8(tIunLel)# zZl~KR(y)X_pl}YWH_Ssfu!wGQUa}n3;4IRj1YDjV*6r}(ist2I*f4R&yly8<^O(XR z!6XWSNwj2n0Wvfc{WK9fifQViMOkL9EPLEbs#x-pPzrNdoU6aSoLygjQEh{eFD(V- zO9Y4(WsuXbl(HZcQZ#Tp(q+zyEKLgboG#6P7&g%@%mSujohlmknB`!H_=@G5ni2%x z)upmem!?9VQ>Kvn0L8_;gr(s(oLB!)o13fI<)`tfH9Ncd z#rizE`bGb6@#gx&p=OTPqSJiNN<4diwgH!|Xq-h%^3(W_?RcoP+izNXf0>DQJEzm> z6#%aQECgM??RLh#&$1G~rwQ=xBJUow1wc@L!LWcMPa_w_bcN}h4Naok{Vrm?H-axJBz|RF#B5Z<*>JAPh8tW ze)BZ!ZB%ivA+?W^2tvc&lC!%k&C8whcRVN;!+xAQ!$k^uW@BoO|6NZ_L$Y_`PU2OLl(+o_BO0Z--+ z=0e~UhcxEVJ($pC#u9W*lcLv0y;Bw~SqTn{-m!|o%x0pG#yLpPM+I!f1DXl=}dvOSrK)ErzmU^;1b;rv^T9qivIZ*affPyErJ`p_A;ub`&FU?pU^FriW zX&`4Ct_Kh=auyDIZBl`PK;kr^q6m5VSZoFSexg@7O>0WmtN2@d2Mse#<$ zCO6^jnKb^j0+61|;ttiqkWm9WhGAmeatNlT7h+$5M5Tm*={eR*vbd&dKa#|E`Giw` zmFM-r^raQYzc`tnOr}WQQbp_JzBf-biGC+r_jjRLiJ3pFzMX|c@^wfgr_W=^|1wmI zS^p>br+U)f6c6W6JXEmmB9`FUb)O~c4KoO~csvJGU38KD3JyYLLn7$aM@2b+8aAm` zXl0vUO#L+)!^NVcealPP&@4$a(W|$o(~I*yRI0ZOoiP74eT@zmc^WfW1GAG9(;$*z zK?(?I&Nru)5ZrF)y@*$^1ip9$%t78itxMw_jXYb7)&ci9E7nZK2@Q|n016qb>mI{) zsOs14rD(%#o{Dy3b6vv93BN*dIXZuAsF-vJ){qrz+tJ{NEPz4f#H^7>i-Pp+T;o=oB!PN-=b*qEv-SPy-Q zZ39AdVA_@+*oNna-iv(jpO8s@BwqYKll&KDlJCFPPy*A^Je6XwXASE_1=DaW4Qq}; z0^<3gKW)DmtM-%_u41^Fv&W2eOZms9lAk}DsyZ91+KKUbW`5E>j^Cc@=V$LO-_0gx zhm(`1&G)$i^m!|0PH2Ue?qQL)UXKE&4({HG@_zo(Wq==*f+qU80JrNEx@>Du#R5q06vSLM`=5L*adde&{d6@!!t~MA?9KVvA?jgS zKATLBW%+1&)U2oyk>$y|-ulf&`{JT~X|cg_^S+|(q}F{Dv`X!|XqxJViETO_w4eow kQ$QDOI+91g_eKkrGtqEJZO2iQ-t9F03f;arO8pf80CkVP_5c6? literal 0 HcmV?d00001