1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warc: simplify recordloader a bit more, only response and request records

get parsed as http (excluding dns: and whois: uris)
All others have an '-' status and no headers parsing
tests: add test for zero-length revisits
This commit is contained in:
Ilya Kreymer 2014-06-25 12:11:26 -07:00
parent 6761f5697f
commit 913a1e9f31
6 changed files with 145 additions and 39 deletions

View File

@ -54,7 +54,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter exact invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = ['!=urlkey:com,example)/?example=1', '!=urlkey:com,example)/?example=2'])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
@ -65,7 +65,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz

View File

@ -46,6 +46,8 @@ class ArcWarcRecordLoader:
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
'OPTIONS', 'CONNECT', 'PATCH']
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
if not loader:
loader = BlockLoader(cookie_maker)
@ -94,25 +96,22 @@ class ArcWarcRecordLoader:
known_format))
if the_format == 'arc':
rec_type = 'response'
uri = rec_headers.get_header('uri')
length = rec_headers.get_header('length')
content_type = rec_headers.get_header('content-type')
sub_len = rec_headers.total_len
if uri and uri.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
elif the_format == 'warc':
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
sub_len = 0
if rec_type == 'response' and uri:
if uri.startswith('filedesc://'):
rec_type = 'arc_header'
elif uri.startswith('dns:'):
rec_type = 'dns_response'
elif uri.startswith('whois:'):
rec_type = 'whois_response'
is_err = False
try:
@ -124,39 +123,28 @@ class ArcWarcRecordLoader:
# err condition
if is_err:
status_headers = StatusAndHeaders('-', [])
length = 0
# special case: empty w/arc record (hopefully a revisit)
elif length == 0:
status_headers = StatusAndHeaders('204 No Content', [])
# limit stream to the length for all valid records
stream = LimitReader.wrap_stream(stream, length)
# if empty record (error or otherwise) set status to -
if length == 0:
# already handled error case above
pass
status_headers = StatusAndHeaders('- None', [])
# ================================================================
# handle different types of records
# special case: warc records that are not expected to have http headers
# attempt to add 200 status and content-type
elif rec_type == 'metadata' or rec_type == 'resource':
content_type = [('Content-Type',
rec_headers.get_header('Content-Type'))]
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit') and
not uri.startswith(('dns:', 'whois:'))):
status_headers = self.http_parser.parse(stream)
status_headers = StatusAndHeaders('200 OK', content_type)
elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')):
# no extra parsing of body for these
status_headers = StatusAndHeaders('204 No Content', [])
elif (rec_type == 'request'):
# request record: parse request
elif ((rec_type == 'request') and
not uri.startswith(('dns:', 'whois:'))):
status_headers = self.http_req_parser.parse(stream)
# response record: parse HTTP status and headers!
# everything else: create a no-status entry, set content-type
else:
status_headers = self.http_parser.parse(stream)
content_type_header = [('Content-Type', content_type)]
status_headers = StatusAndHeaders('- OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers)

View File

@ -47,9 +47,9 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain 200 SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain 200 UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain 200 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# bad arcs -- test error edge cases
>>> print_cdx_index('bad.arc')
@ -104,13 +104,13 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
204
206
# test sort, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
395
398
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])

View File

@ -0,0 +1,3 @@
CDX N b a m s k r M S V g
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 example-extra.warc
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 2701 example-extra.warc

View File

@ -0,0 +1,107 @@
WARC/1.0
WARC-Type: response
WARC-Record-ID: <urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>
WARC-Date: 2014-01-03T03:03:21Z
Content-Length: 1610
Content-Type: application/http; msgtype=response
WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
WARC-Target-URI: http://example.com?example=2
WARC-Warcinfo-ID: <urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>
HTTP/1.1 200 OK
Accept-Ranges: bytes
Cache-Control: max-age=604800
Content-Type: text/html
Date: Fri, 03 Jan 2014 03:03:21 GMT
Etag: "359670651"
Expires: Fri, 10 Jan 2014 03:03:21 GMT
Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
Server: ECS (sjc/4FCE)
X-Cache: HIT
x-ec-custom-error: 1
Content-Length: 1270
Connection: close
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 50px;
background-color: #fff;
border-radius: 1em;
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
body {
background-color: #fff;
}
div {
width: auto;
margin: 0 auto;
border-radius: 0;
padding: 1em;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is established to be used for illustrative examples in documents. You may use this
domain in examples without prior coordination or asking for permission.</p>
<p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>
WARC/1.0
WARC-Type: request
WARC-Record-ID: <urn:uuid:9a3ffea5-9556-4790-a6bf-c15231fd6b97>
WARC-Date: 2014-01-03T03:03:21Z
Content-Length: 323
Content-Type: application/http; msgtype=request
WARC-Concurrent-To: <urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>
WARC-Target-URI: http://example.com?example=2
WARC-Warcinfo-ID: <urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>
GET /?example=2 HTTP/1.1
Connection: close
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.8
User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)
Host: example.com
WARC/1.0
WARC-Type: revisit
WARC-Record-ID: <urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>
WARC-Date: 2014-06-03T03:03:41Z
WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
WARC-Target-URI: http://example.com?example=2
WARC-Warcinfo-ID: <urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>
WARC-Profile: http://netpreserve.org/warc/0.18/revisit/identical-payload-digest
WARC-Refers-To-Target-URI: http://example.com?example=2
WARC-Refers-To-Date: 2014-01-03T03:03:21Z
Content-Length: 0

View File

@ -115,6 +115,14 @@ class TestWb:
assert 'wb.js' in resp.body
assert '/pywb-nosurt/20140103030321mp_/http://www.iana.org/domains/example' in resp.body
def test_zero_len_revisit(self):
resp = self.testapp.get('/pywb/20140603030341mp_/http://example.com?example=2')
self._assert_basic_html(resp)
assert 'Tue, Jun 03 2014 03:03:41' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20140603030341mp_/http://www.iana.org/domains/example' in resp.body
def test_replay_url_agnostic_revisit(self):
resp = self.testapp.get('/pywb/20130729195151mp_/http://www.example.com/')
self._assert_basic_html(resp)