mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
warc: simplify recordloader a bit more, only response and request records
get parsed as http (excluding dns: and whois: uris) All others have an '-' status and no headers parsing tests: add test for zero-length revisits
This commit is contained in:
parent
6761f5697f
commit
913a1e9f31
@ -54,7 +54,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter exact invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = ['!=urlkey:com,example)/?example=1', '!=urlkey:com,example)/?example=2'])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
@ -65,7 +65,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
@ -46,6 +46,8 @@ class ArcWarcRecordLoader:
|
||||
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
|
||||
'OPTIONS', 'CONNECT', 'PATCH']
|
||||
|
||||
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
|
||||
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker)
|
||||
@ -94,25 +96,22 @@ class ArcWarcRecordLoader:
|
||||
known_format))
|
||||
|
||||
if the_format == 'arc':
|
||||
rec_type = 'response'
|
||||
uri = rec_headers.get_header('uri')
|
||||
length = rec_headers.get_header('length')
|
||||
content_type = rec_headers.get_header('content-type')
|
||||
sub_len = rec_headers.total_len
|
||||
if uri and uri.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
elif the_format == 'warc':
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
uri = rec_headers.get_header('WARC-Target-URI')
|
||||
length = rec_headers.get_header('Content-Length')
|
||||
content_type = rec_headers.get_header('Content-Type')
|
||||
sub_len = 0
|
||||
|
||||
if rec_type == 'response' and uri:
|
||||
if uri.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
elif uri.startswith('dns:'):
|
||||
rec_type = 'dns_response'
|
||||
elif uri.startswith('whois:'):
|
||||
rec_type = 'whois_response'
|
||||
|
||||
is_err = False
|
||||
|
||||
try:
|
||||
@ -124,39 +123,28 @@ class ArcWarcRecordLoader:
|
||||
|
||||
# err condition
|
||||
if is_err:
|
||||
status_headers = StatusAndHeaders('-', [])
|
||||
length = 0
|
||||
# special case: empty w/arc record (hopefully a revisit)
|
||||
elif length == 0:
|
||||
status_headers = StatusAndHeaders('204 No Content', [])
|
||||
|
||||
# limit stream to the length for all valid records
|
||||
stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# if empty record (error or otherwise) set status to -
|
||||
if length == 0:
|
||||
# already handled error case above
|
||||
pass
|
||||
status_headers = StatusAndHeaders('- None', [])
|
||||
|
||||
# ================================================================
|
||||
# handle different types of records
|
||||
# special case: warc records that are not expected to have http headers
|
||||
# attempt to add 200 status and content-type
|
||||
elif rec_type == 'metadata' or rec_type == 'resource':
|
||||
content_type = [('Content-Type',
|
||||
rec_headers.get_header('Content-Type'))]
|
||||
# response record or non-empty revisit: parse HTTP status and headers!
|
||||
elif (rec_type in ('response', 'revisit') and
|
||||
not uri.startswith(('dns:', 'whois:'))):
|
||||
status_headers = self.http_parser.parse(stream)
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', content_type)
|
||||
|
||||
elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')):
|
||||
# no extra parsing of body for these
|
||||
status_headers = StatusAndHeaders('204 No Content', [])
|
||||
|
||||
elif (rec_type == 'request'):
|
||||
# request record: parse request
|
||||
elif ((rec_type == 'request') and
|
||||
not uri.startswith(('dns:', 'whois:'))):
|
||||
status_headers = self.http_req_parser.parse(stream)
|
||||
|
||||
# response record: parse HTTP status and headers!
|
||||
# everything else: create a no-status entry, set content-type
|
||||
else:
|
||||
status_headers = self.http_parser.parse(stream)
|
||||
content_type_header = [('Content-Type', content_type)]
|
||||
status_headers = StatusAndHeaders('- OK', content_type_header)
|
||||
|
||||
return ArcWarcRecord(the_format, rec_type,
|
||||
rec_headers, stream, status_headers)
|
||||
|
@ -47,9 +47,9 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain 200 SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain 200 UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain 200 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||
|
||||
# bad arcs -- test error edge cases
|
||||
>>> print_cdx_index('bad.arc')
|
||||
@ -104,13 +104,13 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
204
|
||||
206
|
||||
|
||||
# test sort, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||
395
|
||||
398
|
||||
|
||||
# test writing to stdout
|
||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
|
3
sample_archive/cdx/example-extra.cdx
Normal file
3
sample_archive/cdx/example-extra.cdx
Normal file
@ -0,0 +1,3 @@
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 example-extra.warc
|
||||
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 2701 example-extra.warc
|
107
sample_archive/warcs/example-extra.warc
Normal file
107
sample_archive/warcs/example-extra.warc
Normal file
@ -0,0 +1,107 @@
|
||||
WARC/1.0
|
||||
WARC-Type: response
|
||||
WARC-Record-ID: <urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>
|
||||
WARC-Date: 2014-01-03T03:03:21Z
|
||||
Content-Length: 1610
|
||||
Content-Type: application/http; msgtype=response
|
||||
WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
|
||||
WARC-Target-URI: http://example.com?example=2
|
||||
WARC-Warcinfo-ID: <urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>
|
||||
|
||||
HTTP/1.1 200 OK
|
||||
Accept-Ranges: bytes
|
||||
Cache-Control: max-age=604800
|
||||
Content-Type: text/html
|
||||
Date: Fri, 03 Jan 2014 03:03:21 GMT
|
||||
Etag: "359670651"
|
||||
Expires: Fri, 10 Jan 2014 03:03:21 GMT
|
||||
Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
|
||||
Server: ECS (sjc/4FCE)
|
||||
X-Cache: HIT
|
||||
x-ec-custom-error: 1
|
||||
Content-Length: 1270
|
||||
Connection: close
|
||||
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Example Domain</title>
|
||||
|
||||
<meta charset="utf-8" />
|
||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<style type="text/css">
|
||||
body {
|
||||
background-color: #f0f0f2;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
|
||||
|
||||
}
|
||||
div {
|
||||
width: 600px;
|
||||
margin: 5em auto;
|
||||
padding: 50px;
|
||||
background-color: #fff;
|
||||
border-radius: 1em;
|
||||
}
|
||||
a:link, a:visited {
|
||||
color: #38488f;
|
||||
text-decoration: none;
|
||||
}
|
||||
@media (max-width: 700px) {
|
||||
body {
|
||||
background-color: #fff;
|
||||
}
|
||||
div {
|
||||
width: auto;
|
||||
margin: 0 auto;
|
||||
border-radius: 0;
|
||||
padding: 1em;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div>
|
||||
<h1>Example Domain</h1>
|
||||
<p>This domain is established to be used for illustrative examples in documents. You may use this
|
||||
domain in examples without prior coordination or asking for permission.</p>
|
||||
<p><a href="http://www.iana.org/domains/example">More information...</a></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
||||
WARC/1.0
|
||||
WARC-Type: request
|
||||
WARC-Record-ID: <urn:uuid:9a3ffea5-9556-4790-a6bf-c15231fd6b97>
|
||||
WARC-Date: 2014-01-03T03:03:21Z
|
||||
Content-Length: 323
|
||||
Content-Type: application/http; msgtype=request
|
||||
WARC-Concurrent-To: <urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>
|
||||
WARC-Target-URI: http://example.com?example=2
|
||||
WARC-Warcinfo-ID: <urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>
|
||||
|
||||
GET /?example=2 HTTP/1.1
|
||||
Connection: close
|
||||
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
|
||||
Accept-Language: en-US,en;q=0.8
|
||||
User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)
|
||||
Host: example.com
|
||||
|
||||
|
||||
WARC/1.0
|
||||
WARC-Type: revisit
|
||||
WARC-Record-ID: <urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>
|
||||
WARC-Date: 2014-06-03T03:03:41Z
|
||||
WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
|
||||
WARC-Target-URI: http://example.com?example=2
|
||||
WARC-Warcinfo-ID: <urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>
|
||||
WARC-Profile: http://netpreserve.org/warc/0.18/revisit/identical-payload-digest
|
||||
WARC-Refers-To-Target-URI: http://example.com?example=2
|
||||
WARC-Refers-To-Date: 2014-01-03T03:03:21Z
|
||||
Content-Length: 0
|
||||
|
||||
|
@ -115,6 +115,14 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb-nosurt/20140103030321mp_/http://www.iana.org/domains/example' in resp.body
|
||||
|
||||
def test_zero_len_revisit(self):
|
||||
resp = self.testapp.get('/pywb/20140603030341mp_/http://example.com?example=2')
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
assert 'Tue, Jun 03 2014 03:03:41' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20140603030341mp_/http://www.iana.org/domains/example' in resp.body
|
||||
|
||||
def test_replay_url_agnostic_revisit(self):
|
||||
resp = self.testapp.get('/pywb/20130729195151mp_/http://www.example.com/')
|
||||
self._assert_basic_html(resp)
|
||||
|
Loading…
x
Reference in New Issue
Block a user