mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'master' into develop, merging changes from old release
This commit is contained in:
commit
056aed085c
10
CHANGES.rst
10
CHANGES.rst
@ -1,3 +1,13 @@
|
||||
pywb 0.33.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Minor fixes from pull requests:
|
||||
- Better handling of exceptions from in wsgi_wrapper
|
||||
- Fix CommonCrawl tests
|
||||
- Fix broken links in README
|
||||
- Fix travis build (requires certauth<1.2)
|
||||
|
||||
|
||||
pywb 0.33.1 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
PyWb 0.33.1
|
||||
PyWb 0.33.2
|
||||
===========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
|
||||
|
@ -16,19 +16,19 @@ which will fetch a live url and apply the registered rewriting rules to that url
|
||||
|
||||
Run:
|
||||
|
||||
`python ./pywb.rewrite/rewrite_live.py http://example.com`
|
||||
`python ./pywb/rewrite/rewrite_live.py http://example.com`
|
||||
|
||||
To specify custom timestamp and prefix:
|
||||
|
||||
```
|
||||
python ./pywb.rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html
|
||||
python ./pywb/rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html
|
||||
```
|
||||
|
||||
This will print to stdout the content of `http://example.com` with all urls rewritten relative to
|
||||
`/mycoll/20141026000102/http://mysite.example.com/path.html`.
|
||||
|
||||
Headers are also rewritten, for further details, consult the `get_rewritten` function in
|
||||
[pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py)
|
||||
[rewrite_live.py](rewrite_live.py)
|
||||
|
||||
|
||||
#### Tests
|
||||
|
@ -169,12 +169,13 @@ class CDXFilter(object):
|
||||
if string.startswith('='):
|
||||
string = string[1:]
|
||||
self.compare_func = self.exact
|
||||
# contains match
|
||||
# regex match
|
||||
elif string.startswith('~'):
|
||||
string = string[1:]
|
||||
self.compare_func = self.contains
|
||||
else:
|
||||
self.compare_func = self.rx_match
|
||||
# contains match
|
||||
else:
|
||||
self.compare_func = self.contains
|
||||
|
||||
parts = string.split(':', 1)
|
||||
# no field set, apply filter to entire cdx
|
||||
|
@ -40,8 +40,32 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ
|
||||
# No matching -- limit=1
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||
# Filter cdx (default: contains)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# Filter cdx (regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['~mimetype:.*/html$'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# Filter cdx (regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['=mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
@ -59,8 +83,8 @@ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter -- no field specified, match regex on entire line
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
|
||||
# Filter -- no field specified, check whether filter query contained in entire line
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'screen.css 20140126200625')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter -- no such field, no matches
|
||||
@ -78,12 +102,23 @@ com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Filter contains
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = {'dir': test_cdx_dir}, matchType = 'prefix', filter = '~urlkey:example=1')
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = {'dir': test_cdx_dir}, matchType = 'prefix', filter = 'urlkey:example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = {'dir': test_cdx_dir}, matchType = 'prefix', filter = '!~urlkey:example=')
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = {'dir': test_cdx_dir}, matchType = 'prefix', filter = '!urlkey:example=')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Filter regex
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = {'dir': test_cdx_dir}, matchType = 'prefix', filter = '~urlkey:.*example=1$')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter regex invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = {'dir': test_cdx_dir}, matchType = 'prefix', filter = '!~urlkey:.*example=[0-9]$')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
Loading…
x
Reference in New Issue
Block a user