From 29da5033216e7114114c768c0561d7ccdbdced12 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 17 Jun 2017 11:32:48 +0100 Subject: [PATCH 1/6] travis: use certauth<1.2 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 716ebdb7..e49990ca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ sudo: false install: - "pip install 'argparse>=1.2.1' --allow-all-external" - - pip install boto certauth + - pip install boto 'certauth<1.2' - pip install git+https://github.com/esnme/ultrajson.git - python setup.py -q install - pip install coverage pytest-cov coveralls From 3e8e590c1b750f071f7390abfcb1ffbeac9cf318 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 17 Jun 2017 12:41:52 +0200 Subject: [PATCH 2/6] Improve handling of exceptions in wsgi_wrappers, fixes #219 (#220) * Improve handling of exceptions in wsgi_wrappers, fixes #219 * Update Common Crawl public data set location --- pywb/framework/wsgi_wrappers.py | 18 +++++++++++++++--- pywb/utils/test/test_loaders.py | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index e4bbd1b2..569cf81a 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -86,7 +86,19 @@ class WSGIApp(object): error_view = self.wb_router.error_view if hasattr(exc, 'status'): - status = exc.status() + if callable(exc.status): + status = exc.status() + else: + status = exc.status + # wsgi requires status + # - to have at least 4 characters and + # - to start with a number / integer + if type(status) == int: + status = '{} Exception {}'.format(status, type(exc).__name__) + elif type(status) == str and status[0].isdigit(): + pass + else: + status = '500 Internal Server Error' else: status = '500 Internal Server Error' @@ -96,7 +108,7 @@ class WSGIApp(object): err_url = None if len(exc.args): - err_msg = exc.args[0] + err_msg = str(exc.args[0]) if print_trace: import traceback @@ -125,7 +137,7 @@ class WSGIApp(object): #msg = msg.encode('utf-8', 'ignore') return WbResponse.text_response(msg, - status=status) + status=status) #================================================================= DEFAULT_CONFIG_FILE = 'config.yaml' diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 4b755726..dd5c3861 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -166,7 +166,7 @@ def seek_read_full(seekable_reader, offset): def test_s3_read_1(): pytest.importorskip('boto') - res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', + res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) From 4efb876d53221d5662e929615237ac4497af994b Mon Sep 17 00:00:00 2001 From: Anastasia Aizman Date: Sat, 17 Jun 2017 11:42:41 +0100 Subject: [PATCH 3/6] fix - some broken paths (#212) --- pywb/rewrite/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pywb/rewrite/README.md b/pywb/rewrite/README.md index 0e459ce0..1e7e7203 100644 --- a/pywb/rewrite/README.md +++ b/pywb/rewrite/README.md @@ -16,19 +16,19 @@ which will fetch a live url and apply the registered rewriting rules to that url Run: -`python ./pywb.rewrite/rewrite_live.py http://example.com` +`python ./pywb/rewrite/rewrite_live.py http://example.com` To specify custom timestamp and prefix: ``` -python ./pywb.rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html +python ./pywb/rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html ``` This will print to stdout the content of `http://example.com` with all urls rewritten relative to `/mycoll/20141026000102/http://mysite.example.com/path.html`. Headers are also rewritten, for further details, consult the `get_rewritten` function in -[pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py) +[rewrite_live.py](rewrite_live.py) #### Tests From 897d7d2075da810feef6eec5b4c5dd04b722d7da Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 17 Jun 2017 11:43:41 +0100 Subject: [PATCH 4/6] bump version to 0.33.2 --- pywb/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/__init__.py b/pywb/__init__.py index 5ed899aa..f2b22664 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.33.1' +__version__ = '0.33.2' DEFAULT_CONFIG = 'pywb/default_config.yaml' From 24981eb04b7551224c578829575574a94eac62af Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 17 Jun 2017 13:17:23 +0100 Subject: [PATCH 5/6] Update CHANGES and README for 0.33.2 --- CHANGES.rst | 10 ++++++++++ README.rst | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index b1205143..f6cdb5d9 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,13 @@ +pywb 0.33.2 changelist +~~~~~~~~~~~~~~~~~~~~~~ + +* Minor fixes from pull requests: + - Better handling of exceptions from in wsgi_wrapper + - Fix CommonCrawl tests + - Fix broken links in README + - Fix travis build (requires certauth<1.2) + + pywb 0.33.1 changelist ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index f06e8c52..c06830e0 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.33.1 +PyWb 0.33.2 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master From c5a3f06e83945591dae100ef553b3359c6e0f515 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 13 Oct 2017 20:21:28 +0200 Subject: [PATCH 6/6] CDX-API "filter" param: swap operators for regex and contains match (fixes #249) (#250) - the operator `~` now triggers regex matches - contains match is performed with specific operator (default) --- pywb/cdx/cdxops.py | 7 +++--- pywb/cdx/test/test_cdxops.py | 47 +++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 18c420c5..63f5546f 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -179,12 +179,13 @@ def cdx_filter(cdx_iter, filter_strings): if string.startswith('='): string = string[1:] self.compare_func = self.exact - # contains match + # regex match elif string.startswith('~'): string = string[1:] - self.compare_func = self.contains - else: self.compare_func = self.regex + # contains match + else: + self.compare_func = self.contains parts = string.split(':', 1) # no field set, apply filter to entire cdx diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py index 8c550ece..4594173e 100644 --- a/pywb/cdx/test/test_cdxops.py +++ b/pywb/cdx/test/test_cdxops.py @@ -45,8 +45,32 @@ NotFoundException: No Captures found for: http://iana.org/dont_have_this Traceback (most recent call last): NotFoundException: No Captures found for: http://iana.org/dont_have_this -# Filter cdx (default: regex) ->>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) +# Filter cdx (default: contains) +>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:html']) +org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +# Filter cdx (regex) +>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['~mimetype:.*/html$']) +org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +# Filter cdx (regex) +>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['=mimetype:text/html']) org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz @@ -64,8 +88,8 @@ org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/ >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200') org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz -# Filter -- no field specified, match regex on entire line ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625') +# Filter -- no field specified, check whether filter query contained in entire line +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'screen.css 20140126200625') org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz # Filter -- no such field, no matches @@ -85,12 +109,23 @@ com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz # Filter contains ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = 'urlkey:example=1') com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz # Filter contains invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=') +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!urlkey:example=') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + +# Filter regex +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:.*example=1$') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter regex invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:.*example=[0-9]$') com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz