From 4aa6512b05d5083d2d841da81e14adba14e8ad61 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 Mar 2015 15:38:10 -0700 Subject: [PATCH] rewrite: fix WbUrl parsing for urls that start with a digit, eg. 1234.example.com split latest replay url from timestamped replay regex add additional rewrite tests --- .coveragerc | 3 ++- pywb/rewrite/test/test_wburl.py | 12 ++++++++++++ pywb/rewrite/wburl.py | 12 ++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.coveragerc b/.coveragerc index 92e7a178..6c30b88e 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,10 +1,11 @@ [run] -omit = +omit = */test/* */tests/* *.html *.js *.css + pywb/__init__.py [report] exclude_lines = diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 077bba6d..feabc3f1 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -20,6 +20,18 @@ ur""" >>> repr(WbUrl('cs_/example.com')) "('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')" +>>> repr(WbUrl('im_/20130102.org')) +"('latest_replay', '', 'im_', 'http://20130102.org', 'im_/http://20130102.org')" + +>>> repr(WbUrl('20130102.example.com')) +"('latest_replay', '', '', 'http://20130102.example.com', 'http://20130102.example.com')" + +>>> repr(WbUrl('20130102.org/1')) +"('latest_replay', '', '', 'http://20130102.org/1', 'http://20130102.org/1')" + +>>> repr(WbUrl('20130102/1.com')) +"('replay', '20130102', '', 'http://1.com', '20130102/http://1.com')" + >>> repr(WbUrl('https://example.com/xyz')) "('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 87e6c982..5efe9e45 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -86,7 +86,8 @@ class WbUrl(BaseWbUrl): # Regexs # ====================== QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.+)$') - REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$') + REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{1,3}(.+)$') + #LATEST_REPLAY_REGEX = re.compile('^\w_)') DEFAULT_SCHEME = 'http://' @@ -221,7 +222,14 @@ class WbUrl(BaseWbUrl): def _init_replay(self, url): replay = self.REPLAY_REGEX.match(url) if not replay: - return None + if not url: + return None + + self.timestamp = '' + self.mod = '' + self.url = url + self.type = self.LATEST_REPLAY + return True res = replay.groups('')