1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fuzzy/rules improvements:

- remove 'force_type', if mixin present ensure text type is set (use 'mixin_type' prop defaulting to 'json')
- rules: add more fuzzy match rules for fb photos
- tests: add tests for find_all
This commit is contained in:
Ilya Kreymer 2017-11-01 10:55:32 -07:00
parent bcbc00a89b
commit 9023fb531e
3 changed files with 28 additions and 7 deletions

View File

@ -172,9 +172,8 @@ class BaseContentRewriter(object):
rule = self.get_rule(cdx)
force_type = rule.get('force_type')
if force_type:
rwinfo.text_type = force_type
if rule.get('mixin') and not rwinfo.text_type:
rwinfo.text_type = rule.get('mixin_type', 'json')
if rwinfo.should_rw_content():
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)

View File

@ -71,14 +71,18 @@ rules:
mixin_params:
rx: '"ssid":([\d]+)'
force_type: 'json'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(?:.*?(?:[&]|(query_type|fbid|v|cursor|data)[^,]+))'
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerpagelet'
fuzzy_lookup:
match: '("(?:cursor|cursorindex)":["\d\w]+)'
find_all: true
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
#fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(?:.*?(?:[&]|(query_type|fbid|v|cursor|data)[^,]+))'
- url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php'
@ -119,6 +123,13 @@ rules:
fuzzy_lookup:
- __user
- url_prefix: 'com,facebook)/ajax/photos/'
fuzzy_lookup:
- __spin_r
- __spin_t
- __dyn
# fallback for all /ajax/
- url_prefix: 'com,facebook)/ajax/'

View File

@ -122,6 +122,17 @@ class TestFuzzy(object):
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
def test_fuzzy_find_all_rule(self):
url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerpagelet?data={"cursor":"ABC","food":"bar","cursorindex":6,"A":12345,"B":"foo"}'
actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerpagelet?data={"some":data","cursor":"ABC","foo":"bar","cursorindex":6}'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
filters = {'urlkey:"cursor":"abc"',
'urlkey:"cursorindex":6'}
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
def test_no_fuzzy_custom_rule_video_id_diff(self):
url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'