mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
fuzzy/rules improvements:
- remove 'force_type', if mixin present ensure text type is set (use 'mixin_type' prop defaulting to 'json') - rules: add more fuzzy match rules for fb photos - tests: add tests for find_all
This commit is contained in:
parent
bcbc00a89b
commit
9023fb531e
@ -172,9 +172,8 @@ class BaseContentRewriter(object):
|
||||
|
||||
rule = self.get_rule(cdx)
|
||||
|
||||
force_type = rule.get('force_type')
|
||||
if force_type:
|
||||
rwinfo.text_type = force_type
|
||||
if rule.get('mixin') and not rwinfo.text_type:
|
||||
rwinfo.text_type = rule.get('mixin_type', 'json')
|
||||
|
||||
if rwinfo.should_rw_content():
|
||||
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
||||
|
@ -71,14 +71,18 @@ rules:
|
||||
mixin_params:
|
||||
rx: '"ssid":([\d]+)'
|
||||
|
||||
force_type: 'json'
|
||||
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(?:.*?(?:[&]|(query_type|fbid|v|cursor|data)[^,]+))'
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerpagelet'
|
||||
|
||||
fuzzy_lookup:
|
||||
match: '("(?:cursor|cursorindex)":["\d\w]+)'
|
||||
find_all: true
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
|
||||
|
||||
#fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(?:.*?(?:[&]|(query_type|fbid|v|cursor|data)[^,]+))'
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php'
|
||||
|
||||
@ -119,6 +123,13 @@ rules:
|
||||
fuzzy_lookup:
|
||||
- __user
|
||||
|
||||
- url_prefix: 'com,facebook)/ajax/photos/'
|
||||
|
||||
fuzzy_lookup:
|
||||
- __spin_r
|
||||
- __spin_t
|
||||
- __dyn
|
||||
|
||||
# fallback for all /ajax/
|
||||
- url_prefix: 'com,facebook)/ajax/'
|
||||
|
||||
|
@ -122,6 +122,17 @@ class TestFuzzy(object):
|
||||
|
||||
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||
|
||||
def test_fuzzy_find_all_rule(self):
|
||||
url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerpagelet?data={"cursor":"ABC","food":"bar","cursorindex":6,"A":12345,"B":"foo"}'
|
||||
actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerpagelet?data={"some":data","cursor":"ABC","foo":"bar","cursorindex":6}'
|
||||
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
filters = {'urlkey:"cursor":"abc"',
|
||||
'urlkey:"cursorindex":6'}
|
||||
|
||||
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||
|
||||
def test_no_fuzzy_custom_rule_video_id_diff(self):
|
||||
url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
|
||||
actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'
|
||||
|
Loading…
x
Reference in New Issue
Block a user