1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00
pywb/pywb/rewrite/rewrite_live.py
Ilya Kreymer 5345459298 pywb 0.2!
move to distinct packages: pywb.utils, pywb.cdx, pywb.warc, pywb.util, pywb.rewrite!
each package will have its own README and tests
shared sample_data and install
2014-02-17 10:01:09 -08:00

69 lines
1.9 KiB
Python

import urllib2
import os
import sys
import datetime
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent
"""
Fetch a url from live web and apply rewriting rules
"""
#=================================================================
def get_status_and_stream(url):
resp = urllib2.urlopen(url)
headers = []
for name, value in resp.info().dict.iteritems():
headers.append((name, value))
status_headers = StatusAndHeaders('200 OK', headers)
stream = resp
return (status_headers, stream)
#=================================================================
def get_rewritten(url, urlrewriter):
(status_headers, stream) = get_status_and_stream(url)
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
buff = ''
for x in gen:
buff += x
return (status_headers, buff)
#=================================================================
def main():
if len(sys.argv) < 2:
print 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'.format(sys.argv[0])
exit(1)
else:
url = sys.argv[1]
if len(sys.argv) >= 3:
wburl_str = sys.argv[2]
if wburl_str.startswith('/'):
wburl_str = wburl_str[1:]
prefix, wburl_str = wburl_str.split('/', 1)
prefix = '/' + prefix + '/'
else:
wburl_str = datetime_to_timestamp(datetime.datetime.now()) + '/http://example.com/path/sample.html'
prefix = '/pywb_rewrite/'
urlrewriter = UrlRewriter(wburl_str, prefix)
status_headers, buff = get_rewritten(url, urlrewriter)
sys.stdout.write(buff)
#=================================================================
if __name__ == "__main__":
main()