From 30ab27bb1ceea72564a173e02bbbed2c138fd037 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 30 Mar 2015 17:21:17 -0700 Subject: [PATCH] indexing: support indexing (and even replay of) records where target-uri is a 'urn:' identifier (#91) for canonicalzation, treat urns as is, already canonical for wburl, don't add http:// prefix if urn: prefix is present add example-wpull warc for testing --- pywb/rewrite/test/test_wburl.py | 4 ++++ pywb/rewrite/wburl.py | 3 +++ pywb/utils/canonicalize.py | 7 +++++++ pywb/warc/test/test_indexing.py | 22 +++++++++++++-------- sample_archive/warcs/example-wpull.warc.gz | Bin 0 -> 3738 bytes 5 files changed, 28 insertions(+), 8 deletions(-) create mode 100644 sample_archive/warcs/example-wpull.warc.gz diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index feabc3f1..88b57f75 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -44,6 +44,10 @@ ur""" >>> repr(WbUrl('http://example.com?example=2')) "('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')" +# support urn: prefix +>>> repr(WbUrl('urn:X-wpull:log')) +"('latest_replay', '', '', 'urn:X-wpull:log', 'urn:X-wpull:log')" + # Test scheme partially encoded urls >>> repr(WbUrl('https%3A//example.com/')) "('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 5efe9e45..57967d37 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -178,6 +178,9 @@ class WbUrl(BaseWbUrl): self.url = new_uri + if self.url.startswith('urn:'): + return + # protocol agnostic url -> http:// # no protocol -> http:// inx = self.url.find(':/') diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index f8630284..7bbbf7ed 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -33,10 +33,17 @@ def canonicalize(url, surt_ordered=True): >>> canonicalize('http://example.com/path/file.html', surt_ordered=False) 'example.com/path/file.html' + + >>> canonicalize('urn:some:id') + 'urn:some:id' """ try: key = surt.surt(url) except Exception as e: + # urn is already canonical, so just use as-is + if url.startswith('urn:'): + return url + raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index de64e513..864270c0 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -83,6 +83,12 @@ metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu. metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz +# wpull warc, includes metadata by default +>>> print_cdx_index('example-wpull.warc.gz') + CDX N b a m s k r M S V g +com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz + # bad arcs -- test error edge cases >>> print_cdx_index('bad.arc', include_all=True) CDX N b a m s k r M S V g @@ -135,20 +141,20 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar # test sort, multiple inputs >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -Total: 206 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz +Total: 208 # test sort, multiple inputs, recursive, from base test dir >>> cli_lines(['--sort', '-r', '-', get_test_dir()]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz -Total: 206 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz +Total: 208 # test sort, 9-field, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz -Total: 398 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz +Total: 401 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) @@ -171,8 +177,8 @@ Total: 4 # test custom root dir for cdx filenames, dir input >>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 ../warcs/example-url-agnostic-orig.warc.gz -Total: 206 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz +Total: 208 # test writing to temp dir, also use unicode filename >>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz')) diff --git a/sample_archive/warcs/example-wpull.warc.gz b/sample_archive/warcs/example-wpull.warc.gz new file mode 100644 index 0000000000000000000000000000000000000000..227eb0192a738ff6a9073addc9566e3f5c03ba9a GIT binary patch literal 3738 zcmai#Wm^-B0zhe`QyND1=x#~8fb%@(01}@)Gu@y!#d->O=jY}I46yTd5P}8^{f`*AW-A_y(&S;%>+dI5npAvK znY79YIlW*EZ(;rZX+^54hHXfJ*N}9!A5HMuQ{pcnCaYiABalAYq|ouz@`+lH-!FDy zOE*iT1fw!q1;=9S^RT2UlZdZHLOW*6r^f_U(VP2R@zb z+-{ZG*w{Qw|2ShT?>?RXw(T+%cqe;%^YlVaxoz%;7R%(}L}_|@Ak%}_g?uqoHb}`L zC%CYPhE9qbX&LkH{YF@1wZ|dBI!>REQWS)MrJ)+XP!Ssf+*+h=!vF>ms)y?Zi z*4QGpc^u|f@%T{re2cuh_@Te&NXCx2-_=@jT5RSL;ZBK_>ZcSBzrgQe%g(S3Ohm%Z zRRsd2Dsi0)Bh;{+t~1VwNe3CN8iPhkI3P-Yzd7Ir<~e=-=xUaPnIUQwWYU;m#g8N2 zG9kUvq<}JuEqPr?#nM0;KyKiUd%$+N4v97%H(nL;)>T~OG&$R!m6he1)iY5#0k-M5 zgi(?@b%PEX6~hCf=R+8_B$>MvN<*+uSy5!fEy(8^#-7c#76f#sRZCT-uPqXzJ$F=% z-(a2>;h_Ge&)107r8A9P?WELu#N zHnE9s<>9x&{0O+xk@|9~veL%si52D!*TXs1ro7YRW}RC9fFx^FUez&yD=DmoNgM^6 z+Y8{HLQ!h`2;ya5R+r6qXQy(hBKhO?qOAD9nv&U6!Ryi~CW?tfk5oN0XYlV6G@gam z35C23lAv&yjj^!>JFoC;8*tw%fbT!Ap7R=XYgEC-Oh6#O*EEEj1c^A76mIU9nEtxP zi1p%QVF<*~RESuuI#YvVWm}s)yi}b<3ziX~cEFV9(mJ`{ zo=$GC{UrpF3`C4yMpyqWS1tQftLw-8aw_6E^s?d^{GL3iBv49WRM`3QL(KBl*awVi zoGeh7{Tfq>q16wHmvXQ4E-0KLomO?e@^yH#vf?0MZ{f-r(th#-f3E{@AJ7}(CIzfbpd9>`eO4hyKm6k%P|e=WK%Okp$2EF}y@Dy6tNSm5D>*bdc$ z`%B~=LJ|`gn`a)|Vzh+nQVq0WRfL6f&F%@Bj^vh9!?ZJ4LtcQOB)B zFvAW89`tzfD}!rUI3<6)FE@{TH(i#bZ7neKY>z>6%e7=h+q`BH5iZD131UjFU;MZd zN!36&ggC3e#6Zyy+lIIXieGNoZsb)q3lp=vqB$FrGBk5Z+R>R{rKSQA?h;fXr?F%w zaxe#%!^SPs=K13oaVHw%Ds%!;&AG)IBJfNtiIEVaYt$4}h6ER3kP6$>P zMVz?o3lsaYY@8xL8n5azcZbye!@(E5S$M1DQs1LEcZ1&Y3L0gSHEN*xErTVr1`giz zDrRI{st^c=$`s%cP`S035K{MV#H(uYcII^PGx4KwSIsqC(RuU5zq5{C+8RB#+U>`- z4EAD?KL0YI$#w~yMePYV{&Hq<%E7;cb$S)DP%PQ8LO+>uAJjyijZ}$psTBT6A2@S} zj?h5^-i4T9!*lr)^oO#T3hz* zVik`lOmll5#>$>p-3mYG2)?{HpmXCXbvJMIKaykWF#TmjA`Fz!UEp>qEHLabe`%W6 z2Xb<6qkiDIs$Msx! z;SJ_~NocV7N5}95m1Z5?jnY+Sv^l4U{pahgRIpK5!C83QdbxDB9+h=p<8ar<5rk|V zjVqJ3h1$$J3HI1ylb-ra)xC(+bFuLL8NuT!pIJ!sSTkco=G5KH!LD6@_cojP33P*v za42w$<;p|Et)|b?n()T!i(t7pk33Cm0OPJ*fJ{)?z?`hiDK}4mxRUTwNf4Pvr4W;- z=cIgs+;~FqJ$1ac;EwWA1bsb0DJS}D5FJW9ioesE0>Ji7+gm=80&!<)O|^!URJ-2c zxiA%@fGH`a*}D&MQ(rC=5uFFi?MI4FwC2K73&MaPJ3eBpe{qv+YU#5nMI8=}eJW@b zZWE_ysu%6YEdA5Kg{osCSdzHoE_RsgZGz}<+sWOn39LYn3e?VSH;|t`bLV>-)BcwF zNVJ=NVAD7oi1MY7817hq$)JQ|uo_QL-aB0h+RTnwIc0-65aYOQt)aya_GRs6=H@77 zvXJqrV-04Wzfl0jNc;kjSg6nQ!f-ZZIj>A&=+BjX<1?=beA+wshlWMT$ zQR}IddOR|MSQ^`V48Ok!eOTzb=zEeoOp?wrY0$^91Ng2qBX@WAXHX+=R>N8|e+Rr* z&^vuNg(VK0ez6u=Uid@P5M!-5czm&%+y`ano3KZ-Avyg&ygR`nRaofhqz&+@MUjZ& zu;JwD%NJvC=QukkWb5fkx0L#mw%#qfCGo~Uw(!&GKT>4XO7ccUOme}OW{f#0bcf`* ziRf@r0CM94rBi1$ddN9Egn@d40?!2oAV6rZM0IGu5yh2eRk@iLD(2!IgEF2nuOlz> zIzgZ&k{M$)>l~y^OeXlb@mJMq)7*TDA-9-}#H|)#64gaT&W0WwB{n<}$0c90{uKXb z_ry16Fv^hydQ^VG69_QgV)6;N`CZ~hRuq19T&WRfntA81n3^0;pG9wg?L9LHzWy>L zQJnRK3<|CxN9+yejLvhPM))w{ zlD-3h6zZg;w^vpy6Jbf;qallp%8eaL)f?*CD%)J_Gs9X4r=&DHr#erw%!8He_NB{U z0qM2(Tv4IbI?w*4J|*c-E-NDB15acZ%oiAGz?(R?L{aTce@WCT@U71NBA_x>8 zq-*#HB-Y$mK$klri=Qpt+)Mq8=ecnl?Svn-ItoAC5|2$+%jkY6BOHxMtF+iNlTF06 z+h@id1u<;$LO5lJ(y6h*k$qmDbR(2@VO?NRa;H1{-kz_b(NjuZr!^*ixTQa@TN-$k zFUtt%%4l@2=F~I9SaYSW2l08rTwQIhCg!y=@Ddr0$=x#ie!qv&RK_ll^VNlV)#pi^ zSV3q$K|Nf*eeqT#lV?(cjs)985X0<6r%LpA$;ET=6Xl2odJQ+Px6eG#iQij3Q@ Y2IV&75