11# This file comes from an adaptation of rules present in
2- # https://github.com/webrecorder/wabac.js/blame /main/src/fuzzymatcher.js
2+ # https://github.com/webrecorder/wabac.js/blob /main/src/fuzzymatcher.ts
33#
44# Syncing rules is done manually, based on expert knowledge, especially because in
55# scraperlib we are not really fuzzy matching (searching the best entry among existing
66# ones) but just rewriting to proper path.
77#
8- # This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815
9- # from June 9, 2024
8+ # This file is in sync with content at commit
9+ # https://github.com/webrecorder/wabac.js/commit/1c3acfce39e0dc127acf455b04237e9a82062730
10+ # from October 17, 2024
1011#
1112# This file should be updated at every release of scraperlib
1213#
@@ -27,12 +28,12 @@ fuzzyRules:
2728 - raw_url : foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value
2829 fuzzified_url : youtube.fuzzy.replayweb.page/videoplayback?id=1576
2930 - raw_url : foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value
30- unchanged : true # videoplayback is not followed by `?`
31+ unchanged : true # videoplayback is not followed by `?`
3132 - raw_url : foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value
32- unchanged : true # No googlevideo.com in url
33+ unchanged : true # No googlevideo.com in url
3334 - name : youtube_video_info
3435 pattern : (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
35- replace : youtube.fuzzy.replayweb.page/\1\2
36+ replace : youtube.fuzzy.replayweb.page/\1\2
3637 tests :
3738 - raw_url : www.youtube.com/get_video_info?video_id=123ah
3839 fuzzified_url : youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
@@ -52,7 +53,7 @@ fuzzyRules:
5253 unchanged : true # improper hostname
5354 - name : youtube_thumbnails
5455 pattern : i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
55- replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
56+ replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
5657 tests :
5758 - raw_url : i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
5859 fuzzified_url : i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
@@ -64,7 +65,7 @@ fuzzyRules:
6465 fuzzified_url : i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
6566 - name : trim_digits_only
6667 pattern : ([^?]+)\?[\d]+$
67- replace : \1
68+ replace : \1
6869 tests :
6970 - raw_url : www.example.com/page?1234
7071 fuzzified_url : www.example.com/page
@@ -80,7 +81,7 @@ fuzzyRules:
8081 unchanged : true
8182 - name : youtubei
8283 pattern : (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
83- replace : youtube.fuzzy.replayweb.page/\1?\2
84+ replace : youtube.fuzzy.replayweb.page/\1?\2
8485 tests :
8586 - raw_url : www.youtube-nocookie.com/youtubei/page/?videoId=123ah
8687 fuzzified_url : youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
@@ -104,7 +105,7 @@ fuzzyRules:
104105 unchanged : true
105106 - name : youtube_embed
106107 pattern : (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
107- replace : youtube.fuzzy.replayweb.page/embed/\1
108+ replace : youtube.fuzzy.replayweb.page/embed/\1
108109 tests :
109110 - raw_url : www.youtube-nocookie.com/embed/foo
110111 fuzzified_url : youtube.fuzzy.replayweb.page/embed/foo
@@ -123,7 +124,7 @@ fuzzyRules:
123124
124125 - name : vimeo_cdn_fix # custom warc2zim rule intended to fix Vimeo support
125126 pattern : .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*\/(.+?.mp4)\?.*range=(.*?)(?:&.*|$)
126- replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
127+ replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
127128 tests :
128129 - raw_url : gcs-vimeo.akamaized.net/123.mp4?range=123-456
129130 fuzzified_url : vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
@@ -151,7 +152,7 @@ fuzzyRules:
151152 unchanged : true
152153 - name : vimeo_cdn
153154 pattern : .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?\/([\d/]+.mp4)$
154- replace : vimeo-cdn.fuzzy.replayweb.page/\1
155+ replace : vimeo-cdn.fuzzy.replayweb.page/\1
155156 tests :
156157 - raw_url : vod.akamaized.net/23.mp4
157158 fuzzified_url : vimeo-cdn.fuzzy.replayweb.page/23.mp4
@@ -161,7 +162,7 @@ fuzzyRules:
161162 fuzzified_url : vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
162163 - name : vimeo_player
163164 pattern : .*player.vimeo.com\/(video\/[\d]+)\?.*
164- replace : vimeo.fuzzy.replayweb.page/\1
165+ replace : vimeo.fuzzy.replayweb.page/\1
165166 tests :
166167 - raw_url : player.vimeo.com/video/1234?foo=bar
167168 fuzzified_url : vimeo.fuzzy.replayweb.page/video/1234
@@ -177,15 +178,15 @@ fuzzyRules:
177178 unchanged : true
178179 - name : i_vimeo_cdn
179180 pattern : .*i\.vimeocdn\.com\/(.*)\?.*
180- replace : i.vimeocdn.fuzzy.replayweb.page/\1
181+ replace : i.vimeocdn.fuzzy.replayweb.page/\1
181182 tests :
182183 - raw_url : i.vimeocdn.com/image/1234?foo=bar
183184 fuzzified_url : i.vimeocdn.fuzzy.replayweb.page/image/1234
184185 - raw_url : i.vimeocdn.com/something/a456?foo
185186 fuzzified_url : i.vimeocdn.fuzzy.replayweb.page/something/a456
186187 - name : cheatography_com
187188 pattern : cheatography\.com\/scripts\/(.*).js.*[?&](v=[^&]+).*
188- replace : cheatography.com.fuzzy.replayweb.page/scripts/\1.js?\2
189+ replace : cheatography.com.fuzzy.replayweb.page/scripts/\1.js?\2
189190 tests :
190191 - raw_url : cheatography.com/scripts/useful.min.js?v=2&q=1719438924
191192 fuzzified_url : cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
0 commit comments