Skip to content

Commit b613fca

Browse files
committed
fixup! Move rewriting stuff from warc2zim to zimscraperlib
1 parent 39ab439 commit b613fca

File tree

5 files changed

+33
-25
lines changed

5 files changed

+33
-25
lines changed

.github/workflows/Publish.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Build and publish to PyPI / NPM
1+
name: Build and publish to PyPI
22

33
on:
44
release:

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
exclude: ^tests/files # these are raw test files, no need to mess with them
44
repos:
55
- repo: https://github.com/pre-commit/pre-commit-hooks
6-
rev: v4.5.0
6+
rev: v5.0.0
77
hooks:
88
- id: trailing-whitespace
99
- id: end-of-file-fixer
@@ -12,11 +12,11 @@ repos:
1212
hooks:
1313
- id: black
1414
- repo: https://github.com/astral-sh/ruff-pre-commit
15-
rev: v0.4.9
15+
rev: v0.7.0
1616
hooks:
1717
- id: ruff
1818
- repo: https://github.com/RobertCraigie/pyright-python
19-
rev: v1.1.368
19+
rev: v1.1.385
2020
hooks:
2121
- id: pyright
2222
name: pyright (system)

openzim.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,14 @@ action="get_file"
99
source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.8.2/dist/wombat.js"
1010
target_file="wombat.js"
1111

12-
[files.assets.actions."wombatSetup.js"] # fallback if this script has not been properly built
12+
# wombatSetup.js is supposed to be built locally from files in javascript folder.
13+
# Should someone not have proper skills / tooling / knowledge, or simply install from
14+
# sdist / Github repo directly, without any advanced knowledge of this specificity, the
15+
# configuration below ensures that wombatSetup.js is downloaded from dev.kiwix.org,
16+
# where we have the latest version from `main` branch. wheel contains the wombatSetup.js
17+
# which was built at the same time than the wheel. (reminder: get_file action does not
18+
# overwrite a file which already exists)
19+
[files.assets.actions."wombatSetup.js"]
1320
action="get_file"
1421
source="https://dev.kiwix.org/zimscraperlib/wombatSetup.js"
1522
target_file="wombatSetup.js"

pyproject.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,20 @@ scripts = [
5252
]
5353
lint = [
5454
"black==24.10.0",
55-
"ruff==0.4.9",
55+
"ruff==0.7.0",
5656
]
5757
check = [
58-
"pyright==1.1.368",
59-
"pytest==8.2.2",
58+
"pyright==1.1.385",
59+
"pytest==8.3.3",
6060
]
6161
test = [
62-
"pytest==8.2.2",
62+
"pytest==8.3.3",
6363
"pytest-mock==3.14.0",
6464
"coverage==7.5.3",
6565
]
6666
dev = [
6767
"ipython==8.25.0",
68-
"pre-commit==3.7.1",
68+
"pre-commit==4.0.1",
6969
"zimscraperlib[scripts]",
7070
"zimscraperlib[lint]",
7171
"zimscraperlib[test]",

rules/rules.yaml

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
# This file comes from an adaptation of rules present in
2-
# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js
2+
# https://github.com/webrecorder/wabac.js/blob/main/src/fuzzymatcher.ts
33
#
44
# Syncing rules is done manually, based on expert knowledge, especially because in
55
# scraperlib we are not really fuzzy matching (searching the best entry among existing
66
# ones) but just rewriting to proper path.
77
#
8-
# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815
9-
# from June 9, 2024
8+
# This file is in sync with content at commit
9+
# https://github.com/webrecorder/wabac.js/commit/1c3acfce39e0dc127acf455b04237e9a82062730
10+
# from October 17, 2024
1011
#
1112
# This file should be updated at every release of scraperlib
1213
#
@@ -27,12 +28,12 @@ fuzzyRules:
2728
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value
2829
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
2930
- raw_url: foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value
30-
unchanged: true # videoplayback is not followed by `?`
31+
unchanged: true # videoplayback is not followed by `?`
3132
- raw_url: foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value
32-
unchanged: true # No googlevideo.com in url
33+
unchanged: true # No googlevideo.com in url
3334
- name: youtube_video_info
3435
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
35-
replace : youtube.fuzzy.replayweb.page/\1\2
36+
replace: youtube.fuzzy.replayweb.page/\1\2
3637
tests:
3738
- raw_url: www.youtube.com/get_video_info?video_id=123ah
3839
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
@@ -52,7 +53,7 @@ fuzzyRules:
5253
unchanged: true # improper hostname
5354
- name: youtube_thumbnails
5455
pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
55-
replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
56+
replace: i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
5657
tests:
5758
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
5859
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
@@ -64,7 +65,7 @@ fuzzyRules:
6465
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
6566
- name: trim_digits_only
6667
pattern: ([^?]+)\?[\d]+$
67-
replace : \1
68+
replace: \1
6869
tests:
6970
- raw_url: www.example.com/page?1234
7071
fuzzified_url: www.example.com/page
@@ -80,7 +81,7 @@ fuzzyRules:
8081
unchanged: true
8182
- name: youtubei
8283
pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
83-
replace : youtube.fuzzy.replayweb.page/\1?\2
84+
replace: youtube.fuzzy.replayweb.page/\1?\2
8485
tests:
8586
- raw_url: www.youtube-nocookie.com/youtubei/page/?videoId=123ah
8687
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
@@ -104,7 +105,7 @@ fuzzyRules:
104105
unchanged: true
105106
- name: youtube_embed
106107
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
107-
replace : youtube.fuzzy.replayweb.page/embed/\1
108+
replace: youtube.fuzzy.replayweb.page/embed/\1
108109
tests:
109110
- raw_url: www.youtube-nocookie.com/embed/foo
110111
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
@@ -123,7 +124,7 @@ fuzzyRules:
123124

124125
- name: vimeo_cdn_fix # custom warc2zim rule intended to fix Vimeo support
125126
pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*\/(.+?.mp4)\?.*range=(.*?)(?:&.*|$)
126-
replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
127+
replace: vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
127128
tests:
128129
- raw_url: gcs-vimeo.akamaized.net/123.mp4?range=123-456
129130
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
@@ -151,7 +152,7 @@ fuzzyRules:
151152
unchanged: true
152153
- name: vimeo_cdn
153154
pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?\/([\d/]+.mp4)$
154-
replace : vimeo-cdn.fuzzy.replayweb.page/\1
155+
replace: vimeo-cdn.fuzzy.replayweb.page/\1
155156
tests:
156157
- raw_url: vod.akamaized.net/23.mp4
157158
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4
@@ -161,7 +162,7 @@ fuzzyRules:
161162
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
162163
- name: vimeo_player
163164
pattern: .*player.vimeo.com\/(video\/[\d]+)\?.*
164-
replace : vimeo.fuzzy.replayweb.page/\1
165+
replace: vimeo.fuzzy.replayweb.page/\1
165166
tests:
166167
- raw_url: player.vimeo.com/video/1234?foo=bar
167168
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
@@ -177,15 +178,15 @@ fuzzyRules:
177178
unchanged: true
178179
- name: i_vimeo_cdn
179180
pattern: .*i\.vimeocdn\.com\/(.*)\?.*
180-
replace : i.vimeocdn.fuzzy.replayweb.page/\1
181+
replace: i.vimeocdn.fuzzy.replayweb.page/\1
181182
tests:
182183
- raw_url: i.vimeocdn.com/image/1234?foo=bar
183184
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/image/1234
184185
- raw_url: i.vimeocdn.com/something/a456?foo
185186
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/something/a456
186187
- name: cheatography_com
187188
pattern: cheatography\.com\/scripts\/(.*).js.*[?&](v=[^&]+).*
188-
replace : cheatography.com.fuzzy.replayweb.page/scripts/\1.js?\2
189+
replace: cheatography.com.fuzzy.replayweb.page/scripts/\1.js?\2
189190
tests:
190191
- raw_url: cheatography.com/scripts/useful.min.js?v=2&q=1719438924
191192
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2

0 commit comments

Comments
 (0)