Skip to content

Commit 7ab3fcd

Browse files
authored
Merge pull request #175 from openzim/tags_utilities
Enhance tags manipulation
2 parents 07cb331 + 4402698 commit 7ab3fcd

File tree

5 files changed

+100
-6
lines changed

5 files changed

+100
-6
lines changed

src/zimscraperlib/inputs.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pathlib
77
import shutil
88
import tempfile
9+
from collections.abc import Iterable
910

1011
from zimscraperlib import logger
1112
from zimscraperlib.constants import DEFAULT_USER_AGENT
@@ -111,3 +112,26 @@ def compute_descriptions(
111112
user_description = user_description[:-1] + "…"
112113

113114
return (user_description, user_long_description)
115+
116+
117+
def compute_tags(
118+
default_tags: Iterable[str],
119+
user_tags: str | None,
120+
) -> set[str]:
121+
"""Computes a list of tags string compliant with ZIM standard.
122+
123+
Based on default tags (set by the scraper) and user provided tags (usually retrived
124+
from the CLI arguments), the function computes a tag string to be used as metadata
125+
which is compliant with the ZIM standard. It removes duplicates and empty values,
126+
and strip leading and trailing whitespaces.
127+
128+
args:
129+
default_tags: the list of default tags always set for a given scraper
130+
user_tags: the tags, separated by semi-colon, as given by user at CLI args
131+
132+
Returns a set of tags, ready to be passed to the creator
133+
"""
134+
135+
return {
136+
tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
137+
}

src/zimscraperlib/zim/metadata.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,17 @@ def validate_tags(name: str, value: Iterable[str] | str):
103103
or not all(isinstance(tag, str) for tag in value)
104104
):
105105
raise ValueError(f"Invalid type(s) for {name}")
106+
if (
107+
name == "Tags"
108+
and not isinstance(value, str)
109+
and isinstance(value, Iterable)
110+
and len(set(value)) != len(list(value))
111+
):
112+
raise ValueError(f"Duplicate tags are not valid: {value}")
113+
if name == "Tags" and isinstance(value, str):
114+
values = value.split(";")
115+
if len(set(values)) != len(list(values)):
116+
raise ValueError(f"Duplicate tags are not valid: {value}")
106117

107118

108119
def validate_illustrations(name: str, value: bytes):

tests/download/test_download.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ def test_large_download_https(tmp_path, valid_https_url):
167167
@pytest.mark.parametrize(
168168
"url,video_id",
169169
[
170-
("https://vimeo.com/619427082", "619427082"),
171-
("https://vimeo.com/619427082", "619427082"),
170+
("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"),
171+
("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"),
172172
],
173173
)
174174
def test_youtube_download_serial(url, video_id, tmp_path):
@@ -186,7 +186,7 @@ def test_youtube_download_serial(url, video_id, tmp_path):
186186
def test_youtube_download_nowait(tmp_path):
187187
with YoutubeDownloader(threads=1) as yt_downloader:
188188
future = yt_downloader.download(
189-
"https://vimeo.com/619427082",
189+
"https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL",
190190
BestMp4.get_options(target_dir=tmp_path),
191191
wait=False,
192192
)
@@ -212,10 +212,11 @@ def test_youtube_download_error():
212212
def test_youtube_download_contextmanager(tmp_path):
213213
with YoutubeDownloader(threads=1) as yt_downloader:
214214
yt_downloader.download(
215-
"https://vimeo.com/619427082", BestWebm.get_options(target_dir=tmp_path)
215+
"https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL",
216+
BestWebm.get_options(target_dir=tmp_path),
216217
)
217218
assert yt_downloader.executor._shutdown
218-
assert tmp_path.joinpath("video.mp4").exists() # videmo doesn't offer webm
219+
assert tmp_path.joinpath("video.mp4").exists() # jeena doesn't offer webm
219220

220221

221222
@pytest.fixture

tests/inputs/test_inputs.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH,
1717
)
1818
from zimscraperlib.constants import NAME as PROJECT_NAME
19-
from zimscraperlib.inputs import compute_descriptions, handle_user_provided_file
19+
from zimscraperlib.inputs import (
20+
compute_descriptions,
21+
compute_tags,
22+
handle_user_provided_file,
23+
)
2024

2125

2226
def test_with_none():
@@ -296,3 +300,28 @@ def test_description(
296300

297301
assert description == expected_description
298302
assert long_description == expected_long_description
303+
304+
305+
@pytest.mark.parametrize(
306+
"default_tags, user_tags, expected_tags",
307+
[
308+
pytest.param(
309+
{"tag1", "tag2"},
310+
"tag3;tag4",
311+
{"tag1", "tag2", "tag3", "tag4"},
312+
id="case1",
313+
),
314+
pytest.param(
315+
{" tag1", " tag2 "},
316+
" ta:g,4;tag2 ",
317+
{"tag1", "tag2", "ta:g,4"},
318+
id="case2",
319+
),
320+
],
321+
)
322+
def test_compute_tags(
323+
default_tags: set[str],
324+
user_tags: str,
325+
expected_tags: set[str],
326+
):
327+
assert compute_tags(default_tags, user_tags) == expected_tags

tests/zim/test_metadata.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,32 @@ def test_validate_language_valid(name: str, value: Iterable[str] | str):
3232
def test_validate_language_invalid(name: str, value: Iterable[str] | str):
3333
with pytest.raises(ValueError, match=re.escape("is not ISO-639-3")):
3434
metadata.validate_language(name, value)
35+
36+
37+
@pytest.mark.parametrize(
38+
"tags, is_valid",
39+
[
40+
pytest.param("", True, id="empty_string"),
41+
pytest.param("tag1", True, id="empty_string"),
42+
pytest.param("taaaag1", True, id="many_letters"),
43+
pytest.param("tag1;tag2", True, id="semi_colon_distinct_1"),
44+
pytest.param("tag2;tag2", False, id="semi_colon_identical"),
45+
pytest.param("tag,1;tug,1", True, id="semi_colon_distinct_2"),
46+
pytest.param(
47+
"tag1,tag2", True, id="comma"
48+
), # we cannot say that this ought to be a tags separator
49+
pytest.param({"tag1"}, True, id="one_tag_in_set"),
50+
pytest.param({"tag1", "tag2"}, True, id="two_tags_in_set"),
51+
pytest.param(1, False, id="one_int"),
52+
pytest.param(None, False, id="none_value"),
53+
pytest.param(["tag1", "tag2"], True, id="two_distinct"),
54+
pytest.param(["tag1", "tag1"], False, id="two_identical"),
55+
pytest.param(["tag1", 1], False, id="int_in_list"),
56+
],
57+
)
58+
def test_validate_tags(tags, is_valid):
59+
if is_valid:
60+
metadata.validate_tags("Tags", tags)
61+
else:
62+
with pytest.raises(ValueError):
63+
metadata.validate_tags("Tags", tags)

0 commit comments

Comments
 (0)