Skip to content

Commit 6aa458b

Browse files
authored
Merge pull request #176 from openzim/graphemes
Metadata length validation: count graphemes
2 parents 7ab3fcd + a2c1892 commit 6aa458b

File tree

4 files changed

+23
-8
lines changed

4 files changed

+23
-8
lines changed

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ dependencies = [
1818
"beautifulsoup4>=4.9.3,<5.0",
1919
"lxml>=4.6.3,<6.0",
2020
"optimize-images>=1.3.6,<2.0",
21+
# regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
22+
# promise is that they will never (or always) break the API, and the API is very
23+
# limited and we use only a very small subset of it.
24+
"regex>=2020.7.14",
2125
# youtube-dl should be updated as frequently as possible
2226
"yt-dlp"
2327
]

src/zimscraperlib/image/probing.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import io
88
import pathlib
99
import re
10+
from typing import IO
1011

1112
import colorthief
1213
import PIL.Image
@@ -52,7 +53,7 @@ def is_hex_color(text: str) -> bool:
5253

5354

5455
def format_for(
55-
src: pathlib.Path | io.BytesIO,
56+
src: pathlib.Path | IO[bytes],
5657
from_suffix: bool = True, # noqa: FBT001, FBT002
5758
) -> str:
5859
"""Pillow format of a given filename, either Pillow-detected or from suffix"""
@@ -70,7 +71,7 @@ def format_for(
7071

7172

7273
def is_valid_image(
73-
image: pathlib.Path | io.IOBase | bytes,
74+
image: pathlib.Path | IO[bytes] | bytes,
7475
imformat: str,
7576
size: tuple[int, int] | None = None,
7677
) -> bool:

src/zimscraperlib/zim/metadata.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from collections.abc import Iterable
66
from typing import Any
77

8+
import regex
9+
810
from zimscraperlib.constants import (
911
ILLUSTRATIONS_METADATA_RE,
1012
MANDATORY_ZIM_METADATA_KEYS,
@@ -16,6 +18,11 @@
1618
from zimscraperlib.image.probing import is_valid_image
1719

1820

21+
def nb_grapheme_for(value: str) -> int:
22+
"""Number of graphemes (visually perceived characters) in a given string"""
23+
return len(regex.findall(r"\X", value))
24+
25+
1926
def validate_required_values(name: str, value: Any):
2027
"""ensures required ones have a value (spec doesnt requires it but makes sense)"""
2128
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
@@ -43,7 +50,7 @@ def validate_standard_str_types(name: str, value: str):
4350

4451
def validate_title(name: str, value: str):
4552
"""ensures Title metadata is within recommended length"""
46-
if name == "Title" and len(value) > RECOMMENDED_MAX_TITLE_LENGTH:
53+
if name == "Title" and nb_grapheme_for(value) > RECOMMENDED_MAX_TITLE_LENGTH:
4754
raise ValueError(f"{name} is too long.")
4855

4956

@@ -83,15 +90,18 @@ def validate_counter(name: str, value: str): # noqa: ARG001
8390

8491
def validate_description(name: str, value: str):
8592
"""ensures Description metadata is with required length"""
86-
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
93+
if (
94+
name == "Description"
95+
and nb_grapheme_for(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH
96+
):
8797
raise ValueError(f"{name} is too long.")
8898

8999

90100
def validate_longdescription(name: str, value: str):
91101
"""ensures LongDescription metadata is with required length"""
92102
if (
93103
name == "LongDescription"
94-
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
104+
and nb_grapheme_for(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
95105
):
96106
raise ValueError(f"{name} is too long.")
97107

tests/zim/test_zim_creator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,7 @@ def test_config_metadata(tmp_path, png_image, tags):
739739
("Flavour", 4, False),
740740
("Source", 4, False),
741741
("Scraper", 4, False),
742-
("Title", "X" * 30, True),
742+
("Title", "में" * 30, True),
743743
("Title", "X" * 31, False),
744744
("Date", 4, False),
745745
("Date", datetime.datetime.now(), True), # noqa: DTZ005
@@ -762,9 +762,9 @@ def test_config_metadata(tmp_path, png_image, tags):
762762
("Language", "eng,", False),
763763
("Language", "eng, fra", False),
764764
("Counter", "1", False),
765-
("Description", "X" * 80, True),
765+
("Description", "में" * 80, True),
766766
("Description", "X" * 81, False),
767-
("LongDescription", "X" * 4000, True),
767+
("LongDescription", "में" * 4000, True),
768768
("LongDescription", "X" * 4001, False),
769769
("Tags", 4, False),
770770
("Tags", ["wikipedia", 4, "football"], False),

0 commit comments

Comments
 (0)