Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.22.7

### Enhancements
- **Trim langdetect profiles**: Load only 15 common language profiles instead of all 55, reducing n-gram probability map memory by ~77% (58 MiB -> 14 MiB).

## 0.22.6

### Fixes
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.6" # pragma: no cover
__version__ = "0.22.7" # pragma: no cover
46 changes: 46 additions & 0 deletions unstructured/partition/common/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Callable, Iterable, Iterator, Optional

import iso639 # pyright: ignore[reportMissingTypeStubs]
import langdetect.detector_factory as _ldf
from langdetect import ( # pyright: ignore[reportMissingTypeStubs]
DetectorFactory,
detect_langs, # pyright: ignore[reportUnknownVariableType]
Expand All @@ -18,6 +19,51 @@
TESSERACT_LANGUAGES_SPLITTER,
)

# Patch langdetect to load only 15 common language profiles instead of all 55.
# Cuts n-gram probability map memory by ~77% (58 MiB -> 14 MiB). Documents in
# excluded languages still get a result — the closest loaded profile matches.
LANGDETECT_LANGUAGES = frozenset(
{
"en",
"es",
"ar",
"fr",
"de",
"it",
"pt",
"ru",
"ja",
"ko",
"zh-cn",
"zh-tw",
"hi",
"bn",
"id",
}
)


def init_langdetect_with_subset():
"""Load only common language profiles into langdetect's DetectorFactory."""
if _ldf._factory is not None:
return

import json
from pathlib import Path

from langdetect.utils.lang_profile import LangProfile

factory = _ldf.DetectorFactory()
profile_dir = Path(_ldf.PROFILES_DIRECTORY)
files = sorted(f.name for f in profile_dir.iterdir() if f.name in LANGDETECT_LANGUAGES)
for index, filename in enumerate(files):
with open(profile_dir / filename, encoding="utf-8") as fh:
factory.add_profile(LangProfile(**json.load(fh)), index, len(files))
_ldf._factory = factory


_ldf.init_factory = init_langdetect_with_subset

_ASCII_RE = re.compile(r"^[\x00-\x7F]+$")

# pytesseract.get_languages(config="") only shows user installed language packs,
Expand Down