Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@ RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -
##### utils for python and TESSERACT
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections

RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
RUN apt-get update && apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
libpcre3 libpcre3-dev \
libpcre3 libpcre3-dev libxml2 libxml2-dev libxslt1.1 libxslt-dev \
mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5

Expand Down
2 changes: 1 addition & 1 deletion ocr_service/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def create_app() -> FastAPI:
global _started

try:
app = FastAPI(title="OCR Service",
app = FastAPI(title="OCR_Service",
description="OCR Service API",
version=settings.OCR_SERVICE_VERSION,
default_response_class=ORJSONResponse,
Expand Down
25 changes: 21 additions & 4 deletions ocr_service/processor/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import atexit
import multiprocessing
import os
import re
import time
import traceback
import uuid
Expand All @@ -25,6 +26,10 @@


class DocumentConverter:

MULTI_WHITESPACE = re.compile(r"[ \t]+")
MULTI_NEWLINES = re.compile(r"\n{3,}")

def __init__(self, log, loffice_process_list: dict[str, Any]) -> None:
self.log = log
self.loffice_process_list = loffice_process_list
Expand All @@ -45,8 +50,15 @@ def resolve_content_type(file_type: object | None) -> str:

@staticmethod
def finalize_output_text(output_text: str) -> str:
output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'}) # type: ignore
return str(output_text).encode("utf-8", errors="replace").decode("utf-8")

# normalize line endings
output_text = output_text.replace("\r\n", "\n").replace("\r", "\n")
# remove multiple whitespaces
output_text = DocumentConverter.MULTI_WHITESPACE.sub(" ", output_text)
# remove multiple new-lines
output_text = DocumentConverter.MULTI_NEWLINES.sub("\n\n", output_text)

return output_text.encode("utf-8", errors="replace").decode("utf-8").strip()

def _extract_text_fallback(self,
stream: bytes, *,
Expand All @@ -57,7 +69,7 @@ def _extract_text_fallback(self,
text = ""

if is_html or is_xml:
parser = "html.parser" if is_html else "xml"
parser = "html.parser" if is_html else "lxml-xml"
try:
soup = BeautifulSoup(stream, parser)
except Exception:
Expand All @@ -70,6 +82,11 @@ def _extract_text_fallback(self,
else:
text = soup.get_text(separator="\n")

# remove XML-ish self-closing tags
text = re.sub(r"<[^>]+/>", "", text)
# remove empty XML tags
text = re.sub(r"</?[\w:.-]+>", "", text)

if not text and is_rtf:
try:
text = rtf_to_text(stream.decode("utf-8", "ignore"))
Expand All @@ -79,7 +96,7 @@ def _extract_text_fallback(self,
if not text:
text = stream.decode("utf-8", "ignore")

return text.strip()
return text

@staticmethod
def initialize_pdf_worker(stream: bytes) -> None:
Expand Down
4 changes: 4 additions & 0 deletions ocr_service/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def validate_lo_port_range(cls, value: str | None) -> str | None:
return value

def model_post_init(self, __context: Any) -> None:
"""
Performs additional actions after the model is instantiated and all field validators are applied.
"""

default_lo_python = "/Applications/LibreOffice.app/Contents/Resources/python"
default_lo_exec = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
tessdata_prefix = self.OCR_TESSDATA_PREFIX
Expand Down
Loading
Loading