CogStack · vladd-bit · May 28, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -84,11 +84,11 @@ RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -
 ##### utils for python and TESSERACT
 RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
 
-RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
+RUN apt-get update && apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
     libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
     ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
     fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
-    libpcre3 libpcre3-dev \
+    libpcre3 libpcre3-dev libxml2 libxml2-dev libxslt1.1 libxslt-dev \
     mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
     imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
 

diff --git a/ocr_service/app/app.py b/ocr_service/app/app.py
@@ -141,7 +141,7 @@ def create_app() -> FastAPI:
     global _started
 
     try:
-        app = FastAPI(title="OCR Service",
+        app = FastAPI(title="OCR_Service",
                       description="OCR Service API",
                       version=settings.OCR_SERVICE_VERSION,
                       default_response_class=ORJSONResponse,

diff --git a/ocr_service/processor/converter.py b/ocr_service/processor/converter.py
@@ -3,6 +3,7 @@
 import atexit
 import multiprocessing
 import os
+import re
 import time
 import traceback
 import uuid
@@ -25,6 +26,10 @@
 
 
 class DocumentConverter:
+
+    MULTI_WHITESPACE = re.compile(r"[ \t]+")
+    MULTI_NEWLINES = re.compile(r"\n{3,}")
+
     def __init__(self, log, loffice_process_list: dict[str, Any]) -> None:
         self.log = log
         self.loffice_process_list = loffice_process_list
@@ -45,8 +50,15 @@ def resolve_content_type(file_type: object | None) -> str:
 
     @staticmethod
     def finalize_output_text(output_text: str) -> str:
-        output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'})  # type: ignore
-        return str(output_text).encode("utf-8", errors="replace").decode("utf-8")
+
+        # normalize line endings
+        output_text = output_text.replace("\r\n", "\n").replace("\r", "\n")
+        # remove multiple whitespaces
+        output_text = DocumentConverter.MULTI_WHITESPACE.sub(" ", output_text)
+        # remove multiple new-lines
+        output_text = DocumentConverter.MULTI_NEWLINES.sub("\n\n", output_text)
+
+        return output_text.encode("utf-8", errors="replace").decode("utf-8").strip()
 
     def _extract_text_fallback(self, 
                                stream: bytes, *,
@@ -57,7 +69,7 @@ def _extract_text_fallback(self,
         text = ""
 
         if is_html or is_xml:
-            parser = "html.parser" if is_html else "xml"
+            parser = "html.parser" if is_html else "lxml-xml"
             try:
                 soup = BeautifulSoup(stream, parser)
             except Exception:
@@ -70,6 +82,11 @@ def _extract_text_fallback(self,
             else:
                 text = soup.get_text(separator="\n")
 
+            # remove XML-ish self-closing tags
+            text = re.sub(r"<[^>]+/>", "", text)
+            # remove empty XML tags
+            text = re.sub(r"</?[\w:.-]+>", "", text)        
+
         if not text and is_rtf:
             try:
                 text = rtf_to_text(stream.decode("utf-8", "ignore"))
@@ -79,7 +96,7 @@ def _extract_text_fallback(self,
         if not text:
             text = stream.decode("utf-8", "ignore")
 
-        return text.strip()
+        return text
 
     @staticmethod
     def initialize_pdf_worker(stream: bytes) -> None:

diff --git a/ocr_service/settings.py b/ocr_service/settings.py
@@ -89,6 +89,10 @@ def validate_lo_port_range(cls, value: str | None) -> str | None:
         return value
 
     def model_post_init(self, __context: Any) -> None:
+        """
+            Performs additional actions after the model is instantiated and all field validators are applied.
+        """
+
         default_lo_python = "/Applications/LibreOffice.app/Contents/Resources/python"
         default_lo_exec = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
         tessdata_prefix = self.OCR_TESSDATA_PREFIX