diff --git a/Dockerfile b/Dockerfile index 0de944e..8deab5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -84,11 +84,11 @@ RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y - ##### utils for python and TESSERACT RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections -RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \ +RUN apt-get update && apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \ libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \ ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \ fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \ - libpcre3 libpcre3-dev \ + libpcre3 libpcre3-dev libxml2 libxml2-dev libxslt1.1 libxslt-dev \ mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \ imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5 diff --git a/ocr_service/app/app.py b/ocr_service/app/app.py index dc032e3..f913311 100755 --- a/ocr_service/app/app.py +++ b/ocr_service/app/app.py @@ -141,7 +141,7 @@ def create_app() -> FastAPI: global _started try: - app = FastAPI(title="OCR Service", + app = FastAPI(title="OCR_Service", description="OCR Service API", version=settings.OCR_SERVICE_VERSION, default_response_class=ORJSONResponse, diff --git a/ocr_service/processor/converter.py b/ocr_service/processor/converter.py index 35b596d..3d3b2f6 100755 --- a/ocr_service/processor/converter.py +++ b/ocr_service/processor/converter.py @@ -3,6 +3,7 @@ import atexit import multiprocessing import os +import re import time import traceback import uuid @@ -25,6 +26,10 @@ class DocumentConverter: + + MULTI_WHITESPACE = re.compile(r"[ \t]+") + MULTI_NEWLINES = re.compile(r"\n{3,}") + def __init__(self, log, loffice_process_list: dict[str, Any]) -> None: self.log = log self.loffice_process_list = loffice_process_list @@ -45,8 +50,15 @@ def resolve_content_type(file_type: object | None) -> str: @staticmethod def finalize_output_text(output_text: str) -> str: - output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'}) # type: ignore - return str(output_text).encode("utf-8", errors="replace").decode("utf-8") + + # normalize line endings + output_text = output_text.replace("\r\n", "\n").replace("\r", "\n") + # remove multiple whitespaces + output_text = DocumentConverter.MULTI_WHITESPACE.sub(" ", output_text) + # remove multiple new-lines + output_text = DocumentConverter.MULTI_NEWLINES.sub("\n\n", output_text) + + return output_text.encode("utf-8", errors="replace").decode("utf-8").strip() def _extract_text_fallback(self, stream: bytes, *, @@ -57,7 +69,7 @@ def _extract_text_fallback(self, text = "" if is_html or is_xml: - parser = "html.parser" if is_html else "xml" + parser = "html.parser" if is_html else "lxml-xml" try: soup = BeautifulSoup(stream, parser) except Exception: @@ -70,6 +82,11 @@ def _extract_text_fallback(self, else: text = soup.get_text(separator="\n") + # remove XML-ish self-closing tags + text = re.sub(r"<[^>]+/>", "", text) + # remove empty XML tags + text = re.sub(r"", "", text) + if not text and is_rtf: try: text = rtf_to_text(stream.decode("utf-8", "ignore")) @@ -79,7 +96,7 @@ def _extract_text_fallback(self, if not text: text = stream.decode("utf-8", "ignore") - return text.strip() + return text @staticmethod def initialize_pdf_worker(stream: bytes) -> None: diff --git a/ocr_service/settings.py b/ocr_service/settings.py index c095c23..bab7228 100755 --- a/ocr_service/settings.py +++ b/ocr_service/settings.py @@ -89,6 +89,10 @@ def validate_lo_port_range(cls, value: str | None) -> str | None: return value def model_post_init(self, __context: Any) -> None: + """ + Performs additional actions after the model is instantiated and all field validators are applied. + """ + default_lo_python = "/Applications/LibreOffice.app/Contents/Resources/python" default_lo_exec = "/Applications/LibreOffice.app/Contents/MacOS/soffice" tessdata_prefix = self.OCR_TESSDATA_PREFIX diff --git a/ocr_service/tests/resources/docs/generic/pat_id_1_openofficexml.odt b/ocr_service/tests/resources/docs/generic/pat_id_1_openofficexml.odt new file mode 100644 index 0000000..b5be945 --- /dev/null +++ b/ocr_service/tests/resources/docs/generic/pat_id_1_openofficexml.odt @@ -0,0 +1,335 @@ + + + + + + Clinical Test Document + OpenAI Synthetic Generator + Example NHS-like clinical XML document for OCR/NLP testing + + + + + + + Example NHS Foundation Trust + + + + Community Mental Health Team + + + + Document Type: + Outpatient Review Letter + + + + Date Created: + 28 May 2026 + + + + + + Patient Demographics + + + + NHS Number: + 485 777 3456 + + + + Hospital Number: + HN0092847 + + + + The patient’s name is + + Bart Davidson + . + + + + Date of Birth: + 14 February 1978 + + + + Gender: + Male + + + + His telephone number is + + 07754828992 + + + + + Address + + + + His Address is: + + + + + 61 Basildon Way + + + East Croyhurst + + + Angelton + + + AL64 9HT + + + + + Family and Carer Information + + + + His mother’s name is + + Pauline Smith + . + + + + His carer’s Name is + + Paul Wayne + . + + + + Relationship to patient: + Primary Carer + + + + Clinical Summary + + + + This is an example of a clinical document. + + + + Bart Davidson attended clinic today for a routine follow-up review. + + + + Mood appeared stable and no acute distress was observed. + + + + The patient denied suicidal ideation or recent self-harm. + + + + Sleep pattern described as variable with intermittent insomnia. + + + + Appetite reported as normal. + + + + No visual or auditory hallucinations reported during assessment. + + + + Medication + + + + He is on + 100mg + Paracetamol, + and + 20 milligrams + clozapine. + + + + + + + Medication + + + + Dose + + + + Frequency + + + + + + Paracetamol + + + + 100mg + + + + PRN + + + + + + Clozapine + + + + 20mg + + + + OD + + + + + + + Risk Assessment + + + + + Risk to self: Low + + + + Risk to others: Low + + + + Safeguarding concerns: None identified + + + + + Plan + + + + Continue current medication regime. + + + + Arrange routine blood monitoring for clozapine. + + + + Follow-up appointment in 4 weeks. + + + + + + This is a synthetic note for XML parsing tests. + + + + + + NLP / OCR Structured Entities + + + + <patient> + <name>Bart Davidson</name> + <dob>1978-02-14</dob> + <nhs_number>4857773456</nhs_number> + </patient> + + + + <carer> + <name>Paul Wayne</name> + <relationship>Primary Carer</relationship> + </carer> + + + + <medication name="Paracetamol" dose="100mg" frequency="PRN" /> + + + + <medication name="Clozapine" dose="20mg" frequency="OD" /> + + + + <address postcode="AL64 9HT"> + 61 Basildon Way, East Croyhurst, Angelton + </address> + + + + Additional XML Features + + + + + Extra spacing test. + + + + Line break test + + second line + + third line. + + + + Hyperlink test: + + NHS Website + + + + + Timestamp: + 14:32 + + + + + + Confidentiality Notice + + + + This document contains confidential clinical information intended + for authorised healthcare professionals only. + + + + + + diff --git a/requirements.txt b/requirements.txt index cb47035..dec43db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ httpx==0.28.1 beautifulsoup4==4.12.3 striprtf==0.0.29 python-multipart==0.0.27 +lxml==6.1.1 # Pillow package dependencies defusedxml==0.7.1