diff --git a/CHANGELOG.md b/CHANGELOG.md index 7482c1e6a..395e94c1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.0.93 +* Refactored the Dockerfile to use the chainguard/wolfi-base image instead of the unstructured/base-image. This is to align with the recent change in the unstructured repo where the same change was made. +* upgraded dependancies to address CVEs + ## 0.0.92 * Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation diff --git a/Dockerfile b/Dockerfile index 48b468468..81091a7a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:experimental -FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base +FROM cgr.dev/chainguard/wolfi-base:latest # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html @@ -10,25 +10,65 @@ ARG PIPELINE_PACKAGE ARG PYTHON_VERSION="3.12" # Set up environment -ENV PYTHON python${PYTHON_VERSION} -ENV PIP ${PYTHON} -m pip +ENV PYTHON=python${PYTHON_VERSION} +ENV PIP="${PYTHON} -m pip" + +USER root + +COPY ./docker/packages/*.apk /tmp/packages/ + +RUN apk update && \ + apk add libxml2 python-3.12 python-3.12-base py3.12-pip glib \ + mesa-gl mesa-libgallium cmake bash libmagic wget git openjpeg \ + poppler poppler-utils poppler-glib libreoffice tesseract && \ + apk add --allow-untrusted /tmp/packages/pandoc-3.1.8-r0.apk && \ + rm -rf /tmp/packages && \ + git clone --depth 1 https://github.com/tesseract-ocr/tessdata.git /tmp/tessdata && \ + mkdir -p /usr/local/share/tessdata && \ + cp /tmp/tessdata/*.traineddata /usr/local/share/tessdata && \ + rm -rf /tmp/tessdata && \ + git clone --depth 1 https://github.com/tesseract-ocr/tessconfigs /tmp/tessconfigs && \ + cp -r /tmp/tessconfigs/configs /usr/local/share/tessdata && \ + cp -r /tmp/tessconfigs/tessconfigs /usr/local/share/tessdata && \ + rm -rf /tmp/tessconfigs && \ + apk cache clean && \ + ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \ + ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \ + chmod +x /usr/lib/libreoffice/program/soffice.bin && \ + apk add --no-cache font-ubuntu fontconfig && \ + apk upgrade --no-cache py3.12-pip && \ + fc-cache -fv && \ + ln -sf /usr/bin/$PYTHON /usr/bin/python3 && \ + addgroup --gid ${NB_UID} ${NB_USER} && \ + adduser --disabled-password --gecos "" --uid ${NB_UID} -G ${NB_USER} ${NB_USER} && \ + rm -rf /usr/lib/python3.10 && \ + rm -rf /usr/lib/python3.11 && \ + rm -rf /usr/lib/python3.13 && \ + rm -f /usr/bin/python3.13 + +ENV USER=${NB_USER} +ENV HOME=/home/${NB_USER} +COPY --chown=${NB_USER} scripts/initialize-libreoffice.sh ${HOME}/initialize-libreoffice.sh -WORKDIR ${HOME} USER ${NB_USER} +WORKDIR ${HOME} + +# Initialize libreoffice config as non-root user (required for soffice to work properly) +# See: https://github.com/Unstructured-IO/unstructured/issues/3105 +RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh ENV PYTHONPATH="${PYTHONPATH}:${HOME}" ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" +ENV TESSDATA_PREFIX=/usr/local/share/tessdata -FROM base as python-deps COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt -RUN ${PIP} install pip==${PIP_VERSION} -RUN ${PIP} install --no-cache -r requirements-base.txt +RUN ${PIP} install pip==${PIP_VERSION} && \ + ${PIP} install --no-cache -r requirements-base.txt -FROM python-deps as model-deps RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ - ${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" + ${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" && \ + ${PYTHON} -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" -FROM model-deps as code COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/ diff --git a/docker/packages/pandoc-3.1.8-r0.apk b/docker/packages/pandoc-3.1.8-r0.apk new file mode 100644 index 000000000..769644353 Binary files /dev/null and b/docker/packages/pandoc-3.1.8-r0.apk differ diff --git a/prepline_general/api/__version__.py b/prepline_general/api/__version__.py index 9666db13e..54d4b587b 100644 --- a/prepline_general/api/__version__.py +++ b/prepline_general/api/__version__.py @@ -1 +1 @@ -__version__ = "0.0.92" # pragma: no cover +__version__ = "0.0.93" # pragma: no cover diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index ad2c506a7..54ec25889 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.92 +version: 0.0.93 diff --git a/requirements/base.txt b/requirements/base.txt index ae52127d5..52079892a 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -10,7 +10,7 @@ annotated-types==0.7.0 # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf -anyio==4.12.0 +anyio==4.12.1 # via # httpx # starlette @@ -20,9 +20,7 @@ backoff==2.2.1 # unstructured beautifulsoup4==4.14.3 # via unstructured -cachetools==6.2.4 - # via google-auth -certifi==2025.11.12 +certifi==2026.1.4 # via # httpcore # httpx @@ -39,6 +37,7 @@ click==8.3.1 # -r ./requirements/base.in # nltk # python-oxmsg + # typer-slim # uvicorn coloredlogs==15.0.1 # via onnxruntime @@ -46,6 +45,7 @@ contourpy==1.3.3 # via matplotlib cryptography==46.0.3 # via + # google-auth # msoffcrypto-tool # pdfminer-six # unstructured-client @@ -63,7 +63,7 @@ et-xmlfile==2.0.0 # via openpyxl fastapi==0.128.0 # via -r ./requirements/base.in -filelock==3.20.1 +filelock==3.20.3 # via # huggingface-hub # torch @@ -74,17 +74,17 @@ flatbuffers==25.12.19 # via onnxruntime fonttools==4.61.1 # via matplotlib -fsspec==2025.12.0 +fsspec==2026.1.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.28.1 +google-api-core[grpc]==2.29.0 # via google-cloud-vision -google-auth==2.45.0 +google-auth==2.48.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.11.0 +google-cloud-vision==3.12.0 # via unstructured googleapis-common-protos==1.72.0 # via @@ -110,8 +110,10 @@ httpcore==1.0.9 # httpx # unstructured-client httpx==0.28.1 - # via unstructured-client -huggingface-hub==0.36.0 + # via + # huggingface-hub + # unstructured-client +huggingface-hub==1.3.4 # via # accelerate # timm @@ -133,13 +135,15 @@ kiwisolver==1.4.9 # via matplotlib langdetect==1.0.9 # via unstructured +llvmlite==0.46.0 + # via numba lxml==6.0.2 # via # pikepdf # python-docx # python-pptx # unstructured -markdown==3.10 +markdown==3.10.1 # via unstructured markupsafe==3.0.3 # via jinja2 @@ -151,7 +155,7 @@ ml-dtypes==0.5.4 # via onnx mpmath==1.3.0 # via sympy -msoffcrypto-tool==5.4.2 +msoffcrypto-tool==6.0.0 # via unstructured mypy-extensions==1.1.0 # via typing-inspect @@ -161,6 +165,8 @@ networkx==3.6.1 # unstructured nltk==3.9.2 # via unstructured +numba==0.63.1 + # via unstructured numpy==1.26.4 # via # -c ./requirements/constraints.in @@ -168,6 +174,7 @@ numpy==1.26.4 # contourpy # matplotlib # ml-dtypes + # numba # onnx # onnxruntime # opencv-python @@ -184,7 +191,7 @@ olefile==0.47 # python-oxmsg omegaconf==2.3.0 # via effdet -onnx==1.20.0 +onnx==1.20.1 # via # unstructured # unstructured-inference @@ -196,7 +203,7 @@ opencv-python==4.11.0.86 # via unstructured-inference openpyxl==3.1.5 # via unstructured -packaging==25.0 +packaging==26.0 # via # accelerate # huggingface-hub @@ -206,7 +213,7 @@ packaging==25.0 # pikepdf # transformers # unstructured-pytesseract -pandas==2.3.3 +pandas==3.0.0 # via # unstructured # unstructured-inference @@ -217,11 +224,11 @@ pdfminer-six==20260107 # -c ./requirements/constraints.in # unstructured # unstructured-inference -pi-heif==1.1.1 +pi-heif==1.2.0 # via unstructured -pikepdf==10.1.0 +pikepdf==10.2.0 # via unstructured -pillow==12.0.0 +pillow==12.1.0 # via # matplotlib # pdf2image @@ -234,7 +241,7 @@ proto-plus==1.27.0 # via # google-api-core # google-cloud-vision -protobuf==6.33.2 +protobuf==6.33.4 # via # google-api-core # google-cloud-vision @@ -248,7 +255,7 @@ psutil==7.2.1 # -r ./requirements/base.in # accelerate # unstructured -pyasn1==0.6.1 +pyasn1==0.6.2 # via # pyasn1-modules # rsa @@ -256,7 +263,7 @@ pyasn1-modules==0.4.2 # via google-auth pycocotools==2.0.11 # via effdet -pycparser==2.23 +pycparser==3.0 # via cffi pycryptodome==3.23.0 # via -r ./requirements/base.in @@ -268,14 +275,14 @@ pydantic-core==2.41.5 # via pydantic pypandoc==1.16.2 # via unstructured -pyparsing==3.3.1 +pyparsing==3.3.2 # via matplotlib -pypdf==6.5.0 +pypdf==6.6.2 # via # -r ./requirements/base.in # unstructured # unstructured-client -pypdfium2==5.2.0 +pypdfium2==5.3.0 # via unstructured-inference python-dateutil==2.9.0.post0 # via @@ -287,14 +294,12 @@ python-iso639==2025.11.16 # via unstructured python-magic==0.4.27 # via unstructured -python-multipart==0.0.21 +python-multipart==0.0.22 # via unstructured-inference python-oxmsg==0.0.2 # via unstructured python-pptx==1.0.2 # via unstructured -pytz==2025.2 - # via pandas pyyaml==6.0.3 # via # accelerate @@ -308,7 +313,7 @@ rapidfuzz==3.14.3 # unstructured-inference ratelimit==2.2.1 # via -r ./requirements/base.in -regex==2025.11.3 +regex==2026.1.15 # via # nltk # transformers @@ -316,9 +321,7 @@ requests==2.32.5 # via # -r ./requirements/base.in # google-api-core - # huggingface-hub # requests-toolbelt - # transformers # unstructured requests-toolbelt==1.0.0 # via unstructured-client @@ -329,14 +332,16 @@ safetensors==0.7.0 # accelerate # timm # transformers -scipy==1.16.3 +scipy==1.17.0 # via unstructured-inference +shellingham==1.5.4 + # via huggingface-hub six==1.17.0 # via # html5lib # langdetect # python-dateutil -soupsieve==2.8.1 +soupsieve==2.8.3 # via beautifulsoup4 starlette==0.41.2 # via @@ -346,20 +351,20 @@ sympy==1.14.0 # via # onnxruntime # torch -timm==1.0.22 +timm==1.0.24 # via # effdet # unstructured-inference -tokenizers==0.22.1 +tokenizers==0.22.2 # via transformers -torch==2.9.1 +torch==2.10.0 # via # accelerate # effdet # timm # torchvision # unstructured-inference -torchvision==0.24.1 +torchvision==0.25.0 # via # effdet # timm @@ -369,8 +374,12 @@ tqdm==4.67.1 # nltk # transformers # unstructured -transformers==4.57.3 +transformers==5.0.0 # via unstructured-inference +typer-slim==0.21.1 + # via + # huggingface-hub + # transformers typing-extensions==4.15.0 # via # anyio @@ -385,6 +394,7 @@ typing-extensions==4.15.0 # python-oxmsg # python-pptx # torch + # typer-slim # typing-inspect # typing-inspection # unstructured @@ -392,17 +402,15 @@ typing-inspect==0.9.0 # via dataclasses-json typing-inspection==0.4.2 # via pydantic -tzdata==2025.3 - # via pandas -unstructured[all-docs]==0.18.24 +unstructured[all-docs]==0.18.31 # via -r ./requirements/base.in -unstructured-client==0.42.6 +unstructured-client==0.42.8 # via unstructured unstructured-inference==1.1.1 # via unstructured unstructured-pytesseract==0.3.15 # via unstructured -urllib3==2.6.2 +urllib3==2.6.3 # via requests uvicorn==0.40.0 # via -r ./requirements/base.in diff --git a/requirements/constraints.txt b/requirements/constraints.txt index 3945ba58e..df13896cc 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -1,6 +1,6 @@ # This file was autogenerated by uv via the following command: # uv pip compile --python-version 3.12 --no-strip-extras ./requirements/constraints.in -o ./requirements/constraints.txt --no-emit-package pip --no-emit-package setuptools -anyio==4.12.0 +anyio==4.12.1 # via starlette cffi==2.0.0 # via cryptography @@ -14,7 +14,7 @@ numpy==1.26.4 # via -r ./requirements/constraints.in pdfminer-six==20260107 # via -r ./requirements/constraints.in -pycparser==2.23 +pycparser==3.0 # via cffi starlette==0.41.2 # via -r ./requirements/constraints.in diff --git a/requirements/test.txt b/requirements/test.txt index 87e63184c..a39f5cdbe 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,6 +1,6 @@ # This file was autogenerated by uv via the following command: # uv pip compile --python-version 3.12 --no-strip-extras ./requirements/test.in -o ./requirements/test.txt --no-emit-package pip --no-emit-package setuptools -anyio==4.12.0 +anyio==4.12.1 # via # httpx # jupyter-server @@ -18,7 +18,7 @@ asttokens==3.0.1 # stack-data astunparse==1.6.3 # via nbdev -async-lru==2.0.5 +async-lru==2.1.0 # via jupyterlab attrs==25.4.0 # via @@ -28,13 +28,13 @@ babel==2.17.0 # via jupyterlab-server beautifulsoup4==4.14.3 # via nbconvert -black==25.12.0 +black==26.1.0 # via -r ./requirements/test.in bleach[css]==6.3.0 # via nbconvert -build==1.3.0 +build==1.4.0 # via nbdev -certifi==2025.11.12 +certifi==2026.1.4 # via # httpcore # httpx @@ -51,7 +51,7 @@ comm==0.2.3 # via # ipykernel # ipywidgets -coverage[toml]==7.13.1 +coverage[toml]==7.13.2 # via pytest-cov debugpy==1.8.19 # via ipykernel @@ -61,24 +61,26 @@ deepdiff==8.6.1 # via -r ./requirements/test.in defusedxml==0.7.1 # via nbconvert -execnb==0.1.16 +execnb==0.1.18 # via nbdev execnet==2.1.2 # via pytest-xdist executing==2.2.1 # via stack-data -fastcore==1.10.0 +fastcore==1.12.6 # via # execnb # ghapi # nbdev +fastgit==0.0.2 + # via nbdev fastjsonschema==2.21.2 # via nbformat flake8==7.3.0 # via -r ./requirements/test.in fqdn==1.5.1 # via jsonschema -ghapi==1.0.8 +ghapi==1.0.10 # via nbdev h11==0.16.0 # via httpcore @@ -101,7 +103,7 @@ ipykernel==7.1.0 # jupyter # jupyter-console # jupyterlab -ipython==9.8.0 +ipython==9.9.0 # via # execnb # ipykernel @@ -121,11 +123,11 @@ jinja2==3.1.6 # jupyterlab # jupyterlab-server # nbconvert -json5==0.12.1 +json5==0.13.0 # via jupyterlab-server jsonpointer==3.0.0 # via jsonschema -jsonschema[format-nongpl]==4.25.1 +jsonschema[format-nongpl]==4.26.0 # via # jupyter-events # jupyterlab-server @@ -134,7 +136,7 @@ jsonschema-specifications==2025.9.1 # via jsonschema jupyter==1.1.1 # via -r ./requirements/test.in -jupyter-client==8.7.0 +jupyter-client==8.8.0 # via # ipykernel # jupyter-console @@ -163,9 +165,9 @@ jupyter-server==2.17.0 # jupyterlab-server # notebook # notebook-shim -jupyter-server-terminals==0.5.3 +jupyter-server-terminals==0.5.4 # via jupyter-server -jupyterlab==4.5.1 +jupyterlab==4.5.3 # via # jupyter # notebook @@ -179,7 +181,7 @@ jupyterlab-widgets==3.0.16 # via ipywidgets lark==1.3.1 # via rfc3987-syntax -librt==0.7.5 +librt==0.7.8 # via mypy markupsafe==3.0.3 # via @@ -205,7 +207,7 @@ nbconvert==7.16.6 # via # jupyter # jupyter-server -nbdev==2.4.7 +nbdev==2.4.14 # via -r ./requirements/test.in nbformat==5.10.4 # via @@ -214,7 +216,7 @@ nbformat==5.10.4 # nbconvert nest-asyncio==1.6.0 # via ipykernel -notebook==7.5.1 +notebook==7.5.3 # via jupyter notebook-shim==0.2.4 # via @@ -222,7 +224,7 @@ notebook-shim==0.2.4 # notebook orderly-set==5.5.0 # via deepdiff -packaging==25.0 +packaging==26.0 # via # black # build @@ -236,11 +238,12 @@ packaging==25.0 # nbconvert # nbdev # pytest + # wheel pandocfilters==1.5.1 # via nbconvert parso==0.8.5 # via jedi -pathspec==0.12.1 +pathspec==1.0.4 # via # black # mypy @@ -254,7 +257,7 @@ pluggy==1.6.0 # via # pytest # pytest-cov -prometheus-client==0.23.1 +prometheus-client==0.24.1 # via jupyter-server prompt-toolkit==3.0.52 # via @@ -270,7 +273,7 @@ pure-eval==0.2.3 # via stack-data pycodestyle==2.14.0 # via flake8 -pycparser==2.23 +pycparser==3.0 # via cffi pyflakes==3.4.0 # via flake8 @@ -300,7 +303,7 @@ python-dateutil==2.9.0.post0 # jupyter-client python-json-logger==4.0.0 # via jupyter-events -pytokens==0.3.0 +pytokens==0.4.0 # via black pyyaml==6.0.3 # via @@ -333,14 +336,14 @@ rpds-py==0.30.0 # via # jsonschema # referencing -send2trash==2.0.0 +send2trash==2.1.0 # via jupyter-server six==1.17.0 # via # astunparse # python-dateutil # rfc3339-validator -soupsieve==2.8.1 +soupsieve==2.8.3 # via beautifulsoup4 stack-data==0.6.3 # via ipython @@ -383,11 +386,11 @@ tzdata==2025.3 # via arrow uri-template==1.3.0 # via jsonschema -urllib3==2.6.2 +urllib3==2.6.3 # via requests watchdog==6.0.0 # via nbdev -wcwidth==0.2.14 +wcwidth==0.5.0 # via prompt-toolkit webcolors==25.10.0 # via jsonschema @@ -397,7 +400,7 @@ webencodings==0.5.1 # tinycss2 websocket-client==1.9.0 # via jupyter-server -wheel==0.45.1 +wheel==0.46.3 # via astunparse widgetsnbextension==4.0.15 # via ipywidgets diff --git a/scripts/initialize-libreoffice.sh b/scripts/initialize-libreoffice.sh new file mode 100755 index 000000000..6bb0a2e83 --- /dev/null +++ b/scripts/initialize-libreoffice.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/bin/soffice --headless || [ $? -eq 81 ] || exit 1