Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.93
* Refactored the Dockerfile to use the chainguard/wolfi-base image instead of the unstructured/base-image. This is to align with the recent change in the unstructured repo where the same change was made.
* upgraded dependancies to address CVEs

## 0.0.92
* Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation

Expand Down
60 changes: 50 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# syntax=docker/dockerfile:experimental
FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
FROM cgr.dev/chainguard/wolfi-base:latest

# NOTE(crag): NB_USER ARG for mybinder.org compat:
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
Expand All @@ -10,25 +10,65 @@ ARG PIPELINE_PACKAGE
ARG PYTHON_VERSION="3.12"

# Set up environment
ENV PYTHON python${PYTHON_VERSION}
ENV PIP ${PYTHON} -m pip
ENV PYTHON=python${PYTHON_VERSION}
ENV PIP="${PYTHON} -m pip"

USER root

COPY ./docker/packages/*.apk /tmp/packages/

RUN apk update && \
apk add libxml2 python-3.12 python-3.12-base py3.12-pip glib \
mesa-gl mesa-libgallium cmake bash libmagic wget git openjpeg \
poppler poppler-utils poppler-glib libreoffice tesseract && \
apk add --allow-untrusted /tmp/packages/pandoc-3.1.8-r0.apk && \
rm -rf /tmp/packages && \
git clone --depth 1 https://github.com/tesseract-ocr/tessdata.git /tmp/tessdata && \
mkdir -p /usr/local/share/tessdata && \
cp /tmp/tessdata/*.traineddata /usr/local/share/tessdata && \
rm -rf /tmp/tessdata && \
git clone --depth 1 https://github.com/tesseract-ocr/tessconfigs /tmp/tessconfigs && \
cp -r /tmp/tessconfigs/configs /usr/local/share/tessdata && \
cp -r /tmp/tessconfigs/tessconfigs /usr/local/share/tessdata && \
rm -rf /tmp/tessconfigs && \
apk cache clean && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \
chmod +x /usr/lib/libreoffice/program/soffice.bin && \
apk add --no-cache font-ubuntu fontconfig && \
apk upgrade --no-cache py3.12-pip && \
fc-cache -fv && \
ln -sf /usr/bin/$PYTHON /usr/bin/python3 && \
addgroup --gid ${NB_UID} ${NB_USER} && \
adduser --disabled-password --gecos "" --uid ${NB_UID} -G ${NB_USER} ${NB_USER} && \
rm -rf /usr/lib/python3.10 && \
rm -rf /usr/lib/python3.11 && \
rm -rf /usr/lib/python3.13 && \
rm -f /usr/bin/python3.13

ENV USER=${NB_USER}
ENV HOME=/home/${NB_USER}
COPY --chown=${NB_USER} scripts/initialize-libreoffice.sh ${HOME}/initialize-libreoffice.sh

WORKDIR ${HOME}
USER ${NB_USER}
WORKDIR ${HOME}

# Initialize libreoffice config as non-root user (required for soffice to work properly)
# See: https://github.com/Unstructured-IO/unstructured/issues/3105
RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh

ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata

FROM base as python-deps
COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt
RUN ${PIP} install pip==${PIP_VERSION}
RUN ${PIP} install --no-cache -r requirements-base.txt
RUN ${PIP} install pip==${PIP_VERSION} && \
${PIP} install --no-cache -r requirements-base.txt

FROM python-deps as model-deps
RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()"
${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" && \
${PYTHON} -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

FROM model-deps as code
COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
Expand Down
Binary file added docker/packages/pandoc-3.1.8-r0.apk
Binary file not shown.
2 changes: 1 addition & 1 deletion prepline_general/api/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.92" # pragma: no cover
__version__ = "0.0.93" # pragma: no cover
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.92
version: 0.0.93
Loading
Loading