From 4069830c67cde556329258cebb78d893f94614e2 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Thu, 11 Dec 2025 06:02:42 +0000 Subject: [PATCH 01/19] fix: config --- build-script/doc-parser-build.config | 4 ++-- build-script/paddle-ocr-build.config | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build-script/doc-parser-build.config b/build-script/doc-parser-build.config index eb54693edd..fd38ddb1a5 100644 --- a/build-script/doc-parser-build.config +++ b/build-script/doc-parser-build.config @@ -6,10 +6,10 @@ DOCKER_REGISTRY=mncregistry:30500 IMAGE_NAME=doc-parser-preprocessor # 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능) -IMAGE_VERSION=1.3.0 +IMAGE_VERSION=1.3.3-komipo # 실제 Dockerfile 위치 (루트 기준) DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile # 빌드 후 push 할지 여부 -PUSH_IMAGE=true +PUSH_IMAGE=false diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config index 8c9ced262e..9a31ed49bb 100644 --- a/build-script/paddle-ocr-build.config +++ b/build-script/paddle-ocr-build.config @@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile # 이미지 이름/태그 IMAGE_NAME=doc-parser-ocr -IMAGE_TAG=0.0.0 +IMAGE_TAG=1.3.3-komipo # 푸시할 레지스트리 (없으면 빈값) REGISTRY=mncregistry:30500 From 316d532572d3e2a2b338438fe516e3598ecdf655 Mon Sep 17 00:00:00 2001 From: "seunghyun.nam" Date: Thu, 11 Dec 2025 15:26:54 +0900 Subject: [PATCH 02/19] fix: config --- genon/preprocessor/scripts/register.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genon/preprocessor/scripts/register.config b/genon/preprocessor/scripts/register.config index 5961c9085d..f41c99c5e7 100644 --- a/genon/preprocessor/scripts/register.config +++ b/genon/preprocessor/scripts/register.config @@ -1,7 +1,7 @@ # 필수 REGISTRY_NAME="mncregistry:30500/" IMAGE_NAME="mnc/doc-parser-preprocessor" -IMAGE_TAG="1.3.0" +IMAGE_TAG="1.3.3-komipo" DESCRIPTION="unified-preprocessor" # K8s / DB 파드 위치 From 2601b2a19d812e2d1c2601e49813f7b15437a927 Mon Sep 17 00:00:00 2001 From: "seunghyun.nam" Date: Thu, 11 Dec 2025 15:29:55 +0900 Subject: [PATCH 03/19] fix: script --- genon/preprocessor/scripts/register_image.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/genon/preprocessor/scripts/register_image.sh b/genon/preprocessor/scripts/register_image.sh index 613fa2738a..325fb5c18b 100644 --- a/genon/preprocessor/scripts/register_image.sh +++ b/genon/preprocessor/scripts/register_image.sh @@ -57,14 +57,14 @@ else read -srp "MySQL 비밀번호: " MYSQL_PASS; echo fi -# ── 로컬 이미지 확인 ──────────────────────────────────────── -step "로컬 Docker 이미지 확인" -if docker images | awk '{print $1":"$2}' | grep -qx "${FULL_IMAGE_NAME}"; then - ok "로컬 이미지 존재" -else - fail "로컬에 ${FULL_IMAGE_NAME} 없음. 먼저 build/push 하세요." - exit 1 -fi +# # ── 로컬 이미지 확인 ──────────────────────────────────────── +# step "로컬 Docker 이미지 확인" +# if docker images | awk '{print $1":"$2}' | grep -qx "${FULL_IMAGE_NAME}"; then +# ok "로컬 이미지 존재" +# else +# fail "로컬에 ${FULL_IMAGE_NAME} 없음. 먼저 build/push 하세요." +# exit 1 +# fi # ── docker push (포그라운드 / 재시도) ─────────────────────── step "docker push" @@ -109,7 +109,7 @@ if [ -z "${EXISTING_ID}" ]; then INSERT INTO llmops.resource_meta_tb (resource_id, resource_type, resource_group_id, is_active, reg_date, mod_date, reg_user_id, mod_user_id) VALUES - (LAST_INSERT_ID(), 'DOCKER_IMAGE', 2, 1, NOW(), NOW(), 1, 1); + (LAST_INSERT_ID(), 'DOCKER_IMAGE', 1, 1, NOW(), NOW(), 1, 1); " 2>/dev/null IMAGE_ID=$( From fa4de8a79e43a9916509490454802db474ac552f Mon Sep 17 00:00:00 2001 From: "seunghyun.nam" Date: Wed, 17 Dec 2025 13:08:57 +0900 Subject: [PATCH 04/19] fix:readme --- genon/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/genon/README.md b/genon/README.md index a04ebd0783..4e28533cac 100644 --- a/genon/README.md +++ b/genon/README.md @@ -60,7 +60,7 @@ 6. 사이트 배포 시 ```shell 1. 이미지 저장 -docker save mncregistry:30500/mnc/doc-parser-preprocessor:latest | gzip > doc-parser-preprocessor.tar.gz +docker save mncregistry:30500/mnc/doc-parser-preprocessor:1.3.3-komipo | gzip > doc-parser-preprocessor.tar.gz 2. 사이트에서 이미지 복원 gunzip -c doc-parser-preprocessor.tar.gz | docker load 3. register_image.sh 파일 실행 @@ -75,4 +75,10 @@ gunzip -c doc-parser-preprocessor.tar.gz | docker load ```shell kubectl apply -f doc-parser-ocr-deployment.yaml ``` -5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml) \ No newline at end of file +5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml) + +사이트에서 +``` +docker save mncregistry:30500/doc-parser-ocr:1.3.3-komipo | gzip > doc-parser-ocr.tar.gz +gunzip -c doc-parser-ocr.tar.gz | docker load +``` From 2913f752aa1da532a241b664d34dd47a2c24cb38 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Mon, 26 Jan 2026 16:43:35 +0900 Subject: [PATCH 05/19] fix: miniLM path --- genon/preprocessor/facade/intelligent_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/genon/preprocessor/facade/intelligent_processor.py b/genon/preprocessor/facade/intelligent_processor.py index a2487d05f4..33d04e1ccd 100644 --- a/genon/preprocessor/facade/intelligent_processor.py +++ b/genon/preprocessor/facade/intelligent_processor.py @@ -349,8 +349,10 @@ class HybridChunker(BaseChunker): model_config = ConfigDict(arbitrary_types_allowed=True) + # 해당 경로에 all-MiniLM-L6-v2 위치 시키기 + # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 tokenizer: Union[PreTrainedTokenizerBase, str] = ( - "sentence-transformers/all-MiniLM-L6-v2" + "/nfs-root/all-MiniLM-L6-v2" ) max_tokens: int = 1024 merge_peers: bool = True From d8f91063284801f2f49b60c84991dc55056b9d93 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Thu, 12 Feb 2026 09:51:07 +0900 Subject: [PATCH 06/19] =?UTF-8?q?fix:=20html=20=EC=95=84=EB=8B=8C=20?= =?UTF-8?q?=EA=B2=BD=EC=9A=B0=EC=97=90=EB=A7=8C=20ocr=20=EC=88=98=ED=96=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genon/preprocessor/facade/intelligent_processor_ocr.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/genon/preprocessor/facade/intelligent_processor_ocr.py b/genon/preprocessor/facade/intelligent_processor_ocr.py index 807493fe73..b720d3bf26 100644 --- a/genon/preprocessor/facade/intelligent_processor_ocr.py +++ b/genon/preprocessor/facade/intelligent_processor_ocr.py @@ -1288,8 +1288,11 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict): # OCR이 필요하다고 판단되면 OCR 수행 document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs) - # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지) - document: DoclingDocument = self.ocr_all_table_cells(document, file_path) + if document.origin.mimetype == "text/html": + pass + else: + # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지) + document: DoclingDocument = self.ocr_all_table_cells(document, file_path) output_path, output_file = os.path.split(file_path) filename, _ = os.path.splitext(output_file) From 4ba3747b0e96fbd1a371377cd71ffad4c4157feb Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Fri, 13 Feb 2026 11:13:29 +0900 Subject: [PATCH 07/19] =?UTF-8?q?add:=20=EB=B3=B4=EC=95=88=EC=BB=A8?= =?UTF-8?q?=EC=84=A4=ED=8C=85=20=EC=A1=B0=EC=B9=98=EC=82=AC=ED=95=AD?= =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20=EC=9D=B8=ED=95=B4=20=EC=B2=A8=EB=B6=80?= =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EA=B0=80=EB=93=9C=EB=A0=88=EC=9D=BC=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../facade/attachment_processor.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py index 140704a84a..28c65bed96 100644 --- a/genon/preprocessor/facade/attachment_processor.py +++ b/genon/preprocessor/facade/attachment_processor.py @@ -90,6 +90,41 @@ # pdf 변환 대상 확장자 CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx'] +## 보안컨설팅 조치로 인한 가드레일 추가 + +import requests +import re +import json + +GUARDRAIL_WORKFLOW_ID = 694 +GUARDRAIL_BEARER_TOKEN = "" +GENOS_URL = "" + +from functools import wraps + +def guardrail(func): + @wraps(func) + async def wrapper(*args, **kwargs): + result = await func(*args, **kwargs) + + for r in result: + url = f"{GENOS_URL}/api/gateway/workflow/{GUARDRAIL_WORKFLOW_ID}" + headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}") + + if hasattr(r, "text"): + body = {"question": r.text} + + res = requests.post(f"{url}/run/v2", json=body, headers=headers) + + answer = res.json()["data"]["text"] + + if answer.startswith("[UNSAFE]"): + r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다." + + return result + + return wrapper + def convert_to_pdf(file_path: str) -> str | None: """ @@ -1383,6 +1418,7 @@ def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict return vectors + @guardrail async def __call__(self, request: Request, file_path: str, **kwargs: dict): ext = os.path.splitext(file_path)[-1].lower() if ext in ('.wav', '.mp3', '.m4a'): From 41ba56a2fbafb6ee8251c8dc732dc4c7c13ca574 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Thu, 11 Dec 2025 06:02:42 +0000 Subject: [PATCH 08/19] fix: config --- build-script/doc-parser-build.config | 13 ++----------- build-script/paddle-ocr-build.config | 2 +- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/build-script/doc-parser-build.config b/build-script/doc-parser-build.config index f54b799ee3..c909a43497 100644 --- a/build-script/doc-parser-build.config +++ b/build-script/doc-parser-build.config @@ -6,19 +6,10 @@ DOCKER_REGISTRY=mncregistry:30500 IMAGE_NAME=doc-parser-preprocessor # 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능) -IMAGE_VERSION=1.3.6.2 +IMAGE_VERSION=1.3.7-komipo # 실제 Dockerfile 위치 (루트 기준) DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile # 빌드 후 push 할지 여부 -PUSH_IMAGE=true - -# USER, GROUP -APP_UID=3000 -APP_GID=3000 -APP_UNAME=genos -APP_GNAME=genos - -# NLTK packages (comma-separated). Use "all" to download everything. -APP_NLTK_PACKAGES=punkt,stopwords,averaged_perceptron_tagger,averaged_perceptron_tagger_eng,wordnet,omw-1.4 +PUSH_IMAGE=false diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config index 8c9ced262e..9a31ed49bb 100644 --- a/build-script/paddle-ocr-build.config +++ b/build-script/paddle-ocr-build.config @@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile # 이미지 이름/태그 IMAGE_NAME=doc-parser-ocr -IMAGE_TAG=0.0.0 +IMAGE_TAG=1.3.3-komipo # 푸시할 레지스트리 (없으면 빈값) REGISTRY=mncregistry:30500 From 7afc6f96312fb2d76c2b9640f302e1cfbf7e8bfe Mon Sep 17 00:00:00 2001 From: "seunghyun.nam" Date: Thu, 11 Dec 2025 15:26:54 +0900 Subject: [PATCH 09/19] fix: config --- genon/preprocessor/scripts/register.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genon/preprocessor/scripts/register.config b/genon/preprocessor/scripts/register.config index 5961c9085d..f41c99c5e7 100644 --- a/genon/preprocessor/scripts/register.config +++ b/genon/preprocessor/scripts/register.config @@ -1,7 +1,7 @@ # 필수 REGISTRY_NAME="mncregistry:30500/" IMAGE_NAME="mnc/doc-parser-preprocessor" -IMAGE_TAG="1.3.0" +IMAGE_TAG="1.3.3-komipo" DESCRIPTION="unified-preprocessor" # K8s / DB 파드 위치 From 2ebdbd9781467f698e7accdcfdadbf2db7dc0958 Mon Sep 17 00:00:00 2001 From: "seunghyun.nam" Date: Thu, 11 Dec 2025 15:29:55 +0900 Subject: [PATCH 10/19] fix: script --- genon/preprocessor/scripts/register_image.sh | 43 ++++++++------------ 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/genon/preprocessor/scripts/register_image.sh b/genon/preprocessor/scripts/register_image.sh index 8c872ef7ed..0610cb54db 100644 --- a/genon/preprocessor/scripts/register_image.sh +++ b/genon/preprocessor/scripts/register_image.sh @@ -58,15 +58,14 @@ else read -srp "MySQL 비밀번호: " MYSQL_PASS; echo fi -# ── 로컬 이미지 확인 ──────────────────────────────────────── -step "로컬 Docker 이미지 확인" -if docker image inspect "${FULL_IMAGE_NAME}" >/dev/null 2>&1; then - ok "로컬 이미지 존재" - HAS_LOCAL_IMAGE="yes" -else - echo "⚠️ 로컬에 ${FULL_IMAGE_NAME} 없음." - HAS_LOCAL_IMAGE="no" -fi +# # ── 로컬 이미지 확인 ──────────────────────────────────────── +# step "로컬 Docker 이미지 확인" +# if docker images | awk '{print $1":"$2}' | grep -qx "${FULL_IMAGE_NAME}"; then +# ok "로컬 이미지 존재" +# else +# fail "로컬에 ${FULL_IMAGE_NAME} 없음. 먼저 build/push 하세요." +# exit 1 +# fi # ── docker push (포그라운드 / 재시도) ─────────────────────── step "docker push" @@ -160,22 +159,16 @@ if [ -z "${EXISTING_ID}" ]; then INSERT INTO llmops.resource_meta_tb (resource_id, resource_type, resource_group_id, is_active, reg_date, mod_date, reg_user_id, mod_user_id) VALUES - (LAST_INSERT_ID(), 'DOCKER_IMAGE', 2, 1, NOW(), NOW(), 1, 1); - " - if ! MYSQL_OUT="$(mysql_query "${SQL_INSERT}")"; then - fail "DB 등록 실패. 아래 로그 확인 필요." - echo "${MYSQL_OUT}" - exit 1 - fi - - MYSQL_OUT="" - if ! MYSQL_OUT="$(mysql_query "${SQL_EXISTING}")"; then - fail "DB 조회 실패(등록 후). 아래 로그 확인 필요." - echo "${MYSQL_OUT}" - exit 1 - fi - IMAGE_ID="$(printf '%s' "${MYSQL_OUT}" | tr -d '\r\n' | grep -Eo '^[0-9]+$' || true)" - ok "DB 등록 완료. 이미지 ID: ${IMAGE_ID}" + (LAST_INSERT_ID(), 'DOCKER_IMAGE', 1, 1, NOW(), NOW(), 1, 1); + " 2>/dev/null + + IMAGE_ID=$( + kubectl exec -it "${MARIADB_POD}" -n "${K8S_NAMESPACE}" -- \ + mysql -u "${MYSQL_USER}" -p"${MYSQL_PASS}" llmops -se \ + "SELECT id FROM system_docker_image_tb WHERE name='${IMAGE_NAME}' AND tag='${IMAGE_TAG}';" \ + 2>/dev/null | tr -d '\r\n' | grep -o '[0-9]*' || true + ) + echo "✅ DB 등록 완료. 이미지 ID: ${IMAGE_ID}" else ok "이미 등록된 이미지입니다. ID: ${EXISTING_ID}" IMAGE_ID="${EXISTING_ID}" From 52ecf20a9e2a06b0b90a9186543d5d0581970b7c Mon Sep 17 00:00:00 2001 From: "seunghyun.nam" Date: Wed, 17 Dec 2025 13:08:57 +0900 Subject: [PATCH 11/19] fix:readme --- genon/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/genon/README.md b/genon/README.md index a04ebd0783..4e28533cac 100644 --- a/genon/README.md +++ b/genon/README.md @@ -60,7 +60,7 @@ 6. 사이트 배포 시 ```shell 1. 이미지 저장 -docker save mncregistry:30500/mnc/doc-parser-preprocessor:latest | gzip > doc-parser-preprocessor.tar.gz +docker save mncregistry:30500/mnc/doc-parser-preprocessor:1.3.3-komipo | gzip > doc-parser-preprocessor.tar.gz 2. 사이트에서 이미지 복원 gunzip -c doc-parser-preprocessor.tar.gz | docker load 3. register_image.sh 파일 실행 @@ -75,4 +75,10 @@ gunzip -c doc-parser-preprocessor.tar.gz | docker load ```shell kubectl apply -f doc-parser-ocr-deployment.yaml ``` -5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml) \ No newline at end of file +5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml) + +사이트에서 +``` +docker save mncregistry:30500/doc-parser-ocr:1.3.3-komipo | gzip > doc-parser-ocr.tar.gz +gunzip -c doc-parser-ocr.tar.gz | docker load +``` From 849a32f23dc5899c2e970de68b291051d8e56698 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Thu, 12 Feb 2026 09:51:07 +0900 Subject: [PATCH 12/19] =?UTF-8?q?fix:=20html=20=EC=95=84=EB=8B=8C=20?= =?UTF-8?q?=EA=B2=BD=EC=9A=B0=EC=97=90=EB=A7=8C=20ocr=20=EC=88=98=ED=96=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...4\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git "a/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" "b/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" index df58f2f2a7..c69a67dd5b 100644 --- "a/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" +++ "b/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" @@ -1253,8 +1253,11 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict): # OCR이 필요하다고 판단되면 OCR 수행 document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs) - # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지) - document: DoclingDocument = self.ocr_all_table_cells(document, file_path) + if document.origin.mimetype == "text/html": + pass + else: + # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지) + document: DoclingDocument = self.ocr_all_table_cells(document, file_path) output_path, output_file = os.path.split(file_path) filename, _ = os.path.splitext(output_file) From 055d059751bec917ad82e384ad1dd25b6c0fbad4 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Fri, 13 Feb 2026 11:13:29 +0900 Subject: [PATCH 13/19] =?UTF-8?q?add:=20=EB=B3=B4=EC=95=88=EC=BB=A8?= =?UTF-8?q?=EC=84=A4=ED=8C=85=20=EC=A1=B0=EC=B9=98=EC=82=AC=ED=95=AD?= =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20=EC=9D=B8=ED=95=B4=20=EC=B2=A8=EB=B6=80?= =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EA=B0=80=EB=93=9C=EB=A0=88=EC=9D=BC=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../facade/attachment_processor.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py index f5fea12eba..32c8cf2052 100644 --- a/genon/preprocessor/facade/attachment_processor.py +++ b/genon/preprocessor/facade/attachment_processor.py @@ -99,6 +99,41 @@ # pdf 변환 대상 확장자 CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx'] +## 보안컨설팅 조치로 인한 가드레일 추가 + +import requests +import re +import json + +GUARDRAIL_WORKFLOW_ID = 694 +GUARDRAIL_BEARER_TOKEN = "" +GENOS_URL = "" + +from functools import wraps + +def guardrail(func): + @wraps(func) + async def wrapper(*args, **kwargs): + result = await func(*args, **kwargs) + + for r in result: + url = f"{GENOS_URL}/api/gateway/workflow/{GUARDRAIL_WORKFLOW_ID}" + headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}") + + if hasattr(r, "text"): + body = {"question": r.text} + + res = requests.post(f"{url}/run/v2", json=body, headers=headers) + + answer = res.json()["data"]["text"] + + if answer.startswith("[UNSAFE]"): + r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다." + + return result + + return wrapper + def convert_to_pdf(file_path: str) -> str | None: """ @@ -1432,6 +1467,7 @@ def get_level_name(level_num: int) -> str: # root logger level 적용 logging.getLogger().setLevel(level) + @guardrail async def __call__(self, request: Request, file_path: str, **kwargs: dict): self.setup_logging(kwargs.get('log_level', 4)) From 0f03ec661c3b86be3635212aac6c2cc048e90b63 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Wed, 11 Mar 2026 14:03:17 +0900 Subject: [PATCH 14/19] =?UTF-8?q?chore:=20=EB=A6=AC=EB=B2=A0=EC=9D=B4?= =?UTF-8?q?=EC=8A=A4=20=EC=9E=91=EC=97=85=EC=9D=80=20=EC=84=B1=EB=AF=BC=20?= =?UTF-8?q?=ED=94=84=EB=A1=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genon/preprocessor/facade/attachment_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py index 32c8cf2052..2adb82a385 100644 --- a/genon/preprocessor/facade/attachment_processor.py +++ b/genon/preprocessor/facade/attachment_processor.py @@ -214,6 +214,7 @@ def _get_pdf_path(file_path: str) -> str: return pdf_path + def install_packages(packages): for package in packages: try: From 76543992a3b37d2749f61fcc920632e1771e92fb Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Wed, 11 Mar 2026 14:22:15 +0900 Subject: [PATCH 15/19] fix: img tag --- build-script/paddle-ocr-build.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config index 9a31ed49bb..ea2f74486d 100644 --- a/build-script/paddle-ocr-build.config +++ b/build-script/paddle-ocr-build.config @@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile # 이미지 이름/태그 IMAGE_NAME=doc-parser-ocr -IMAGE_TAG=1.3.3-komipo +IMAGE_TAG=1.3.7-komipo # 푸시할 레지스트리 (없으면 빈값) REGISTRY=mncregistry:30500 From 1f7bb47bc3cf0e627ea7f25378f726ff4982c9f6 Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Mon, 6 Apr 2026 11:27:50 +0900 Subject: [PATCH 16/19] =?UTF-8?q?feat:=20=EC=B2=A8=EB=B6=80=EC=9A=A9=20?= =?UTF-8?q?=EC=A0=84=EC=B2=98=EB=A6=AC=EA=B8=B0=EC=97=90=20=EA=B0=80?= =?UTF-8?q?=EB=93=9C=EB=A0=88=EC=9D=BC=20=EC=B6=94=EA=B0=80.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../facade/attachment_processor_guardrail.py | 1626 +++++++++++++++++ 1 file changed, 1626 insertions(+) create mode 100644 genon/preprocessor/facade/attachment_processor_guardrail.py diff --git a/genon/preprocessor/facade/attachment_processor_guardrail.py b/genon/preprocessor/facade/attachment_processor_guardrail.py new file mode 100644 index 0000000000..0064b76832 --- /dev/null +++ b/genon/preprocessor/facade/attachment_processor_guardrail.py @@ -0,0 +1,1626 @@ +from __future__ import annotations + +from collections import defaultdict + +import asyncio +import fitz +import json +import math +import os +import pandas as pd +import pydub +import requests +import shutil +import subprocess +import sys +import threading +import uuid +import warnings +from datetime import datetime +from fastapi import Request +from glob import glob +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import ( + # TextLoader, # TXT + PyMuPDFLoader, # PDF + DataFrameLoader, # DataFrame + UnstructuredWordDocumentLoader, # DOC and DOCX + UnstructuredPowerPointLoader, # PPT and PPTX + UnstructuredImageLoader, # JPG, PNG + UnstructuredMarkdownLoader, # Markdown + UnstructuredFileLoader, # Generic fallback +) +from langchain_core.documents import Document +from markdown2 import markdown +from pandas import DataFrame +from pathlib import Path +from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator +from typing import Any, Iterable, Iterator, List, Optional, Union +from typing_extensions import Self + +try: + import semchunk + from transformers import AutoTokenizer, PreTrainedTokenizerBase +except ImportError: + raise RuntimeError( + "Module requires 'chunking' extra; to install, run: " + "`pip install 'docling-core[chunking]'`" + ) +try: + import chardet +except ImportError: + raise RuntimeError("Module 'chardet' not imported. Run `pip install chardet`.") +try: + from weasyprint import HTML +except ImportError: + print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.") + HTML = None + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.document import ConversionResult, InputDocument +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.document_converter import DocumentConverter, HwpxFormatOption, WordFormatOption +from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta +from docling_core.types import DoclingDocument as DLDocument +from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, + PictureItem, SectionHeaderItem, TableItem, TextItem +) +from docling_core.types.doc.document import LevelNumber, ListItem, CodeItem +from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend +# from utils import assert_cancelled +# from genos_utils import upload_files, merge_overlapping_bboxes + +# import platform +from pathlib import Path +import os +import subprocess +import tempfile +import shutil +import unicodedata + +import logging + +for n in ("fontTools", "fontTools.ttLib", "fontTools.ttLib.ttFont"): + lg = logging.getLogger(n) + lg.setLevel(logging.CRITICAL) + lg.propagate = False + logging.getLogger().setLevel(logging.WARNING) +# pdf 변환 대상 확장자 +CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx'] + + + +### 가드레일 용 ### +import requests +import re +import json + +GUARDRAIL_WORKFLOW_ID = 694 +GUARDRAIL_BEARER_TOKEN = '23c3898fe3264fd597961af23a68fe7c' +# GENOS_URL = 'https://ai.komipo.co.kr:30908/' +# @@@@ 내부 호출로 변경 +GENOS_URL = 'http://llmops-gateway-api-service:8080' + + +from functools import wraps + +def guardrail(func): + @wraps(func) + async def wrapper(*args, **kwargs): + result = await func(*args, **kwargs) + for r in result: + url = f"{GENOS_URL}/workflow/{GUARDRAIL_WORKFLOW_ID}" + headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}") + + if hasattr(r, "text"): + body = {'question': r.text} + + res = requests.post(f'{url}/run/v2', json=body, headers=headers) + + answer = res.json()['data']['text'] + + if answer.startswith("[UNSAFE]"): + r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다." + + + return result + return wrapper + + +def convert_to_pdf(file_path: str) -> str | None: + """ + LibreOffice로 PDF 변환을 시도한다. + 실패해도 예외를 던지지 않고 None을 반환한다. + """ + try: + in_path = Path(file_path).resolve() + out_dir = in_path.parent + pdf_path = in_path.with_suffix('.pdf') + + # headless에서 UTF-8 locale 보장 + env = os.environ.copy() + env.setdefault("LANG", "C.UTF-8") + env.setdefault("LC_ALL", "C.UTF-8") + + # 확장자에 따라 필터(특히 .ppt는 impress 필터) + ext = in_path.suffix.lower() + if ext in ('.ppt', '.pptx'): + convert_arg = "pdf:impress_pdf_Export" + elif ext in ('.doc', '.docx'): + convert_arg = "pdf:writer_pdf_Export" + elif ext in ('.xls', '.xlsx', '.csv'): + convert_arg = "pdf:calc_pdf_Export" + else: + convert_arg = "pdf" + + # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도 + try: + in_path.name.encode('ascii') + candidates = [in_path] + tmp_dir = None + except UnicodeEncodeError: + tmp_dir = Path(tempfile.mkdtemp()) + ascii_name = unicodedata.normalize('NFKD', in_path.stem).encode('ascii', 'ignore').decode('ascii') or "file" + ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}" + shutil.copy2(in_path, ascii_copy) + candidates = [ascii_copy, in_path] + + for cand in candidates: + cmd = [ + "soffice", "--headless", + "--convert-to", convert_arg, + "--outdir", str(out_dir), + str(cand) + ] + proc = subprocess.run(cmd, env=env, capture_output=True, text=True) + if proc.returncode == 0 and pdf_path.exists(): + # 성공 + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return str(pdf_path) + # 실패해도 계속 시도 (로그만 찍고 무시) + print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}") + + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return None + except Exception as e: + # 어떤 에러든 삼키고 None 반환 + print(f"[convert_to_pdf] error: {e}") + return None + + +def _get_pdf_path(file_path: str) -> str: + """ + 다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수 + + Args: + file_path (str): 원본 파일 경로 + + Returns: + str: PDF 확장자로 변경된 파일 경로 + """ + pdf_path = file_path + for ext in CONVERTIBLE_EXTENSIONS: + pdf_path = pdf_path.replace(ext, '.pdf') + return pdf_path + + +def install_packages(packages): + for package in packages: + try: + __import__(package) + except ImportError: + print(f"[!] {package} 패키지가 없습니다. 설치를 시도합니다.") + subprocess.run([sys.executable, "-m", "pip", "install", package], check=True) + + +class GenOSVectorMeta(BaseModel): + class Config: + extra = 'allow' + + text: str | None = None + n_char: int | None = None + n_word: int | None = None + n_line: int | None = None + i_page: int | None = None + e_page: int | None = None + i_chunk_on_page: int | None = None + n_chunk_of_page: int | None = None + i_chunk_on_doc: int | None = None + n_chunk_of_doc: int | None = None + n_page: int | None = None + reg_date: str | None = None + chunk_bboxes: str | None = None + media_files: str | None = None + + +class GenOSVectorMetaBuilder: + def __init__(self): + """빌더 초기화""" + self.text: Optional[str] = None + self.n_char: Optional[int] = None + self.n_word: Optional[int] = None + self.n_line: Optional[int] = None + self.i_page: Optional[int] = None + self.e_page: Optional[int] = None + self.i_chunk_on_page: Optional[int] = None + self.n_chunk_of_page: Optional[int] = None + self.i_chunk_on_doc: Optional[int] = None + self.n_chunk_of_doc: Optional[int] = None + self.n_page: Optional[int] = None + self.reg_date: Optional[str] = None + self.chunk_bboxes: Optional[str] = None + self.media_files: Optional[str] = None + # self.title: Optional[str] = None + # self.created_date: Optional[int] = None + + def set_text(self, text: str) -> "GenOSVectorMetaBuilder": + """텍스트와 관련된 데이터를 설정""" + self.text = text + self.n_char = len(text) + self.n_word = len(text.split()) + self.n_line = len(text.splitlines()) + return self + + def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder": + """페이지 정보 설정""" + self.i_page = i_page + self.i_chunk_on_page = i_chunk_on_page + self.n_chunk_of_page = n_chunk_of_page + return self + + def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder": + """문서 전체의 청크 인덱스 설정""" + self.i_chunk_on_doc = i_chunk_on_doc + return self + + def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder": + """글로벌 메타데이터 병합""" + for key, value in global_metadata.items(): + if hasattr(self, key): + setattr(self, key, value) + return self + + def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder": + chunk_bboxes = [] + for item in doc_items: + for prov in item.prov: + label = item.self_ref + type_ = item.label + size = document.pages.get(prov.page_no).size + page_no = prov.page_no + bbox = prov.bbox + bbox_data = { + 'l': bbox.l / size.width, + 't': bbox.t / size.height, + 'r': bbox.r / size.width, + 'b': bbox.b / size.height, + 'coord_origin': bbox.coord_origin.value + } + chunk_bboxes.append({ + 'page': page_no, + 'bbox': bbox_data, + 'type': type_, + 'ref': label + }) + self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None + self.chunk_bboxes = json.dumps(chunk_bboxes) + return self + + def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder": + temp_list = [] + if not doc_items: + self.media_files = "" + return self + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref}) + self.media_files = json.dumps(temp_list) + return self + + def build(self) -> GenOSVectorMeta: + """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성""" + return GenOSVectorMeta( + text=self.text, + n_char=self.n_char, + n_word=self.n_word, + n_line=self.n_line, + i_page=self.i_page, + e_page=self.e_page, + i_chunk_on_page=self.i_chunk_on_page, + n_chunk_of_page=self.n_chunk_of_page, + i_chunk_on_doc=self.i_chunk_on_doc, + n_chunk_of_doc=self.n_chunk_of_doc, + n_page=self.n_page, + reg_date=self.reg_date, + chunk_bboxes=self.chunk_bboxes, + media_files=self.media_files, + ) + + +class HwpLoader: + def __init__(self, file_path: str): + self.file_path = file_path + self.output_dir = os.path.join('/tmp', str(uuid.uuid4())) + os.makedirs(self.output_dir, exist_ok=True) + + def load(self): + try: + subprocess.run(['hwp5html', self.file_path, '--output', self.output_dir], check=True, timeout=600) + converted_file_path = os.path.join(self.output_dir, 'index.xhtml') + pdf_save_path = _get_pdf_path(self.file_path) + HTML(converted_file_path).write_pdf(pdf_save_path) + loader = PyMuPDFLoader(pdf_save_path) + return loader.load() + except Exception as e: + print(f"Failed to convert {self.file_path} to XHTML") + raise e + finally: + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + +class TextLoader: + def __init__(self, file_path: str): + self.file_path = file_path + self.output_dir = os.path.join('/tmp', str(uuid.uuid4())) + os.makedirs(self.output_dir, exist_ok=True) + + def load(self): + try: + with open(self.file_path, 'rb') as f: + raw = f.read() + enc = chardet.detect(raw).get('encoding') or '' + encodings = [enc] if enc and enc.lower() not in ('ascii', 'unknown') else [] + encodings += ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1'] + + content = None + for e in encodings: + try: + content = raw.decode(e) # 전체 파일로 디코딩 + break + except UnicodeDecodeError: + continue + if content is None: + content = raw.decode('utf-8', errors='replace') + + # 4) PDF 변환 유지 + html = f"
{content}
" + html_path = os.path.join(self.output_dir, 'temp.html') + with open(html_path, 'w', encoding='utf-8') as f: + f.write(html) + # pdf_path = (self.file_path + # .replace('.txt', '.pdf') + # .replace('.json', '.pdf')) + pdf_path = _get_pdf_path(self.file_path) + if HTML: + HTML(html_path).write_pdf(pdf_path) + loader = PyMuPDFLoader(pdf_path) + return loader.load() + # PDF가 불가하면 Document 직접 반환 (원형 스키마 유지) + return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})] + + except Exception: + # 실패 시에도 스키마는 그대로 유지해 반환 + for e in ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1']: + try: + with open(self.file_path, 'r', encoding=e) as f: + content = f.read() + return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})] + except UnicodeDecodeError: + continue + with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})] + finally: + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + +class TabularLoader: + def __init__(self, file_path: str, ext: str): + + packages = ['openpyxl', 'chardet'] + + install_packages(packages) + + self.file_path = file_path + if ext == ".csv": + # convert_to_pdf(file_path) csv는 Pdf 변환 안 함 + self.data_dict = self.load_csv_documents(file_path) + elif ext == ".xlsx": + # convert_to_pdf(file_path) xlsx는 Pdf 변환 안 함 + self.data_dict = self.load_xlsx_documents(file_path) + else: + print(f"[!] Inadequate extension for TabularLoader: {ext}") + return + + def check_sql_dtypes(self, df): + df = df.convert_dtypes() + res = [] + for col in df.columns: + # col_name = col.strip().replace(' ', '_') + dtype = str(df.dtypes[col]).lower() + + if 'int' in dtype: + if '64' in dtype: + sql_dtype = 'BIGINT' + else: + sql_dtype = 'INT' + elif 'float' in dtype: + sql_dtype = 'FLOAT' + elif 'bool' in dtype: + sql_dtype = 'BOOLEAN' + elif 'date' in dtype: + sql_dtype = 'DATE' + df[col] = df[col].astype(str) + elif 'datetime' in dtype: + sql_dtype = 'DATETIME' + df[col] = df[col].astype(str) + # else: + # max_len = df[col].str.len().max().item() + 10 + # sql_dtype = f'VARCHAR({max_len})' + else: + lens = df[col].astype(str).str.len() + max_len_val = lens.max() + max_len = int(0 if pd.isna(max_len_val) else max_len_val) + 10 + sql_dtype = f'VARCHAR({max_len})' + + res.append([col, sql_dtype]) + + return df, res + + def process_data_rows(self, data: dict): + """Arg: data (keys: 'sheet_name', 'page_column', 'page_column_type', 'documents')""" + + rows = [] + for doc in data["documents"]: + row = {} + if 'int' in data["page_column_type"]: + row[data["page_column"]] = int(doc.page_content) + elif 'float' in data["page_column_type"]: + row[data["page_column"]] = float(doc.page_content) + elif 'bool' in data["page_column_type"]: + if doc.page_content.lower() == 'true': + row[data["page_column"]] = True + elif doc.page_content.lower() == 'false': + row[data["page_column"]] = False + else: + raise ValueError(f"Invalid boolean string: {doc.page_content}") + else: + row[data["page_column"]] = doc.page_content + + row.update(doc.metadata) + rows.append(row) + + processed_data = {"sheet_name": data["sheet_name"], "data_rows": rows, "data_types": data["dtypes"]} + return processed_data + + def load_csv_documents(self, file_path: str, **kwargs: dict): + import chardet + + with open(file_path, "rb") as f: + raw_file = f.read(10000) + enc_type = chardet.detect(raw_file)['encoding'] + df = pd.read_csv(file_path, encoding=enc_type, index_col=False) + df = df.fillna('null') # csv 파일에서도 xlsx 파일과 동일하게 null로 채움 + df, dtypes_str = self.check_sql_dtypes(df) + + for i in range(len(df.columns)): + try: + col = df.columns[0] + # col_type = str(type(col)) + col_type = str(df[col].dtype) + df = df.astype({col: 'str'}) + break + except: + raise ValueError( + f"Any columns cannot be converted into the string type so that can't load LangChain Documents: {dtypes_str}") + + loader = DataFrameLoader(df, page_content_column=col) + documents = loader.load() + + data = { + "sheet_name": "table_1", + "page_column": col, + "page_column_type": col_type, + "documents": documents, + "dtypes": dtypes_str + } + data = self.process_data_rows(data) # including only one sheet as it's a csv file + data_dict = {"data": [data]} + return data_dict + + def load_xlsx_documents(self, file_path: str, **kwargs: dict): + dfs = pd.read_excel(file_path, sheet_name=None) + sheets = [] + for sheet_name, df in dfs.items(): + df = df.fillna('null') + df, dtypes_str = self.check_sql_dtypes(df) + + for i in range(len(df.columns)): + try: + col = df.columns[0] + col_type = str(type(col)) + df = df.astype({col: 'str'}) + break + except: + raise ValueError( + f"Any columns cannot be converted into string type so that can't load LangChain Documents: {dtypes_str}") + + loader = DataFrameLoader(df, page_content_column=col) + documents = loader.load() + + sheet = { + "sheet_name": sheet_name, + "page_column": col, + "page_column_type": col_type, + "documents": documents, + "dtypes": dtypes_str + } + sheets.append(sheet) + + data_dict = {"data": []} + for sheet in sheets: + data = self.process_data_rows(sheet) + data_dict["data"].append(data) + + return data_dict + + def return_vectormeta_format(self): + if not self.data_dict: + return None + + text = "[DA] " + str(self.data_dict) # Add a token to indicate this string is for data analysis + + # @@@@ 성민: 토큰 수 줄이기위한 후처리(임시조치) + text = text.replace("Unnamed: ", "") + text = text[:2000] + + + vectors = [GenOSVectorMeta.model_validate({ + 'text': text, + 'n_char': 1, + 'n_word': 1, + 'n_line': 1, + 'i_page': 1, + 'e_page': 1, + 'n_page': 1, + 'i_chunk_on_page': 1, + 'n_chunk_of_page': 1, + 'i_chunk_on_doc': 1, + 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z', + 'chunk_bboxes': ".", + 'media_files': "." + })] + + + return vectors + + +class AudioLoader: + def __init__(self, + file_path: str, + req_url: str, + req_data: dict, + chunk_sec: int = 29, + tmp_path: str = '.', + ): + self.file_path = file_path + self.tmp_path = tmp_path + self.chunk_sec = chunk_sec + self.req_url = req_url + self.req_data = req_data + + def split_file_as_chunks(self) -> list: + audio = pydub.AudioSegment.from_file(self.file_path) + chunk_len = self.chunk_sec * 1000 + n_chunks = math.ceil(len(audio) / chunk_len) + + for i in range(n_chunks): + start_ms = i * chunk_len + overlap_start_ms = start_ms - 300 if start_ms > 0 else start_ms + end_ms = start_ms + chunk_len + audio_chunk = audio[overlap_start_ms:end_ms] + audio_chunk.export(os.path.join(self.tmp_path, "tmp_{}.wav".format(str(i))), format="wav") + tmp_files = glob(os.path.join(self.tmp_path, "*.wav")) + return tmp_files + + def transcribe_audio(self, file_path_lst: list): + transcribed_text_chunks = [] + + def _send_request(filepath: str): + """Send a request to 'whisper' model served""" + files = { + 'file': (filepath, open(filepath, 'rb'), 'audio/mp3'), + } + + response = requests.post(self.req_url, data=self.req_data, files=files) + text = response.json().get('text', ', ') + transcribed_text_chunks.append({ + 'file_name': os.path.basename(filepath), + 'text': text + }) + + # Send parallel requests + threads = [threading.Thread(target=_send_request, args=(f,)) for f in file_path_lst] + for t in threads: t.start() + for t in threads: t.join() + + # Merge transcribed text snippets in order + transcribed_text_chunks.sort(key=lambda x: x['file_name']) + transcribed_text = "[AUDIO]" + ' '.join([t['text'] for t in transcribed_text_chunks]) + return transcribed_text + + def return_vectormeta_format(self): + audio_chunks = self.split_file_as_chunks() + transcribed_text = self.transcribe_audio(audio_chunks) + res = [GenOSVectorMeta.model_validate({ + 'text': transcribed_text, + 'n_char': 1, + 'n_word': 1, + 'n_line': 1, + 'i_page': 1, + 'e_page': 1, + 'n_page': 1, + 'i_chunk_on_page': 1, + 'n_chunk_of_page': 1, + 'i_chunk_on_doc': 1, + 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z', + 'chunk_bboxes': ".", + 'media_files': "." + })] + return res + + +### for HWPX from 지능형 전처리기 ### +# * GenOSVectorMetaBuilder # +# * HierarchicalChunker # +# * HybridChunker # +# * HwpxProcessor # +# * GenosServiceException # + +class HierarchicalChunker(BaseChunker): + r""" Chunker implementation leveraging the document layout. + Args: + merge_list_items (bool): Whether to merge successive list items. + Defaults to True. + delim (str): Delimiter to use for merging text. Defaults to "\n". + """ + merge_list_items: bool = True + + @classmethod + def _triplet_serialize(cls, table_df: DataFrame) -> str: + # copy header as first row and shift all rows by one + table_df.loc[-1] = table_df.columns # type: ignore[call-overload] + table_df.index = table_df.index + 1 + table_df = table_df.sort_index() + + rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()] + cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()] + + nrows = table_df.shape[0] + ncols = table_df.shape[1] + texts = [ + f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}" + for i in range(1, nrows) + for j in range(1, ncols) + ] + output_text = ". ".join(texts) + + return output_text + + def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: + r"""Chunk the provided document. + Args: + dl_doc (DLDocument): document to chunk + + Yields: + Iterator[Chunk]: iterator over extracted chunks + """ + heading_by_level: dict[LevelNumber, str] = {} + list_items: list[TextItem] = [] + for item, level in dl_doc.iterate_items(): + captions = None + if isinstance(item, DocItem): + # first handle any merging needed + if self.merge_list_items: + if isinstance( + item, ListItem + ) or ( # TODO remove when all captured as ListItem: + isinstance(item, TextItem) + and item.label == DocItemLabel.LIST_ITEM + ): + list_items.append(item) + continue + elif list_items: # need to yield + yield DocChunk( + text=self.delim.join([i.text for i in list_items]), + meta=DocMeta( + doc_items=list_items, + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + origin=dl_doc.origin, + ), + ) + list_items = [] # reset + + if isinstance(item, SectionHeaderItem) or ( + isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]): + level = ( + item.level + if isinstance(item, SectionHeaderItem) + else (0 if item.label == DocItemLabel.TITLE else 1) + ) + heading_by_level[level] = item.text + text = ''.join(str(value) for value in heading_by_level.values()) + + # remove headings of higher level as they just went out of scope + keys_to_del = [k for k in heading_by_level if k > level] + for k in keys_to_del: + heading_by_level.pop(k, None) + c = DocChunk( + text=text, + meta=DocMeta( + doc_items=[item], + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + captions=captions, + origin=dl_doc.origin + ), + ) + yield c + continue + + if isinstance(item, TextItem) or ( + (not self.merge_list_items) and isinstance(item, ListItem)) or isinstance(item, CodeItem): + text = item.text + + elif isinstance(item, TableItem): + text = item.export_to_markdown(dl_doc) + # dataframe으로 추출할 때 사용되는 코드 + # if table_df.shape[0] < 1 or table_df.shape[1] < 2: + # # at least two cols needed, as first column contains row headers + # continue + # text = self._triplet_serialize(table_df=table_df) + captions = [c.text for c in [r.resolve(dl_doc) for r in item.captions]] or None + + elif isinstance(item, PictureItem): + text = ''.join(str(value) for value in heading_by_level.values()) + else: + continue + c = DocChunk( + text=text, + meta=DocMeta( + doc_items=[item], + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + captions=captions, + origin=dl_doc.origin, + ), + ) + yield c + + if self.merge_list_items and list_items: # need to yield + yield DocChunk( + text=self.delim.join([i.text for i in list_items]), + meta=DocMeta( + doc_items=list_items, + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + origin=dl_doc.origin, + ), + ) + + +class HybridChunker(BaseChunker): + r"""Chunker doing tokenization-aware refinements on top of document layout chunking. + Args: + tokenizer: The tokenizer to use; either instantiated object or name or path of + respective pretrained model + max_tokens: The maximum number of tokens per chunk. If not set, limit is + resolved from the tokenizer + merge_peers: Whether to merge undersized chunks sharing same relevant metadata + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + tokenizer: Union[PreTrainedTokenizerBase, str] = ( + "/nfs-root/all-MiniLM-L6-v2" + ) + max_tokens: int = int(1e30) # type: ignore[assignment] + merge_peers: bool = True + _inner_chunker: HierarchicalChunker = HierarchicalChunker() + + @model_validator(mode="after") + def _patch_tokenizer_and_max_tokens(self) -> Self: + self._tokenizer = ( + self.tokenizer + if isinstance(self.tokenizer, PreTrainedTokenizerBase) + else AutoTokenizer.from_pretrained(self.tokenizer) + ) + if self.max_tokens is None: + self.max_tokens = TypeAdapter(PositiveInt).validate_python( + self._tokenizer.model_max_length + ) + return self + + def _count_text_tokens(self, text: Optional[Union[str, list[str]]]): + if text is None: + return 0 + elif isinstance(text, list): + total = 0 + for t in text: + total += self._count_text_tokens(t) + return total + return len(self._tokenizer.tokenize(text)) + + class _ChunkLengthInfo(BaseModel): + total_len: int + text_len: int + other_len: int + + def _count_chunk_tokens(self, doc_chunk: DocChunk): + ser_txt = self.serialize(chunk=doc_chunk) + return len(self._tokenizer.tokenize(text=ser_txt)) + + def _doc_chunk_length(self, doc_chunk: DocChunk): + text_length = self._count_text_tokens(doc_chunk.text) + total = self._count_chunk_tokens(doc_chunk=doc_chunk) + return self._ChunkLengthInfo( + total_len=total, + text_len=text_length, + other_len=total - text_length, + ) + + def _make_chunk_from_doc_items( + self, doc_chunk: DocChunk, window_start: int, window_end: int + ): + doc_items = doc_chunk.meta.doc_items[window_start: window_end + 1] + meta = DocMeta( + doc_items=doc_items, + headings=doc_chunk.meta.headings, + captions=doc_chunk.meta.captions, + origin=doc_chunk.meta.origin, + ) + window_text = ( + doc_chunk.text + if len(doc_chunk.meta.doc_items) == 1 + else self.delim.join( + [ + doc_item.text + for doc_item in doc_items + if isinstance(doc_item, TextItem) + ] + ) + ) + new_chunk = DocChunk(text=window_text, meta=meta) + return new_chunk + + def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]: + chunks = [] + window_start = 0 + window_end = 0 # an inclusive index + num_items = len(doc_chunk.meta.doc_items) + while window_end < num_items: + new_chunk = self._make_chunk_from_doc_items( + doc_chunk=doc_chunk, + window_start=window_start, + window_end=window_end, + ) + if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens: + if window_end < num_items - 1: + window_end += 1 + # 아직 청크에 여유가 있고, 남은 아이템도 있으므로 계속 추가 시도 + continue + else: + # 현재 윈도우의 모든 아이템이 청크에 들어갔고, 더 이상 아이템이 없음 + window_end = num_items # signalizing the last loop + elif window_start == window_end: + # 아이템 1개도 청크에 안 들어감 → 단독 청크로 처리, 이후 재분할 + window_end += 1 + window_start = window_end + else: + # 마지막 아이템 빼고 청크 생성 → 남은 아이템으로 새 윈도우 시작 + new_chunk = self._make_chunk_from_doc_items( + doc_chunk=doc_chunk, + window_start=window_start, + window_end=window_end - 1, + ) + window_start = window_end + chunks.append(new_chunk) + return chunks + + def _split_using_plain_text(self, doc_chunk: DocChunk) -> list[DocChunk]: + lengths = self._doc_chunk_length(doc_chunk) + if lengths.total_len <= self.max_tokens: + return [doc_chunk] + else: + # 헤더/캡션을 제외하고 본문 텍스트에 할당 가능한 토큰 수 계산 + available_length = self.max_tokens - lengths.other_len + sem_chunker = semchunk.chunkerify( + self._tokenizer, chunk_size=available_length + ) + if available_length <= 0: + warnings.warn( + f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}" + # noqa + ) + return [] + text = doc_chunk.text + segments = sem_chunker.chunk(text) + chunks = [type(doc_chunk)(text=s, meta=doc_chunk.meta) for s in segments] + return chunks + + def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): + output_chunks = [] + window_start = 0 + window_end = 0 # an inclusive index + num_chunks = len(chunks) + + while window_end < num_chunks: + chunk = chunks[window_end] + headings_and_captions = (chunk.meta.headings, chunk.meta.captions) + ready_to_append = False + + if window_start == window_end: + current_headings_and_captions = headings_and_captions + window_end += 1 + first_chunk_of_window = chunk + + else: + chks = chunks[window_start: window_end + 1] + doc_items = [it for chk in chks for it in chk.meta.doc_items] + candidate = DocChunk( + text=self.delim.join([chk.text for chk in chks]), + meta=DocMeta( + doc_items=doc_items, + headings=current_headings_and_captions[0], + captions=current_headings_and_captions[1], + origin=chunk.meta.origin, + ), + ) + + if (headings_and_captions == current_headings_and_captions + and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens + ): + # 토큰 수 여유 있음 → 청크 확장 계속 + window_end += 1 + new_chunk = candidate + else: + ready_to_append = True + + if ready_to_append or window_end == num_chunks: + # no more room OR the start of new metadata. + if window_start + 1 == window_end: + output_chunks.append(first_chunk_of_window) + else: + output_chunks.append(new_chunk) + window_start = window_end + + return output_chunks + + def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: + r"""Chunk the provided document. + Args: + dl_doc (DLDocument): document to chunk + Yields: + Iterator[Chunk]: iterator over extracted chunks + """ + res: Iterable[DocChunk] + res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # type: ignore + res = [x for c in res for x in self._split_by_doc_items(c)] + res = [x for c in res for x in self._split_using_plain_text(c)] + + if self.merge_peers: + res = self._merge_chunks_with_matching_metadata(res) + return iter(res) + + +class DocxProcessor: + def __init__(self): + self.page_chunk_counts = defaultdict(int) + self.pipeline_options = PipelineOptions() + self.converter = DocumentConverter( + format_options={ + InputFormat.DOCX: WordFormatOption( + pipeline_cls=SimplePipeline, backend=GenosMsWordDocumentBackend + ), + } + ) + + def get_paths(self, file_path: str): + output_path, output_file = os.path.split(file_path) + filename, _ = os.path.splitext(output_file) + artifacts_dir = Path(f"{output_path}/{filename}") + if artifacts_dir.is_absolute(): + reference_path = None + else: + reference_path = artifacts_dir.parent + return artifacts_dir, reference_path + + def get_media_files(self, doc_items: list): + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'path': path, 'name': name}) + return temp_list + + def safe_join(self, iterable): + if not isinstance(iterable, (list, tuple, set)): + return '' + return ''.join(map(str, iterable)) + '\n' + + def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument: + conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True) + return conv_result.document + + def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]: + chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True) + chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs)) + for chunk in chunks: + self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1 + return chunks + + async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, + **kwargs: dict) -> list[dict]: + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=document.num_pages(), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z', + ) + + current_page = None + chunk_index_on_page = 0 + vectors = [] + upload_tasks = [] + for chunk_idx, chunk in enumerate(chunks): + chunk_page = chunk.meta.doc_items[0].prov[0].page_no + content = self.safe_join(chunk.meta.headings) + chunk.text + + if chunk_page != current_page: + current_page = chunk_page + chunk_index_on_page = 0 + + vector = (GenOSVectorMetaBuilder() + .set_text(content) + .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page]) + .set_chunk_index(chunk_idx) + .set_global_metadata(**global_metadata) + .set_chunk_bboxes(chunk.meta.doc_items, document) + .set_media_files(chunk.meta.doc_items) + ).build() + vectors.append(vector) + + chunk_index_on_page += 1 + # file_list = self.get_media_files(chunk.meta.doc_items) + # upload_tasks.append(asyncio.create_task( + # upload_files(file_list, request=request) + # )) + + if upload_tasks: + await asyncio.gather(*upload_tasks) + return vectors + + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + document: DoclingDocument = self.load_documents(file_path, **kwargs) + artifacts_dir, reference_path = self.get_paths(file_path) + document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path) + + chunks: list[DocChunk] = self.split_documents(document, **kwargs) + + vectors = [] + if len(chunks) >= 1: + vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs) + else: + raise GenosServiceException(1, f"chunk length is 0") + return vectors + + +class HwpxProcessor: + def __init__(self): + self.page_chunk_counts = defaultdict(int) + self.pipeline_options = PipelineOptions() + self.pipeline_options.save_images = False + self.converter = DocumentConverter( + format_options={ + InputFormat.XML_HWPX: HwpxFormatOption( + pipeline_options=self.pipeline_options + ) + } + ) + + def get_paths(self, file_path: str): + output_path, output_file = os.path.split(file_path) + filename, _ = os.path.splitext(output_file) + artifacts_dir = Path(f"{output_path}/{filename}") + if artifacts_dir.is_absolute(): + reference_path = None + else: + reference_path = artifacts_dir.parent + return artifacts_dir, reference_path + + def get_media_files(self, doc_items: list): + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'path': path, 'name': name}) + return temp_list + + def safe_join(self, iterable): + if not isinstance(iterable, (list, tuple, set)): + return '' + return ''.join(map(str, iterable)) + '\n' + + def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument: + save_images = kwargs.get('save_images', False) + + if self.pipeline_options.save_images != save_images: + self.pipeline_options.save_images = save_images + # self._create_converters() + + conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True) + return conv_result.document + + def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]: + chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True) + chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs)) + for chunk in chunks: + self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1 + return chunks + + async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, + **kwargs: dict) -> list[dict]: + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=document.num_pages(), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z', + ) + + current_page = None + chunk_index_on_page = 0 + vectors = [] + upload_tasks = [] + for chunk_idx, chunk in enumerate(chunks): + chunk_page = chunk.meta.doc_items[0].prov[0].page_no + content = self.safe_join(chunk.meta.headings) + chunk.text + + if chunk_page != current_page: + current_page = chunk_page + chunk_index_on_page = 0 + + vector = (GenOSVectorMetaBuilder() + .set_text(content) + .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page]) + .set_chunk_index(chunk_idx) + .set_global_metadata(**global_metadata) + .set_chunk_bboxes(chunk.meta.doc_items, document) + .set_media_files(chunk.meta.doc_items) + ).build() + vectors.append(vector) + + chunk_index_on_page += 1 + # file_list = self.get_media_files(chunk.meta.doc_items) + # upload_tasks.append(asyncio.create_task( + # upload_files(file_list, request=request) + # )) + + if upload_tasks: + await asyncio.gather(*upload_tasks) + return vectors + + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + document: DoclingDocument = self.load_documents(file_path, **kwargs) + artifacts_dir, reference_path = self.get_paths(file_path) + document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path) + + chunks: list[DocChunk] = self.split_documents(document, **kwargs) + + vectors = [] + if len(chunks) >= 1: + vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs) + else: + raise GenosServiceException(1, f"chunk length is 0") + + text = "" + for vector in vectors: + if len(text) + len(vector.text) > 8192: + break + text += vector.text + + return [vectors[0]] + + +class GenosServiceException(Exception): + """GenOS 와의 의존성 부분 제거를 위해 추가""" + + def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None: + self.code = 1 + self.error_code = error_code + self.error_msg = error_msg or "GenOS Service Exception" + self.msg_params = msg_params or {} + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})" + + +# async def assert_cancelled(request: Request): +# """GenOS 와의 의존성 제거를 위해 추가""" +# if await request.is_disconnected(): +# raise GenosServiceException(1, f"Cancelled") + + +# @@@@ 성민: OCR을 위해서 추가 +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + # OcrEngine, + # PdfBackend, + PdfPipelineOptions, + TableFormerMode, + PipelineOptions, + PaddleOcrOptions, +) +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.document_converter import PdfFormatOption + +class DocumentProcessor: + def __init__(self): + self.page_chunk_counts = defaultdict(int) + self.hwpx_processor = HwpxProcessor() + self.docx_processor = DocxProcessor() + + + + # @@@@ 성민: OCR을 위해서 추가 + self.ocr_endpoint = "http://doc-parser-ocr-service:8080/ocr" + ocr_options = PaddleOcrOptions( + force_full_page_ocr=False, + lang=['korean'], + ocr_endpoint=self.ocr_endpoint, + text_score=0.3) + + + device = AcceleratorDevice.AUTO + num_threads = 8 + accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) + + # PDF 파이프라인 옵션 설정 + self.pipe_line_options = PdfPipelineOptions() + self.pipe_line_options.generate_page_images = True + self.pipe_line_options.generate_picture_images = True + self.pipe_line_options.do_ocr = False + self.pipe_line_options.ocr_options = ocr_options + # self.pipe_line_options.ocr_options.lang = ["ko", 'en'] + # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model" + # self.pipe_line_options.ocr_options.force_full_page_ocr = True + # ocr_options = TesseractOcrOptions() + # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert'] + # ocr_options.path = './.tesseract/tessdata' + # self.pipe_line_options.ocr_options = ocr_options + # self.pipe_line_options.artifacts_path = Path("/models/") + self.pipe_line_options.do_table_structure = True + self.pipe_line_options.images_scale = 2 + self.pipe_line_options.table_structure_options.do_cell_matching = True + self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE + self.pipe_line_options.accelerator_options = accelerator_options + + # Simple 파이프라인 옵션을 인스턴스 변수로 저장 + self.simple_pipeline_options = PipelineOptions() + self.simple_pipeline_options.save_images = False + + # ocr 파이프라인 옵션 + self.ocr_pipe_line_options = PdfPipelineOptions() + self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True) + self.ocr_pipe_line_options.do_ocr = True + self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True) + self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True + + self.ocr_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.ocr_pipe_line_options, + backend=DoclingParseV4DocumentBackend + ), + } + ) + + def get_loader(self, file_path: str): + ext = os.path.splitext(file_path)[-1].lower() + real_type = self.get_real_file_type(file_path) + + # 확장자와 실제 파일 타입이 다를 때만 real_type 사용 + if ext != real_type and real_type == 'pdf': + return PyMuPDFLoader(file_path) + elif ext != real_type and real_type in ['txt', 'json', 'md']: + return TextLoader(file_path) + # 원래 확장자 기반 로직 + elif ext == '.pdf': + return PyMuPDFLoader(file_path) + elif ext == '.doc': + convert_to_pdf(file_path) + return UnstructuredWordDocumentLoader(file_path) + elif ext in ['.ppt', '.pptx']: + convert_to_pdf(file_path) + return UnstructuredPowerPointLoader(file_path) + elif ext in ['.jpg', '.jpeg', '.png']: + convert_to_pdf(file_path) + # 한국어 OCR 지원을 위한 언어 설정 + return UnstructuredImageLoader( + file_path, + languages=["kor", "eng"], # 한국어 + 영어 OCR + ) + elif ext in ['.txt', '.json', '.md']: + return TextLoader(file_path) + elif ext == '.hwp': + return HwpLoader(file_path) + elif ext == '.md': + return UnstructuredMarkdownLoader(file_path) + else: + return UnstructuredFileLoader(file_path) + + def get_real_file_type(self, file_path: str) -> str: + """파일 확장자가 아닌 실제 내용으로 파일 타입 판단""" + with open(file_path, 'rb') as f: + header = f.read(8) + if header.startswith(b'%PDF-'): + return 'pdf' + elif header.startswith(b'\x89PNG'): + return 'png' + elif header.startswith(b'\xff\xd8\xff'): + return 'jpg' + + # 매직 헤더로 판단할 수 없으면 확장자 사용 + return os.path.splitext(file_path)[-1].lower() + + def convert_md_to_pdf(self, md_path): + """Markdown 파일을 PDF로 변환""" + install_packages(['chardet']) + import chardet + + pdf_path = md_path.replace('.md', '.pdf') + with open(md_path, 'rb') as f: + raw_file = f.read() + candidates = ['utf-8', 'utf-8-sig'] + try: + det = (chardet.detect(raw_file) or {}).get('encoding') or '' + # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가 + if det and det.lower() not in ('ascii', 'unknown'): + if det.lower() not in [c.lower() for c in candidates]: + candidates.append(det) + except Exception: + pass + candidates += ['cp949', 'euc-kr', 'iso-8859-1', 'latin-1'] + md_content = None + for enc in candidates: + try: + md_content = raw_file.decode(enc) + break + except UnicodeDecodeError: + continue + if md_content is None: + md_content = raw_file.decode('utf-8', errors='replace') + + html_content = markdown(md_content) + if HTML: + HTML(string=html_content).write_pdf(pdf_path) + return pdf_path + + + + def _create_converters(self): + """컨버터들을 생성하는 헬퍼 메서드""" + self.ocr_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.ocr_pipe_line_options, + backend=DoclingParseV4DocumentBackend + ), + } + ) + + + def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument: + # kwargs에서 save_images 값을 가져와서 옵션 업데이트 + save_images = kwargs.get('save_images', True) + include_wmf = kwargs.get('include_wmf', False) + + # save_images 옵션이 현재 설정과 다르면 컨버터 재생성 + if (self.simple_pipeline_options.save_images != save_images or + getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf): + self.simple_pipeline_options.save_images = save_images + self.simple_pipeline_options.include_wmf = include_wmf + self._create_converters() + + try: + conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True) + except Exception as e: + print("@@@@", e) + # conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True) + + return conv_result.document + + + def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]: + loader = self.get_loader(file_path) + documents = loader.load() + + # @@@@ 성민: 밑에 주석 + # 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공 + # ext = os.path.splitext(file_path)[-1].lower() + # if ext in ['.jpg', '.jpeg', '.png']: + # # documents가 없거나, 있어도 모든 page_content가 비어있는 경우 + # if not documents or not any(doc.page_content.strip() for doc in documents): + # documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})] + + # @@@@ 성민 새로 작성: 텍스트가 없을 경우 OCR 수행 + if not documents or not any(doc.page_content.strip() for doc in documents): + document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs) + + documents = list([Document(page_content=document.export_to_markdown(), metadata={})]) + + return documents + + def split_documents(self, documents, **kwargs: dict) -> list[Document]: + # @@@@ 성민: GenOS에서 바꿔도 안바뀌는듯? + print("@@@@ kwargs", kwargs) + + kwargs.setdefault("chunk_size", 20_000) + + text_splitter = RecursiveCharacterTextSplitter(**kwargs) + + chunks = text_splitter.split_documents(documents) + chunks = [chunk for chunk in chunks if chunk.page_content] + + if not chunks: + raise Exception('Empty document') + + for chunk in chunks: + page = chunk.metadata.get('page', 0) + self.page_chunk_counts[page] += 1 + return chunks + + def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict) -> list[dict]: + ext = os.path.splitext(file_path)[-1].lower() + real_type = self.get_real_file_type(file_path) + + # 확장자와 실제 파일 타입이 다를 때만 real_type 사용 + if ext != real_type and real_type == 'pdf': + pdf_path = file_path + elif ext != real_type and real_type in ['txt', 'json', 'md']: + pdf_path = _get_pdf_path(file_path) + # 원래 확장자 기반 로직 + elif file_path.endswith('.md'): + pdf_path = self.convert_md_to_pdf(file_path) + elif file_path.endswith(('.ppt', '.pptx')): + pdf_path = _get_pdf_path(file_path) + else: + pdf_path = _get_pdf_path(file_path) + + # doc = fitz.open(pdf_path) if (pdf_path and os.path.exists(pdf_path)) else None + + if file_path.endswith(('.ppt', '.pptx')): + if os.path.exists(pdf_path): + subprocess.run(["rm", pdf_path], check=True) + + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=max([chunk.metadata.get('page', 0) for chunk in chunks]), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z' + ) + current_page = None + chunk_index_on_page = 0 + + vectors = [] + for chunk_idx, chunk in enumerate(chunks): + page = chunk.metadata.get('page', 0) + text = chunk.page_content + + if page != current_page: + current_page = page + chunk_index_on_page = 0 + + # 첨부용에서는 bbox 정보 추출 X + # if doc: + # fitz_page = doc.load_page(page) + # global_metadata['chunk_bboxes'] = json.dumps(merge_overlapping_bboxes([{ + # 'page': page + 1, + # 'type': 'text', + # 'bbox': { + # 'l': rect[0] / fitz_page.rect.width, + # 't': rect[1] / fitz_page.rect.height, + # 'r': rect[2] / fitz_page.rect.width, + # 'b': rect[3] / fitz_page.rect.height, + # } + # } for rect in fitz_page.search_for(text)], x_tolerance=1 / fitz_page.rect.width, + # y_tolerance=1 / fitz_page.rect.height)) + + vectors.append(GenOSVectorMeta.model_validate({ + 'text': text, + 'n_char': len(text), + 'n_word': len(text.split()), + 'n_line': len(text.splitlines()), + 'i_page': page, + 'e_page': page, + 'i_chunk_on_page': chunk_index_on_page, + 'n_chunk_of_page': self.page_chunk_counts[page], + 'i_chunk_on_doc': chunk_idx, + **global_metadata + })) + chunk_index_on_page += 1 + + return vectors + + @guardrail + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + ext = os.path.splitext(file_path)[-1].lower() + if ext in ('.wav', '.mp3', '.m4a'): + # Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model + tmp_path = "./tmp_audios_{}".format(os.path.basename(file_path).split('.')[0]) + if not os.path.exists(tmp_path): + os.makedirs(tmp_path) + + # Use 'Whisper' model served in-house + # [!] Modify the request parameters to change a STT model to be used + loader = AudioLoader( + file_path=file_path, + req_url="http://192.168.74.164:30100/v1/audio/transcriptions", + req_data={ + 'model': 'model', + 'language': 'ko', + 'response_format': 'json', + 'temperature': '0', + 'stream': 'false', + 'timestamp_granularities[]': 'word' + }, + chunk_sec=29, # length(sec) of a chunk from the uploaded audio + tmp_path=tmp_path + ) + vectors = loader.return_vectormeta_format() + # await assert_cancelled(request) + + # Remove the temporal chunks + try: + subprocess.run(['rm', '-r', tmp_path], check=True) + except: + pass + # await assert_cancelled(request) + return vectors + + elif ext in ('.csv', '.xlsx'): + loader = TabularLoader(file_path, ext) + vectors = loader.return_vectormeta_format() + # pdf_path = _get_pdf_path(file_path) + # await assert_cancelled(request) + return vectors + + elif ext == '.hwp': + documents: list[Document] = self.load_documents(file_path, **kwargs) + # await assert_cancelled(request) + chunks: list[Document] = self.split_documents(documents, **kwargs) + # await assert_cancelled(request) + vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs) + + return vectors + + elif ext == '.hwpx': + return await self.hwpx_processor(request, file_path, **kwargs) + + elif ext == '.docx': + return await self.docx_processor(request, file_path, **kwargs) + + else: + documents: list[Document] = self.load_documents(file_path, **kwargs) + # await assert_cancelled(request) + + chunks: list[Document] = self.split_documents(documents, **kwargs) + # await assert_cancelled(request) + + vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs) + + return vectors \ No newline at end of file From acb33e0014be0a014b8cb07180f96fd39f4cb0dd Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Mon, 6 Apr 2026 11:28:24 +0900 Subject: [PATCH 17/19] =?UTF-8?q?feat:=20json=20=EC=A0=84=EC=B2=98?= =?UTF-8?q?=EB=A6=AC=EA=B8=B0=20=ED=95=98=EB=82=98=EB=A1=9C=20=ED=86=B5?= =?UTF-8?q?=ED=95=A9.(PMS=20-=20=EA=B2=BD=EC=83=81=EC=98=A4=EB=8D=94,=20TM?= =?UTF-8?q?,=20=EB=B0=9C=EC=A0=84=20=EC=A0=95=EC=A7=80)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genon/preprocessor/facade/json_processor.py | 344 ++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 genon/preprocessor/facade/json_processor.py diff --git a/genon/preprocessor/facade/json_processor.py b/genon/preprocessor/facade/json_processor.py new file mode 100644 index 0000000000..8616aab79b --- /dev/null +++ b/genon/preprocessor/facade/json_processor.py @@ -0,0 +1,344 @@ +from datetime import datetime +from typing import Optional, Iterable, Any, List, Dict, Tuple +from collections import defaultdict +from fastapi import Request +from pydantic import BaseModel, ConfigDict +from collections import Counter + +import re +import asyncio +import json +import ast +import pdb + +import pandas as pd + +from docling_core.types.doc import ( + BoundingBox, + #CoordOrigin, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + GroupLabel, + #ImageRef, + #ProvenanceItem, + #Size, + #TableCell, + #TableData, + #GroupItem, + DocItem, + PictureItem, + SectionHeaderItem, + TableItem, + TextItem, + PageItem +) + +from docling.document_converter import DocumentConverter, PdfFormatOption, HTMLFormatOption +from docling.datamodel.document import ConversionResult, InputDocument +from docling_core.types import DoclingDocument + +KV_MAP = { + "url": ["URL"], + "ins_date": [ + "입력일", # 경상오더 + "발행일자", # TM + ], + "title": [ + "오더제목", # 경상오더 + "고장내용", # 발전정지 + "TM제목", # TM + ], + "num": [ + "오더번호", # 경상오더 + "번호", # 발전정지 + "TM번호", # TM + ], + "Powersys": ["발전소"], # 발전정지 + "desman": ["설계자"], # 경상오더 + "desdept": ["설계부서"], # 경상오더, TM + "hogi": ["호기"], + "des_date": ["설계일"], + "stopcat": ["정지종별"], # 발전정지 + "stopcat_code": ["정지종별코드"], # 발전정지 + "parcat": ["대분류"], # 발전정지 + "cat": ["분류"], # 발전정지 + "event_date": ["발생일시"], # 발전정지 + "rec_date": ["복구일시"], # 발전정지 + "pubman": ["발행자"], + "pubdept": ["발행부서"], + "status": ["진행상태"] +} + +class GenOSVectorMeta(BaseModel): + model_config = ConfigDict(extra="allow") + +class GenOSVectorMetaBuilder: + def __init__(self): + """빌더 초기화""" + self.text: Optional[str] = None + self.n_char: Optional[int] = None + self.n_word: Optional[int] = None + self.n_line: Optional[int] = None + self.i_page: Optional[int] = None + self.i_chunk_on_page: Optional[int] = None + self.n_chunk_of_page: Optional[int] = None + self.i_chunk_on_doc: Optional[int] = None + self.n_chunk_of_doc: Optional[int] = None + self.n_page: Optional[int] = None + self.reg_date: Optional[str] = None + self.bboxes: Optional[str] = None + self.url: Optional[str] = None + + self.data = {"text": None, + "n_char": None, + "n_line": None, + "i_page": None, + "i_chunk_on_page": None, + "n_chunk_of_page": None, + "i_chunk_on_doc": None, + "n_chunk_of_doc": None, + "n_page": None, + "reg_date": None, + "bboxes": None, + "url": None + } + + def set_text(self, text: str) -> "GenOSVectorMetaBuilder": + """텍스트와 관련된 데이터를 설정""" + + self.text = text + self.n_char = len(text) + self.n_word = len(text.split()) + self.n_line = len(text.splitlines()) + + self.data["text"] = text + self.data["n_char"] = len(text) + self.data["n_word"] = len(text.split()) + self.data["n_line"] = len(text.splitlines()) + + return self + + def set_page_info( + self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int + ) -> "GenOSVectorMetaBuilder": + """페이지 정보 설정""" + self.i_page = i_page + self.i_chunk_on_page = i_chunk_on_page + self.n_chunk_of_page = n_chunk_of_page + + self.data["i_page"] = i_page + self.data["i_chunk_on_page"] = i_chunk_on_page + self.data["n_chunk_of_page"] = n_chunk_of_page + + return self + + def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder": + """문서 전체의 청크 인덱스 설정""" + self.i_chunk_on_doc = i_chunk_on_doc + + self.data["i_chunk_on_doc"] = i_chunk_on_doc + + return self + + def set_bboxes(self, bbox: BoundingBox) -> "GenOSVectorMetaBuilder": + """Bounding Boxes 정보 설정""" + # bboxes.append({ + # 'p1': {'x': rect[0] / fitz_page.rect.width, 'y': rect[1] / fitz_page.rect.height}, + # 'p2': {'x': rect[2] / fitz_page.rect.width, 'y': rect[3] / fitz_page.rect.height}, + # }) + # NOTE: docling은 BOTTOMLEFT인데 해당 좌표 그대로 활용되는지 ? + conv = [] + conv.append({ + 'p1': {'x': 0, 'y': 0}, + 'p2': {'x': 0, 'y': 0}, + }) + self.bboxes = json.dumps(conv) + + self.data["bboxes"] = json.dumps(conv) + + return self + + def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder": + """글로벌 메타데이터 병합""" + + for key, value in global_metadata.items(): + setattr(self, key, value) + self.data[key] = value + + + return self + + def build(self) -> GenOSVectorMeta: + """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성""" + return GenOSVectorMeta(text=self.data.pop("text", "ERROR: no text"), **self.data) + +class DocumentProcessor: + def __init__(self): + ''' + initialize Document Converter + ''' + self.page_chunk_counts = defaultdict(int) + # device = AcceleratorDevice.AUTO + num_threads = 4 + + def preprocess_json(self, jsonf): + + metadata_keys = [] + date_keys = [] + + for jsonf_k, _ in jsonf.items(): + for k, v in KV_MAP.items(): + if jsonf_k in v: + metadata_keys.append(jsonf_k) + if "date" in k: + date_keys.append(jsonf_k) + + # date처리, json확인해보고 빼도됨. + for k in date_keys: + if k in jsonf: + try: + # jsonf[k] = pdf.to_datetime(jsonf[k], errors='coerce').isoformat() + date_value = jsonf[k] + if not date_value: + date_value = 0 + + if date_value: + dt_obj = self._parse_date_string(str(date_value)) + + if dt_obj: + jsonf[k] = int(dt_obj.strftime("%Y%m%d")) + else: + jsonf[k] = "" + else: + jsonf[k] = "" + except Exception: + pass + + # nan 처리 + for key in jsonf.keys(): + try: + if not isinstance(jsonf[key], list) and pd.isna(jsonf[key]): + jsonf[key] = "" + except: + pass + + # 메타데이터 처리 + metadata = {key: jsonf[key] for key in metadata_keys if key in jsonf} + + # formatted text 생성 + formatted_text = "\n ".join([f"{key} : {str(jsonf[key])}" for key in jsonf if not key.startswith('Unnamed')]) + + metadata_key_list = list(metadata.keys()) + for k, v_list in KV_MAP.items(): + for metadata_key in metadata_key_list: + if metadata_key in v_list: + if k in metadata.keys(): + print(f"@@@@ 이미 있는 키: {metadata_key} ---X-->> {k}") + print(f"@@@@ 있는 값: {k} : {metadata[k]}") + metadata.pop(metadata_key) + else: + metadata[k] = metadata.pop(metadata_key) + + chunk = { + "id": 1, + "text": formatted_text, + "metadata": metadata + } + + return chunk + + + def load_documents(self, file_path: str): + with open(file_path, 'r', encoding='utf-8') as f: + jsonfile = json.load(f) + + return jsonfile + + def _parse_date_string(self, date_str:str)-> Optional[datetime]: + formats = [ + "%Y-%m-%d", + "%Y%m%d", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%d %H:%M:%S%z", + "%Y-%m-%dT%H:%M:%S.%fZ" + ] + + if not date_str or date_str.strip() == "": + return 0 + for fmt in formats: + try: + return datetime.strptime(date_str, fmt) + except ValueError: + continue + + return 0 + + + # def split_documents(self, documents: dict, **kwargs: dict) -> List[Dict]: + def split_documents(self, documents: dict, **kwargs: dict) -> Dict: + chunk_size = 1000 + text = documents.get("text", "error") + chunks = [] + chunk = "" + + words = text.split(" ") + + for word in words: + if len(chunk) + len(word) > chunk_size: + chunks.append(chunk) + chunk = word + else: + chunk += (" " + word) if chunk else word + + if chunk: + chunks.append(chunk) + + new_chunks = [] + for chunk in chunks: + documents['text'] = chunk + new_chunks.append(documents.copy()) + + return new_chunks + + + def compose_vectors(self, chunks: list[dict], file_path: str) -> \ + list[dict]: + + first_chunk = chunks[0] + + global_metadata = dict( + n_chunk_of_doc=int(1), + n_page=int(1), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z', + **first_chunk['metadata'], + ) + + current_page = 1 + chunk_index_on_page = 0 + + vectors = [] + for chunk in chunks: + vector = (GenOSVectorMetaBuilder() + .set_text(chunk["text"]) + .set_page_info(1, 1, 1) + .set_chunk_index(1) + .set_global_metadata(**global_metadata) + .set_bboxes(None) + ).build() + vectors.append(vector) + + return vectors + + async def __call__(self, request: Request, file_path: str, **kwargs): # request: Request + + file: dict = self.load_documents(file_path) + + document: dict = self.preprocess_json(file) + + chunks: list[dict] = self.split_documents(document, **kwargs) + + vectors: list[dict] = self.compose_vectors(chunks=chunks, file_path= file_path) + + return vectors \ No newline at end of file From d35e7714ddb807119f3ff2529cd9d83d01647f6d Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Mon, 6 Apr 2026 11:29:02 +0900 Subject: [PATCH 18/19] =?UTF-8?q?feat:=20OneAgent=20=EC=97=B0=EB=8F=99?= =?UTF-8?q?=EC=9A=A9=20=EC=A0=84=EC=B2=98=EB=A6=AC=EA=B8=B0=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80.=20(=EC=B2=A8=EB=B6=80=EC=9A=A9=20=EC=A0=84=EC=B2=98?= =?UTF-8?q?=EB=A6=AC=EA=B8=B0=20=EA=B8=B0=EB=B0=98)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../preprocessor/facade/oneagent_processor.py | 1646 +++++++++++++++++ 1 file changed, 1646 insertions(+) create mode 100644 genon/preprocessor/facade/oneagent_processor.py diff --git a/genon/preprocessor/facade/oneagent_processor.py b/genon/preprocessor/facade/oneagent_processor.py new file mode 100644 index 0000000000..93870bace0 --- /dev/null +++ b/genon/preprocessor/facade/oneagent_processor.py @@ -0,0 +1,1646 @@ +from __future__ import annotations + +from collections import defaultdict + +import asyncio +import fitz +import json +import math +import os +import pandas as pd +import pydub +import requests +import shutil +import subprocess +import sys +import threading +import uuid +import warnings +from datetime import datetime +from fastapi import Request +from glob import glob +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import ( + # TextLoader, # TXT + PyMuPDFLoader, # PDF + DataFrameLoader, # DataFrame + UnstructuredWordDocumentLoader, # DOC and DOCX + UnstructuredPowerPointLoader, # PPT and PPTX + UnstructuredImageLoader, # JPG, PNG + UnstructuredMarkdownLoader, # Markdown + UnstructuredFileLoader, # Generic fallback +) +from langchain_core.documents import Document +from markdown2 import markdown +from pandas import DataFrame +from pathlib import Path +from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator +from typing import Any, Iterable, Iterator, List, Optional, Union +from typing_extensions import Self + +try: + import semchunk + from transformers import AutoTokenizer, PreTrainedTokenizerBase +except ImportError: + raise RuntimeError( + "Module requires 'chunking' extra; to install, run: " + "`pip install 'docling-core[chunking]'`" + ) +try: + import chardet +except ImportError: + raise RuntimeError("Module 'chardet' not imported. Run `pip install chardet`.") +try: + from weasyprint import HTML +except ImportError: + print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.") + HTML = None + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.document import ConversionResult, InputDocument +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.document_converter import DocumentConverter, HwpxFormatOption, WordFormatOption +from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta +from docling_core.types import DoclingDocument as DLDocument +from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, + PictureItem, SectionHeaderItem, TableItem, TextItem +) +from docling_core.types.doc.document import LevelNumber, ListItem, CodeItem +from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend +# from utils import assert_cancelled +# from genos_utils import upload_files, merge_overlapping_bboxes + +# import platform +from pathlib import Path +import os +import subprocess +import tempfile +import shutil +import unicodedata + +import logging + +for n in ("fontTools", "fontTools.ttLib", "fontTools.ttLib.ttFont"): + lg = logging.getLogger(n) + lg.setLevel(logging.CRITICAL) + lg.propagate = False + logging.getLogger().setLevel(logging.WARNING) +# pdf 변환 대상 확장자 +CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx'] + + + +### @@@@ 성민: 가드레일 용 ### +import requests +import re +import json + +GUARDRAIL_WORKFLOW_ID = 694 +GUARDRAIL_BEARER_TOKEN = '23c3898fe3264fd597961af23a68fe7c' +# GENOS_URL = 'https://ai.komipo.co.kr:30908/' +# @@@@ 내부 호출로 변경 +GENOS_URL = 'http://llmops-gateway-api-service:8080' + + +from functools import wraps + +def guardrail(func): + @wraps(func) + async def wrapper(*args, **kwargs): + result = await func(*args, **kwargs) + for r in result: + url = f"{GENOS_URL}/workflow/{GUARDRAIL_WORKFLOW_ID}" + headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}") + + if hasattr(r, "text"): + body = {'question': r.text} + + res = requests.post(f'{url}/run/v2', json=body, headers=headers) + + answer = res.json()['data']['text'] + + if answer.startswith("[UNSAFE]"): + r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다." + + + return result + return wrapper + + +def convert_to_pdf(file_path: str) -> str | None: + """ + LibreOffice로 PDF 변환을 시도한다. + 실패해도 예외를 던지지 않고 None을 반환한다. + """ + try: + in_path = Path(file_path).resolve() + out_dir = in_path.parent + pdf_path = in_path.with_suffix('.pdf') + + # headless에서 UTF-8 locale 보장 + env = os.environ.copy() + env.setdefault("LANG", "C.UTF-8") + env.setdefault("LC_ALL", "C.UTF-8") + + # 확장자에 따라 필터(특히 .ppt는 impress 필터) + ext = in_path.suffix.lower() + if ext in ('.ppt', '.pptx'): + convert_arg = "pdf:impress_pdf_Export" + elif ext in ('.doc', '.docx'): + convert_arg = "pdf:writer_pdf_Export" + elif ext in ('.xls', '.xlsx', '.csv'): + convert_arg = "pdf:calc_pdf_Export" + else: + convert_arg = "pdf" + + # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도 + try: + in_path.name.encode('ascii') + candidates = [in_path] + tmp_dir = None + except UnicodeEncodeError: + tmp_dir = Path(tempfile.mkdtemp()) + ascii_name = unicodedata.normalize('NFKD', in_path.stem).encode('ascii', 'ignore').decode('ascii') or "file" + ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}" + shutil.copy2(in_path, ascii_copy) + candidates = [ascii_copy, in_path] + + for cand in candidates: + cmd = [ + "soffice", "--headless", + "--convert-to", convert_arg, + "--outdir", str(out_dir), + str(cand) + ] + proc = subprocess.run(cmd, env=env, capture_output=True, text=True) + if proc.returncode == 0 and pdf_path.exists(): + # 성공 + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return str(pdf_path) + # 실패해도 계속 시도 (로그만 찍고 무시) + print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}") + + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return None + except Exception as e: + # 어떤 에러든 삼키고 None 반환 + print(f"[convert_to_pdf] error: {e}") + return None + + +def _get_pdf_path(file_path: str) -> str: + """ + 다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수 + + Args: + file_path (str): 원본 파일 경로 + + Returns: + str: PDF 확장자로 변경된 파일 경로 + """ + pdf_path = file_path + for ext in CONVERTIBLE_EXTENSIONS: + pdf_path = pdf_path.replace(ext, '.pdf') + return pdf_path + + +def install_packages(packages): + for package in packages: + try: + __import__(package) + except ImportError: + print(f"[!] {package} 패키지가 없습니다. 설치를 시도합니다.") + subprocess.run([sys.executable, "-m", "pip", "install", package], check=True) + + +class GenOSVectorMeta(BaseModel): + class Config: + extra = 'allow' + + text: str | None = None + n_char: int | None = None + n_word: int | None = None + n_line: int | None = None + i_page: int | None = None + e_page: int | None = None + i_chunk_on_page: int | None = None + n_chunk_of_page: int | None = None + i_chunk_on_doc: int | None = None + n_chunk_of_doc: int | None = None + n_page: int | None = None + reg_date: str | None = None + chunk_bboxes: str | None = None + media_files: str | None = None + + +class GenOSVectorMetaBuilder: + def __init__(self): + """빌더 초기화""" + self.text: Optional[str] = None + self.n_char: Optional[int] = None + self.n_word: Optional[int] = None + self.n_line: Optional[int] = None + self.i_page: Optional[int] = None + self.e_page: Optional[int] = None + self.i_chunk_on_page: Optional[int] = None + self.n_chunk_of_page: Optional[int] = None + self.i_chunk_on_doc: Optional[int] = None + self.n_chunk_of_doc: Optional[int] = None + self.n_page: Optional[int] = None + self.reg_date: Optional[str] = None + self.chunk_bboxes: Optional[str] = None + self.media_files: Optional[str] = None + # self.title: Optional[str] = None + # self.created_date: Optional[int] = None + + def set_text(self, text: str) -> "GenOSVectorMetaBuilder": + """텍스트와 관련된 데이터를 설정""" + self.text = text + self.n_char = len(text) + self.n_word = len(text.split()) + self.n_line = len(text.splitlines()) + return self + + def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder": + """페이지 정보 설정""" + self.i_page = i_page + self.i_chunk_on_page = i_chunk_on_page + self.n_chunk_of_page = n_chunk_of_page + return self + + def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder": + """문서 전체의 청크 인덱스 설정""" + self.i_chunk_on_doc = i_chunk_on_doc + return self + + def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder": + """글로벌 메타데이터 병합""" + for key, value in global_metadata.items(): + if hasattr(self, key): + setattr(self, key, value) + return self + + def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder": + chunk_bboxes = [] + for item in doc_items: + for prov in item.prov: + label = item.self_ref + type_ = item.label + size = document.pages.get(prov.page_no).size + page_no = prov.page_no + bbox = prov.bbox + bbox_data = { + 'l': bbox.l / size.width, + 't': bbox.t / size.height, + 'r': bbox.r / size.width, + 'b': bbox.b / size.height, + 'coord_origin': bbox.coord_origin.value + } + chunk_bboxes.append({ + 'page': page_no, + 'bbox': bbox_data, + 'type': type_, + 'ref': label + }) + self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None + self.chunk_bboxes = json.dumps(chunk_bboxes) + return self + + def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder": + temp_list = [] + if not doc_items: + self.media_files = "" + return self + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref}) + self.media_files = json.dumps(temp_list) + return self + + def build(self) -> GenOSVectorMeta: + """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성""" + return GenOSVectorMeta( + text=self.text, + n_char=self.n_char, + n_word=self.n_word, + n_line=self.n_line, + i_page=self.i_page, + e_page=self.e_page, + i_chunk_on_page=self.i_chunk_on_page, + n_chunk_of_page=self.n_chunk_of_page, + i_chunk_on_doc=self.i_chunk_on_doc, + n_chunk_of_doc=self.n_chunk_of_doc, + n_page=self.n_page, + reg_date=self.reg_date, + chunk_bboxes=self.chunk_bboxes, + media_files=self.media_files, + ) + + +class HwpLoader: + def __init__(self, file_path: str): + self.file_path = file_path + self.output_dir = os.path.join('/tmp', str(uuid.uuid4())) + os.makedirs(self.output_dir, exist_ok=True) + + def load(self): + try: + subprocess.run(['hwp5html', self.file_path, '--output', self.output_dir], check=True, timeout=600) + converted_file_path = os.path.join(self.output_dir, 'index.xhtml') + pdf_save_path = _get_pdf_path(self.file_path) + HTML(converted_file_path).write_pdf(pdf_save_path) + loader = PyMuPDFLoader(pdf_save_path) + return loader.load() + except Exception as e: + print(f"Failed to convert {self.file_path} to XHTML") + raise e + finally: + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + +class TextLoader: + def __init__(self, file_path: str): + self.file_path = file_path + self.output_dir = os.path.join('/tmp', str(uuid.uuid4())) + os.makedirs(self.output_dir, exist_ok=True) + + def load(self): + try: + with open(self.file_path, 'rb') as f: + raw = f.read() + enc = chardet.detect(raw).get('encoding') or '' + encodings = [enc] if enc and enc.lower() not in ('ascii', 'unknown') else [] + encodings += ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1'] + + content = None + for e in encodings: + try: + content = raw.decode(e) # 전체 파일로 디코딩 + break + except UnicodeDecodeError: + continue + if content is None: + content = raw.decode('utf-8', errors='replace') + + # 4) PDF 변환 유지 + html = f"
{content}
" + html_path = os.path.join(self.output_dir, 'temp.html') + with open(html_path, 'w', encoding='utf-8') as f: + f.write(html) + # pdf_path = (self.file_path + # .replace('.txt', '.pdf') + # .replace('.json', '.pdf')) + pdf_path = _get_pdf_path(self.file_path) + if HTML: + HTML(html_path).write_pdf(pdf_path) + loader = PyMuPDFLoader(pdf_path) + return loader.load() + # PDF가 불가하면 Document 직접 반환 (원형 스키마 유지) + return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})] + + except Exception: + # 실패 시에도 스키마는 그대로 유지해 반환 + for e in ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1']: + try: + with open(self.file_path, 'r', encoding=e) as f: + content = f.read() + return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})] + except UnicodeDecodeError: + continue + with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})] + finally: + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + +class TabularLoader: + def __init__(self, file_path: str, ext: str): + + packages = ['openpyxl', 'chardet'] + + install_packages(packages) + + self.file_path = file_path + if ext == ".csv": + # convert_to_pdf(file_path) csv는 Pdf 변환 안 함 + self.data_dict = self.load_csv_documents(file_path) + elif ext == ".xlsx": + # convert_to_pdf(file_path) xlsx는 Pdf 변환 안 함 + self.data_dict = self.load_xlsx_documents(file_path) + else: + print(f"[!] Inadequate extension for TabularLoader: {ext}") + return + + def check_sql_dtypes(self, df): + df = df.convert_dtypes() + res = [] + for col in df.columns: + # col_name = col.strip().replace(' ', '_') + dtype = str(df.dtypes[col]).lower() + + if 'int' in dtype: + if '64' in dtype: + sql_dtype = 'BIGINT' + else: + sql_dtype = 'INT' + elif 'float' in dtype: + sql_dtype = 'FLOAT' + elif 'bool' in dtype: + sql_dtype = 'BOOLEAN' + elif 'date' in dtype: + sql_dtype = 'DATE' + df[col] = df[col].astype(str) + elif 'datetime' in dtype: + sql_dtype = 'DATETIME' + df[col] = df[col].astype(str) + # else: + # max_len = df[col].str.len().max().item() + 10 + # sql_dtype = f'VARCHAR({max_len})' + else: + lens = df[col].astype(str).str.len() + max_len_val = lens.max() + max_len = int(0 if pd.isna(max_len_val) else max_len_val) + 10 + sql_dtype = f'VARCHAR({max_len})' + + res.append([col, sql_dtype]) + + return df, res + + def process_data_rows(self, data: dict): + """Arg: data (keys: 'sheet_name', 'page_column', 'page_column_type', 'documents')""" + + rows = [] + for doc in data["documents"]: + row = {} + if 'int' in data["page_column_type"]: + row[data["page_column"]] = int(doc.page_content) + elif 'float' in data["page_column_type"]: + row[data["page_column"]] = float(doc.page_content) + elif 'bool' in data["page_column_type"]: + if doc.page_content.lower() == 'true': + row[data["page_column"]] = True + elif doc.page_content.lower() == 'false': + row[data["page_column"]] = False + else: + raise ValueError(f"Invalid boolean string: {doc.page_content}") + else: + row[data["page_column"]] = doc.page_content + + row.update(doc.metadata) + rows.append(row) + + processed_data = {"sheet_name": data["sheet_name"], "data_rows": rows, "data_types": data["dtypes"]} + return processed_data + + def load_csv_documents(self, file_path: str, **kwargs: dict): + import chardet + + with open(file_path, "rb") as f: + raw_file = f.read(10000) + enc_type = chardet.detect(raw_file)['encoding'] + df = pd.read_csv(file_path, encoding=enc_type, index_col=False) + df = df.fillna('null') # csv 파일에서도 xlsx 파일과 동일하게 null로 채움 + df, dtypes_str = self.check_sql_dtypes(df) + + for i in range(len(df.columns)): + try: + col = df.columns[0] + # col_type = str(type(col)) + col_type = str(df[col].dtype) + df = df.astype({col: 'str'}) + break + except: + raise ValueError( + f"Any columns cannot be converted into the string type so that can't load LangChain Documents: {dtypes_str}") + + loader = DataFrameLoader(df, page_content_column=col) + documents = loader.load() + + data = { + "sheet_name": "table_1", + "page_column": col, + "page_column_type": col_type, + "documents": documents, + "dtypes": dtypes_str + } + data = self.process_data_rows(data) # including only one sheet as it's a csv file + data_dict = {"data": [data]} + return data_dict + + def load_xlsx_documents(self, file_path: str, **kwargs: dict): + dfs = pd.read_excel(file_path, sheet_name=None) + sheets = [] + for sheet_name, df in dfs.items(): + df = df.fillna('null') + df, dtypes_str = self.check_sql_dtypes(df) + + for i in range(len(df.columns)): + try: + col = df.columns[0] + col_type = str(type(col)) + df = df.astype({col: 'str'}) + break + except: + raise ValueError( + f"Any columns cannot be converted into string type so that can't load LangChain Documents: {dtypes_str}") + + loader = DataFrameLoader(df, page_content_column=col) + documents = loader.load() + + sheet = { + "sheet_name": sheet_name, + "page_column": col, + "page_column_type": col_type, + "documents": documents, + "dtypes": dtypes_str + } + sheets.append(sheet) + + data_dict = {"data": []} + for sheet in sheets: + data = self.process_data_rows(sheet) + data_dict["data"].append(data) + + return data_dict + + def return_vectormeta_format(self): + if not self.data_dict: + return None + + text = "[DA] " + str(self.data_dict) # Add a token to indicate this string is for data analysis + + # @@@@ 성민: 토큰 수 줄이기위한 후처리(임시조치) + text = text.replace("Unnamed: ", "") + text = text[:2000] + + vectors = [GenOSVectorMeta.model_validate({ + 'text': text, + 'n_char': 1, + 'n_word': 1, + 'n_line': 1, + 'i_page': 1, + 'e_page': 1, + 'n_page': 1, + 'i_chunk_on_page': 1, + 'n_chunk_of_page': 1, + 'i_chunk_on_doc': 1, + 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z', + 'chunk_bboxes': ".", + 'media_files': "." + })] + + + return vectors + + +class AudioLoader: + def __init__(self, + file_path: str, + req_url: str, + req_data: dict, + chunk_sec: int = 29, + tmp_path: str = '.', + ): + self.file_path = file_path + self.tmp_path = tmp_path + self.chunk_sec = chunk_sec + self.req_url = req_url + self.req_data = req_data + + def split_file_as_chunks(self) -> list: + audio = pydub.AudioSegment.from_file(self.file_path) + chunk_len = self.chunk_sec * 1000 + n_chunks = math.ceil(len(audio) / chunk_len) + + for i in range(n_chunks): + start_ms = i * chunk_len + overlap_start_ms = start_ms - 300 if start_ms > 0 else start_ms + end_ms = start_ms + chunk_len + audio_chunk = audio[overlap_start_ms:end_ms] + audio_chunk.export(os.path.join(self.tmp_path, "tmp_{}.wav".format(str(i))), format="wav") + tmp_files = glob(os.path.join(self.tmp_path, "*.wav")) + return tmp_files + + def transcribe_audio(self, file_path_lst: list): + transcribed_text_chunks = [] + + def _send_request(filepath: str): + """Send a request to 'whisper' model served""" + files = { + 'file': (filepath, open(filepath, 'rb'), 'audio/mp3'), + } + + response = requests.post(self.req_url, data=self.req_data, files=files) + text = response.json().get('text', ', ') + transcribed_text_chunks.append({ + 'file_name': os.path.basename(filepath), + 'text': text + }) + + # Send parallel requests + threads = [threading.Thread(target=_send_request, args=(f,)) for f in file_path_lst] + for t in threads: t.start() + for t in threads: t.join() + + # Merge transcribed text snippets in order + transcribed_text_chunks.sort(key=lambda x: x['file_name']) + transcribed_text = "[AUDIO]" + ' '.join([t['text'] for t in transcribed_text_chunks]) + return transcribed_text + + def return_vectormeta_format(self): + audio_chunks = self.split_file_as_chunks() + transcribed_text = self.transcribe_audio(audio_chunks) + res = [GenOSVectorMeta.model_validate({ + 'text': transcribed_text, + 'n_char': 1, + 'n_word': 1, + 'n_line': 1, + 'i_page': 1, + 'e_page': 1, + 'n_page': 1, + 'i_chunk_on_page': 1, + 'n_chunk_of_page': 1, + 'i_chunk_on_doc': 1, + 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z', + 'chunk_bboxes': ".", + 'media_files': "." + })] + return res + + +### for HWPX from 지능형 전처리기 ### +# * GenOSVectorMetaBuilder # +# * HierarchicalChunker # +# * HybridChunker # +# * HwpxProcessor # +# * GenosServiceException # + +class HierarchicalChunker(BaseChunker): + r""" Chunker implementation leveraging the document layout. + Args: + merge_list_items (bool): Whether to merge successive list items. + Defaults to True. + delim (str): Delimiter to use for merging text. Defaults to "\n". + """ + merge_list_items: bool = True + + @classmethod + def _triplet_serialize(cls, table_df: DataFrame) -> str: + # copy header as first row and shift all rows by one + table_df.loc[-1] = table_df.columns # type: ignore[call-overload] + table_df.index = table_df.index + 1 + table_df = table_df.sort_index() + + rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()] + cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()] + + nrows = table_df.shape[0] + ncols = table_df.shape[1] + texts = [ + f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}" + for i in range(1, nrows) + for j in range(1, ncols) + ] + output_text = ". ".join(texts) + + return output_text + + def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: + r"""Chunk the provided document. + Args: + dl_doc (DLDocument): document to chunk + + Yields: + Iterator[Chunk]: iterator over extracted chunks + """ + heading_by_level: dict[LevelNumber, str] = {} + list_items: list[TextItem] = [] + for item, level in dl_doc.iterate_items(): + captions = None + if isinstance(item, DocItem): + # first handle any merging needed + if self.merge_list_items: + if isinstance( + item, ListItem + ) or ( # TODO remove when all captured as ListItem: + isinstance(item, TextItem) + and item.label == DocItemLabel.LIST_ITEM + ): + list_items.append(item) + continue + elif list_items: # need to yield + yield DocChunk( + text=self.delim.join([i.text for i in list_items]), + meta=DocMeta( + doc_items=list_items, + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + origin=dl_doc.origin, + ), + ) + list_items = [] # reset + + if isinstance(item, SectionHeaderItem) or ( + isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]): + level = ( + item.level + if isinstance(item, SectionHeaderItem) + else (0 if item.label == DocItemLabel.TITLE else 1) + ) + heading_by_level[level] = item.text + text = ''.join(str(value) for value in heading_by_level.values()) + + # remove headings of higher level as they just went out of scope + keys_to_del = [k for k in heading_by_level if k > level] + for k in keys_to_del: + heading_by_level.pop(k, None) + c = DocChunk( + text=text, + meta=DocMeta( + doc_items=[item], + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + captions=captions, + origin=dl_doc.origin + ), + ) + yield c + continue + + if isinstance(item, TextItem) or ( + (not self.merge_list_items) and isinstance(item, ListItem)) or isinstance(item, CodeItem): + text = item.text + + elif isinstance(item, TableItem): + text = item.export_to_markdown(dl_doc) + # dataframe으로 추출할 때 사용되는 코드 + # if table_df.shape[0] < 1 or table_df.shape[1] < 2: + # # at least two cols needed, as first column contains row headers + # continue + # text = self._triplet_serialize(table_df=table_df) + captions = [c.text for c in [r.resolve(dl_doc) for r in item.captions]] or None + + elif isinstance(item, PictureItem): + text = ''.join(str(value) for value in heading_by_level.values()) + else: + continue + c = DocChunk( + text=text, + meta=DocMeta( + doc_items=[item], + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + captions=captions, + origin=dl_doc.origin, + ), + ) + yield c + + if self.merge_list_items and list_items: # need to yield + yield DocChunk( + text=self.delim.join([i.text for i in list_items]), + meta=DocMeta( + doc_items=list_items, + headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + origin=dl_doc.origin, + ), + ) + + +class HybridChunker(BaseChunker): + r"""Chunker doing tokenization-aware refinements on top of document layout chunking. + Args: + tokenizer: The tokenizer to use; either instantiated object or name or path of + respective pretrained model + max_tokens: The maximum number of tokens per chunk. If not set, limit is + resolved from the tokenizer + merge_peers: Whether to merge undersized chunks sharing same relevant metadata + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + tokenizer: Union[PreTrainedTokenizerBase, str] = ( + "/nfs-root/all-MiniLM-L6-v2" + ) + max_tokens: int = int(1e30) # type: ignore[assignment] + merge_peers: bool = True + _inner_chunker: HierarchicalChunker = HierarchicalChunker() + + @model_validator(mode="after") + def _patch_tokenizer_and_max_tokens(self) -> Self: + self._tokenizer = ( + self.tokenizer + if isinstance(self.tokenizer, PreTrainedTokenizerBase) + else AutoTokenizer.from_pretrained(self.tokenizer) + ) + if self.max_tokens is None: + self.max_tokens = TypeAdapter(PositiveInt).validate_python( + self._tokenizer.model_max_length + ) + return self + + def _count_text_tokens(self, text: Optional[Union[str, list[str]]]): + if text is None: + return 0 + elif isinstance(text, list): + total = 0 + for t in text: + total += self._count_text_tokens(t) + return total + return len(self._tokenizer.tokenize(text)) + + class _ChunkLengthInfo(BaseModel): + total_len: int + text_len: int + other_len: int + + def _count_chunk_tokens(self, doc_chunk: DocChunk): + ser_txt = self.serialize(chunk=doc_chunk) + return len(self._tokenizer.tokenize(text=ser_txt)) + + def _doc_chunk_length(self, doc_chunk: DocChunk): + text_length = self._count_text_tokens(doc_chunk.text) + total = self._count_chunk_tokens(doc_chunk=doc_chunk) + return self._ChunkLengthInfo( + total_len=total, + text_len=text_length, + other_len=total - text_length, + ) + + def _make_chunk_from_doc_items( + self, doc_chunk: DocChunk, window_start: int, window_end: int + ): + doc_items = doc_chunk.meta.doc_items[window_start: window_end + 1] + meta = DocMeta( + doc_items=doc_items, + headings=doc_chunk.meta.headings, + captions=doc_chunk.meta.captions, + origin=doc_chunk.meta.origin, + ) + window_text = ( + doc_chunk.text + if len(doc_chunk.meta.doc_items) == 1 + else self.delim.join( + [ + doc_item.text + for doc_item in doc_items + if isinstance(doc_item, TextItem) + ] + ) + ) + new_chunk = DocChunk(text=window_text, meta=meta) + return new_chunk + + def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]: + chunks = [] + window_start = 0 + window_end = 0 # an inclusive index + num_items = len(doc_chunk.meta.doc_items) + while window_end < num_items: + new_chunk = self._make_chunk_from_doc_items( + doc_chunk=doc_chunk, + window_start=window_start, + window_end=window_end, + ) + if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens: + if window_end < num_items - 1: + window_end += 1 + # 아직 청크에 여유가 있고, 남은 아이템도 있으므로 계속 추가 시도 + continue + else: + # 현재 윈도우의 모든 아이템이 청크에 들어갔고, 더 이상 아이템이 없음 + window_end = num_items # signalizing the last loop + elif window_start == window_end: + # 아이템 1개도 청크에 안 들어감 → 단독 청크로 처리, 이후 재분할 + window_end += 1 + window_start = window_end + else: + # 마지막 아이템 빼고 청크 생성 → 남은 아이템으로 새 윈도우 시작 + new_chunk = self._make_chunk_from_doc_items( + doc_chunk=doc_chunk, + window_start=window_start, + window_end=window_end - 1, + ) + window_start = window_end + chunks.append(new_chunk) + return chunks + + def _split_using_plain_text(self, doc_chunk: DocChunk) -> list[DocChunk]: + lengths = self._doc_chunk_length(doc_chunk) + if lengths.total_len <= self.max_tokens: + return [doc_chunk] + else: + # 헤더/캡션을 제외하고 본문 텍스트에 할당 가능한 토큰 수 계산 + available_length = self.max_tokens - lengths.other_len + sem_chunker = semchunk.chunkerify( + self._tokenizer, chunk_size=available_length + ) + if available_length <= 0: + warnings.warn( + f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}" + # noqa + ) + return [] + text = doc_chunk.text + segments = sem_chunker.chunk(text) + chunks = [type(doc_chunk)(text=s, meta=doc_chunk.meta) for s in segments] + return chunks + + def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): + output_chunks = [] + window_start = 0 + window_end = 0 # an inclusive index + num_chunks = len(chunks) + + while window_end < num_chunks: + chunk = chunks[window_end] + headings_and_captions = (chunk.meta.headings, chunk.meta.captions) + ready_to_append = False + + if window_start == window_end: + current_headings_and_captions = headings_and_captions + window_end += 1 + first_chunk_of_window = chunk + + else: + chks = chunks[window_start: window_end + 1] + doc_items = [it for chk in chks for it in chk.meta.doc_items] + candidate = DocChunk( + text=self.delim.join([chk.text for chk in chks]), + meta=DocMeta( + doc_items=doc_items, + headings=current_headings_and_captions[0], + captions=current_headings_and_captions[1], + origin=chunk.meta.origin, + ), + ) + + if (headings_and_captions == current_headings_and_captions + and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens + ): + # 토큰 수 여유 있음 → 청크 확장 계속 + window_end += 1 + new_chunk = candidate + else: + ready_to_append = True + + if ready_to_append or window_end == num_chunks: + # no more room OR the start of new metadata. + if window_start + 1 == window_end: + output_chunks.append(first_chunk_of_window) + else: + output_chunks.append(new_chunk) + window_start = window_end + + return output_chunks + + def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: + r"""Chunk the provided document. + Args: + dl_doc (DLDocument): document to chunk + Yields: + Iterator[Chunk]: iterator over extracted chunks + """ + res: Iterable[DocChunk] + res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # type: ignore + res = [x for c in res for x in self._split_by_doc_items(c)] + res = [x for c in res for x in self._split_using_plain_text(c)] + + if self.merge_peers: + res = self._merge_chunks_with_matching_metadata(res) + return iter(res) + + +class DocxProcessor: + def __init__(self): + self.page_chunk_counts = defaultdict(int) + self.pipeline_options = PipelineOptions() + self.converter = DocumentConverter( + format_options={ + InputFormat.DOCX: WordFormatOption( + pipeline_cls=SimplePipeline, backend=GenosMsWordDocumentBackend + ), + } + ) + + def get_paths(self, file_path: str): + output_path, output_file = os.path.split(file_path) + filename, _ = os.path.splitext(output_file) + artifacts_dir = Path(f"{output_path}/{filename}") + if artifacts_dir.is_absolute(): + reference_path = None + else: + reference_path = artifacts_dir.parent + return artifacts_dir, reference_path + + def get_media_files(self, doc_items: list): + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'path': path, 'name': name}) + return temp_list + + def safe_join(self, iterable): + if not isinstance(iterable, (list, tuple, set)): + return '' + return ''.join(map(str, iterable)) + '\n' + + def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument: + conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True) + return conv_result.document + + def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]: + chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True) + chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs)) + for chunk in chunks: + self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1 + return chunks + + async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, + **kwargs: dict) -> list[dict]: + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=document.num_pages(), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z', + ) + + current_page = None + chunk_index_on_page = 0 + vectors = [] + upload_tasks = [] + for chunk_idx, chunk in enumerate(chunks): + chunk_page = chunk.meta.doc_items[0].prov[0].page_no + content = self.safe_join(chunk.meta.headings) + chunk.text + + if chunk_page != current_page: + current_page = chunk_page + chunk_index_on_page = 0 + + vector = (GenOSVectorMetaBuilder() + .set_text(content) + .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page]) + .set_chunk_index(chunk_idx) + .set_global_metadata(**global_metadata) + .set_chunk_bboxes(chunk.meta.doc_items, document) + .set_media_files(chunk.meta.doc_items) + ).build() + vectors.append(vector) + + chunk_index_on_page += 1 + # file_list = self.get_media_files(chunk.meta.doc_items) + # upload_tasks.append(asyncio.create_task( + # upload_files(file_list, request=request) + # )) + + if upload_tasks: + await asyncio.gather(*upload_tasks) + return vectors + + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + document: DoclingDocument = self.load_documents(file_path, **kwargs) + artifacts_dir, reference_path = self.get_paths(file_path) + document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path) + + chunks: list[DocChunk] = self.split_documents(document, **kwargs) + + vectors = [] + if len(chunks) >= 1: + vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs) + else: + raise GenosServiceException(1, f"chunk length is 0") + return vectors + + +class HwpxProcessor: + def __init__(self): + self.page_chunk_counts = defaultdict(int) + self.pipeline_options = PipelineOptions() + self.pipeline_options.save_images = False + self.converter = DocumentConverter( + format_options={ + InputFormat.XML_HWPX: HwpxFormatOption( + pipeline_options=self.pipeline_options + ) + } + ) + + def get_paths(self, file_path: str): + output_path, output_file = os.path.split(file_path) + filename, _ = os.path.splitext(output_file) + artifacts_dir = Path(f"{output_path}/{filename}") + if artifacts_dir.is_absolute(): + reference_path = None + else: + reference_path = artifacts_dir.parent + return artifacts_dir, reference_path + + def get_media_files(self, doc_items: list): + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'path': path, 'name': name}) + return temp_list + + def safe_join(self, iterable): + if not isinstance(iterable, (list, tuple, set)): + return '' + return ''.join(map(str, iterable)) + '\n' + + def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument: + save_images = kwargs.get('save_images', False) + + if self.pipeline_options.save_images != save_images: + self.pipeline_options.save_images = save_images + # self._create_converters() + + conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True) + return conv_result.document + + def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]: + chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True) + chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs)) + for chunk in chunks: + self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1 + return chunks + + async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, + **kwargs: dict) -> list[dict]: + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=document.num_pages(), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z', + ) + + current_page = None + chunk_index_on_page = 0 + vectors = [] + upload_tasks = [] + for chunk_idx, chunk in enumerate(chunks): + chunk_page = chunk.meta.doc_items[0].prov[0].page_no + content = self.safe_join(chunk.meta.headings) + chunk.text + + if chunk_page != current_page: + current_page = chunk_page + chunk_index_on_page = 0 + + vector = (GenOSVectorMetaBuilder() + .set_text(content) + .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page]) + .set_chunk_index(chunk_idx) + .set_global_metadata(**global_metadata) + .set_chunk_bboxes(chunk.meta.doc_items, document) + .set_media_files(chunk.meta.doc_items) + ).build() + vectors.append(vector) + + chunk_index_on_page += 1 + # file_list = self.get_media_files(chunk.meta.doc_items) + # upload_tasks.append(asyncio.create_task( + # upload_files(file_list, request=request) + # )) + + if upload_tasks: + await asyncio.gather(*upload_tasks) + return vectors + + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + document: DoclingDocument = self.load_documents(file_path, **kwargs) + artifacts_dir, reference_path = self.get_paths(file_path) + document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path) + + chunks: list[DocChunk] = self.split_documents(document, **kwargs) + + vectors = [] + if len(chunks) >= 1: + vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs) + else: + raise GenosServiceException(1, f"chunk length is 0") + + + text = "" + for vector in vectors: + if len(text) + len(vector.text) > 8192: + break + text += vector.text + + + return [vectors[0]] + + +class GenosServiceException(Exception): + """GenOS 와의 의존성 부분 제거를 위해 추가""" + + def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None: + self.code = 1 + self.error_code = error_code + self.error_msg = error_msg or "GenOS Service Exception" + self.msg_params = msg_params or {} + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})" + + +# async def assert_cancelled(request: Request): +# """GenOS 와의 의존성 제거를 위해 추가""" +# if await request.is_disconnected(): +# raise GenosServiceException(1, f"Cancelled") + + +# @@@@ 성민: OCR을 위해서 추가 +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + # OcrEngine, + # PdfBackend, + PdfPipelineOptions, + TableFormerMode, + PipelineOptions, + PaddleOcrOptions, +) +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.document_converter import PdfFormatOption + +class DocumentProcessor: + def __init__(self): + self.page_chunk_counts = defaultdict(int) + self.hwpx_processor = HwpxProcessor() + self.docx_processor = DocxProcessor() + + # @@@@ 성민: OCR을 위해서 추가 + self.ocr_endpoint = "http://doc-parser-ocr-service:8080/ocr" + ocr_options = PaddleOcrOptions( + force_full_page_ocr=False, + lang=['korean'], + ocr_endpoint=self.ocr_endpoint, + text_score=0.3) + + + device = AcceleratorDevice.AUTO + num_threads = 8 + accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) + + # PDF 파이프라인 옵션 설정 + self.pipe_line_options = PdfPipelineOptions() + self.pipe_line_options.generate_page_images = True + self.pipe_line_options.generate_picture_images = True + self.pipe_line_options.do_ocr = False + self.pipe_line_options.ocr_options = ocr_options + # self.pipe_line_options.ocr_options.lang = ["ko", 'en'] + # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model" + # self.pipe_line_options.ocr_options.force_full_page_ocr = True + # ocr_options = TesseractOcrOptions() + # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert'] + # ocr_options.path = './.tesseract/tessdata' + # self.pipe_line_options.ocr_options = ocr_options + # self.pipe_line_options.artifacts_path = Path("/models/") + self.pipe_line_options.do_table_structure = True + self.pipe_line_options.images_scale = 2 + self.pipe_line_options.table_structure_options.do_cell_matching = True + self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE + self.pipe_line_options.accelerator_options = accelerator_options + + # Simple 파이프라인 옵션을 인스턴스 변수로 저장 + self.simple_pipeline_options = PipelineOptions() + self.simple_pipeline_options.save_images = False + + # ocr 파이프라인 옵션 + self.ocr_pipe_line_options = PdfPipelineOptions() + self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True) + self.ocr_pipe_line_options.do_ocr = True + self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True) + self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True + + self.ocr_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.ocr_pipe_line_options, + backend=DoclingParseV4DocumentBackend + ), + } + ) + + def get_loader(self, file_path: str): + ext = os.path.splitext(file_path)[-1].lower() + real_type = self.get_real_file_type(file_path) + + # 확장자와 실제 파일 타입이 다를 때만 real_type 사용 + if ext != real_type and real_type == 'pdf': + return PyMuPDFLoader(file_path) + elif ext != real_type and real_type in ['txt', 'json', 'md']: + return TextLoader(file_path) + # 원래 확장자 기반 로직 + elif ext == '.pdf': + return PyMuPDFLoader(file_path) + elif ext == '.doc': + convert_to_pdf(file_path) + return UnstructuredWordDocumentLoader(file_path) + elif ext in ['.ppt', '.pptx']: + convert_to_pdf(file_path) + return UnstructuredPowerPointLoader(file_path) + elif ext in ['.jpg', '.jpeg', '.png']: + convert_to_pdf(file_path) + # 한국어 OCR 지원을 위한 언어 설정 + return UnstructuredImageLoader( + file_path, + languages=["kor", "eng"], # 한국어 + 영어 OCR + ) + elif ext in ['.txt', '.json', '.md']: + return TextLoader(file_path) + elif ext == '.hwp': + return HwpLoader(file_path) + elif ext == '.md': + return UnstructuredMarkdownLoader(file_path) + else: + return UnstructuredFileLoader(file_path) + + def get_real_file_type(self, file_path: str) -> str: + """파일 확장자가 아닌 실제 내용으로 파일 타입 판단""" + with open(file_path, 'rb') as f: + header = f.read(8) + if header.startswith(b'%PDF-'): + return 'pdf' + elif header.startswith(b'\x89PNG'): + return 'png' + elif header.startswith(b'\xff\xd8\xff'): + return 'jpg' + + # 매직 헤더로 판단할 수 없으면 확장자 사용 + return os.path.splitext(file_path)[-1].lower() + + def convert_md_to_pdf(self, md_path): + """Markdown 파일을 PDF로 변환""" + install_packages(['chardet']) + import chardet + + pdf_path = md_path.replace('.md', '.pdf') + with open(md_path, 'rb') as f: + raw_file = f.read() + candidates = ['utf-8', 'utf-8-sig'] + try: + det = (chardet.detect(raw_file) or {}).get('encoding') or '' + # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가 + if det and det.lower() not in ('ascii', 'unknown'): + if det.lower() not in [c.lower() for c in candidates]: + candidates.append(det) + except Exception: + pass + candidates += ['cp949', 'euc-kr', 'iso-8859-1', 'latin-1'] + md_content = None + for enc in candidates: + try: + md_content = raw_file.decode(enc) + break + except UnicodeDecodeError: + continue + if md_content is None: + md_content = raw_file.decode('utf-8', errors='replace') + + html_content = markdown(md_content) + if HTML: + HTML(string=html_content).write_pdf(pdf_path) + return pdf_path + + + + def _create_converters(self): + """컨버터들을 생성하는 헬퍼 메서드""" + self.ocr_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.ocr_pipe_line_options, + backend=DoclingParseV4DocumentBackend + ), + } + ) + + + def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument: + # kwargs에서 save_images 값을 가져와서 옵션 업데이트 + save_images = kwargs.get('save_images', True) + include_wmf = kwargs.get('include_wmf', False) + + # save_images 옵션이 현재 설정과 다르면 컨버터 재생성 + if (self.simple_pipeline_options.save_images != save_images or + getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf): + self.simple_pipeline_options.save_images = save_images + self.simple_pipeline_options.include_wmf = include_wmf + self._create_converters() + + try: + conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True) + except Exception as e: + print("@@@@", e) + # conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True) + + return conv_result.document + + + def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]: + loader = self.get_loader(file_path) + documents = loader.load() + + # @@@@ 성민: 밑에 주석 + # 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공 + # ext = os.path.splitext(file_path)[-1].lower() + # if ext in ['.jpg', '.jpeg', '.png']: + # # documents가 없거나, 있어도 모든 page_content가 비어있는 경우 + # if not documents or not any(doc.page_content.strip() for doc in documents): + # documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})] + + # @@@@ 성민 새로 작성: 텍스트가 없을 경우 OCR 수행 + if not documents or not any(doc.page_content.strip() for doc in documents): + document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs) + + documents = list([Document(page_content=document.export_to_markdown(), metadata={})]) + + return documents + + def split_documents(self, documents, **kwargs: dict) -> list[Document]: + # @@@@ 성민: GenOS에서 바꿔도 안바뀌는듯? + print("@@@@ kwargs", kwargs) + + kwargs.setdefault("chunk_size", 20_000) + + splitter_kwargs = { + k: v for k, v in kwargs.items() + if k in ["chunk_size", "chunk_overlap", "separators", "length_function"] + } + + text_splitter = RecursiveCharacterTextSplitter(**splitter_kwargs) + + chunks = text_splitter.split_documents(documents) + chunks = [chunk for chunk in chunks if chunk.page_content] + + if not chunks: + raise Exception('Empty document') + + for chunk in chunks: + page = chunk.metadata.get('page', 0) + self.page_chunk_counts[page] += 1 + return chunks + + def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict) -> list[dict]: + ext = os.path.splitext(file_path)[-1].lower() + real_type = self.get_real_file_type(file_path) + + # 확장자와 실제 파일 타입이 다를 때만 real_type 사용 + if ext != real_type and real_type == 'pdf': + pdf_path = file_path + elif ext != real_type and real_type in ['txt', 'json', 'md']: + pdf_path = _get_pdf_path(file_path) + # 원래 확장자 기반 로직 + elif file_path.endswith('.md'): + pdf_path = self.convert_md_to_pdf(file_path) + elif file_path.endswith(('.ppt', '.pptx')): + pdf_path = _get_pdf_path(file_path) + else: + pdf_path = _get_pdf_path(file_path) + + # doc = fitz.open(pdf_path) if (pdf_path and os.path.exists(pdf_path)) else None + + if file_path.endswith(('.ppt', '.pptx')): + if os.path.exists(pdf_path): + subprocess.run(["rm", pdf_path], check=True) + + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=max([chunk.metadata.get('page', 0) for chunk in chunks]), + reg_date=datetime.now().isoformat(timespec='seconds') + 'Z' + ) + current_page = None + chunk_index_on_page = 0 + + vectors = [] + for chunk_idx, chunk in enumerate(chunks): + page = chunk.metadata.get('page', 0) + text = chunk.page_content + + if page != current_page: + current_page = page + chunk_index_on_page = 0 + + # 첨부용에서는 bbox 정보 추출 X + # if doc: + # fitz_page = doc.load_page(page) + # global_metadata['chunk_bboxes'] = json.dumps(merge_overlapping_bboxes([{ + # 'page': page + 1, + # 'type': 'text', + # 'bbox': { + # 'l': rect[0] / fitz_page.rect.width, + # 't': rect[1] / fitz_page.rect.height, + # 'r': rect[2] / fitz_page.rect.width, + # 'b': rect[3] / fitz_page.rect.height, + # } + # } for rect in fitz_page.search_for(text)], x_tolerance=1 / fitz_page.rect.width, + # y_tolerance=1 / fitz_page.rect.height)) + + vectors.append(GenOSVectorMeta.model_validate({ + 'text': text, + 'n_char': len(text), + 'n_word': len(text.split()), + 'n_line': len(text.splitlines()), + 'i_page': page, + 'e_page': page, + 'i_chunk_on_page': chunk_index_on_page, + 'n_chunk_of_page': self.page_chunk_counts[page], + 'i_chunk_on_doc': chunk_idx, + **global_metadata + })) + chunk_index_on_page += 1 + + return vectors + + @guardrail + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + + # @@@@ 성민: OneAgent 연동용 + if "uploads" in kwargs.keys(): + import base64 + uploads = kwargs.get("uploads", None)[0] + + # @@@@ 전처리기 파일 저장 경로 + folder = "/nfs-root/tmp/uploads" + + decoded = base64.b64decode(uploads['data'].split(",", 1)[1]) + file_path = os.path.join(folder, uploads['name']) + + with open(file_path, 'wb') as f: + f.write(decoded) + + ext = os.path.splitext(file_path)[-1].lower() + if ext in ('.wav', '.mp3', '.m4a'): + # Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model + tmp_path = "./tmp_audios_{}".format(os.path.basename(file_path).split('.')[0]) + if not os.path.exists(tmp_path): + os.makedirs(tmp_path) + + # Use 'Whisper' model served in-house + # [!] Modify the request parameters to change a STT model to be used + loader = AudioLoader( + file_path=file_path, + req_url="http://192.168.74.164:30100/v1/audio/transcriptions", + req_data={ + 'model': 'model', + 'language': 'ko', + 'response_format': 'json', + 'temperature': '0', + 'stream': 'false', + 'timestamp_granularities[]': 'word' + }, + chunk_sec=29, # length(sec) of a chunk from the uploaded audio + tmp_path=tmp_path + ) + vectors = loader.return_vectormeta_format() + # await assert_cancelled(request) + + # Remove the temporal chunks + try: + subprocess.run(['rm', '-r', tmp_path], check=True) + except: + pass + # await assert_cancelled(request) + return vectors + + elif ext in ('.csv', '.xlsx'): + loader = TabularLoader(file_path, ext) + vectors = loader.return_vectormeta_format() + # pdf_path = _get_pdf_path(file_path) + # await assert_cancelled(request) + return vectors + + elif ext == '.hwp': + documents: list[Document] = self.load_documents(file_path, **kwargs) + # await assert_cancelled(request) + chunks: list[Document] = self.split_documents(documents, **kwargs) + # await assert_cancelled(request) + vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs) + + return vectors + + elif ext == '.hwpx': + return await self.hwpx_processor(request, file_path, **kwargs) + + elif ext == '.docx': + return await self.docx_processor(request, file_path, **kwargs) + + else: + documents: list[Document] = self.load_documents(file_path, **kwargs) + # await assert_cancelled(request) + + chunks: list[Document] = self.split_documents(documents, **kwargs) + # await assert_cancelled(request) + + vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs) + + + return vectors \ No newline at end of file From b5877fe12fa2066a9a7fec8441575ad1e0b4ffbb Mon Sep 17 00:00:00 2001 From: seongmincho315 Date: Thu, 16 Apr 2026 13:23:30 +0900 Subject: [PATCH 19/19] =?UTF-8?q?feat:=20=EB=AA=A8=EB=93=88=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genon/preprocessor/module/base_processor.py | 175 ++ .../module/intelligent_processor.py | 1675 +++++++++++++++++ genon/preprocessor/module/test.py | 59 + genon/preprocessor/module/test_processor.py | 55 + genon/preprocessor/module/utils/chunkers.py | 836 ++++++++ genon/preprocessor/module/utils/genos_util.py | 17 + genon/preprocessor/module/utils/metadata.py | 352 ++++ genon/preprocessor/module/utils/util.py | 146 ++ 8 files changed, 3315 insertions(+) create mode 100644 genon/preprocessor/module/base_processor.py create mode 100644 genon/preprocessor/module/intelligent_processor.py create mode 100644 genon/preprocessor/module/test.py create mode 100644 genon/preprocessor/module/test_processor.py create mode 100644 genon/preprocessor/module/utils/chunkers.py create mode 100644 genon/preprocessor/module/utils/genos_util.py create mode 100644 genon/preprocessor/module/utils/metadata.py create mode 100644 genon/preprocessor/module/utils/util.py diff --git a/genon/preprocessor/module/base_processor.py b/genon/preprocessor/module/base_processor.py new file mode 100644 index 0000000000..fd277a8b08 --- /dev/null +++ b/genon/preprocessor/module/base_processor.py @@ -0,0 +1,175 @@ +from typing import Any, List + +from fastapi import Request +from langchain_core.documents import Document +from docling.document_converter import DocumentConverter, PdfFormatOption, HwpxFormatOption, WordFormatOption +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + PdfPipelineOptions, + TableFormerMode, + PipelineOptions, + PaddleOcrOptions, +) +from docling.datamodel.base_models import InputFormat +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.backend.msword_backend import MsWordDocumentBackend +from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend +from docling_core.transforms.chunker import BaseChunker, DocChunk +from docling_core.types import DoclingDocument +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docling.pipeline.simple_pipeline import SimplePipeline + + +from utils.chunkers import CHUNKERS +from utils.metadata import GenOSVectorMetaBuilder + + +# load 파이프라인 +# open -> table -> reading order + +""" +모델들 +- detection model +- recognition model +- ocr + - easy + - paddle + +- table 모델 + +- vlm + - 이미지 디스크립션 모델 + - 문서 로테이션 모델 + - TOC 생성 모델 +""" + +""" +컴포넌트 + - base64로 오면 저장 (oneagent용) + - 파일 오픈: 확장자 별로.... + - pdf로 저장(GenOS에서 보여주려고) + - 리딩오더 + - 레이아웃 + - 테이블 디텍션 + - 이미지 디스크립션 + - ocr -> 용도별로 + - 이미지 로테이션 +""" + +# TODO all ext +FORMAT_MAP = { + "pdf": InputFormat.PDF, + # "hwp": InputFormat.HWP, + # "hwpx":InputFormat.HWPX,# TODO + # "doc":InputFormat.DOC, # TODO + "docx": InputFormat.DOCX, + # "ppt": InputFormat.PPT, #TODO + # "pptx": InputFormat.PPTX, + # "xlsx": InputFormat.XLSX, + # "csv": InputFormat.CSV, + # "md": InputFormat.MD, + # "json": InputFormat.JSON, + # "html": InputFormat.HTML, +} + + +# TODO all ext +FORMAT_OPTION_MAP = { + InputFormat.PDF: PdfFormatOption, + # InputFormat.HWP: HwpFormatOption # TODO 왜 HwpFormatOption은 없는지 + # "hwpx":InputFormat.HWPX,# TODO + # "doc":InputFormat.DOC, # TODO + InputFormat.DOCX: WordFormatOption, + # "ppt": InputFormat.PPT, #TODO + # InputFormat.PPTX, + # InputFormat.XLSX, + # InputFormat.CSV, + # InputFormat.MD, + # InputFormat.JSON, + # InputFormat.HTML, +} + +PIPELINE_MAP = { + "pdf": StandardPdfPipeline, + "simple": SimplePipeline, +} + +BACKEND_MAP = { + "pypdf": PyPdfiumDocumentBackend, + "msword": GenosMsWordDocumentBackend, +} + + +class BaseProcessor: + pipeline: list[str] = None + format_options: None + chunker: BaseChunker = None + loaders: list = None + converter: DocumentConverter = None + config: dict = None + + def __init__(self, config: dict) -> None: + # mapping 해주자 + self.config = config + self.allowed_formats = self._build_allowed_formats() + self.format_options = self._build_format_options() + self.converter = DocumentConverter( + allowed_formats=self.allowed_formats, + format_options=self.format_options, + ) + + # self.loaders = LOADERS["pdf"] # 로더 왜 필요하더라 + + self.chunker = CHUNKERS["simple"]() + self.genos_meta_builder = GenOSVectorMetaBuilder() + + def _build_allowed_formats(self): + allowed_formats = [] + for _format in self.config["format_options"].keys(): + format = FORMAT_MAP.get(_format, None) + assert format is not None, f"@@@@ 잘못된 확장자입니다. {_format}, 가능한 확장자: {list(FORMAT_MAP.keys())}" + allowed_formats.append(format) + return allowed_formats + + def _build_format_options(self): + format_options = {} + for _format, option in self.config["format_options"].items(): + format = FORMAT_MAP.get(_format, None) + + format_options[format] = FORMAT_OPTION_MAP[format]( + pipeline_cls=PIPELINE_MAP[option["pipeline_options"]], + backend=BACKEND_MAP[option["backend"]], + ) + + # @@@ 성민: pdf 일때만 이미지 저장이 가능하네 + if "generate_picture_images" in option and option["generate_picture_images"] == True: + format_options[format].pipeline_options.generate_picture_images = True + + return format_options + + def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]: + """ + 설명: 확장자에 해당하는 DocumentConverter를 사용하여 ConversionResult 리턴 + """ + # TODO: OneAgent 호출인지 판단. 이게 여기 있어야 할까 __call_ 에 있어야 할까. + + conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True) + + return conv_result.document + + def split_documents(self, documents: list[Document], **kwargs: dict) -> list[Document]: + chunks = list(self.chunker.chunk(documents, **kwargs)) + return chunks + + async def compose_vectors( + self, request: Request, file_path: str, document: DoclingDocument, chunks: List[DocChunk], **kwargs: dict + ) -> list[dict]: + return await self.genos_meta_builder(document, chunks, file_path, request, **kwargs) + + async def __call__(self, request: Request, file_path: str, **kwargs: dict) -> Any: + documents = self.load_documents(file_path, **kwargs) + chunks = self.split_documents(documents, **kwargs) + vectors = await self.compose_vectors(request, file_path, documents, chunks, **kwargs) + return vectors diff --git a/genon/preprocessor/module/intelligent_processor.py b/genon/preprocessor/module/intelligent_processor.py new file mode 100644 index 0000000000..79742ec8ab --- /dev/null +++ b/genon/preprocessor/module/intelligent_processor.py @@ -0,0 +1,1675 @@ +from __future__ import annotations + +import json +import os +import logging +import math, bisect +from pathlib import Path + +from collections import defaultdict +from datetime import datetime +from typing import Optional, Iterable, Any, List, Dict, Tuple + +from fastapi import Request + +_log = logging.getLogger(__name__) + +# docling imports + +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend + +# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.backend.genos_pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.pipeline.simple_pipeline import SimplePipeline + +# from docling.datamodel.document import ConversionStatus +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + # OcrEngine, + # PdfBackend, + PdfPipelineOptions, + TableFormerMode, + PipelineOptions, + PaddleOcrOptions, +) + +from docling.document_converter import DocumentConverter, PdfFormatOption, FormatOption +from docling.datamodel.pipeline_options import DataEnrichmentOptions +from docling.utils.document_enrichment import enrich_document, check_document +from docling.datamodel.document import ConversionResult +from docling_core.transforms.chunker import ( + BaseChunk, + BaseChunker, + DocChunk, + DocMeta, +) +from docling_core.types import DoclingDocument + +from pandas import DataFrame +import asyncio +from docling_core.types import DoclingDocument as DLDocument +from docling_core.types.doc.document import ( + DocumentOrigin, + LevelNumber, + ListItem, + CodeItem, + ContentLayer, +) +from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc import ( + BoundingBox, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + DocItem, + PictureItem, + SectionHeaderItem, + TableItem, + TextItem, + PageItem, + ProvenanceItem, +) +from collections import Counter +import re +import json +import warnings +from typing import Iterable, Iterator, Optional, Union + +from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator +from typing_extensions import Self + +try: + import semchunk + from transformers import AutoTokenizer, PreTrainedTokenizerBase +except ImportError: + raise RuntimeError("Module requires 'chunking' extra; to install, run: " "`pip install 'docling-core[chunking]'`") + +try: + from genos_utils import upload_files +except ImportError: + upload_files = None + +# ============================================ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +"""Chunker implementation leveraging the document structure.""" + + +class GenosBucketChunker(BaseChunker): + """토큰 제한을 고려하여 섹션별 청크를 분할하고 병합하는 청커 (v2)""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + tokenizer: Union[PreTrainedTokenizerBase, str] = "sentence-transformers/all-MiniLM-L6-v2" + max_tokens: int = 1024 + merge_peers: bool = True + + # _inner_chunker: BaseChunker = None + _tokenizer: PreTrainedTokenizerBase = None + merge_list_items: bool = True + + @model_validator(mode="after") + def _initialize_components(self) -> Self: + # 토크나이저 초기화 + self._tokenizer = ( + self.tokenizer + if isinstance(self.tokenizer, PreTrainedTokenizerBase) + else AutoTokenizer.from_pretrained(self.tokenizer) + ) + return self + + def preprocess(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: + """문서의 모든 아이템을 헤더 정보와 함께 청크로 생성 + + Args: + dl_doc: 청킹할 문서 + + Yields: + 문서의 모든 아이템을 포함하는 하나의 청크 + """ + # 모든 아이템과 헤더 정보 수집 + all_items = [] + all_header_info = [] # 각 아이템의 헤더 정보 + current_heading_by_level: dict[LevelNumber, str] = {} + all_header_short_info = [] # 각 아이템의 짧은 헤더 정보 + current_heading_short_by_level: dict[LevelNumber, str] = {} + list_items: list[TextItem] = [] + + # iterate_items()로 수집된 아이템들의 self_ref 추적 + processed_refs = set() + + # 모든 아이템 순회 + for item, level in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}): + if hasattr(item, "self_ref"): + processed_refs.add(item.self_ref) + + if not isinstance(item, DocItem): + continue + + # 리스트 아이템 병합 처리 + if self.merge_list_items: + if isinstance(item, ListItem) or (isinstance(item, TextItem) and item.label == DocItemLabel.LIST_ITEM): + list_items.append(item) + continue + elif list_items: + # 누적된 리스트 아이템들을 추가 + for list_item in list_items: + all_items.append(list_item) + # 리스트 아이템의 헤더 정보 저장 + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + list_items = [] + + # 섹션 헤더 처리 + if isinstance(item, SectionHeaderItem) or ( + isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE] + ): + # 새로운 헤더 레벨 설정 + header_level = ( + item.level + if isinstance(item, SectionHeaderItem) + else (0 if item.label == DocItemLabel.TITLE else 1) + ) + current_heading_by_level[header_level] = item.text + current_heading_short_by_level[header_level] = item.orig # 첫 단어로 짧은 헤더 정보 설정 + + # 더 깊은 레벨의 헤더들 제거 + keys_to_del = [k for k in current_heading_by_level if k > header_level] + for k in keys_to_del: + current_heading_by_level.pop(k, None) + keys_to_del_short = [k for k in current_heading_short_by_level if k > header_level] + for k in keys_to_del_short: + current_heading_short_by_level.pop(k, None) + + # 헤더 아이템도 추가 (헤더 자체도 아이템임) + all_items.append(item) + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + continue + + if ( + isinstance(item, TextItem) + or isinstance(item, ListItem) + or isinstance(item, CodeItem) + or isinstance(item, TableItem) + or isinstance(item, PictureItem) + ): + # if item.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]: + # item.text = "" + all_items.append(item) + # 현재 아이템의 헤더 정보 저장 + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + + # 마지막 리스트 아이템들 처리 + if list_items: + for list_item in list_items: + all_items.append(list_item) + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + + # iterate_items()에서 누락된 테이블들을 별도로 추가 + missing_tables = [] + for table in dl_doc.tables: + table_ref = getattr(table, "self_ref", None) + if table_ref not in processed_refs: + missing_tables.append(table) + + # 누락된 테이블들을 문서 앞부분에 추가 (페이지 1의 테이블들일 가능성이 높음) + if missing_tables: + for missing_table in missing_tables: + # 첫 번째 위치에 삽입 (헤더 테이블일 가능성이 높음) + all_items.insert(0, missing_table) + all_header_info.insert(0, {}) # 빈 헤더 정보 + all_header_short_info.insert(0, {}) # 빈 짧은 헤더 정보 + + # 아이템이 없으면 빈 문서 + if not all_items: + return + + # 모든 아이템을 하나의 청크로 반환 (HybridChunker에서 분할) + # headings는 None으로 설정하고, 헤더 정보는 별도로 관리 + chunk = DocChunk( + text="", # 텍스트는 HybridChunker에서 생성 + meta=DocMeta( + doc_items=all_items, + headings=None, # DocMeta의 원래 형식 유지 + captions=None, + origin=dl_doc.origin, + ), + ) + # 헤더 정보를 별도 속성으로 저장 + chunk._header_info_list = all_header_info + chunk._header_short_info_list = all_header_short_info # 짧은 헤더 정보도 저장 + yield chunk + + def _count_tokens(self, text: str) -> int: + """텍스트의 토큰 수 계산 (안전한 분할 처리)""" + if not text: + return 0 + + # 텍스트를 더 작은 단위로 분할하여 계산 + max_chunk_length = 300 # 더 안전한 길이로 설정 + total_tokens = 0 + + # 텍스트를 줄 단위로 먼저 분할 + lines = text.split("\n") + current_chunk = "" + + for line in lines: + # 현재 청크에 줄을 추가했을 때 길이 확인 + temp_chunk = current_chunk + "\n" + line if current_chunk else line + + if len(temp_chunk) <= max_chunk_length: + current_chunk = temp_chunk + else: + # 현재 청크가 있으면 토큰 계산 + if current_chunk: + try: + total_tokens += len(self._tokenizer.tokenize(current_chunk)) + except Exception: + total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산 + + # 새로운 청크 시작 + current_chunk = line + + # 마지막 청크 처리 + if current_chunk: + try: + total_tokens += len(self._tokenizer.tokenize(current_chunk)) + except Exception: + total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산 + + return total_tokens + + def _generate_text_from_items_with_headers( + self, items: list[DocItem], header_info_list: list[dict], dl_doc: DoclingDocument + ) -> str: + """DocItem 리스트로부터 헤더 정보를 포함한 텍스트 생성""" + text_parts = [] + current_section_headers = {} # 현재 섹션의 헤더 정보 + + for i, item in enumerate(items): + item_headers = header_info_list[i] if i < len(header_info_list) else {} + + # 헤더 정보가 변경된 경우 (새로운 섹션 시작) + if item_headers != current_section_headers: + # 변경된 헤더 레벨들만 추가 + headers_to_add = [] + for level in sorted(item_headers.keys()): + # 이전 섹션과 다른 헤더만 추가 + if level not in current_section_headers or current_section_headers[level] != item_headers[level]: + # 해당 레벨까지의 모든 상위 헤더 포함 + for l in sorted(item_headers.keys()): + if l < level: + headers_to_add.append(item_headers[l]) + elif l == level: + headers_to_add.append("") + + break + + # 헤더가 있으면 추가 + if headers_to_add: + header_text = ", ".join(headers_to_add) + if header_text not in text_parts: + text_parts.append(header_text) + + current_section_headers = item_headers.copy() + + # 아이템 텍스트 추가 + if isinstance(item, TableItem): + table_text = self._extract_table_text(item, dl_doc) + if table_text: + text_parts.append(table_text) + elif hasattr(item, "text") and item.text: + # 타이틀과 섹션 헤더 처리 개선 + # is_section_header = ( + # isinstance(item, SectionHeaderItem) or + # (isinstance(item, TextItem) and + # item.label in [DocItemLabel.SECTION_HEADER]) # TITLE은 제외 + # ) + + # 타이틀은 항상 포함, 섹션 헤더는 중복 방지를 위해 스킵 + # if not is_section_header: + # 20250909, shkim, text_parts에 없는 경우만 추가. 섹션헤더가 반복해서 추가되는 것 방지 + if item.text not in text_parts: + text_parts.append(item.text) + elif isinstance(item, PictureItem): + text_parts.append("") # 이미지는 빈 텍스트 + + result_text = self.delim.join(text_parts) + return result_text + + def _extract_table_text(self, table_item: TableItem, dl_doc: DoclingDocument) -> str: + """테이블에서 텍스트를 추출하는 일반화된 메서드""" + try: + # 먼저 export_to_markdown 시도 + table_text = table_item.export_to_markdown(dl_doc) + if table_text and table_text.strip(): + return table_text + except Exception: + pass + + # export_to_markdown 실패 시 테이블 셀 데이터에서 직접 텍스트 추출 + try: + if hasattr(table_item, "data") and table_item.data: + cell_texts = [] + + # table_cells에서 텍스트 추출 + if hasattr(table_item.data, "table_cells"): + for cell in table_item.data.table_cells: + if hasattr(cell, "text") and cell.text and cell.text.strip(): + cell_texts.append(cell.text.strip()) + + # grid에서 텍스트 추출 (table_cells가 없는 경우) + elif hasattr(table_item.data, "grid") and table_item.data.grid: + for row in table_item.data.grid: + if isinstance(row, list): + for cell in row: + if hasattr(cell, "text") and cell.text and cell.text.strip(): + cell_texts.append(cell.text.strip()) + + # 추출된 셀 텍스트들을 결합 + if cell_texts: + return " ".join(cell_texts) + except Exception: + pass + + # 모든 방법 실패 시 item.text 사용 (있는 경우) + if hasattr(table_item, "text") and table_item.text: + return table_item.text + + return "" + + def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[str]]: + """헤더 정보 리스트에서 실제 사용되는 모든 헤더들을 level 순서대로 추출하고 ', '로 연결""" + if not header_info_list: + return None + + all_headers = [] # header 순서대로 추가 + seen_headers = set() # 중복 방지용 + + for header_info in header_info_list: + if header_info: + for level in sorted(header_info.keys()): + header_text = header_info[level] + if header_text and header_text not in seen_headers: + all_headers.append(header_text) + seen_headers.add(header_text) + + return all_headers if all_headers else None + + def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]: + """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)""" + if not table_text: + return [table_text] + + # 전체 테이블이 토큰 제한 내인지 확인 + if self._count_tokens(table_text) <= max_tokens: + return [table_text] + + # 단순히 토큰 수 기준으로 텍스트 분할 + # semchunk 사용하여 토큰 제한에 맞게 분할 + chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens) + chunks = chunker(table_text) + return chunks if chunks else [table_text] + + def _is_section_header(self, item: DocItem) -> bool: + """아이템이 section header인지 확인""" + return isinstance(item, SectionHeaderItem) or ( + isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE] + ) + + def _get_section_header_level(self, item: DocItem) -> Optional[int]: + """Section header의 level을 반환""" + if isinstance(item, SectionHeaderItem): + return item.level + elif isinstance(item, TextItem): + if item.label == DocItemLabel.TITLE: + return 0 + elif item.label == DocItemLabel.SECTION_HEADER: + return 1 + return None + + def _generate_section_text_with_heading( + self, section_items: list[DocItem], section_header_infos: list[dict], dl_doc: DoclingDocument + ) -> str: + """섹션의 텍스트를 생성하되, 앞에 heading을 붙임""" + # 첫 번째 item의 header_info에서 heading 추출 + if section_header_infos and section_header_infos[0]: + merged_headers = {} + for level, header_text in section_header_infos[0].items(): + if header_text: + merged_headers[level] = header_text + + # level 순서대로 정렬해서 ', '로 연결 + if merged_headers: + sorted_levels = sorted(merged_headers.keys()) + headers = [merged_headers[level] for level in sorted_levels] + heading_text = ", ".join(headers) + else: + heading_text = "" + else: + heading_text = "" + + # 섹션의 일반 텍스트 생성 + section_text = self._generate_text_from_items_with_headers(section_items, section_header_infos, dl_doc) + + # heading이 있으면 앞에 붙이기 + if heading_text: + return heading_text + ", " + section_text + else: + return section_text + + def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument) -> list[DocChunk]: + """문서를 토큰 제한에 맞게 분할 (v2: 섹션 헤더 기준으로 분할 후 max_tokens로 병합)""" + items = doc_chunk.meta.doc_items + header_info_list = getattr(doc_chunk, "_header_info_list", []) + header_short_info_list = getattr(doc_chunk, "_header_short_info_list", []) + + if not items: + return [] + + # ================================================================ + # 헬퍼 함수들 + # ================================================================ + + def get_header_level(header_infos, *, first=False, default=-1): + """header_infos에서 최종 레벨 계산""" + if not header_infos: + return default + info = header_infos[0] if first else header_infos[-1] + return max(info.keys(), default=default) + + def get_current_chunk( + doc_chunk: DocChunk, + merged_texts: list[str], + merged_header_short_infos: list[dict], + merged_items: list[DocItem], + ): + """현재까지 병합된 내용으로 DocChunk 생성""" + if not merged_texts: + return None + chunk_text = "\n".join(merged_texts) + used_headers = self._extract_used_headers(merged_header_short_infos) + + return DocChunk( + text=chunk_text, + meta=DocMeta( + doc_items=merged_items, + headings=used_headers, + captions=None, + origin=doc_chunk.meta.origin, + ), + ) + + def get_text_from_item(item: DocItem) -> str: + """DocItem에서 텍스트 추출""" + if isinstance(item, TableItem): + return self._extract_table_text(item, dl_doc) + elif hasattr(item, "text") and item.text: + return item.text + elif isinstance(item, PictureItem): + text = "" + for annotation in item.annotations: + if hasattr(annotation, "text"): + text += annotation.text + return text + return "" + + def split_items_evenly_by_tokens(item_token_counts, max_tokens): + n = len(item_token_counts) + total = sum(item_token_counts) + if n == 0: + return [] + if total <= max_tokens: + return [(0, n)] # ✅ 항상 (a,b) + + k = math.ceil(total / max_tokens) + target = total / k + + P = [0] + for c in item_token_counts: + P.append(P[-1] + c) + + cuts = [0] + used = {0} + for t in range(1, k): + goal = t * target + j = bisect.bisect_left(P, goal) + + cand = [] + if 0 < j < len(P): + cand.append(j) + if 0 <= j - 1 < len(P): + cand.append(j - 1) + + best = None + best_dist = float("inf") + for x in cand: + if x in used: + continue + if x <= cuts[-1]: + continue + if x >= len(P) - 1: # n + continue + dist = abs(P[x] - goal) + if dist < best_dist: + best_dist = dist + best = x + + if best is None: + best = min(max(cuts[-1] + 1, 1), len(P) - 2) + + cuts.append(best) + used.add(best) + + cuts.append(n) + + return [(a, b) for a, b in zip(cuts[:-1], cuts[1:])] + + def adjust_captions(items_group): + + b_modified = False + for idx, group in enumerate(items_group): + if group is None: + continue + item = group[0][0] + ref_idx_list = [] + if hasattr(item, "captions") and item.captions: + for cap in item.captions: + cap_ref = cap.cref + cap_idx = -1 + for j, it in enumerate(items_group): + if it is None: + continue + if getattr(it[0][0], "self_ref", None) == cap_ref: + cap_idx = j + break + if cap_idx != -1: + ref_idx_list.append(cap_idx) + if ref_idx_list: + ref_idx_list = sorted(ref_idx_list) + + if not ref_idx_list: + continue + + # caption 아이템들을 부모 아이템 바로 뒤로 이동 + for cap_idx in ref_idx_list: + for g in items_group[cap_idx]: + items_group[idx].append(g) + items_group[cap_idx] = None # 나중에 None 제거 + b_modified = True + + if b_modified: + items_group = [it for it in items_group if it is not None] + + return items_group + + def adjust_pictures_in_tables(items_group): + # picture in table 처리 + + b_modified = False + for idx, group in enumerate(items_group): + if group is None: + continue + item = group[0][0] + pic_idx_list = [] + if isinstance(item, TableItem): + table_bbox = item.prov[0].bbox + table_page_no = item.prov[0].page_no + + for j in range(len(items_group)): + if items_group[j] is None: + continue + pic_item = items_group[j][0][0] + if isinstance(pic_item, PictureItem): + # table 안의 picture인지 확인. iou 사용 + pic_bbox = pic_item.prov[0].bbox + pic_page_no = pic_item.prov[0].page_no + if pic_page_no != table_page_no: + continue + ios = pic_bbox.intersection_over_self(table_bbox) + if ios > 0.5: # picture가 50% 이상 table 안에 포함되면 table 안의 picture로 간주 + pic_idx_list.append(j) + if pic_idx_list: + pic_idx_list = sorted(pic_idx_list) + + if not pic_idx_list: + continue + + for pic_idx in pic_idx_list: + for g in items_group[pic_idx]: + items_group[idx].append(g) + items_group[pic_idx] = None # 나중에 None 제거 + b_modified = True + + if b_modified: + items_group = [it for it in items_group if it is not None] + + return items_group + + # ================================================================ + # 1단계: 섹션 헤더 기준으로 분할 + # ================================================================ + + sections = [] # [(items, header_infos, header_short_infos), ...] + cur_items, cur_h_infos, cur_h_short = [], [], [] + + for i, item in enumerate(items): + h_info = header_info_list[i] if i < len(header_info_list) else {} + h_short = header_short_info_list[i] if i < len(header_short_info_list) else {} + + # 섹션 헤더를 만나면 + if self._is_section_header(item): + # 이전 섹션이 있으면 저장 + if cur_items: + sections.append((cur_items, cur_h_infos, cur_h_short)) + + # 새로운 섹션 시작 + cur_items = [item] + cur_h_infos = [h_info] + cur_h_short = [h_short] + else: + # 섹션 헤더가 아니면 현재 섹션에 추가 + cur_items.append(item) + cur_h_infos.append(h_info) + cur_h_short.append(h_short) + + # 마지막 섹션 저장 + if cur_items: + sections.append((cur_items, cur_h_infos, cur_h_short)) + + # ================================================================ + # 2단계: 각 섹션의 텍스트에 heading 붙이기 + # ================================================================ + + sections_with_text = [] + for items, header_infos, header_short_infos in sections: + text = self._generate_section_text_with_heading(items, header_short_infos, dl_doc) + sections_with_text.append((text, items, header_infos, header_short_infos)) + + # ================================================================ + # 2.5단계: 너무 긴 청크는 분할 + # ================================================================ + if self.max_tokens > 0: + for i in range(len(sections_with_text)): + text, items, h_infos, h_short = sections_with_text[i] + token_count = self._count_tokens(text) + if token_count < self.max_tokens: + continue + + # caption 및 table 내 그림은 같은 섹션에 있도록 조정 + items_group = [[(item, info, short)] for item, info, short in zip(items, h_infos, h_short)] + items_group = adjust_captions(items_group) + items_group = adjust_pictures_in_tables(items_group) + + # 너무 긴 섹션은 분할 + # 각 아이템 별 token 수 계산 + item_token_counts = [] + for group in items_group: + cur_count = 0 + for g in group: + cur_count += self._count_tokens(get_text_from_item(g[0])) + item_token_counts.append(cur_count) + + # 아이템 그룹들을 토큰 기준으로 균등 분할 + split_info = split_items_evenly_by_tokens(item_token_counts, self.max_tokens) + + # item_groups를 섹션으로 다시 구성 + new_sections = [] + for a, b in split_info: + + # 각 그룹에서 items, h_infos, h_short로 분리 + group_items = [] + group_h_infos = [] + group_h_short = [] + for idx in range(a, b): + for g in items_group[idx]: + group_items.append(g[0]) + group_h_infos.append(g[1]) + group_h_short.append(g[2]) + + new_text = self._generate_section_text_with_heading(group_items, group_h_short, dl_doc) + new_sections.append((new_text, group_items, group_h_infos, group_h_short)) + + # 원래 섹션을 새로 분할된 섹션들로 교체 + sections_with_text.pop(i) + for new_section in reversed(new_sections): + sections_with_text.insert(i, new_section) + + # ================================================================ + # 3단계: 단독 타이틀(1줄만) → 다음 섹션으로 병합 + # ================================================================ + + for i in range(len(sections_with_text) - 2, -1, -1): + text, items, h_infos, h_short = sections_with_text[i] + + # 아이템이 하나인 섹션 헤더만 검사 + if len(items) != 1 or not self._is_section_header(items[0]): + continue + + # 문단이 이미 구성된 것은 제외 (문자 수가 30자 이상이면 문단을 구성했다고 간주) + item_text = "".join(getattr(it, "text", "") for it in items) + if len(item_text) > 30: + continue + + # 현재 섹션헤더 레벨이 다음 섹션헤더 레벨보다 더 높은 경우에만 병합 (높은 레벨이 더 작은 숫자) + n_text, n_items, n_h_infos, n_h_short = sections_with_text[i + 1] + current_level = get_header_level(h_infos, first=False) + next_level = get_header_level(n_h_infos, first=True) + if 0 <= next_level < current_level: + continue + + # 다음 섹션과 병합 + sections_with_text[i] = (text + "\n" + n_text, items + n_items, h_infos + n_h_infos, h_short + n_h_short) + sections_with_text.pop(i + 1) + + # ================================================================ + # 4단계: 토큰 기준 병합 + # ================================================================ + + result_chunks = [] + merged_texts, merged_items = [], [] + merged_header_infos, merged_header_short_infos = [], [] + + for text, items, header_infos, header_short_infos in sections_with_text: + + b_new_chunk = False + + # ---------------------------------- + # 병합 가능 여부 판단 + + # 병합 가능 토큰 수 계산 + test_tokens = self._count_tokens("\n".join(merged_texts + [text])) + + # 현재 섹션헤더 레벨과 병합된 섹션헤더 레벨 + section_level = get_header_level(header_infos, first=True) + merged_level = get_header_level(merged_header_infos, first=False) + + # 토큰 수 초과 시 새로운 청크 생성 + if test_tokens > self.max_tokens and len(merged_texts) > 0: + b_new_chunk = True + # 현재 섹션헤더 레벨이 더 높으면 새로운 청크 생성 + elif 0 <= section_level < merged_level: + b_new_chunk = True + # ---------------------------------- + + # 새로운 청크 생성 + if b_new_chunk: + cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items) + if cur_chunk: + result_chunks.append(cur_chunk) + + # 새로운 병합 시작 + merged_texts = [text] + merged_items = items + merged_header_infos = header_infos + merged_header_short_infos = header_short_infos + else: + # 현재 섹션 병합 + merged_texts.append(text) + merged_items.extend(items) + merged_header_infos.extend(header_infos) + merged_header_short_infos.extend(header_short_infos) + + # 마지막 병합된 items 처리 + cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items) + if cur_chunk: + result_chunks.append(cur_chunk) + + return result_chunks + + def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: + """문서를 청킹하여 반환 + + Args: + dl_doc: 청킹할 문서 + + Yields: + 토큰 제한에 맞게 분할된 청크들 + """ + doc_chunks = list(self.preprocess(dl_doc=dl_doc, **kwargs)) + + if not doc_chunks: + return iter([]) + + doc_chunk = doc_chunks[0] # preprocess는 하나의 청크만 반환 + + final_chunks = self._split_document_by_tokens(doc_chunk, dl_doc) + + return iter(final_chunks) + + +class GenOSVectorMeta(BaseModel): + class Config: + extra = "allow" + + text: str = None + n_char: int = None + n_word: int = None + n_line: int = None + e_page: int = None + i_page: int = None + i_chunk_on_page: int = None + n_chunk_of_page: int = None + i_chunk_on_doc: int = None + n_chunk_of_doc: int = None + n_page: int = None + reg_date: str = None + chunk_bboxes: str = None + media_files: str = None + title: str = None + created_date: int = None + appendix: str = None ## !! appendix feature (2025-09-30, geonhee kim) !! + + +class GenOSVectorMetaBuilder: + def __init__(self): + """빌더 초기화""" + self.text: Optional[str] = None + self.n_char: Optional[int] = None + self.n_word: Optional[int] = None + self.n_line: Optional[int] = None + self.i_page: Optional[int] = None + self.e_page: Optional[int] = None + self.i_chunk_on_page: Optional[int] = None + self.n_chunk_of_page: Optional[int] = None + self.i_chunk_on_doc: Optional[int] = None + self.n_chunk_of_doc: Optional[int] = None + self.n_page: Optional[int] = None + self.reg_date: Optional[str] = None + self.chunk_bboxes: Optional[str] = None + self.media_files: Optional[str] = None + self.title: Optional[str] = None + self.created_date: Optional[int] = None + self.appendix: Optional[str] = None # !! appendix feature (2025-09-30, geonhee kim) !! + + def set_text(self, text: str) -> "GenOSVectorMetaBuilder": + """텍스트와 관련된 데이터를 설정""" + self.text = text + self.n_char = len(text) + self.n_word = len(text.split()) + self.n_line = len(text.splitlines()) + return self + + def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder": + """페이지 정보 설정""" + self.i_page = i_page + self.i_chunk_on_page = i_chunk_on_page + self.n_chunk_of_page = n_chunk_of_page + return self + + def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder": + """문서 전체의 청크 인덱스 설정""" + self.i_chunk_on_doc = i_chunk_on_doc + return self + + def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder": + """글로벌 메타데이터 병합""" + for key, value in global_metadata.items(): + if hasattr(self, key): + setattr(self, key, value) + return self + + def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder": + chunk_bboxes = [] + for item in doc_items: + for prov in item.prov: + label = item.self_ref + type_ = item.label + size = document.pages.get(prov.page_no).size + page_no = prov.page_no + bbox = prov.bbox + bbox_data = { + "l": bbox.l / size.width, + "t": bbox.t / size.height, + "r": bbox.r / size.width, + "b": bbox.b / size.height, + "coord_origin": bbox.coord_origin.value, + } + chunk_bboxes.append({"page": page_no, "bbox": bbox_data, "type": type_, "ref": label}) + self.e_page = max([bbox["page"] for bbox in chunk_bboxes]) if chunk_bboxes else None + self.chunk_bboxes = json.dumps(chunk_bboxes) + return self + + def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder": + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + print(item) + name = path.rsplit("/", 1)[-1] + temp_list.append({"name": name, "type": "image", "ref": item.self_ref}) + self.media_files = json.dumps(temp_list) + return self + + def build(self) -> GenOSVectorMeta: + """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성""" + return GenOSVectorMeta( + text=self.text, + n_char=self.n_char, + n_word=self.n_word, + n_line=self.n_line, + i_page=self.i_page, + e_page=self.e_page, + i_chunk_on_page=self.i_chunk_on_page, + n_chunk_of_page=self.n_chunk_of_page, + i_chunk_on_doc=self.i_chunk_on_doc, + n_chunk_of_doc=self.n_chunk_of_doc, + n_page=self.n_page, + reg_date=self.reg_date, + chunk_bboxes=self.chunk_bboxes, + media_files=self.media_files, + title=self.title, + created_date=self.created_date, + appendix=self.appendix or "", # !! appendix feature (2025-09-30, geonhee kim) !! + ) + + +class DocumentProcessor: + + def __init__(self): + """ + initialize Document Converter + """ + self.ocr_endpoint = "http://192.168.73.172:48080/ocr" + ocr_options = PaddleOcrOptions( + force_full_page_ocr=False, lang=["korean"], ocr_endpoint=self.ocr_endpoint, text_score=0.3 + ) + + self.page_chunk_counts = defaultdict(int) + device = AcceleratorDevice.AUTO + num_threads = 8 + accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) + # PDF 파이프라인 옵션 설정 + self.pipe_line_options = PdfPipelineOptions() + self.pipe_line_options.generate_page_images = True + self.pipe_line_options.generate_picture_images = True + self.pipe_line_options.do_ocr = False + # self.pipe_line_options.ocr_options = ocr_options + # self.pipe_line_options.ocr_options.lang = ["ko", 'en'] + # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model" + # self.pipe_line_options.ocr_options.force_full_page_ocr = True + # ocr_options = TesseractOcrOptions() + # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert'] + # ocr_options.path = './.tesseract/tessdata' + # self.pipe_line_options.ocr_options = ocr_options + # self.pipe_line_options.artifacts_path = Path("/models/") + # self.pipe_line_options.do_table_structure = True + self.pipe_line_options.do_table_structure = False + self.pipe_line_options.images_scale = 2 + self.pipe_line_options.table_structure_options.do_cell_matching = False + # self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE + # self.pipe_line_options.accelerator_options = accelerator_options + + # Simple 파이프라인 옵션을 인스턴스 변수로 저장 + self.simple_pipeline_options = PipelineOptions() + self.simple_pipeline_options.save_images = False + + # ocr 파이프라인 옵션 + self.ocr_pipe_line_options = PdfPipelineOptions() + self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True) + self.ocr_pipe_line_options.do_ocr = False + self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True) + self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = False + + # 기본 컨버터들 생성 + self._create_converters() + + # enrichment 옵션 설정 + self.enrichment_options = DataEnrichmentOptions( + do_toc_enrichment=False, + # toc_doc_type="law", + # extract_metadata=False, + # toc_api_provider="custom", + # # Mistral-Small-3.1-24B-Instruct-2503, 운영망 + # toc_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/502/v1/chat/completions", + # metadata_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/502/v1/chat/completions", + # toc_api_key="022653a3743849e299f19f19d323490b", + # metadata_api_key="022653a3743849e299f19f19d323490b", + # # Mistral-Small-3.1-24B-Instruct-2503, 한국은행 클러스터 + # # toc_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions", + # # metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions", + # # toc_api_key="9e32423947fd4a5da07a28962fe88487", + # # metadata_api_key="9e32423947fd4a5da07a28962fe88487", + # toc_model="model", + # metadata_model="model", + # toc_temperature=0.0, + # toc_top_p=0.00001, + # toc_seed=33, + # toc_max_tokens=10000, + # toc_system_prompt=toc_system_prompt, + # toc_user_prompt=toc_user_prompt, + ) + + def _create_converters(self): + """컨버터들을 생성하는 헬퍼 메서드""" + self.converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.pipe_line_options, backend=PyPdfiumDocumentBackend + ), + } + ) + self.second_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.pipe_line_options, backend=PyPdfiumDocumentBackend + ), + }, + ) + self.ocr_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.ocr_pipe_line_options, backend=DoclingParseV4DocumentBackend + ), + } + ) + self.ocr_second_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=self.ocr_pipe_line_options, backend=PyPdfiumDocumentBackend + ), + }, + ) + + def load_documents_with_docling(self, file_path: str, **kwargs: dict) -> DoclingDocument: + # kwargs에서 save_images 값을 가져와서 옵션 업데이트 + save_images = kwargs.get("save_images", True) + include_wmf = kwargs.get("include_wmf", False) + + # save_images 옵션이 현재 설정과 다르면 컨버터 재생성 + if ( + self.simple_pipeline_options.save_images != save_images + or getattr(self.simple_pipeline_options, "include_wmf", False) != include_wmf + ): + self.simple_pipeline_options.save_images = save_images + self.simple_pipeline_options.include_wmf = include_wmf + self._create_converters() + + try: + conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True) + except Exception as e: + conv_result: ConversionResult = self.second_converter.convert(file_path, raises_on_error=True) + return conv_result.document + + def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument: + # kwargs에서 save_images 값을 가져와서 옵션 업데이트 + save_images = kwargs.get("save_images", True) + include_wmf = kwargs.get("include_wmf", False) + + # save_images 옵션이 현재 설정과 다르면 컨버터 재생성 + if ( + self.simple_pipeline_options.save_images != save_images + or getattr(self.simple_pipeline_options, "include_wmf", False) != include_wmf + ): + self.simple_pipeline_options.save_images = save_images + self.simple_pipeline_options.include_wmf = include_wmf + self._create_converters() + + try: + conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True) + except Exception as e: + conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True) + return conv_result.document + + def load_documents(self, file_path: str, **kwargs) -> DoclingDocument: + return self.load_documents_with_docling(file_path, **kwargs) + + def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]: + chunker: GenosBucketChunker = GenosBucketChunker(max_tokens=0, merge_peers=True) + + chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs)) + for chunk in chunks: + self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1 + return chunks + + def safe_join(self, iterable): + if not isinstance(iterable, (list, tuple, set)): + return "" + return "".join(map(str, iterable)) + "\n" + + def parse_created_date(self, date_text: str) -> Optional[int]: + """ + 작성일 텍스트를 파싱하여 YYYYMMDD 형식의 정수로 변환 + + Args: + date_text: 작성일 텍스트 (YYYY-MM 또는 YYYY-MM-DD 형식) + + Returns: + YYYYMMDD 형식의 정수, 파싱 실패시 None + """ + if not date_text or not isinstance(date_text, str) or date_text == "None": + return 0 + + # 공백 제거 및 정리 + date_text = date_text.strip() + + # YYYY-MM-DD 형식 매칭 + match_full = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", date_text) + if match_full: + year, month, day = match_full.groups() + try: + # 유효한 날짜인지 검증 + datetime(int(year), int(month), int(day)) + return int(f"{year}{month.zfill(2)}{day.zfill(2)}") + except ValueError: + pass + + # YYYY-MM 형식 매칭 (일자는 01로 설정) + match_month = re.match(r"^(\d{4})-(\d{1,2})$", date_text) + if match_month: + year, month = match_month.groups() + try: + # 유효한 월인지 검증 + datetime(int(year), int(month), 1) + return int(f"{year}{month.zfill(2)}01") + except ValueError: + pass + + # YYYY 형식 매칭 (월일은 0101로 설정) + match_year = re.match(r"^(\d{4})$", date_text) + if match_year: + year = match_year.group(1) + try: + datetime(int(year), 1, 1) + return int(f"{year}0101") + except ValueError: + pass + + return 0 + + def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocument: + return document + + # 새로운 enriched result 받기 + document = enrich_document(document, self.enrichment_options, **kwargs) + return document + + async def compose_vectors( + self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, **kwargs: dict + ) -> list[dict]: + title = "" + created_date = 0 + try: + if ( + document.key_value_items + and len(document.key_value_items) > 0 + and hasattr(document.key_value_items[0], "graph") + and hasattr(document.key_value_items[0].graph, "cells") + and len(document.key_value_items[0].graph.cells) > 1 + ): + # 작성일 추출 (cells[1]) + date_text = document.key_value_items[0].graph.cells[1].text + created_date = self.parse_created_date(date_text) + except (AttributeError, IndexError) as e: + pass + + for item, _ in document.iterate_items(): + if hasattr(item, "label"): + if item.label == DocItemLabel.TITLE: + title = item.text.strip() if item.text else "" + break + + # kwargs에서 부록 정보 추출 !! appendix feature (2025-09-30, geonhee kim) !! + appendix_info = kwargs.get("appendix", "") + appendix_list = [] + if isinstance(appendix_info, str): + appendix_list = ( + [item.strip() for item in json.loads(appendix_info) if item.strip()] if appendix_info else [] + ) + elif isinstance(appendix_info, list): + appendix_list = appendix_info + else: + appendix_list = [] + + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=document.num_pages(), + reg_date=datetime.now().isoformat(timespec="seconds") + "Z", + created_date=created_date, + title=title, + ) + + current_page = None + chunk_index_on_page = 0 + vectors = [] + upload_tasks = [] + for chunk_idx, chunk in enumerate(chunks): + chunk_page = chunk.meta.doc_items[0].prov[0].page_no + # header 앞에 헤더 마커 추가 (HEADER: ) + headers_text = "HEADER: " + ", ".join(chunk.meta.headings) + "\n" if chunk.meta.headings else "" + content = headers_text + chunk.text + + # appendix 추출 !! appendix feature (2025-09-30, geonhee kim) !! + matched_appendices = self.check_appendix_keywords(content, appendix_list) + # print(appendix_list, matched_appendices) + chunk_global_metadata = global_metadata.copy() + chunk_global_metadata["appendix"] = matched_appendices # Only matched ones + ### + + if chunk_page != current_page: + current_page = chunk_page + chunk_index_on_page = 0 + + vector = ( + GenOSVectorMetaBuilder() + .set_text(content) + .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page]) + .set_chunk_index(chunk_idx) + .set_global_metadata(**chunk_global_metadata) #!! appendix feature (2025-09-30, geonhee kim) !! + .set_chunk_bboxes(chunk.meta.doc_items, document) + .set_media_files(chunk.meta.doc_items) + ).build() + vectors.append(vector) + + chunk_index_on_page += 1 + if upload_files: + file_list = self.get_media_files(chunk.meta.doc_items) + upload_tasks.append(asyncio.create_task(upload_files(file_list, request=request))) + + if upload_tasks: + await asyncio.gather(*upload_tasks) + + return vectors + + def get_media_files(self, doc_items: list): + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + name = path.rsplit("/", 1)[-1] + temp_list.append({"path": path, "name": name}) + return temp_list + + def check_glyph_text(self, text: str, threshold: int = 1) -> bool: + """텍스트에 GLYPH 항목이 있는지 확인하는 메서드""" + if not text: + return False + + # GLYPH 항목이 있는지 정규식으로 확인 + matches = re.findall(r"GLYPH\w*", text) + if len(matches) >= threshold: + # print(f"Text has glyphs. len(matches): {len(matches)}. ") + return True + + return False + + def check_glyphs(self, document: DoclingDocument) -> bool: + """문서에 글리프가 있는지 확인하는 메서드""" + for item, level in document.iterate_items(): + if isinstance(item, TextItem) and hasattr(item, "prov") and item.prov: + page_no = item.prov[0].page_no + # page_texts += item.text + + # GLYPH 항목이 있는지 확인. 정규식사용 + matches = re.findall(r"GLYPH\w*", item.text) + if len(matches) > 10: + # print(f"Document has glyphs on page {page_no}. len(matches): {len(matches)}. ") + return True + + return False + + def check_appendix_keywords( + self, content: str, appendix_list: list + ) -> str: # !! appendix feature (2025-09-30, geonhee kim) !! + if not content or not appendix_list: + return "" + + matched_appendices = [] + + # 1. Find appendix patterns in content first + found_patterns = [] + + # Complex patterns: 별지/별표/장부 + numbers (with hyphens, Roman numerals) + # Updated regex to capture full patterns like "별지 제 Ⅰ -1 호 서식" by matching until closing delimiters + content = re.sub(r"\s+", "", content) + complex_patterns = re.findall(r"(별지|별표|장부)(?:제)?([^<>()\[\]]+?)(?=(?:호|서식)|[<>\)\]]|$)", content) + for pattern_type, number in complex_patterns: + found_patterns.extend( + [ + f"{pattern_type} {number}", + f"{pattern_type} 제{number}호", + f"{pattern_type}{number}", + f"{pattern_type}제{number}호", + ] + ) + + # Standalone patterns: (별표), (별지), (장부) + standalone_patterns = re.findall(r"[\(\[]+(별지|별표|장부)[\)\]]+", content) + for pattern_type in set(standalone_patterns): + found_patterns.extend( + [ + pattern_type, + f"{pattern_type}", + ] + ) + + # 2. Check if found patterns match any appendix in the list + for appendix in appendix_list: + if not appendix or not isinstance(appendix, str): + continue + + appendix_clean = appendix.replace(".pdf", "").lower().strip() + + # If any found pattern exists in appendix filename, it's a match + for pattern in found_patterns: + if pattern.lower().strip() in appendix_clean: + matched_appendices.append(appendix) + break # Prevent duplicates + + return ", ".join(matched_appendices) if matched_appendices else "" + + def ocr_all_table_cells(self, document: DoclingDocument, pdf_path) -> List[Dict[str, Any]]: + """ + 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR을 수행합니다. + Args: + document: DoclingDocument 객체 + pdf_path: PDF 파일 경로 + Returns: + OCR이 완료된 문서의 DoclingDocument 객체 + """ + import fitz + import base64 + import requests + + def post_ocr_bytes(img_bytes: bytes, timeout=60) -> dict: + HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} + payload = {"file": base64.b64encode(img_bytes).decode("ascii"), "fileType": 1, "visualize": False} + r = requests.post(self.ocr_endpoint, json=payload, headers=HEADERS, timeout=timeout) + if not r.ok: + # 진단에 도움되도록 본문 일부 출력 + raise RuntimeError(f"OCR HTTP {r.status_code}: {r.text[:500]}") + return r.json() + + def extract_ocr_fields(resp: dict): + """ + resp: 위와 같은 OCR 응답 JSON(dict) + return: (rec_texts, rec_scores, rec_boxes) — 모두 list + """ + if resp is None: + return [], [], [] + + # 최상위 상태 체크 + if resp.get("errorCode") not in (0, None): + return [], [], [] + + ocr_results = resp.get("result", {}).get("ocrResults", []) + if not ocr_results: + return [], [], [] + + pruned = ocr_results[0].get("prunedResult", {}) + if not pruned: + return [], [], [] + + rec_texts = pruned.get("rec_texts", []) # list[str] + rec_scores = pruned.get("rec_scores", []) # list[float] + rec_boxes = pruned.get("rec_boxes", []) # list[[x1,y1,x2,y2]] + + # 길이 불일치 방어: 최소 길이에 맞춰 자르기 + n = min(len(rec_texts), len(rec_scores), len(rec_boxes)) + return rec_texts[:n], rec_scores[:n], rec_boxes[:n] + + try: + doc = fitz.open(pdf_path) + + for table_idx, table_item in enumerate(document.tables): + if not table_item.data or not table_item.data.table_cells: + continue + + b_ocr = False + for cell_idx, cell in enumerate(table_item.data.table_cells): + if self.check_glyph_text(cell.text, threshold=1): + b_ocr = True + break + + if b_ocr is False: + # 글리프 깨진 텍스트가 없는 경우, OCR을 수행하지 않음 + continue + + for cell_idx, cell in enumerate(table_item.data.table_cells): + + # Provenance 정보에서 위치 정보 추출 + if not table_item.prov: + continue + + page_no = table_item.prov[0].page_no - 1 + bbox = cell.bbox + + page = doc.load_page(page_no) + + # 셀의 바운딩 박스를 사용하여 이미지에서 해당 영역을 잘라냄 + cell_bbox = fitz.Rect(bbox.l, min(bbox.t, bbox.b), bbox.r, max(bbox.t, bbox.b)) + + # bbox 높이 계산 (PDF 좌표계 단위) + bbox_height = cell_bbox.height + + # 목표 픽셀 높이 + target_height = 20 + + # zoom factor 계산 + # (너무 작은 bbox일 경우 0으로 나누는 걸 방지) + zoom_factor = target_height / bbox_height if bbox_height > 0 else 1.0 + zoom_factor = min(zoom_factor, 4.0) # 최대 확대 비율 제한 + zoom_factor = max(zoom_factor, 1) # 최소 확대 비율 제한 + + # 페이지를 이미지로 렌더링 + mat = fitz.Matrix(zoom_factor, zoom_factor) + pix = page.get_pixmap(matrix=mat, clip=cell_bbox) + img_data = pix.tobytes("png") + + result = post_ocr_bytes(img_data, timeout=60) + rec_texts, rec_scores, rec_boxes = extract_ocr_fields(result) + + cell.text = "" + for t in rec_texts: + if len(cell.text) > 0: + cell.text += " " + cell.text += t if t else "" + except Exception as e: + print(f"OCR processing failed: {e}") + pass + + return document + + def setup_logging(self, level_num: int): + """ + 5"DEBUG", 4"INFO", 3"WARNING", 2"ERROR", 1"CRITICAL", 0"NOLOG" 중 하나를 받아서 로깅 레벨을 설정하는 메서드 + """ + + def get_level_name(level_num: int) -> str: + level_map = {5: "DEBUG", 4: "INFO", 3: "WARNING", 2: "ERROR", 1: "CRITICAL", 0: "NOLOG"} + return level_map.get(level_num, "INFO") + + level_name = get_level_name(level_num) + print(f"Setting log level to: {level_name}") + + if level_name == "NOLOG" or not hasattr(logging, level_name): + logging.disable(logging.CRITICAL) # 모든 로그 비활성화 + return + + level = getattr(logging, level_name.upper()) + + # root logger 설정 (핸들러는 main에서만 설정) + logging.basicConfig( + level=level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + handlers=[logging.StreamHandler()], # 콘솔 출력 + ) + + # root logger level 적용 + logging.getLogger().setLevel(level) + + async def __call__(self, request: Request, file_path: str, **kwargs: dict): + self.setup_logging(kwargs.get("log_level", 4)) + + _log.info(f"file_path: {file_path}") + _log.info(f"kwargs: {kwargs}") + + document: DoclingDocument = self.load_documents(file_path, **kwargs) + + # @@@@ 성민: 이게....... 여기 있는게 아니라 로드 중간에 있어야 할거같은데. + if not check_document(document, self.enrichment_options) or self.check_glyphs(document): + # OCR이 필요하다고 판단되면 OCR 수행 + document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs) + + # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지) + document: DoclingDocument = self.ocr_all_table_cells(document, file_path) + + output_path, output_file = os.path.split(file_path) + filename, _ = os.path.splitext(output_file) + artifacts_dir = Path(f"{output_path}/{filename}") + if artifacts_dir.is_absolute(): + reference_path = None + else: + reference_path = artifacts_dir.parent + + document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path) + + document = self.enrichment(document, **kwargs) + + has_text_items = False + for item, _ in document.iterate_items(): + if ( + isinstance(item, (TextItem, ListItem, CodeItem, SectionHeaderItem)) and item.text and item.text.strip() + ) or (isinstance(item, TableItem) and item.data and len(item.data.table_cells) == 0): + has_text_items = True + break + + if has_text_items: + # Extract Chunk from DoclingDocument + chunks: List[DocChunk] = self.split_documents(document, **kwargs) + else: + # text가 있는 item이 없을 때 document에 임의의 text item 추가 + # 첫 번째 페이지의 기본 정보 사용 (1-based indexing) + page_no = 1 + + # ProvenanceItem 생성 + prov = ProvenanceItem(page_no=page_no, bbox=BoundingBox(l=0, t=0, r=1, b=1), charspan=(0, 1)) # 최소 bbox + + # document에 temp text item 추가 + document.add_text(label=DocItemLabel.TEXT, text=".", prov=prov) + + # split_documents 호출 + chunks: List[DocChunk] = self.split_documents(document, **kwargs) + # await assert_cancelled(request) + + vectors = [] + if len(chunks) >= 1: + vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs) + else: + raise GenosServiceException(1, f"chunk length is 0") + + """ + # 미디어 파일 업로드 방법 + media_files = [ + { 'path': '/tmp/graph.jpg', 'name': 'graph.jpg', 'type': 'image' }, + { 'path': '/result/1/graph.jpg', 'name': '1/graph.jpg', 'type': 'image' }, + ] + + # 업로드 요청 시에는 path, name 필요 + file_list = [{k: v for k, v in file.items() if k != 'type'} for file in media_files] + await upload_files(file_list, request=request) + + # 메타에 저장시에는 name, type 필요 + meta = [{k: v for k, v in file.items() if k != 'path'} for file in media_files] + vectors[0].media_files = meta + """ + + return vectors + + +class GenosServiceException(Exception): + # GenOS 와의 의존성 부분 제거를 위해 추가 + def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None: + self.code = 1 + self.error_code = error_code + self.error_msg = error_msg or "GenOS Service Exception" + self.msg_params = msg_params or {} + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})" + + +# GenOS 와의 의존성 제거를 위해 추가 +async def assert_cancelled(request: Request): + if await request.is_disconnected(): + raise GenosServiceException(1, f"Cancelled") + + +# ----------------------------------------------------------------- +# enrichment 프롬프트 +# ----------------------------------------------------------------- + +toc_system_prompt = """You are an expert at generating table of contents (목차) from Korean documents. You specialize in regulatory documents, terms of service, contracts, and mixed-format documents that combine formal regulatory structures with general section headers. +""".strip() +toc_user_prompt = """ +Here is the Korean document you need to analyze: + + +{{raw_text}} + + +Your task is to extract and organize all structural elements from this document into a hierarchical table of contents. Korean documents often have mixed structures where some sections follow formal regulatory patterns (제x장/절/관/조) while others use general section numbering and headers. + +## Analysis Process + +Before generating the final table of contents, work through the document systematically in `` tags. It's OK for this section to be quite long. Follow these steps: + +1. **Document Title Extraction**: Quote the main document title exactly as it appears at the beginning of the document. + +2. **Structural Marker Identification**: Scan through the document and quote all the key structural markers you find, such as: + - Formal regulatory patterns: 제x장, 제x절, 제x관, 제x조 + - General section patterns: numbered headers (1., 2., etc.), lettered headers (가., 나., etc.) + - Special sections: 부칙, 별지, 별표, etc. + +3. **Systematic Section Extraction**: Work through the document from beginning to end, extracting each structural element in order: + - For each main section, quote the exact title as it appears + - For each subsection, quote the exact title and note which main section it belongs under + - For each article/item, quote the exact title and note its parent section + - Include any appendices, attachments, and addenda + +4. **Hierarchy Building**: For each extracted element, explicitly note: + - What level it should be at (main section, subsection, sub-subsection, etc.) + - What its parent section is (if any) + - What numbering it should receive in the final TOC (1., 1.1., 1.1.1., etc.) + +5. **Structure Verification**: Review your extracted elements to ensure: + - All structural elements are captured in document order + - The hierarchy makes logical sense + - No elements are duplicated or missed + +## Output Requirements + +After your analysis, generate the table of contents with this exact format: + +``` + +TITLE: +1. +1.1. +1.1.1. +1.2. +2. +2.1. +3. + +``` + +## Formatting Guidelines + +- Start with `TITLE:` followed by the document title +- Use hierarchical decimal numbering (1, 1.1, 1.1.1, etc.) +- Follow each number with a space and the original title exactly as it appears +- Maintain the document's logical hierarchy +- Include appendices, attachments, and addenda as separate top-level items +- Extract titles exactly as they appear - do not include explanatory content +- Handle both formal regulatory structures and general section headers +- Wrap the entire table of contents in `` tags +""".strip() diff --git a/genon/preprocessor/module/test.py b/genon/preprocessor/module/test.py new file mode 100644 index 0000000000..b8b1310f51 --- /dev/null +++ b/genon/preprocessor/module/test.py @@ -0,0 +1,59 @@ +import os +from fastapi import Request +import logging +import asyncio +import json +import time + +import sys + +sys.path.insert(0, "../../../") # 현재 doc_parser의 docling 폴더 참조 + +# 테스트할 전처리기 임포트 +# from attachment_processor import DocumentProcessor # 첨부용 +# from convert_processor import DocumentProcessor # 변환형 +# from intelligent_processor import DocumentProcessor # 지능형 +from test_processor import DocumentProcessor + +# 파일 경로 +file_path = "../sample_files/pdf_sample.pdf" +# file_path = "/home/gamy0315/doc_parser/삼성전자_재무제표.pdf" +# file_path = "/home/gamy0315/doc_parser/genon/preprocessor/sample_files/docx_sample.docx" + +# 파일 존재 여부 확인 +if not os.path.exists(file_path): + print(f"Sample file not found: {file_path}") + print("Please add a file to the sample_files folder.") + exit(1) + +# DocumentProcessor 인스턴스 생성 +doc_processor = DocumentProcessor() + +# FastAPI 요청 예제 +mock_request = Request(scope={"type": "http"}) + + +# 비동기 메서드 실행 +async def process_document(): + # print(file_path) + kwargs = {} + kwargs["org_filename"] = os.path.basename(file_path) + vectors = await doc_processor(mock_request, file_path, **kwargs) + return vectors + + +begin = time.time() +# 메인 루프 실행 +result = asyncio.run(process_document()) + +result_list_as_dict = [item.model_dump() for item in result] + +# 최종적으로 이 리스트를 JSON으로 저장 +with open("result.json", "w", encoding="utf-8") as f: + json.dump(result_list_as_dict, f, ensure_ascii=False, indent=4) + +end = time.time() +print(f"Processing time: {end - begin:.2f} seconds") + + +breakpoint() diff --git a/genon/preprocessor/module/test_processor.py b/genon/preprocessor/module/test_processor.py new file mode 100644 index 0000000000..5b3637166a --- /dev/null +++ b/genon/preprocessor/module/test_processor.py @@ -0,0 +1,55 @@ +# from typing import Any + +# from fastapi import Request +# from langchain_core.documents import Document + +from base_processor import BaseProcessor + + +ocr_config = { + "model": "~~~~~~~~~", # easy, paddle + "end_point": "~~~~", +} + +vlm_layout_config = { + "model": "~~~~~~~~", + "end_point": "~~~~~~~~~", +} + +vlm_toc_config = { + "model": "~~~~~~~~", + "end_point": "~~~~~~~~~", +} + + +pdf_config = { + "pipeline_options": "~~~~~", + "backend": "~~~~", +} + +toc_config = { + "pipeline_options": "~~~~~", + "backend": "~~~~", +} + + +config = { + # TODO: ["pdf", "hwp", "hwpx", "doc", "docx", "xlsx", "csv", "ppt", "pptx", "md", "json", "html"], + image + "format_options": { + "pdf": { + "pipeline_options": "pdf", # simple | pdf + "backend": "pypdf", # + "generate_picture_images": True, # pdf일때만 설정 가능한듯 + }, + "docx": { + "pipeline_options": "simple", + "backend": "msword", + }, + }, + "chunker": "simple", # TODO: static bucket, dynamic bucket +} + + +class DocumentProcessor(BaseProcessor): + def __init__(self): + super().__init__(config) diff --git a/genon/preprocessor/module/utils/chunkers.py b/genon/preprocessor/module/utils/chunkers.py new file mode 100644 index 0000000000..ccb124e40a --- /dev/null +++ b/genon/preprocessor/module/utils/chunkers.py @@ -0,0 +1,836 @@ +import math, bisect +from typing import Any, Iterable, Iterator, List, Optional, Union +from typing_extensions import Self +from pydantic import ConfigDict, model_validator + + +from docling_core.types import DoclingDocument +from docling_core.types.doc import DocItem, TextItem, SectionHeaderItem, CodeItem, TableItem, PictureItem +from docling_core.types.doc.document import LevelNumber, ContentLayer, ListItem +from docling_core.transforms.chunker import BaseChunker, BaseChunk, DocChunk, DocMeta +from docling_core.types.doc.labels import DocItemLabel + +import semchunk +from transformers import AutoTokenizer, PreTrainedTokenizerBase + + +class GenosBucketChunker(BaseChunker): + """토큰 제한을 고려하여 섹션별 청크를 분할하고 병합하는 청커 (v2)""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + tokenizer: Union[PreTrainedTokenizerBase, str] = "sentence-transformers/all-MiniLM-L6-v2" + max_tokens: int = 1024 + merge_peers: bool = True + + # _inner_chunker: BaseChunker = None + _tokenizer: PreTrainedTokenizerBase = None + merge_list_items: bool = True + + @model_validator(mode="after") + def _initialize_components(self) -> Self: + # 토크나이저 초기화 + self._tokenizer = ( + self.tokenizer + if isinstance(self.tokenizer, PreTrainedTokenizerBase) + else AutoTokenizer.from_pretrained(self.tokenizer) + ) + return self + + def preprocess(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: + """문서의 모든 아이템을 헤더 정보와 함께 청크로 생성 + + Args: + dl_doc: 청킹할 문서 + + Yields: + 문서의 모든 아이템을 포함하는 하나의 청크 + """ + # 모든 아이템과 헤더 정보 수집 + all_items = [] + all_header_info = [] # 각 아이템의 헤더 정보 + current_heading_by_level: dict[LevelNumber, str] = {} + all_header_short_info = [] # 각 아이템의 짧은 헤더 정보 + current_heading_short_by_level: dict[LevelNumber, str] = {} + list_items: list[TextItem] = [] + + # iterate_items()로 수집된 아이템들의 self_ref 추적 + processed_refs = set() + + # 모든 아이템 순회 + for item, level in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}): + if hasattr(item, "self_ref"): + processed_refs.add(item.self_ref) + + if not isinstance(item, DocItem): + continue + + # 리스트 아이템 병합 처리 + if self.merge_list_items: + if isinstance(item, ListItem) or (isinstance(item, TextItem) and item.label == DocItemLabel.LIST_ITEM): + list_items.append(item) + continue + elif list_items: + # 누적된 리스트 아이템들을 추가 + for list_item in list_items: + all_items.append(list_item) + # 리스트 아이템의 헤더 정보 저장 + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + list_items = [] + + # 섹션 헤더 처리 + if isinstance(item, SectionHeaderItem) or ( + isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE] + ): + # 새로운 헤더 레벨 설정 + header_level = ( + item.level + if isinstance(item, SectionHeaderItem) + else (0 if item.label == DocItemLabel.TITLE else 1) + ) + current_heading_by_level[header_level] = item.text + current_heading_short_by_level[header_level] = item.orig # 첫 단어로 짧은 헤더 정보 설정 + + # 더 깊은 레벨의 헤더들 제거 + keys_to_del = [k for k in current_heading_by_level if k > header_level] + for k in keys_to_del: + current_heading_by_level.pop(k, None) + keys_to_del_short = [k for k in current_heading_short_by_level if k > header_level] + for k in keys_to_del_short: + current_heading_short_by_level.pop(k, None) + + # 헤더 아이템도 추가 (헤더 자체도 아이템임) + all_items.append(item) + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + continue + + if ( + isinstance(item, TextItem) + or isinstance(item, ListItem) + or isinstance(item, CodeItem) + or isinstance(item, TableItem) + or isinstance(item, PictureItem) + ): + # if item.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]: + # item.text = "" + all_items.append(item) + # 현재 아이템의 헤더 정보 저장 + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + + # 마지막 리스트 아이템들 처리 + if list_items: + for list_item in list_items: + all_items.append(list_item) + all_header_info.append({k: v for k, v in current_heading_by_level.items()}) + all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()}) + + # iterate_items()에서 누락된 테이블들을 별도로 추가 + missing_tables = [] + for table in dl_doc.tables: + table_ref = getattr(table, "self_ref", None) + if table_ref not in processed_refs: + missing_tables.append(table) + + # 누락된 테이블들을 문서 앞부분에 추가 (페이지 1의 테이블들일 가능성이 높음) + if missing_tables: + for missing_table in missing_tables: + # 첫 번째 위치에 삽입 (헤더 테이블일 가능성이 높음) + all_items.insert(0, missing_table) + all_header_info.insert(0, {}) # 빈 헤더 정보 + all_header_short_info.insert(0, {}) # 빈 짧은 헤더 정보 + + # 아이템이 없으면 빈 문서 + if not all_items: + return + + # 모든 아이템을 하나의 청크로 반환 (HybridChunker에서 분할) + # headings는 None으로 설정하고, 헤더 정보는 별도로 관리 + chunk = DocChunk( + text="", # 텍스트는 HybridChunker에서 생성 + meta=DocMeta( + doc_items=all_items, + headings=None, # DocMeta의 원래 형식 유지 + captions=None, + origin=dl_doc.origin, + ), + ) + # 헤더 정보를 별도 속성으로 저장 + chunk._header_info_list = all_header_info + chunk._header_short_info_list = all_header_short_info # 짧은 헤더 정보도 저장 + yield chunk + + def _count_tokens(self, text: str) -> int: + """텍스트의 토큰 수 계산 (안전한 분할 처리)""" + if not text: + return 0 + + # 텍스트를 더 작은 단위로 분할하여 계산 + max_chunk_length = 300 # 더 안전한 길이로 설정 + total_tokens = 0 + + # 텍스트를 줄 단위로 먼저 분할 + lines = text.split("\n") + current_chunk = "" + + for line in lines: + # 현재 청크에 줄을 추가했을 때 길이 확인 + temp_chunk = current_chunk + "\n" + line if current_chunk else line + + if len(temp_chunk) <= max_chunk_length: + current_chunk = temp_chunk + else: + # 현재 청크가 있으면 토큰 계산 + if current_chunk: + try: + total_tokens += len(self._tokenizer.tokenize(current_chunk)) + except Exception: + total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산 + + # 새로운 청크 시작 + current_chunk = line + + # 마지막 청크 처리 + if current_chunk: + try: + total_tokens += len(self._tokenizer.tokenize(current_chunk)) + except Exception: + total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산 + + return total_tokens + + def _generate_text_from_items_with_headers( + self, items: list[DocItem], header_info_list: list[dict], dl_doc: DoclingDocument + ) -> str: + """DocItem 리스트로부터 헤더 정보를 포함한 텍스트 생성""" + text_parts = [] + current_section_headers = {} # 현재 섹션의 헤더 정보 + + for i, item in enumerate(items): + item_headers = header_info_list[i] if i < len(header_info_list) else {} + + # 헤더 정보가 변경된 경우 (새로운 섹션 시작) + if item_headers != current_section_headers: + # 변경된 헤더 레벨들만 추가 + headers_to_add = [] + for level in sorted(item_headers.keys()): + # 이전 섹션과 다른 헤더만 추가 + if level not in current_section_headers or current_section_headers[level] != item_headers[level]: + # 해당 레벨까지의 모든 상위 헤더 포함 + for l in sorted(item_headers.keys()): + if l < level: + headers_to_add.append(item_headers[l]) + elif l == level: + headers_to_add.append("") + + break + + # 헤더가 있으면 추가 + if headers_to_add: + header_text = ", ".join(headers_to_add) + if header_text not in text_parts: + text_parts.append(header_text) + + current_section_headers = item_headers.copy() + + # 아이템 텍스트 추가 + if isinstance(item, TableItem): + table_text = self._extract_table_text(item, dl_doc) + if table_text: + text_parts.append(table_text) + elif hasattr(item, "text") and item.text: + # 타이틀과 섹션 헤더 처리 개선 + # is_section_header = ( + # isinstance(item, SectionHeaderItem) or + # (isinstance(item, TextItem) and + # item.label in [DocItemLabel.SECTION_HEADER]) # TITLE은 제외 + # ) + + # 타이틀은 항상 포함, 섹션 헤더는 중복 방지를 위해 스킵 + # if not is_section_header: + # 20250909, shkim, text_parts에 없는 경우만 추가. 섹션헤더가 반복해서 추가되는 것 방지 + if item.text not in text_parts: + text_parts.append(item.text) + elif isinstance(item, PictureItem): + text_parts.append("") # 이미지는 빈 텍스트 + + result_text = self.delim.join(text_parts) + return result_text + + def _extract_table_text(self, table_item: TableItem, dl_doc: DoclingDocument) -> str: + """테이블에서 텍스트를 추출하는 일반화된 메서드""" + try: + # 먼저 export_to_markdown 시도 + table_text = table_item.export_to_markdown(dl_doc) + if table_text and table_text.strip(): + return table_text + except Exception: + pass + + # export_to_markdown 실패 시 테이블 셀 데이터에서 직접 텍스트 추출 + try: + if hasattr(table_item, "data") and table_item.data: + cell_texts = [] + + # table_cells에서 텍스트 추출 + if hasattr(table_item.data, "table_cells"): + for cell in table_item.data.table_cells: + if hasattr(cell, "text") and cell.text and cell.text.strip(): + cell_texts.append(cell.text.strip()) + + # grid에서 텍스트 추출 (table_cells가 없는 경우) + elif hasattr(table_item.data, "grid") and table_item.data.grid: + for row in table_item.data.grid: + if isinstance(row, list): + for cell in row: + if hasattr(cell, "text") and cell.text and cell.text.strip(): + cell_texts.append(cell.text.strip()) + + # 추출된 셀 텍스트들을 결합 + if cell_texts: + return " ".join(cell_texts) + except Exception: + pass + + # 모든 방법 실패 시 item.text 사용 (있는 경우) + if hasattr(table_item, "text") and table_item.text: + return table_item.text + + return "" + + def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[str]]: + """헤더 정보 리스트에서 실제 사용되는 모든 헤더들을 level 순서대로 추출하고 ', '로 연결""" + if not header_info_list: + return None + + all_headers = [] # header 순서대로 추가 + seen_headers = set() # 중복 방지용 + + for header_info in header_info_list: + if header_info: + for level in sorted(header_info.keys()): + header_text = header_info[level] + if header_text and header_text not in seen_headers: + all_headers.append(header_text) + seen_headers.add(header_text) + + return all_headers if all_headers else None + + def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]: + """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)""" + if not table_text: + return [table_text] + + # 전체 테이블이 토큰 제한 내인지 확인 + if self._count_tokens(table_text) <= max_tokens: + return [table_text] + + # 단순히 토큰 수 기준으로 텍스트 분할 + # semchunk 사용하여 토큰 제한에 맞게 분할 + chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens) + chunks = chunker(table_text) + return chunks if chunks else [table_text] + + def _is_section_header(self, item: DocItem) -> bool: + """아이템이 section header인지 확인""" + return isinstance(item, SectionHeaderItem) or ( + isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE] + ) + + def _get_section_header_level(self, item: DocItem) -> Optional[int]: + """Section header의 level을 반환""" + if isinstance(item, SectionHeaderItem): + return item.level + elif isinstance(item, TextItem): + if item.label == DocItemLabel.TITLE: + return 0 + elif item.label == DocItemLabel.SECTION_HEADER: + return 1 + return None + + def _generate_section_text_with_heading( + self, section_items: list[DocItem], section_header_infos: list[dict], dl_doc: DoclingDocument + ) -> str: + """섹션의 텍스트를 생성하되, 앞에 heading을 붙임""" + # 첫 번째 item의 header_info에서 heading 추출 + if section_header_infos and section_header_infos[0]: + merged_headers = {} + for level, header_text in section_header_infos[0].items(): + if header_text: + merged_headers[level] = header_text + + # level 순서대로 정렬해서 ', '로 연결 + if merged_headers: + sorted_levels = sorted(merged_headers.keys()) + headers = [merged_headers[level] for level in sorted_levels] + heading_text = ", ".join(headers) + else: + heading_text = "" + else: + heading_text = "" + + # 섹션의 일반 텍스트 생성 + section_text = self._generate_text_from_items_with_headers(section_items, section_header_infos, dl_doc) + + # heading이 있으면 앞에 붙이기 + if heading_text: + return heading_text + ", " + section_text + else: + return section_text + + def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument) -> list[DocChunk]: + """문서를 토큰 제한에 맞게 분할 (v2: 섹션 헤더 기준으로 분할 후 max_tokens로 병합)""" + items = doc_chunk.meta.doc_items + header_info_list = getattr(doc_chunk, "_header_info_list", []) + header_short_info_list = getattr(doc_chunk, "_header_short_info_list", []) + + if not items: + return [] + + # ================================================================ + # 헬퍼 함수들 + # ================================================================ + + def get_header_level(header_infos, *, first=False, default=-1): + """header_infos에서 최종 레벨 계산""" + if not header_infos: + return default + info = header_infos[0] if first else header_infos[-1] + return max(info.keys(), default=default) + + def get_current_chunk( + doc_chunk: DocChunk, + merged_texts: list[str], + merged_header_short_infos: list[dict], + merged_items: list[DocItem], + ): + """현재까지 병합된 내용으로 DocChunk 생성""" + if not merged_texts: + return None + chunk_text = "\n".join(merged_texts) + used_headers = self._extract_used_headers(merged_header_short_infos) + + return DocChunk( + text=chunk_text, + meta=DocMeta( + doc_items=merged_items, + headings=used_headers, + captions=None, + origin=doc_chunk.meta.origin, + ), + ) + + def get_text_from_item(item: DocItem) -> str: + """DocItem에서 텍스트 추출""" + if isinstance(item, TableItem): + return self._extract_table_text(item, dl_doc) + elif hasattr(item, "text") and item.text: + return item.text + elif isinstance(item, PictureItem): + text = "" + for annotation in item.annotations: + if hasattr(annotation, "text"): + text += annotation.text + return text + return "" + + def split_items_evenly_by_tokens(item_token_counts, max_tokens): + n = len(item_token_counts) + total = sum(item_token_counts) + if n == 0: + return [] + if total <= max_tokens: + return [(0, n)] # ✅ 항상 (a,b) + + k = math.ceil(total / max_tokens) + target = total / k + + P = [0] + for c in item_token_counts: + P.append(P[-1] + c) + + cuts = [0] + used = {0} + for t in range(1, k): + goal = t * target + j = bisect.bisect_left(P, goal) + + cand = [] + if 0 < j < len(P): + cand.append(j) + if 0 <= j - 1 < len(P): + cand.append(j - 1) + + best = None + best_dist = float("inf") + for x in cand: + if x in used: + continue + if x <= cuts[-1]: + continue + if x >= len(P) - 1: # n + continue + dist = abs(P[x] - goal) + if dist < best_dist: + best_dist = dist + best = x + + if best is None: + best = min(max(cuts[-1] + 1, 1), len(P) - 2) + + cuts.append(best) + used.add(best) + + cuts.append(n) + + return [(a, b) for a, b in zip(cuts[:-1], cuts[1:])] + + def adjust_captions(items_group): + + b_modified = False + for idx, group in enumerate(items_group): + if group is None: + continue + item = group[0][0] + ref_idx_list = [] + if hasattr(item, "captions") and item.captions: + for cap in item.captions: + cap_ref = cap.cref + cap_idx = -1 + for j, it in enumerate(items_group): + if it is None: + continue + if getattr(it[0][0], "self_ref", None) == cap_ref: + cap_idx = j + break + if cap_idx != -1: + ref_idx_list.append(cap_idx) + if ref_idx_list: + ref_idx_list = sorted(ref_idx_list) + + if not ref_idx_list: + continue + + # caption 아이템들을 부모 아이템 바로 뒤로 이동 + for cap_idx in ref_idx_list: + for g in items_group[cap_idx]: + items_group[idx].append(g) + items_group[cap_idx] = None # 나중에 None 제거 + b_modified = True + + if b_modified: + items_group = [it for it in items_group if it is not None] + + return items_group + + def adjust_pictures_in_tables(items_group): + # picture in table 처리 + + b_modified = False + for idx, group in enumerate(items_group): + if group is None: + continue + item = group[0][0] + pic_idx_list = [] + if isinstance(item, TableItem): + table_bbox = item.prov[0].bbox + table_page_no = item.prov[0].page_no + + for j in range(len(items_group)): + if items_group[j] is None: + continue + pic_item = items_group[j][0][0] + if isinstance(pic_item, PictureItem): + # table 안의 picture인지 확인. iou 사용 + pic_bbox = pic_item.prov[0].bbox + pic_page_no = pic_item.prov[0].page_no + if pic_page_no != table_page_no: + continue + ios = pic_bbox.intersection_over_self(table_bbox) + if ios > 0.5: # picture가 50% 이상 table 안에 포함되면 table 안의 picture로 간주 + pic_idx_list.append(j) + if pic_idx_list: + pic_idx_list = sorted(pic_idx_list) + + if not pic_idx_list: + continue + + for pic_idx in pic_idx_list: + for g in items_group[pic_idx]: + items_group[idx].append(g) + items_group[pic_idx] = None # 나중에 None 제거 + b_modified = True + + if b_modified: + items_group = [it for it in items_group if it is not None] + + return items_group + + # ================================================================ + # 1단계: 섹션 헤더 기준으로 분할 + # ================================================================ + + sections = [] # [(items, header_infos, header_short_infos), ...] + cur_items, cur_h_infos, cur_h_short = [], [], [] + + for i, item in enumerate(items): + h_info = header_info_list[i] if i < len(header_info_list) else {} + h_short = header_short_info_list[i] if i < len(header_short_info_list) else {} + + # 섹션 헤더를 만나면 + if self._is_section_header(item): + # 이전 섹션이 있으면 저장 + if cur_items: + sections.append((cur_items, cur_h_infos, cur_h_short)) + + # 새로운 섹션 시작 + cur_items = [item] + cur_h_infos = [h_info] + cur_h_short = [h_short] + else: + # 섹션 헤더가 아니면 현재 섹션에 추가 + cur_items.append(item) + cur_h_infos.append(h_info) + cur_h_short.append(h_short) + + # 마지막 섹션 저장 + if cur_items: + sections.append((cur_items, cur_h_infos, cur_h_short)) + + # ================================================================ + # 2단계: 각 섹션의 텍스트에 heading 붙이기 + # ================================================================ + + sections_with_text = [] + for items, header_infos, header_short_infos in sections: + text = self._generate_section_text_with_heading(items, header_short_infos, dl_doc) + sections_with_text.append((text, items, header_infos, header_short_infos)) + + # ================================================================ + # 2.5단계: 너무 긴 청크는 분할 + # ================================================================ + if self.max_tokens > 0: + for i in range(len(sections_with_text)): + text, items, h_infos, h_short = sections_with_text[i] + token_count = self._count_tokens(text) + if token_count < self.max_tokens: + continue + + # caption 및 table 내 그림은 같은 섹션에 있도록 조정 + items_group = [[(item, info, short)] for item, info, short in zip(items, h_infos, h_short)] + items_group = adjust_captions(items_group) + items_group = adjust_pictures_in_tables(items_group) + + # 너무 긴 섹션은 분할 + # 각 아이템 별 token 수 계산 + item_token_counts = [] + for group in items_group: + cur_count = 0 + for g in group: + cur_count += self._count_tokens(get_text_from_item(g[0])) + item_token_counts.append(cur_count) + + # 아이템 그룹들을 토큰 기준으로 균등 분할 + split_info = split_items_evenly_by_tokens(item_token_counts, self.max_tokens) + + # item_groups를 섹션으로 다시 구성 + new_sections = [] + for a, b in split_info: + + # 각 그룹에서 items, h_infos, h_short로 분리 + group_items = [] + group_h_infos = [] + group_h_short = [] + for idx in range(a, b): + for g in items_group[idx]: + group_items.append(g[0]) + group_h_infos.append(g[1]) + group_h_short.append(g[2]) + + new_text = self._generate_section_text_with_heading(group_items, group_h_short, dl_doc) + new_sections.append((new_text, group_items, group_h_infos, group_h_short)) + + # 원래 섹션을 새로 분할된 섹션들로 교체 + sections_with_text.pop(i) + for new_section in reversed(new_sections): + sections_with_text.insert(i, new_section) + + # ================================================================ + # 3단계: 단독 타이틀(1줄만) → 다음 섹션으로 병합 + # ================================================================ + + for i in range(len(sections_with_text) - 2, -1, -1): + text, items, h_infos, h_short = sections_with_text[i] + + # 아이템이 하나인 섹션 헤더만 검사 + if len(items) != 1 or not self._is_section_header(items[0]): + continue + + # 문단이 이미 구성된 것은 제외 (문자 수가 30자 이상이면 문단을 구성했다고 간주) + item_text = "".join(getattr(it, "text", "") for it in items) + if len(item_text) > 30: + continue + + # 현재 섹션헤더 레벨이 다음 섹션헤더 레벨보다 더 높은 경우에만 병합 (높은 레벨이 더 작은 숫자) + n_text, n_items, n_h_infos, n_h_short = sections_with_text[i + 1] + current_level = get_header_level(h_infos, first=False) + next_level = get_header_level(n_h_infos, first=True) + if 0 <= next_level < current_level: + continue + + # 다음 섹션과 병합 + sections_with_text[i] = (text + "\n" + n_text, items + n_items, h_infos + n_h_infos, h_short + n_h_short) + sections_with_text.pop(i + 1) + + # ================================================================ + # 4단계: 토큰 기준 병합 + # ================================================================ + + result_chunks = [] + merged_texts, merged_items = [], [] + merged_header_infos, merged_header_short_infos = [], [] + + for text, items, header_infos, header_short_infos in sections_with_text: + + b_new_chunk = False + + # ---------------------------------- + # 병합 가능 여부 판단 + + # 병합 가능 토큰 수 계산 + test_tokens = self._count_tokens("\n".join(merged_texts + [text])) + + # 현재 섹션헤더 레벨과 병합된 섹션헤더 레벨 + section_level = get_header_level(header_infos, first=True) + merged_level = get_header_level(merged_header_infos, first=False) + + # 토큰 수 초과 시 새로운 청크 생성 + if test_tokens > self.max_tokens and len(merged_texts) > 0: + b_new_chunk = True + # 현재 섹션헤더 레벨이 더 높으면 새로운 청크 생성 + elif 0 <= section_level < merged_level: + b_new_chunk = True + # ---------------------------------- + + # 새로운 청크 생성 + if b_new_chunk: + cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items) + if cur_chunk: + result_chunks.append(cur_chunk) + + # 새로운 병합 시작 + merged_texts = [text] + merged_items = items + merged_header_infos = header_infos + merged_header_short_infos = header_short_infos + else: + # 현재 섹션 병합 + merged_texts.append(text) + merged_items.extend(items) + merged_header_infos.extend(header_infos) + merged_header_short_infos.extend(header_short_infos) + + # 마지막 병합된 items 처리 + cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items) + if cur_chunk: + result_chunks.append(cur_chunk) + + return result_chunks + + def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: + """문서를 청킹하여 반환 + + Args: + dl_doc: 청킹할 문서 + + Yields: + 토큰 제한에 맞게 분할된 청크들 + """ + doc_chunks = list(self.preprocess(dl_doc=dl_doc, **kwargs)) + + if not doc_chunks: + return iter([]) + + doc_chunk = doc_chunks[0] # preprocess는 하나의 청크만 반환 + + final_chunks = self._split_document_by_tokens(doc_chunk, dl_doc) + + return iter(final_chunks) + + +class SimpleChunker(BaseChunker): + chunk_size: int = 1000 + + def chunk(self, dl_doc: DoclingDocument, **kwargs: dict): + if "chunk_size" in kwargs: + print(f"@@@@ 기본 chunk_size를 사용합니다: {self.chunk_size}") + + chunk_size = kwargs.get("chunk_size", self.chunk_size) + + # 모든 아이템 수집 + all_items: list[DocItem] = [] + for item, _ in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}): + if isinstance(item, (TextItem, ListItem, CodeItem, SectionHeaderItem, TableItem, PictureItem)): + all_items.append(item) + + if not all_items: + return iter([]) + + def get_text(item: DocItem) -> str: + if isinstance(item, TableItem): + try: + return item.export_to_markdown(dl_doc) or "" + except Exception: + return getattr(item, "text", "") or "" + return getattr(item, "text", "") or "" + + chunks: list[DocChunk] = [] + current_items: list[DocItem] = [] + current_len = 0 + + for item in all_items: + item_text = get_text(item) + item_len = len(item_text) + + # 현재 청크가 비어있지 않고 추가하면 chunk_size 초과 시 저장 + if current_items and current_len + item_len + 1 > chunk_size: + chunks.append( + DocChunk( + text="\n".join(get_text(it) for it in current_items), + meta=DocMeta( + doc_items=current_items, + headings=None, + captions=None, + origin=dl_doc.origin, + ), + ) + ) + current_items = [] + current_len = 0 + + current_items.append(item) + current_len += item_len + (1 if current_len else 0) + + # 마지막 청크 처리 + if current_items: + chunks.append( + DocChunk( + text="\n".join(get_text(it) for it in current_items), + meta=DocMeta( + doc_items=current_items, + headings=None, + captions=None, + origin=dl_doc.origin, + ), + ) + ) + + return iter(chunks) + + +CHUNKERS = { + "bucket": GenosBucketChunker, + "simple": SimpleChunker, +} diff --git a/genon/preprocessor/module/utils/genos_util.py b/genon/preprocessor/module/utils/genos_util.py new file mode 100644 index 0000000000..23d808cfe9 --- /dev/null +++ b/genon/preprocessor/module/utils/genos_util.py @@ -0,0 +1,17 @@ +class GenosServiceException(Exception): + """GenOS 와의 의존성 부분 제거를 위해 추가""" + + def __init__( + self, + error_code: str, + error_msg: Optional[str] = None, + msg_params: Optional[dict] = None, + ) -> None: + self.code = 1 + self.error_code = error_code + self.error_msg = error_msg or "GenOS Service Exception" + self.msg_params = msg_params or {} + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})" diff --git a/genon/preprocessor/module/utils/metadata.py b/genon/preprocessor/module/utils/metadata.py new file mode 100644 index 0000000000..6c0f96ab49 --- /dev/null +++ b/genon/preprocessor/module/utils/metadata.py @@ -0,0 +1,352 @@ +import re +from collections import defaultdict +from pydantic import BaseModel +from typing import Optional, List +from datetime import datetime +import json + +import asyncio +from fastapi import Request +from langchain_core.documents import Document +from docling_core.types import DoclingDocument +from docling_core.types.doc.labels import DocItemLabel +from docling_core.transforms.chunker import DocChunk +from docling_core.types.doc import PictureItem + +try: + from genos_utils import upload_files # TODO 이거 확인하기 +except ImportError: + upload_files = None + + +class GenOSVectorMeta(BaseModel): + class Config: + extra = "allow" + + text: str = None + n_char: int = None + n_word: int = None + n_line: int = None + e_page: int = None + i_page: int = None + i_chunk_on_page: int = None + n_chunk_of_page: int = None + i_chunk_on_doc: int = None + n_chunk_of_doc: int = None + n_page: int = None + reg_date: str = None + chunk_bboxes: str = None + media_files: str = None + title: str = None + created_date: int = None + appendix: str = None ## !! appendix feature (2025-09-30, geonhee kim) !! + + +class GenOSVectorMetaBuilder: + def __init__(self): + """빌더 초기화""" + self.text: Optional[str] = None + self.n_char: Optional[int] = None + self.n_word: Optional[int] = None + self.n_line: Optional[int] = None + self.i_page: Optional[int] = None + self.e_page: Optional[int] = None + self.i_chunk_on_page: Optional[int] = None + self.n_chunk_of_page: Optional[int] = None + self.i_chunk_on_doc: Optional[int] = None + self.n_chunk_of_doc: Optional[int] = None + self.n_page: Optional[int] = None + self.reg_date: Optional[str] = None + self.chunk_bboxes: Optional[str] = None + self.media_files: Optional[str] = None + self.title: Optional[str] = None + self.created_date: Optional[int] = None + self.appendix: Optional[str] = None # !! appendix feature (2025-09-30, geonhee kim) !! + + def parse_created_date(self, date_text: str) -> Optional[int]: + """ + 작성일 텍스트를 파싱하여 YYYYMMDD 형식의 정수로 변환 + + Args: + date_text: 작성일 텍스트 (YYYY-MM 또는 YYYY-MM-DD 형식) + + Returns: + YYYYMMDD 형식의 정수, 파싱 실패시 None + """ + if not date_text or not isinstance(date_text, str) or date_text == "None": + return 0 + + # 공백 제거 및 정리 + date_text = date_text.strip() + + # YYYY-MM-DD 형식 매칭 + match_full = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", date_text) + if match_full: + year, month, day = match_full.groups() + try: + # 유효한 날짜인지 검증 + datetime(int(year), int(month), int(day)) + return int(f"{year}{month.zfill(2)}{day.zfill(2)}") + except ValueError: + pass + + # YYYY-MM 형식 매칭 (일자는 01로 설정) + match_month = re.match(r"^(\d{4})-(\d{1,2})$", date_text) + if match_month: + year, month = match_month.groups() + try: + # 유효한 월인지 검증 + datetime(int(year), int(month), 1) + return int(f"{year}{month.zfill(2)}01") + except ValueError: + pass + + # YYYY 형식 매칭 (월일은 0101로 설정) + match_year = re.match(r"^(\d{4})$", date_text) + if match_year: + year = match_year.group(1) + try: + datetime(int(year), 1, 1) + return int(f"{year}0101") + except ValueError: + pass + + return 0 + + def set_text(self, text: str) -> "GenOSVectorMetaBuilder": + """텍스트와 관련된 데이터를 설정""" + self.text = text + self.n_char = len(text) + self.n_word = len(text.split()) + self.n_line = len(text.splitlines()) + return self + + def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder": + """페이지 정보 설정""" + self.i_page = i_page + self.i_chunk_on_page = i_chunk_on_page + self.n_chunk_of_page = n_chunk_of_page + return self + + def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder": + """문서 전체의 청크 인덱스 설정""" + self.i_chunk_on_doc = i_chunk_on_doc + return self + + def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder": + """글로벌 메타데이터 병합""" + for key, value in global_metadata.items(): + if hasattr(self, key): + setattr(self, key, value) + return self + + def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder": + chunk_bboxes = [] + for item in doc_items: + for prov in item.prov: + label = item.self_ref + type_ = item.label + size = document.pages.get(prov.page_no).size + page_no = prov.page_no + bbox = prov.bbox + bbox_data = { + "l": bbox.l / size.width, + "t": bbox.t / size.height, + "r": bbox.r / size.width, + "b": bbox.b / size.height, + "coord_origin": bbox.coord_origin.value, + } + chunk_bboxes.append({"page": page_no, "bbox": bbox_data, "type": type_, "ref": label}) + self.e_page = max([bbox["page"] for bbox in chunk_bboxes]) if chunk_bboxes else None + self.chunk_bboxes = json.dumps(chunk_bboxes) + return self + + def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder": + temp_list = [] + for item in doc_items: + if isinstance(item, PictureItem): + if item.image is None: + print("@@@@ item.image is None: pipeline_options - generate_picture_images False!!") + continue + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({"name": name, "type": "image", "ref": item.self_ref}) + self.media_files = json.dumps(temp_list) + return self + + def get_title(self, document): + title = "" + for item, _ in document.iterate_items(): + if hasattr(item, "label"): + if item.label == DocItemLabel.TITLE: + title = item.text.strip() if item.text else "" + break + + return title + + def get_created_data(self, document: DoclingDocument): + created_date = 0 + try: + if ( + document.key_value_items + and len(document.key_value_items) > 0 + and hasattr(document.key_value_items[0], "graph") + and hasattr(document.key_value_items[0].graph, "cells") + and len(document.key_value_items[0].graph.cells) > 1 + ): + # 작성일 추출 (cells[1]) + date_text = document.key_value_items[0].graph.cells[1].text + created_date = self.parse_created_date(date_text) + except (AttributeError, IndexError) as e: + pass + + return created_date + + def get_appendix_keywords( + self, content: str, appendix_list: list + ) -> str: # !! appendix feature (2025-09-30, geonhee kim) !! + if not content or not appendix_list: + return "" + + matched_appendices = [] + + # 1. Find appendix patterns in content first + found_patterns = [] + + # Complex patterns: 별지/별표/장부 + numbers (with hyphens, Roman numerals) + # Updated regex to capture full patterns like "별지 제 Ⅰ -1 호 서식" by matching until closing delimiters + content = re.sub(r"\s+", "", content) + complex_patterns = re.findall(r"(별지|별표|장부)(?:제)?([^<>()\[\]]+?)(?=(?:호|서식)|[<>\)\]]|$)", content) + for pattern_type, number in complex_patterns: + found_patterns.extend( + [ + f"{pattern_type} {number}", + f"{pattern_type} 제{number}호", + f"{pattern_type}{number}", + f"{pattern_type}제{number}호", + ] + ) + + # Standalone patterns: (별표), (별지), (장부) + standalone_patterns = re.findall(r"[\(\[]+(별지|별표|장부)[\)\]]+", content) + for pattern_type in set(standalone_patterns): + found_patterns.extend( + [ + pattern_type, + f"{pattern_type}", + ] + ) + + # 2. Check if found patterns match any appendix in the list + for appendix in appendix_list: + if not appendix or not isinstance(appendix, str): + continue + + appendix_clean = appendix.replace(".pdf", "").lower().strip() + + # If any found pattern exists in appendix filename, it's a match + for pattern in found_patterns: + if pattern.lower().strip() in appendix_clean: + matched_appendices.append(appendix) + break # Prevent duplicates + + return ", ".join(matched_appendices) if matched_appendices else "" + + def get_chunk_count(self, chunks: List[DocChunk]): + page_chunk_counts = defaultdict(int) + + for chunk in chunks: + page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1 + + return page_chunk_counts + + def build(self) -> GenOSVectorMeta: + """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성""" + return GenOSVectorMeta( + text=self.text, + n_char=self.n_char, + n_word=self.n_word, + n_line=self.n_line, + i_page=self.i_page, + e_page=self.e_page, + i_chunk_on_page=self.i_chunk_on_page, + n_chunk_of_page=self.n_chunk_of_page, + i_chunk_on_doc=self.i_chunk_on_doc, + n_chunk_of_doc=self.n_chunk_of_doc, + n_page=self.n_page, + reg_date=self.reg_date, + chunk_bboxes=self.chunk_bboxes, + media_files=self.media_files, + title=self.title, + created_date=self.created_date, + appendix=self.appendix or "", # !! appendix feature (2025-09-30, geonhee kim) !! + ) + + async def __call__( + self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, **kwargs: dict + ): + title = self.get_title(document) + created_date = self.get_created_data(document) + page_chunk_counts = self.get_chunk_count(chunks) + + # kwargs에서 부록 정보 추출 !! appendix feature (2025-09-30, geonhee kim) !! + appendix_info = kwargs.get("appendix", "") + appendix_list = [] + if isinstance(appendix_info, str): + appendix_list = ( + [item.strip() for item in json.loads(appendix_info) if item.strip()] if appendix_info else [] + ) + elif isinstance(appendix_info, list): + appendix_list = appendix_info + else: + appendix_list = [] + + global_metadata = dict( + n_chunk_of_doc=len(chunks), + n_page=document.num_pages(), + reg_date=datetime.now().isoformat(timespec="seconds") + "Z", + created_date=created_date, + title=title, + ) + + current_page = None + chunk_index_on_page = 0 + vectors = [] + upload_tasks = [] + for chunk_idx, chunk in enumerate(chunks): + chunk_page = chunk.meta.doc_items[0].prov[0].page_no + # header 앞에 헤더 마커 추가 (HEADER: ) + headers_text = "HEADER: " + ", ".join(chunk.meta.headings) + "\n" if chunk.meta.headings else "" + content = headers_text + chunk.text + + # appendix 추출 !! appendix feature (2025-09-30, geonhee kim) !! + matched_appendices = self.get_appendix_keywords(content, appendix_list) + # print(appendix_list, matched_appendices) + chunk_global_metadata = global_metadata.copy() + chunk_global_metadata["appendix"] = matched_appendices # Only matched ones + ### + + if chunk_page != current_page: + current_page = chunk_page + chunk_index_on_page = 0 + + vector = ( + GenOSVectorMetaBuilder() + .set_text(content) + .set_page_info(chunk_page, chunk_index_on_page, page_chunk_counts[chunk_page]) + .set_chunk_index(chunk_idx) + .set_global_metadata(**chunk_global_metadata) #!! appendix feature (2025-09-30, geonhee kim) !! + .set_chunk_bboxes(chunk.meta.doc_items, document) + .set_media_files(chunk.meta.doc_items) + ).build() + vectors.append(vector) + + chunk_index_on_page += 1 + if upload_files: + file_list = self.get_media_files(chunk.meta.doc_items) + upload_tasks.append(asyncio.create_task(upload_files(file_list, request=request))) + + if upload_tasks: + await asyncio.gather(*upload_tasks) + + return vectors diff --git a/genon/preprocessor/module/utils/util.py b/genon/preprocessor/module/utils/util.py new file mode 100644 index 0000000000..eca19f8b7c --- /dev/null +++ b/genon/preprocessor/module/utils/util.py @@ -0,0 +1,146 @@ +import os +from pathlib import Path + +import shutil +import unicodedata +import tempfile +import subprocess + +from markdown2 import markdown + +try: + from weasyprint import HTML +except ImportError: + print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.") + HTML = None + + +def convert_to_pdf(file_path: str) -> str | None: + """ + LibreOffice로 PDF 변환을 시도한다. + 실패해도 예외를 던지지 않고 None을 반환한다. + """ + try: + in_path = Path(file_path).resolve() + out_dir = in_path.parent + pdf_path = in_path.with_suffix(".pdf") + + # headless에서 UTF-8 locale 보장 + env = os.environ.copy() + env.setdefault("LANG", "C.UTF-8") + env.setdefault("LC_ALL", "C.UTF-8") + + # 확장자에 따라 필터(특히 .ppt는 impress 필터) + ext = in_path.suffix.lower() + if ext in (".ppt", ".pptx"): + convert_arg = "pdf:impress_pdf_Export" + elif ext in (".doc", ".docx"): + convert_arg = "pdf:writer_pdf_Export" + elif ext in (".xls", ".xlsx", ".csv"): + convert_arg = "pdf:calc_pdf_Export" + else: + convert_arg = "pdf" + + # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도 + try: + in_path.name.encode("ascii") + candidates = [in_path] + tmp_dir = None + except UnicodeEncodeError: + tmp_dir = Path(tempfile.mkdtemp()) + ascii_name = unicodedata.normalize("NFKD", in_path.stem).encode("ascii", "ignore").decode("ascii") or "file" + ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}" + shutil.copy2(in_path, ascii_copy) + candidates = [ascii_copy, in_path] + + for cand in candidates: + cmd = [ + "soffice", + "--headless", + "--convert-to", + convert_arg, + "--outdir", + str(out_dir), + str(cand), + ] + proc = subprocess.run(cmd, env=env, capture_output=True, text=True) + if proc.returncode == 0 and pdf_path.exists(): + # 성공 + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return str(pdf_path) + # 실패해도 계속 시도 (로그만 찍고 무시) + print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}") + + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return None + except Exception as e: + # 어떤 에러든 삼키고 None 반환 + print(f"[convert_to_pdf] error: {e}") + return None + + +def _get_pdf_path(file_path: str, CONVERTIBLE_EXTENSIONS: list) -> str: + """ + 다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수 + + Args: + file_path (str): 원본 파일 경로 + + Returns: + str: PDF 확장자로 변경된 파일 경로 + """ + pdf_path = file_path + for ext in CONVERTIBLE_EXTENSIONS: + pdf_path = pdf_path.replace(ext, ".pdf") + return pdf_path + + +def get_real_file_type(file_path: str) -> str: + """파일 확장자가 아닌 실제 내용으로 파일 타입 판단""" + with open(file_path, "rb") as f: + header = f.read(8) + if header.startswith(b"%PDF-"): + return "pdf" + elif header.startswith(b"\x89PNG"): + return "png" + elif header.startswith(b"\xff\xd8\xff"): + return "jpg" + + # 매직 헤더로 판단할 수 없으면 확장자 사용 + return os.path.splitext(file_path)[-1].lower() + + +def convert_md_to_pdf(md_path): + """Markdown 파일을 PDF로 변환""" + install_packages(["chardet"]) + import chardet + + pdf_path = md_path.replace(".md", ".pdf") + with open(md_path, "rb") as f: + raw_file = f.read() + candidates = ["utf-8", "utf-8-sig"] + try: + det = (chardet.detect(raw_file) or {}).get("encoding") or "" + # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가 + if det and det.lower() not in ("ascii", "unknown"): + if det.lower() not in [c.lower() for c in candidates]: + candidates.append(det) + except Exception: + pass + candidates += ["cp949", "euc-kr", "iso-8859-1", "latin-1"] + md_content = None + for enc in candidates: + try: + md_content = raw_file.decode(enc) + break + except UnicodeDecodeError: + continue + if md_content is None: + md_content = raw_file.decode("utf-8", errors="replace") + + html_content = markdown(md_content) + if HTML: + HTML(string=html_content).write_pdf(pdf_path) + return pdf_path