From 4069830c67cde556329258cebb78d893f94614e2 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Thu, 11 Dec 2025 06:02:42 +0000
Subject: [PATCH 01/19] fix: config

---
 build-script/doc-parser-build.config | 4 ++--
 build-script/paddle-ocr-build.config | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build-script/doc-parser-build.config b/build-script/doc-parser-build.config
index eb54693edd..fd38ddb1a5 100644
--- a/build-script/doc-parser-build.config
+++ b/build-script/doc-parser-build.config
@@ -6,10 +6,10 @@ DOCKER_REGISTRY=mncregistry:30500
 IMAGE_NAME=doc-parser-preprocessor
 
 # 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능)
-IMAGE_VERSION=1.3.0
+IMAGE_VERSION=1.3.3-komipo
 
 # 실제 Dockerfile 위치 (루트 기준)
 DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile
 
 # 빌드 후 push 할지 여부
-PUSH_IMAGE=true
+PUSH_IMAGE=false
diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config
index 8c9ced262e..9a31ed49bb 100644
--- a/build-script/paddle-ocr-build.config
+++ b/build-script/paddle-ocr-build.config
@@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile
 
 # 이미지 이름/태그
 IMAGE_NAME=doc-parser-ocr
-IMAGE_TAG=0.0.0
+IMAGE_TAG=1.3.3-komipo
 
 # 푸시할 레지스트리 (없으면 빈값)
 REGISTRY=mncregistry:30500

From 316d532572d3e2a2b338438fe516e3598ecdf655 Mon Sep 17 00:00:00 2001
From: "seunghyun.nam" <seunghyun.nam@genon.ai>
Date: Thu, 11 Dec 2025 15:26:54 +0900
Subject: [PATCH 02/19] fix: config

---
 genon/preprocessor/scripts/register.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genon/preprocessor/scripts/register.config b/genon/preprocessor/scripts/register.config
index 5961c9085d..f41c99c5e7 100644
--- a/genon/preprocessor/scripts/register.config
+++ b/genon/preprocessor/scripts/register.config
@@ -1,7 +1,7 @@
 # 필수
 REGISTRY_NAME="mncregistry:30500/"
 IMAGE_NAME="mnc/doc-parser-preprocessor"
-IMAGE_TAG="1.3.0"
+IMAGE_TAG="1.3.3-komipo"
 DESCRIPTION="unified-preprocessor"
 
 # K8s / DB 파드 위치

From 2601b2a19d812e2d1c2601e49813f7b15437a927 Mon Sep 17 00:00:00 2001
From: "seunghyun.nam" <seunghyun.nam@genon.ai>
Date: Thu, 11 Dec 2025 15:29:55 +0900
Subject: [PATCH 03/19] fix: script

---
 genon/preprocessor/scripts/register_image.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/genon/preprocessor/scripts/register_image.sh b/genon/preprocessor/scripts/register_image.sh
index 613fa2738a..325fb5c18b 100644
--- a/genon/preprocessor/scripts/register_image.sh
+++ b/genon/preprocessor/scripts/register_image.sh
@@ -57,14 +57,14 @@ else
   read -srp "MySQL 비밀번호: " MYSQL_PASS; echo
 fi
 
-# ── 로컬 이미지 확인 ────────────────────────────────────────
-step "로컬 Docker 이미지 확인"
-if docker images | awk '{print $1":"$2}' | grep -qx "${FULL_IMAGE_NAME}"; then
-  ok "로컬 이미지 존재"
-else
-  fail "로컬에 ${FULL_IMAGE_NAME} 없음. 먼저 build/push 하세요."
-  exit 1
-fi
+# # ── 로컬 이미지 확인 ────────────────────────────────────────
+# step "로컬 Docker 이미지 확인"
+# if docker images | awk '{print $1":"$2}' | grep -qx "${FULL_IMAGE_NAME}"; then
+#   ok "로컬 이미지 존재"
+# else
+#   fail "로컬에 ${FULL_IMAGE_NAME} 없음. 먼저 build/push 하세요."
+#   exit 1
+# fi
 
 # ── docker push (포그라운드 / 재시도) ───────────────────────
 step "docker push"
@@ -109,7 +109,7 @@ if [ -z "${EXISTING_ID}" ]; then
       INSERT INTO llmops.resource_meta_tb
         (resource_id, resource_type, resource_group_id, is_active, reg_date, mod_date, reg_user_id, mod_user_id)
       VALUES
-        (LAST_INSERT_ID(), 'DOCKER_IMAGE', 2, 1, NOW(), NOW(), 1, 1);
+        (LAST_INSERT_ID(), 'DOCKER_IMAGE', 1, 1, NOW(), NOW(), 1, 1);
     " 2>/dev/null
 
   IMAGE_ID=$(

From fa4de8a79e43a9916509490454802db474ac552f Mon Sep 17 00:00:00 2001
From: "seunghyun.nam" <seunghyun.nam@genon.ai>
Date: Wed, 17 Dec 2025 13:08:57 +0900
Subject: [PATCH 04/19] fix:readme

---
 genon/README.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/genon/README.md b/genon/README.md
index a04ebd0783..4e28533cac 100644
--- a/genon/README.md
+++ b/genon/README.md
@@ -60,7 +60,7 @@
 6. 사이트 배포 시
 ```shell
 1. 이미지 저장
-docker save mncregistry:30500/mnc/doc-parser-preprocessor:latest | gzip > doc-parser-preprocessor.tar.gz
+docker save mncregistry:30500/mnc/doc-parser-preprocessor:1.3.3-komipo | gzip > doc-parser-preprocessor.tar.gz
 2. 사이트에서 이미지 복원
 gunzip -c doc-parser-preprocessor.tar.gz | docker load
 3. register_image.sh 파일 실행
@@ -75,4 +75,10 @@ gunzip -c doc-parser-preprocessor.tar.gz | docker load
 ```shell
 kubectl apply -f doc-parser-ocr-deployment.yaml
 ```
-5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml)
\ No newline at end of file
+5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml)
+
+사이트에서
+```
+docker save mncregistry:30500/doc-parser-ocr:1.3.3-komipo | gzip > doc-parser-ocr.tar.gz
+gunzip -c doc-parser-ocr.tar.gz | docker load
+```

From 2913f752aa1da532a241b664d34dd47a2c24cb38 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Mon, 26 Jan 2026 16:43:35 +0900
Subject: [PATCH 05/19] fix: miniLM path

---
 genon/preprocessor/facade/intelligent_processor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/genon/preprocessor/facade/intelligent_processor.py b/genon/preprocessor/facade/intelligent_processor.py
index a2487d05f4..33d04e1ccd 100644
--- a/genon/preprocessor/facade/intelligent_processor.py
+++ b/genon/preprocessor/facade/intelligent_processor.py
@@ -349,8 +349,10 @@ class HybridChunker(BaseChunker):
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
+    # 해당 경로에 all-MiniLM-L6-v2 위치 시키기
+    # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
     tokenizer: Union[PreTrainedTokenizerBase, str] = (
-        "sentence-transformers/all-MiniLM-L6-v2"
+        "/nfs-root/all-MiniLM-L6-v2"
     )
     max_tokens: int = 1024
     merge_peers: bool = True

From d8f91063284801f2f49b60c84991dc55056b9d93 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Thu, 12 Feb 2026 09:51:07 +0900
Subject: [PATCH 06/19] =?UTF-8?q?fix:=20html=20=EC=95=84=EB=8B=8C=20?=
 =?UTF-8?q?=EA=B2=BD=EC=9A=B0=EC=97=90=EB=A7=8C=20ocr=20=EC=88=98=ED=96=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 genon/preprocessor/facade/intelligent_processor_ocr.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/genon/preprocessor/facade/intelligent_processor_ocr.py b/genon/preprocessor/facade/intelligent_processor_ocr.py
index 807493fe73..b720d3bf26 100644
--- a/genon/preprocessor/facade/intelligent_processor_ocr.py
+++ b/genon/preprocessor/facade/intelligent_processor_ocr.py
@@ -1288,8 +1288,11 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
             # OCR이 필요하다고 판단되면 OCR 수행
             document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
 
-        # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
-        document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
+        if document.origin.mimetype == "text/html":
+            pass
+        else:
+            # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
+            document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
 
         output_path, output_file = os.path.split(file_path)
         filename, _ = os.path.splitext(output_file)

From 4ba3747b0e96fbd1a371377cd71ffad4c4157feb Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Fri, 13 Feb 2026 11:13:29 +0900
Subject: [PATCH 07/19] =?UTF-8?q?add:=20=EB=B3=B4=EC=95=88=EC=BB=A8?=
 =?UTF-8?q?=EC=84=A4=ED=8C=85=20=EC=A1=B0=EC=B9=98=EC=82=AC=ED=95=AD?=
 =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20=EC=9D=B8=ED=95=B4=20=EC=B2=A8=EB=B6=80?=
 =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EA=B0=80=EB=93=9C=EB=A0=88=EC=9D=BC=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../facade/attachment_processor.py            | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py
index 140704a84a..28c65bed96 100644
--- a/genon/preprocessor/facade/attachment_processor.py
+++ b/genon/preprocessor/facade/attachment_processor.py
@@ -90,6 +90,41 @@
 # pdf 변환 대상 확장자
 CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']
 
+## 보안컨설팅 조치로 인한 가드레일 추가
+
+import requests
+import re
+import json
+
+GUARDRAIL_WORKFLOW_ID = 694
+GUARDRAIL_BEARER_TOKEN = ""
+GENOS_URL = ""
+
+from functools import wraps
+
+def guardrail(func):
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        result = await func(*args, **kwargs)
+
+        for r in result:
+            url = f"{GENOS_URL}/api/gateway/workflow/{GUARDRAIL_WORKFLOW_ID}"
+            headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}")
+
+            if hasattr(r, "text"):
+                body = {"question": r.text}
+
+                res = requests.post(f"{url}/run/v2", json=body, headers=headers)
+
+                answer = res.json()["data"]["text"]
+
+                if answer.startswith("[UNSAFE]"):
+                    r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다."
+        
+        return result
+    
+    return wrapper
+
 
 def convert_to_pdf(file_path: str) -> str | None:
     """
@@ -1383,6 +1418,7 @@ def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict
 
         return vectors
 
+    @guardrail
     async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         ext = os.path.splitext(file_path)[-1].lower()
         if ext in ('.wav', '.mp3', '.m4a'):

From 41ba56a2fbafb6ee8251c8dc732dc4c7c13ca574 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Thu, 11 Dec 2025 06:02:42 +0000
Subject: [PATCH 08/19] fix: config

---
 build-script/doc-parser-build.config | 13 ++-----------
 build-script/paddle-ocr-build.config |  2 +-
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/build-script/doc-parser-build.config b/build-script/doc-parser-build.config
index f54b799ee3..c909a43497 100644
--- a/build-script/doc-parser-build.config
+++ b/build-script/doc-parser-build.config
@@ -6,19 +6,10 @@ DOCKER_REGISTRY=mncregistry:30500
 IMAGE_NAME=doc-parser-preprocessor
 
 # 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능)
-IMAGE_VERSION=1.3.6.2
+IMAGE_VERSION=1.3.7-komipo
 
 # 실제 Dockerfile 위치 (루트 기준)
 DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile
 
 # 빌드 후 push 할지 여부
-PUSH_IMAGE=true
-
-# USER, GROUP
-APP_UID=3000
-APP_GID=3000
-APP_UNAME=genos
-APP_GNAME=genos
-
-# NLTK packages (comma-separated). Use "all" to download everything.
-APP_NLTK_PACKAGES=punkt,stopwords,averaged_perceptron_tagger,averaged_perceptron_tagger_eng,wordnet,omw-1.4
+PUSH_IMAGE=false
diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config
index 8c9ced262e..9a31ed49bb 100644
--- a/build-script/paddle-ocr-build.config
+++ b/build-script/paddle-ocr-build.config
@@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile
 
 # 이미지 이름/태그
 IMAGE_NAME=doc-parser-ocr
-IMAGE_TAG=0.0.0
+IMAGE_TAG=1.3.3-komipo
 
 # 푸시할 레지스트리 (없으면 빈값)
 REGISTRY=mncregistry:30500

From 7afc6f96312fb2d76c2b9640f302e1cfbf7e8bfe Mon Sep 17 00:00:00 2001
From: "seunghyun.nam" <seunghyun.nam@genon.ai>
Date: Thu, 11 Dec 2025 15:26:54 +0900
Subject: [PATCH 09/19] fix: config

---
 genon/preprocessor/scripts/register.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genon/preprocessor/scripts/register.config b/genon/preprocessor/scripts/register.config
index 5961c9085d..f41c99c5e7 100644
--- a/genon/preprocessor/scripts/register.config
+++ b/genon/preprocessor/scripts/register.config
@@ -1,7 +1,7 @@
 # 필수
 REGISTRY_NAME="mncregistry:30500/"
 IMAGE_NAME="mnc/doc-parser-preprocessor"
-IMAGE_TAG="1.3.0"
+IMAGE_TAG="1.3.3-komipo"
 DESCRIPTION="unified-preprocessor"
 
 # K8s / DB 파드 위치

From 2ebdbd9781467f698e7accdcfdadbf2db7dc0958 Mon Sep 17 00:00:00 2001
From: "seunghyun.nam" <seunghyun.nam@genon.ai>
Date: Thu, 11 Dec 2025 15:29:55 +0900
Subject: [PATCH 10/19] fix: script

---
 genon/preprocessor/scripts/register_image.sh | 43 ++++++++------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/genon/preprocessor/scripts/register_image.sh b/genon/preprocessor/scripts/register_image.sh
index 8c872ef7ed..0610cb54db 100644
--- a/genon/preprocessor/scripts/register_image.sh
+++ b/genon/preprocessor/scripts/register_image.sh
@@ -58,15 +58,14 @@ else
   read -srp "MySQL 비밀번호: " MYSQL_PASS; echo
 fi
 
-# ── 로컬 이미지 확인 ────────────────────────────────────────
-step "로컬 Docker 이미지 확인"
-if docker image inspect "${FULL_IMAGE_NAME}" >/dev/null 2>&1; then
-  ok "로컬 이미지 존재"
-  HAS_LOCAL_IMAGE="yes"
-else
-  echo "⚠️ 로컬에 ${FULL_IMAGE_NAME} 없음."
-  HAS_LOCAL_IMAGE="no"
-fi
+# # ── 로컬 이미지 확인 ────────────────────────────────────────
+# step "로컬 Docker 이미지 확인"
+# if docker images | awk '{print $1":"$2}' | grep -qx "${FULL_IMAGE_NAME}"; then
+#   ok "로컬 이미지 존재"
+# else
+#   fail "로컬에 ${FULL_IMAGE_NAME} 없음. 먼저 build/push 하세요."
+#   exit 1
+# fi
 
 # ── docker push (포그라운드 / 재시도) ───────────────────────
 step "docker push"
@@ -160,22 +159,16 @@ if [ -z "${EXISTING_ID}" ]; then
       INSERT INTO llmops.resource_meta_tb
         (resource_id, resource_type, resource_group_id, is_active, reg_date, mod_date, reg_user_id, mod_user_id)
       VALUES
-        (LAST_INSERT_ID(), 'DOCKER_IMAGE', 2, 1, NOW(), NOW(), 1, 1);
-  "
-  if ! MYSQL_OUT="$(mysql_query "${SQL_INSERT}")"; then
-    fail "DB 등록 실패. 아래 로그 확인 필요."
-    echo "${MYSQL_OUT}"
-    exit 1
-  fi
-
-  MYSQL_OUT=""
-  if ! MYSQL_OUT="$(mysql_query "${SQL_EXISTING}")"; then
-    fail "DB 조회 실패(등록 후). 아래 로그 확인 필요."
-    echo "${MYSQL_OUT}"
-    exit 1
-  fi
-  IMAGE_ID="$(printf '%s' "${MYSQL_OUT}" | tr -d '\r\n' | grep -Eo '^[0-9]+$' || true)"
-  ok "DB 등록 완료. 이미지 ID: ${IMAGE_ID}"
+        (LAST_INSERT_ID(), 'DOCKER_IMAGE', 1, 1, NOW(), NOW(), 1, 1);
+    " 2>/dev/null
+
+  IMAGE_ID=$(
+    kubectl exec -it "${MARIADB_POD}" -n "${K8S_NAMESPACE}" -- \
+      mysql -u "${MYSQL_USER}" -p"${MYSQL_PASS}" llmops -se \
+      "SELECT id FROM system_docker_image_tb WHERE name='${IMAGE_NAME}' AND tag='${IMAGE_TAG}';" \
+      2>/dev/null | tr -d '\r\n' | grep -o '[0-9]*' || true
+  )
+  echo "✅ DB 등록 완료. 이미지 ID: ${IMAGE_ID}"
 else
   ok "이미 등록된 이미지입니다. ID: ${EXISTING_ID}"
   IMAGE_ID="${EXISTING_ID}"

From 52ecf20a9e2a06b0b90a9186543d5d0581970b7c Mon Sep 17 00:00:00 2001
From: "seunghyun.nam" <seunghyun.nam@genon.ai>
Date: Wed, 17 Dec 2025 13:08:57 +0900
Subject: [PATCH 11/19] fix:readme

---
 genon/README.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/genon/README.md b/genon/README.md
index a04ebd0783..4e28533cac 100644
--- a/genon/README.md
+++ b/genon/README.md
@@ -60,7 +60,7 @@
 6. 사이트 배포 시
 ```shell
 1. 이미지 저장
-docker save mncregistry:30500/mnc/doc-parser-preprocessor:latest | gzip > doc-parser-preprocessor.tar.gz
+docker save mncregistry:30500/mnc/doc-parser-preprocessor:1.3.3-komipo | gzip > doc-parser-preprocessor.tar.gz
 2. 사이트에서 이미지 복원
 gunzip -c doc-parser-preprocessor.tar.gz | docker load
 3. register_image.sh 파일 실행
@@ -75,4 +75,10 @@ gunzip -c doc-parser-preprocessor.tar.gz | docker load
 ```shell
 kubectl apply -f doc-parser-ocr-deployment.yaml
 ```
-5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml)
\ No newline at end of file
+5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml)
+
+사이트에서
+```
+docker save mncregistry:30500/doc-parser-ocr:1.3.3-komipo | gzip > doc-parser-ocr.tar.gz
+gunzip -c doc-parser-ocr.tar.gz | docker load
+```

From 849a32f23dc5899c2e970de68b291051d8e56698 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Thu, 12 Feb 2026 09:51:07 +0900
Subject: [PATCH 12/19] =?UTF-8?q?fix:=20html=20=EC=95=84=EB=8B=8C=20?=
 =?UTF-8?q?=EA=B2=BD=EC=9A=B0=EC=97=90=EB=A7=8C=20ocr=20=EC=88=98=ED=96=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...4\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git "a/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" "b/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py"
index df58f2f2a7..c69a67dd5b 100644
--- "a/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py"
+++ "b/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py"
@@ -1253,8 +1253,11 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
             # OCR이 필요하다고 판단되면 OCR 수행
             document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
 
-        # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
-        document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
+        if document.origin.mimetype == "text/html":
+            pass
+        else:
+            # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
+            document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
 
         output_path, output_file = os.path.split(file_path)
         filename, _ = os.path.splitext(output_file)

From 055d059751bec917ad82e384ad1dd25b6c0fbad4 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Fri, 13 Feb 2026 11:13:29 +0900
Subject: [PATCH 13/19] =?UTF-8?q?add:=20=EB=B3=B4=EC=95=88=EC=BB=A8?=
 =?UTF-8?q?=EC=84=A4=ED=8C=85=20=EC=A1=B0=EC=B9=98=EC=82=AC=ED=95=AD?=
 =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20=EC=9D=B8=ED=95=B4=20=EC=B2=A8=EB=B6=80?=
 =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EA=B0=80=EB=93=9C=EB=A0=88=EC=9D=BC=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../facade/attachment_processor.py            | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py
index f5fea12eba..32c8cf2052 100644
--- a/genon/preprocessor/facade/attachment_processor.py
+++ b/genon/preprocessor/facade/attachment_processor.py
@@ -99,6 +99,41 @@
 # pdf 변환 대상 확장자
 CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']
 
+## 보안컨설팅 조치로 인한 가드레일 추가
+
+import requests
+import re
+import json
+
+GUARDRAIL_WORKFLOW_ID = 694
+GUARDRAIL_BEARER_TOKEN = ""
+GENOS_URL = ""
+
+from functools import wraps
+
+def guardrail(func):
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        result = await func(*args, **kwargs)
+
+        for r in result:
+            url = f"{GENOS_URL}/api/gateway/workflow/{GUARDRAIL_WORKFLOW_ID}"
+            headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}")
+
+            if hasattr(r, "text"):
+                body = {"question": r.text}
+
+                res = requests.post(f"{url}/run/v2", json=body, headers=headers)
+
+                answer = res.json()["data"]["text"]
+
+                if answer.startswith("[UNSAFE]"):
+                    r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다."
+        
+        return result
+    
+    return wrapper
+
 
 def convert_to_pdf(file_path: str) -> str | None:
     """
@@ -1432,6 +1467,7 @@ def get_level_name(level_num: int) -> str:
         # root logger level 적용
         logging.getLogger().setLevel(level)
 
+    @guardrail
     async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         self.setup_logging(kwargs.get('log_level', 4))
 

From 0f03ec661c3b86be3635212aac6c2cc048e90b63 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Wed, 11 Mar 2026 14:03:17 +0900
Subject: [PATCH 14/19] =?UTF-8?q?chore:=20=EB=A6=AC=EB=B2=A0=EC=9D=B4?=
 =?UTF-8?q?=EC=8A=A4=20=EC=9E=91=EC=97=85=EC=9D=80=20=EC=84=B1=EB=AF=BC=20?=
 =?UTF-8?q?=ED=94=84=EB=A1=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 genon/preprocessor/facade/attachment_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py
index 32c8cf2052..2adb82a385 100644
--- a/genon/preprocessor/facade/attachment_processor.py
+++ b/genon/preprocessor/facade/attachment_processor.py
@@ -214,6 +214,7 @@ def _get_pdf_path(file_path: str) -> str:
     return pdf_path
 
 
+
 def install_packages(packages):
     for package in packages:
         try:

From 76543992a3b37d2749f61fcc920632e1771e92fb Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Wed, 11 Mar 2026 14:22:15 +0900
Subject: [PATCH 15/19] fix: img tag

---
 build-script/paddle-ocr-build.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config
index 9a31ed49bb..ea2f74486d 100644
--- a/build-script/paddle-ocr-build.config
+++ b/build-script/paddle-ocr-build.config
@@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile
 
 # 이미지 이름/태그
 IMAGE_NAME=doc-parser-ocr
-IMAGE_TAG=1.3.3-komipo
+IMAGE_TAG=1.3.7-komipo
 
 # 푸시할 레지스트리 (없으면 빈값)
 REGISTRY=mncregistry:30500

From 1f7bb47bc3cf0e627ea7f25378f726ff4982c9f6 Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Mon, 6 Apr 2026 11:27:50 +0900
Subject: [PATCH 16/19] =?UTF-8?q?feat:=20=EC=B2=A8=EB=B6=80=EC=9A=A9=20?=
 =?UTF-8?q?=EC=A0=84=EC=B2=98=EB=A6=AC=EA=B8=B0=EC=97=90=20=EA=B0=80?=
 =?UTF-8?q?=EB=93=9C=EB=A0=88=EC=9D=BC=20=EC=B6=94=EA=B0=80.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../facade/attachment_processor_guardrail.py  | 1626 +++++++++++++++++
 1 file changed, 1626 insertions(+)
 create mode 100644 genon/preprocessor/facade/attachment_processor_guardrail.py

diff --git a/genon/preprocessor/facade/attachment_processor_guardrail.py b/genon/preprocessor/facade/attachment_processor_guardrail.py
new file mode 100644
index 0000000000..0064b76832
--- /dev/null
+++ b/genon/preprocessor/facade/attachment_processor_guardrail.py
@@ -0,0 +1,1626 @@
+from __future__ import annotations
+
+from collections import defaultdict
+
+import asyncio
+import fitz
+import json
+import math
+import os
+import pandas as pd
+import pydub
+import requests
+import shutil
+import subprocess
+import sys
+import threading
+import uuid
+import warnings
+from datetime import datetime
+from fastapi import Request
+from glob import glob
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+    # TextLoader,                       # TXT
+    PyMuPDFLoader,  # PDF
+    DataFrameLoader,  # DataFrame
+    UnstructuredWordDocumentLoader,  # DOC and DOCX
+    UnstructuredPowerPointLoader,  # PPT and PPTX
+    UnstructuredImageLoader,  # JPG, PNG
+    UnstructuredMarkdownLoader,  # Markdown
+    UnstructuredFileLoader,  # Generic fallback
+)
+from langchain_core.documents import Document
+from markdown2 import markdown
+from pandas import DataFrame
+from pathlib import Path
+from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing import Any, Iterable, Iterator, List, Optional, Union
+from typing_extensions import Self
+
+try:
+    import semchunk
+    from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+    raise RuntimeError(
+        "Module requires 'chunking' extra; to install, run: "
+        "`pip install 'docling-core[chunking]'`"
+    )
+try:
+    import chardet
+except ImportError:
+    raise RuntimeError("Module 'chardet' not imported. Run `pip install chardet`.")
+try:
+    from weasyprint import HTML
+except ImportError:
+    print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.")
+    HTML = None
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.document_converter import DocumentConverter, HwpxFormatOption, WordFormatOption
+from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta
+from docling_core.types import DoclingDocument as DLDocument
+from docling_core.types.doc import (
+    DocItem, DocItemLabel, DoclingDocument,
+    PictureItem, SectionHeaderItem, TableItem, TextItem
+)
+from docling_core.types.doc.document import LevelNumber, ListItem, CodeItem
+from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend
+# from utils import assert_cancelled
+# from genos_utils import upload_files, merge_overlapping_bboxes
+
+# import platform
+from pathlib import Path
+import os
+import subprocess
+import tempfile
+import shutil
+import unicodedata
+
+import logging
+
+for n in ("fontTools", "fontTools.ttLib", "fontTools.ttLib.ttFont"):
+    lg = logging.getLogger(n)
+    lg.setLevel(logging.CRITICAL)
+    lg.propagate = False
+    logging.getLogger().setLevel(logging.WARNING)        
+# pdf 변환 대상 확장자
+CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']
+
+
+
+### 가드레일 용 ###
+import requests
+import re
+import json
+
+GUARDRAIL_WORKFLOW_ID = 694
+GUARDRAIL_BEARER_TOKEN = '23c3898fe3264fd597961af23a68fe7c'
+# GENOS_URL = 'https://ai.komipo.co.kr:30908/'
+# @@@@ 내부 호출로 변경
+GENOS_URL = 'http://llmops-gateway-api-service:8080'
+
+
+from functools import wraps
+
+def guardrail(func):
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        result = await func(*args, **kwargs)
+        for r in result:
+            url = f"{GENOS_URL}/workflow/{GUARDRAIL_WORKFLOW_ID}"
+            headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}")
+
+            if hasattr(r, "text"):
+                body = {'question': r.text}
+
+                res = requests.post(f'{url}/run/v2', json=body, headers=headers)
+
+                answer = res.json()['data']['text']
+
+                if answer.startswith("[UNSAFE]"):
+                    r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다."
+
+            
+        return result
+    return wrapper
+
+
+def convert_to_pdf(file_path: str) -> str | None:
+    """
+    LibreOffice로 PDF 변환을 시도한다.
+    실패해도 예외를 던지지 않고 None을 반환한다.
+    """
+    try:
+        in_path = Path(file_path).resolve()
+        out_dir = in_path.parent
+        pdf_path = in_path.with_suffix('.pdf')
+
+        # headless에서 UTF-8 locale 보장
+        env = os.environ.copy()
+        env.setdefault("LANG", "C.UTF-8")
+        env.setdefault("LC_ALL", "C.UTF-8")
+
+        # 확장자에 따라 필터(특히 .ppt는 impress 필터)
+        ext = in_path.suffix.lower()
+        if ext in ('.ppt', '.pptx'):
+            convert_arg = "pdf:impress_pdf_Export"
+        elif ext in ('.doc', '.docx'):
+            convert_arg = "pdf:writer_pdf_Export"
+        elif ext in ('.xls', '.xlsx', '.csv'):
+            convert_arg = "pdf:calc_pdf_Export"
+        else:
+            convert_arg = "pdf"
+
+        # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도
+        try:
+            in_path.name.encode('ascii')
+            candidates = [in_path]
+            tmp_dir = None
+        except UnicodeEncodeError:
+            tmp_dir = Path(tempfile.mkdtemp())
+            ascii_name = unicodedata.normalize('NFKD', in_path.stem).encode('ascii', 'ignore').decode('ascii') or "file"
+            ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}"
+            shutil.copy2(in_path, ascii_copy)
+            candidates = [ascii_copy, in_path]
+
+        for cand in candidates:
+            cmd = [
+                "soffice", "--headless",
+                "--convert-to", convert_arg,
+                "--outdir", str(out_dir),
+                str(cand)
+            ]
+            proc = subprocess.run(cmd, env=env, capture_output=True, text=True)
+            if proc.returncode == 0 and pdf_path.exists():
+                # 성공
+                if tmp_dir:
+                    shutil.rmtree(tmp_dir, ignore_errors=True)
+                return str(pdf_path)
+            # 실패해도 계속 시도 (로그만 찍고 무시)
+            print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}")
+
+        if tmp_dir:
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+        return None
+    except Exception as e:
+        # 어떤 에러든 삼키고 None 반환
+        print(f"[convert_to_pdf] error: {e}")
+        return None
+
+
+def _get_pdf_path(file_path: str) -> str:
+    """
+    다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수
+
+    Args:
+        file_path (str): 원본 파일 경로
+
+    Returns:
+        str: PDF 확장자로 변경된 파일 경로
+    """
+    pdf_path = file_path
+    for ext in CONVERTIBLE_EXTENSIONS:
+        pdf_path = pdf_path.replace(ext, '.pdf')
+    return pdf_path
+
+
+def install_packages(packages):
+    for package in packages:
+        try:
+            __import__(package)
+        except ImportError:
+            print(f"[!] {package} 패키지가 없습니다. 설치를 시도합니다.")
+            subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)
+
+
+class GenOSVectorMeta(BaseModel):
+    class Config:
+        extra = 'allow'
+
+    text: str | None = None
+    n_char: int | None = None
+    n_word: int | None = None
+    n_line: int | None = None
+    i_page: int | None = None
+    e_page: int | None = None
+    i_chunk_on_page: int | None = None
+    n_chunk_of_page: int | None = None
+    i_chunk_on_doc: int | None = None
+    n_chunk_of_doc: int | None = None
+    n_page: int | None = None
+    reg_date: str | None = None
+    chunk_bboxes: str | None = None
+    media_files: str | None = None
+
+
+class GenOSVectorMetaBuilder:
+    def __init__(self):
+        """빌더 초기화"""
+        self.text: Optional[str] = None
+        self.n_char: Optional[int] = None
+        self.n_word: Optional[int] = None
+        self.n_line: Optional[int] = None
+        self.i_page: Optional[int] = None
+        self.e_page: Optional[int] = None
+        self.i_chunk_on_page: Optional[int] = None
+        self.n_chunk_of_page: Optional[int] = None
+        self.i_chunk_on_doc: Optional[int] = None
+        self.n_chunk_of_doc: Optional[int] = None
+        self.n_page: Optional[int] = None
+        self.reg_date: Optional[str] = None
+        self.chunk_bboxes: Optional[str] = None
+        self.media_files: Optional[str] = None
+        # self.title: Optional[str] = None
+        # self.created_date: Optional[int] = None
+
+    def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+        """텍스트와 관련된 데이터를 설정"""
+        self.text = text
+        self.n_char = len(text)
+        self.n_word = len(text.split())
+        self.n_line = len(text.splitlines())
+        return self
+
+    def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder":
+        """페이지 정보 설정"""
+        self.i_page = i_page
+        self.i_chunk_on_page = i_chunk_on_page
+        self.n_chunk_of_page = n_chunk_of_page
+        return self
+
+    def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+        """문서 전체의 청크 인덱스 설정"""
+        self.i_chunk_on_doc = i_chunk_on_doc
+        return self
+
+    def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+        """글로벌 메타데이터 병합"""
+        for key, value in global_metadata.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+        return self
+
+    def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+        chunk_bboxes = []
+        for item in doc_items:
+            for prov in item.prov:
+                label = item.self_ref
+                type_ = item.label
+                size = document.pages.get(prov.page_no).size
+                page_no = prov.page_no
+                bbox = prov.bbox
+                bbox_data = {
+                    'l': bbox.l / size.width,
+                    't': bbox.t / size.height,
+                    'r': bbox.r / size.width,
+                    'b': bbox.b / size.height,
+                    'coord_origin': bbox.coord_origin.value
+                }
+                chunk_bboxes.append({
+                    'page': page_no,
+                    'bbox': bbox_data,
+                    'type': type_,
+                    'ref': label
+                })
+        self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None
+        self.chunk_bboxes = json.dumps(chunk_bboxes)
+        return self
+
+    def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+        temp_list = []
+        if not doc_items:
+            self.media_files = ""
+            return self
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref})
+        self.media_files = json.dumps(temp_list)
+        return self
+
+    def build(self) -> GenOSVectorMeta:
+        """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+        return GenOSVectorMeta(
+            text=self.text,
+            n_char=self.n_char,
+            n_word=self.n_word,
+            n_line=self.n_line,
+            i_page=self.i_page,
+            e_page=self.e_page,
+            i_chunk_on_page=self.i_chunk_on_page,
+            n_chunk_of_page=self.n_chunk_of_page,
+            i_chunk_on_doc=self.i_chunk_on_doc,
+            n_chunk_of_doc=self.n_chunk_of_doc,
+            n_page=self.n_page,
+            reg_date=self.reg_date,
+            chunk_bboxes=self.chunk_bboxes,
+            media_files=self.media_files,
+        )
+
+
+class HwpLoader:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.output_dir = os.path.join('/tmp', str(uuid.uuid4()))
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    def load(self):
+        try:
+            subprocess.run(['hwp5html', self.file_path, '--output', self.output_dir], check=True, timeout=600)
+            converted_file_path = os.path.join(self.output_dir, 'index.xhtml')
+            pdf_save_path = _get_pdf_path(self.file_path)
+            HTML(converted_file_path).write_pdf(pdf_save_path)
+            loader = PyMuPDFLoader(pdf_save_path)
+            return loader.load()
+        except Exception as e:
+            print(f"Failed to convert {self.file_path} to XHTML")
+            raise e
+        finally:
+            if os.path.exists(self.output_dir):
+                shutil.rmtree(self.output_dir)
+
+
+class TextLoader:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.output_dir = os.path.join('/tmp', str(uuid.uuid4()))
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    def load(self):
+        try:
+            with open(self.file_path, 'rb') as f:
+                raw = f.read()
+            enc = chardet.detect(raw).get('encoding') or ''
+            encodings = [enc] if enc and enc.lower() not in ('ascii', 'unknown') else []
+            encodings += ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+
+            content = None
+            for e in encodings:
+                try:
+                    content = raw.decode(e)  # 전체 파일로 디코딩
+                    break
+                except UnicodeDecodeError:
+                    continue
+            if content is None:
+                content = raw.decode('utf-8', errors='replace')
+
+            # 4) PDF 변환 유지
+            html = f"<html><meta charset='utf-8'><body><pre>{content}</pre></body></html>"
+            html_path = os.path.join(self.output_dir, 'temp.html')
+            with open(html_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+            # pdf_path = (self.file_path
+            #             .replace('.txt', '.pdf')
+            #             .replace('.json', '.pdf'))
+            pdf_path = _get_pdf_path(self.file_path)
+            if HTML:
+                HTML(html_path).write_pdf(pdf_path)
+                loader = PyMuPDFLoader(pdf_path)
+                return loader.load()
+            # PDF가 불가하면 Document 직접 반환 (원형 스키마 유지)
+            return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+
+        except Exception:
+            # 실패 시에도 스키마는 그대로 유지해 반환
+            for e in ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1']:
+                try:
+                    with open(self.file_path, 'r', encoding=e) as f:
+                        content = f.read()
+                    return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+                except UnicodeDecodeError:
+                    continue
+            with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f:
+                content = f.read()
+            return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+        finally:
+            if os.path.exists(self.output_dir):
+                shutil.rmtree(self.output_dir)
+
+
+class TabularLoader:
+    def __init__(self, file_path: str, ext: str):
+        
+        packages = ['openpyxl', 'chardet']
+        
+        install_packages(packages)
+
+        self.file_path = file_path
+        if ext == ".csv":
+            # convert_to_pdf(file_path) csv는 Pdf 변환 안 함
+            self.data_dict = self.load_csv_documents(file_path)
+        elif ext == ".xlsx":
+            # convert_to_pdf(file_path) xlsx는 Pdf 변환 안 함
+            self.data_dict = self.load_xlsx_documents(file_path)
+        else:
+            print(f"[!] Inadequate extension for TabularLoader: {ext}")
+            return
+
+    def check_sql_dtypes(self, df):
+        df = df.convert_dtypes()
+        res = []
+        for col in df.columns:
+            # col_name = col.strip().replace(' ', '_')
+            dtype = str(df.dtypes[col]).lower()
+
+            if 'int' in dtype:
+                if '64' in dtype:
+                    sql_dtype = 'BIGINT'
+                else:
+                    sql_dtype = 'INT'
+            elif 'float' in dtype:
+                sql_dtype = 'FLOAT'
+            elif 'bool' in dtype:
+                sql_dtype = 'BOOLEAN'
+            elif 'date' in dtype:
+                sql_dtype = 'DATE'
+                df[col] = df[col].astype(str)
+            elif 'datetime' in dtype:
+                sql_dtype = 'DATETIME'
+                df[col] = df[col].astype(str)
+            # else:
+            #     max_len = df[col].str.len().max().item() + 10
+            #     sql_dtype = f'VARCHAR({max_len})'
+            else:
+                lens = df[col].astype(str).str.len()
+                max_len_val = lens.max()
+                max_len = int(0 if pd.isna(max_len_val) else max_len_val) + 10
+                sql_dtype = f'VARCHAR({max_len})'
+
+            res.append([col, sql_dtype])
+
+        return df, res
+
+    def process_data_rows(self, data: dict):
+        """Arg: data (keys: 'sheet_name', 'page_column', 'page_column_type', 'documents')"""
+
+        rows = []
+        for doc in data["documents"]:
+            row = {}
+            if 'int' in data["page_column_type"]:
+                row[data["page_column"]] = int(doc.page_content)
+            elif 'float' in data["page_column_type"]:
+                row[data["page_column"]] = float(doc.page_content)
+            elif 'bool' in data["page_column_type"]:
+                if doc.page_content.lower() == 'true':
+                    row[data["page_column"]] = True
+                elif doc.page_content.lower() == 'false':
+                    row[data["page_column"]] = False
+                else:
+                    raise ValueError(f"Invalid boolean string: {doc.page_content}")
+            else:
+                row[data["page_column"]] = doc.page_content
+
+            row.update(doc.metadata)
+            rows.append(row)
+
+        processed_data = {"sheet_name": data["sheet_name"], "data_rows": rows, "data_types": data["dtypes"]}
+        return processed_data
+
+    def load_csv_documents(self, file_path: str, **kwargs: dict):
+        import chardet
+
+        with open(file_path, "rb") as f:
+            raw_file = f.read(10000)
+        enc_type = chardet.detect(raw_file)['encoding']
+        df = pd.read_csv(file_path, encoding=enc_type, index_col=False)
+        df = df.fillna('null')  # csv 파일에서도 xlsx 파일과 동일하게 null로 채움
+        df, dtypes_str = self.check_sql_dtypes(df)
+
+        for i in range(len(df.columns)):
+            try:
+                col = df.columns[0]
+                # col_type = str(type(col))
+                col_type = str(df[col].dtype)
+                df = df.astype({col: 'str'})
+                break
+            except:
+                raise ValueError(
+                    f"Any columns cannot be converted into the string type so that can't load LangChain Documents: {dtypes_str}")
+
+        loader = DataFrameLoader(df, page_content_column=col)
+        documents = loader.load()
+
+        data = {
+            "sheet_name": "table_1",
+            "page_column": col,
+            "page_column_type": col_type,
+            "documents": documents,
+            "dtypes": dtypes_str
+        }
+        data = self.process_data_rows(data)  # including only one sheet as it's a csv file
+        data_dict = {"data": [data]}
+        return data_dict
+
+    def load_xlsx_documents(self, file_path: str, **kwargs: dict):
+        dfs = pd.read_excel(file_path, sheet_name=None)
+        sheets = []
+        for sheet_name, df in dfs.items():
+            df = df.fillna('null')
+            df, dtypes_str = self.check_sql_dtypes(df)
+
+            for i in range(len(df.columns)):
+                try:
+                    col = df.columns[0]
+                    col_type = str(type(col))
+                    df = df.astype({col: 'str'})
+                    break
+                except:
+                    raise ValueError(
+                        f"Any columns cannot be converted into string type so that can't load LangChain Documents: {dtypes_str}")
+
+            loader = DataFrameLoader(df, page_content_column=col)
+            documents = loader.load()
+
+            sheet = {
+                "sheet_name": sheet_name,
+                "page_column": col,
+                "page_column_type": col_type,
+                "documents": documents,
+                "dtypes": dtypes_str
+            }
+            sheets.append(sheet)
+
+        data_dict = {"data": []}
+        for sheet in sheets:
+            data = self.process_data_rows(sheet)
+            data_dict["data"].append(data)
+
+        return data_dict
+
+    def return_vectormeta_format(self):
+        if not self.data_dict:
+            return None
+
+        text = "[DA] " + str(self.data_dict)  # Add a token to indicate this string is for data analysis
+
+        # @@@@ 성민: 토큰 수 줄이기위한 후처리(임시조치)
+        text = text.replace("Unnamed: ", "")
+        text = text[:2000]
+        
+
+        vectors = [GenOSVectorMeta.model_validate({
+            'text': text,
+            'n_char': 1,
+            'n_word': 1,
+            'n_line': 1,
+            'i_page': 1,
+            'e_page': 1,
+            'n_page': 1,
+            'i_chunk_on_page': 1,
+            'n_chunk_of_page': 1,
+            'i_chunk_on_doc': 1,
+            'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+            'chunk_bboxes': ".",
+            'media_files': "."
+        })]
+
+        
+        return vectors
+
+
+class AudioLoader:
+    def __init__(self,
+                 file_path: str,
+                 req_url: str,
+                 req_data: dict,
+                 chunk_sec: int = 29,
+                 tmp_path: str = '.',
+                 ):
+        self.file_path = file_path
+        self.tmp_path = tmp_path
+        self.chunk_sec = chunk_sec
+        self.req_url = req_url
+        self.req_data = req_data
+
+    def split_file_as_chunks(self) -> list:
+        audio = pydub.AudioSegment.from_file(self.file_path)
+        chunk_len = self.chunk_sec * 1000
+        n_chunks = math.ceil(len(audio) / chunk_len)
+
+        for i in range(n_chunks):
+            start_ms = i * chunk_len
+            overlap_start_ms = start_ms - 300 if start_ms > 0 else start_ms
+            end_ms = start_ms + chunk_len
+            audio_chunk = audio[overlap_start_ms:end_ms]
+            audio_chunk.export(os.path.join(self.tmp_path, "tmp_{}.wav".format(str(i))), format="wav")
+        tmp_files = glob(os.path.join(self.tmp_path, "*.wav"))
+        return tmp_files
+
+    def transcribe_audio(self, file_path_lst: list):
+        transcribed_text_chunks = []
+
+        def _send_request(filepath: str):
+            """Send a request to 'whisper' model served"""
+            files = {
+                'file': (filepath, open(filepath, 'rb'), 'audio/mp3'),
+            }
+
+            response = requests.post(self.req_url, data=self.req_data, files=files)
+            text = response.json().get('text', ', ')
+            transcribed_text_chunks.append({
+                'file_name': os.path.basename(filepath),
+                'text': text
+            })
+
+        # Send parallel requests
+        threads = [threading.Thread(target=_send_request, args=(f,)) for f in file_path_lst]
+        for t in threads: t.start()
+        for t in threads: t.join()
+
+        # Merge transcribed text snippets in order
+        transcribed_text_chunks.sort(key=lambda x: x['file_name'])
+        transcribed_text = "[AUDIO]" + ' '.join([t['text'] for t in transcribed_text_chunks])
+        return transcribed_text
+
+    def return_vectormeta_format(self):
+        audio_chunks = self.split_file_as_chunks()
+        transcribed_text = self.transcribe_audio(audio_chunks)
+        res = [GenOSVectorMeta.model_validate({
+            'text': transcribed_text,
+            'n_char': 1,
+            'n_word': 1,
+            'n_line': 1,
+            'i_page': 1,
+            'e_page': 1,
+            'n_page': 1,
+            'i_chunk_on_page': 1,
+            'n_chunk_of_page': 1,
+            'i_chunk_on_doc': 1,
+            'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+            'chunk_bboxes': ".",
+            'media_files': "."
+        })]
+        return res
+
+
+### for HWPX from 지능형 전처리기 ###
+#  * GenOSVectorMetaBuilder     #
+#  * HierarchicalChunker        #
+#  * HybridChunker              #
+#  * HwpxProcessor              #
+#  * GenosServiceException      #
+
+class HierarchicalChunker(BaseChunker):
+    r""" Chunker implementation leveraging the document layout.
+    Args:
+        merge_list_items (bool): Whether to merge successive list items.
+            Defaults to True.
+        delim (str): Delimiter to use for merging text. Defaults to "\n".
+    """
+    merge_list_items: bool = True
+
+    @classmethod
+    def _triplet_serialize(cls, table_df: DataFrame) -> str:
+        # copy header as first row and shift all rows by one
+        table_df.loc[-1] = table_df.columns  # type: ignore[call-overload]
+        table_df.index = table_df.index + 1
+        table_df = table_df.sort_index()
+
+        rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
+        cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
+
+        nrows = table_df.shape[0]
+        ncols = table_df.shape[1]
+        texts = [
+            f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
+            for i in range(1, nrows)
+            for j in range(1, ncols)
+        ]
+        output_text = ". ".join(texts)
+
+        return output_text
+
+    def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        r"""Chunk the provided document.
+        Args:
+            dl_doc (DLDocument): document to chunk
+
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        heading_by_level: dict[LevelNumber, str] = {}
+        list_items: list[TextItem] = []
+        for item, level in dl_doc.iterate_items():
+            captions = None
+            if isinstance(item, DocItem):
+                # first handle any merging needed
+                if self.merge_list_items:
+                    if isinstance(
+                            item, ListItem
+                    ) or (  # TODO remove when all captured as ListItem:
+                            isinstance(item, TextItem)
+                            and item.label == DocItemLabel.LIST_ITEM
+                    ):
+                        list_items.append(item)
+                        continue
+                    elif list_items:  # need to yield
+                        yield DocChunk(
+                            text=self.delim.join([i.text for i in list_items]),
+                            meta=DocMeta(
+                                doc_items=list_items,
+                                headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                                origin=dl_doc.origin,
+                            ),
+                        )
+                        list_items = []  # reset
+
+                if isinstance(item, SectionHeaderItem) or (
+                        isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]):
+                    level = (
+                        item.level
+                        if isinstance(item, SectionHeaderItem)
+                        else (0 if item.label == DocItemLabel.TITLE else 1)
+                    )
+                    heading_by_level[level] = item.text
+                    text = ''.join(str(value) for value in heading_by_level.values())
+
+                    # remove headings of higher level as they just went out of scope
+                    keys_to_del = [k for k in heading_by_level if k > level]
+                    for k in keys_to_del:
+                        heading_by_level.pop(k, None)
+                    c = DocChunk(
+                        text=text,
+                        meta=DocMeta(
+                            doc_items=[item],
+                            headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                            captions=captions,
+                            origin=dl_doc.origin
+                        ),
+                    )
+                    yield c
+                    continue
+
+                if isinstance(item, TextItem) or (
+                        (not self.merge_list_items) and isinstance(item, ListItem)) or isinstance(item, CodeItem):
+                    text = item.text
+
+                elif isinstance(item, TableItem):
+                    text = item.export_to_markdown(dl_doc)
+                    # dataframe으로 추출할 때 사용되는 코드
+                    # if table_df.shape[0] < 1 or table_df.shape[1] < 2:
+                    #     # at least two cols needed, as first column contains row headers
+                    #     continue
+                    # text = self._triplet_serialize(table_df=table_df)
+                    captions = [c.text for c in [r.resolve(dl_doc) for r in item.captions]] or None
+
+                elif isinstance(item, PictureItem):
+                    text = ''.join(str(value) for value in heading_by_level.values())
+                else:
+                    continue
+                c = DocChunk(
+                    text=text,
+                    meta=DocMeta(
+                        doc_items=[item],
+                        headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                        captions=captions,
+                        origin=dl_doc.origin,
+                    ),
+                )
+                yield c
+
+        if self.merge_list_items and list_items:  # need to yield
+            yield DocChunk(
+                text=self.delim.join([i.text for i in list_items]),
+                meta=DocMeta(
+                    doc_items=list_items,
+                    headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                    origin=dl_doc.origin,
+                ),
+            )
+
+
+class HybridChunker(BaseChunker):
+    r"""Chunker doing tokenization-aware refinements on top of document layout chunking.
+    Args:
+        tokenizer: The tokenizer to use; either instantiated object or name or path of
+            respective pretrained model
+        max_tokens: The maximum number of tokens per chunk. If not set, limit is
+            resolved from the tokenizer
+        merge_peers: Whether to merge undersized chunks sharing same relevant metadata
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    tokenizer: Union[PreTrainedTokenizerBase, str] = (
+        "/nfs-root/all-MiniLM-L6-v2"
+    )
+    max_tokens: int = int(1e30)  # type: ignore[assignment]
+    merge_peers: bool = True
+    _inner_chunker: HierarchicalChunker = HierarchicalChunker()
+
+    @model_validator(mode="after")
+    def _patch_tokenizer_and_max_tokens(self) -> Self:
+        self._tokenizer = (
+            self.tokenizer
+            if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+            else AutoTokenizer.from_pretrained(self.tokenizer)
+        )
+        if self.max_tokens is None:
+            self.max_tokens = TypeAdapter(PositiveInt).validate_python(
+                self._tokenizer.model_max_length
+            )
+        return self
+
+    def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
+        if text is None:
+            return 0
+        elif isinstance(text, list):
+            total = 0
+            for t in text:
+                total += self._count_text_tokens(t)
+            return total
+        return len(self._tokenizer.tokenize(text))
+
+    class _ChunkLengthInfo(BaseModel):
+        total_len: int
+        text_len: int
+        other_len: int
+
+    def _count_chunk_tokens(self, doc_chunk: DocChunk):
+        ser_txt = self.serialize(chunk=doc_chunk)
+        return len(self._tokenizer.tokenize(text=ser_txt))
+
+    def _doc_chunk_length(self, doc_chunk: DocChunk):
+        text_length = self._count_text_tokens(doc_chunk.text)
+        total = self._count_chunk_tokens(doc_chunk=doc_chunk)
+        return self._ChunkLengthInfo(
+            total_len=total,
+            text_len=text_length,
+            other_len=total - text_length,
+        )
+
+    def _make_chunk_from_doc_items(
+            self, doc_chunk: DocChunk, window_start: int, window_end: int
+    ):
+        doc_items = doc_chunk.meta.doc_items[window_start: window_end + 1]
+        meta = DocMeta(
+            doc_items=doc_items,
+            headings=doc_chunk.meta.headings,
+            captions=doc_chunk.meta.captions,
+            origin=doc_chunk.meta.origin,
+        )
+        window_text = (
+            doc_chunk.text
+            if len(doc_chunk.meta.doc_items) == 1
+            else self.delim.join(
+                [
+                    doc_item.text
+                    for doc_item in doc_items
+                    if isinstance(doc_item, TextItem)
+                ]
+            )
+        )
+        new_chunk = DocChunk(text=window_text, meta=meta)
+        return new_chunk
+
+    def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
+        chunks = []
+        window_start = 0
+        window_end = 0  # an inclusive index
+        num_items = len(doc_chunk.meta.doc_items)
+        while window_end < num_items:
+            new_chunk = self._make_chunk_from_doc_items(
+                doc_chunk=doc_chunk,
+                window_start=window_start,
+                window_end=window_end,
+            )
+            if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
+                if window_end < num_items - 1:
+                    window_end += 1
+                    # 아직 청크에 여유가 있고, 남은 아이템도 있으므로 계속 추가 시도
+                    continue
+                else:
+                    # 현재 윈도우의 모든 아이템이 청크에 들어갔고, 더 이상 아이템이 없음
+                    window_end = num_items  # signalizing the last loop
+            elif window_start == window_end:
+                # 아이템 1개도 청크에 안 들어감 → 단독 청크로 처리, 이후 재분할
+                window_end += 1
+                window_start = window_end
+            else:
+                # 마지막 아이템 빼고 청크 생성 → 남은 아이템으로 새 윈도우 시작
+                new_chunk = self._make_chunk_from_doc_items(
+                    doc_chunk=doc_chunk,
+                    window_start=window_start,
+                    window_end=window_end - 1,
+                )
+                window_start = window_end
+            chunks.append(new_chunk)
+        return chunks
+
+    def _split_using_plain_text(self, doc_chunk: DocChunk) -> list[DocChunk]:
+        lengths = self._doc_chunk_length(doc_chunk)
+        if lengths.total_len <= self.max_tokens:
+            return [doc_chunk]
+        else:
+            # 헤더/캡션을 제외하고 본문 텍스트에 할당 가능한 토큰 수 계산
+            available_length = self.max_tokens - lengths.other_len
+            sem_chunker = semchunk.chunkerify(
+                self._tokenizer, chunk_size=available_length
+            )
+            if available_length <= 0:
+                warnings.warn(
+                    f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}"
+                    # noqa
+                )
+                return []
+            text = doc_chunk.text
+            segments = sem_chunker.chunk(text)
+            chunks = [type(doc_chunk)(text=s, meta=doc_chunk.meta) for s in segments]
+            return chunks
+
+    def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
+        output_chunks = []
+        window_start = 0
+        window_end = 0  # an inclusive index
+        num_chunks = len(chunks)
+
+        while window_end < num_chunks:
+            chunk = chunks[window_end]
+            headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
+            ready_to_append = False
+
+            if window_start == window_end:
+                current_headings_and_captions = headings_and_captions
+                window_end += 1
+                first_chunk_of_window = chunk
+
+            else:
+                chks = chunks[window_start: window_end + 1]
+                doc_items = [it for chk in chks for it in chk.meta.doc_items]
+                candidate = DocChunk(
+                    text=self.delim.join([chk.text for chk in chks]),
+                    meta=DocMeta(
+                        doc_items=doc_items,
+                        headings=current_headings_and_captions[0],
+                        captions=current_headings_and_captions[1],
+                        origin=chunk.meta.origin,
+                    ),
+                )
+
+                if (headings_and_captions == current_headings_and_captions
+                        and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
+                ):
+                    # 토큰 수 여유 있음 → 청크 확장 계속
+                    window_end += 1
+                    new_chunk = candidate
+                else:
+                    ready_to_append = True
+
+            if ready_to_append or window_end == num_chunks:
+                # no more room OR the start of new metadata.
+                if window_start + 1 == window_end:
+                    output_chunks.append(first_chunk_of_window)
+                else:
+                    output_chunks.append(new_chunk)
+                window_start = window_end
+
+        return output_chunks
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        r"""Chunk the provided document.
+        Args:
+            dl_doc (DLDocument): document to chunk
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        res: Iterable[DocChunk]
+        res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs)  # type: ignore
+        res = [x for c in res for x in self._split_by_doc_items(c)]
+        res = [x for c in res for x in self._split_using_plain_text(c)]
+
+        if self.merge_peers:
+            res = self._merge_chunks_with_matching_metadata(res)
+        return iter(res)
+
+
+class DocxProcessor:
+    def __init__(self):
+        self.page_chunk_counts = defaultdict(int)
+        self.pipeline_options = PipelineOptions()
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.DOCX: WordFormatOption(
+                pipeline_cls=SimplePipeline, backend=GenosMsWordDocumentBackend
+                ),
+            }
+        )
+
+    def get_paths(self, file_path: str):
+        output_path, output_file = os.path.split(file_path)
+        filename, _ = os.path.splitext(output_file)
+        artifacts_dir = Path(f"{output_path}/{filename}")
+        if artifacts_dir.is_absolute():
+            reference_path = None
+        else:
+            reference_path = artifacts_dir.parent
+        return artifacts_dir, reference_path
+
+    def get_media_files(self, doc_items: list):
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({'path': path, 'name': name})
+        return temp_list
+
+    def safe_join(self, iterable):
+        if not isinstance(iterable, (list, tuple, set)):
+            return ''
+        return ''.join(map(str, iterable)) + '\n'
+
+    def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+        return conv_result.document
+
+    def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+        chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+        chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+        for chunk in chunks:
+            self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+        return chunks
+
+    async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+                              **kwargs: dict) -> list[dict]:
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=document.num_pages(),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+        )
+
+        current_page = None
+        chunk_index_on_page = 0
+        vectors = []
+        upload_tasks = []
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+            content = self.safe_join(chunk.meta.headings) + chunk.text
+
+            if chunk_page != current_page:
+                current_page = chunk_page
+                chunk_index_on_page = 0
+
+            vector = (GenOSVectorMetaBuilder()
+                      .set_text(content)
+                      .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+                      .set_chunk_index(chunk_idx)
+                      .set_global_metadata(**global_metadata)
+                      .set_chunk_bboxes(chunk.meta.doc_items, document)
+                      .set_media_files(chunk.meta.doc_items)
+                      ).build()
+            vectors.append(vector)
+
+            chunk_index_on_page += 1
+            # file_list = self.get_media_files(chunk.meta.doc_items)
+            # upload_tasks.append(asyncio.create_task(
+            #     upload_files(file_list, request=request)
+            # ))
+
+        if upload_tasks:
+            await asyncio.gather(*upload_tasks)
+        return vectors
+
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+        document: DoclingDocument = self.load_documents(file_path, **kwargs)
+        artifacts_dir, reference_path = self.get_paths(file_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+        chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+        vectors = []
+        if len(chunks) >= 1:
+            vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+        else:
+            raise GenosServiceException(1, f"chunk length is 0")
+        return vectors
+
+
+class HwpxProcessor:
+    def __init__(self):
+        self.page_chunk_counts = defaultdict(int)
+        self.pipeline_options = PipelineOptions()
+        self.pipeline_options.save_images = False
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.XML_HWPX: HwpxFormatOption(
+                    pipeline_options=self.pipeline_options
+                )
+            }
+        )
+
+    def get_paths(self, file_path: str):
+        output_path, output_file = os.path.split(file_path)
+        filename, _ = os.path.splitext(output_file)
+        artifacts_dir = Path(f"{output_path}/{filename}")
+        if artifacts_dir.is_absolute():
+            reference_path = None
+        else:
+            reference_path = artifacts_dir.parent
+        return artifacts_dir, reference_path
+
+    def get_media_files(self, doc_items: list):
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({'path': path, 'name': name})
+        return temp_list
+
+    def safe_join(self, iterable):
+        if not isinstance(iterable, (list, tuple, set)):
+            return ''
+        return ''.join(map(str, iterable)) + '\n'
+
+    def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        save_images = kwargs.get('save_images', False)
+
+        if self.pipeline_options.save_images != save_images:
+            self.pipeline_options.save_images = save_images
+            # self._create_converters()
+
+        conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+        return conv_result.document
+
+    def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+        chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+        chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+        for chunk in chunks:
+            self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+        return chunks
+
+    async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+                              **kwargs: dict) -> list[dict]:
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=document.num_pages(),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+        )
+
+        current_page = None
+        chunk_index_on_page = 0
+        vectors = []
+        upload_tasks = []
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+            content = self.safe_join(chunk.meta.headings) + chunk.text
+
+            if chunk_page != current_page:
+                current_page = chunk_page
+                chunk_index_on_page = 0
+
+            vector = (GenOSVectorMetaBuilder()
+                      .set_text(content)
+                      .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+                      .set_chunk_index(chunk_idx)
+                      .set_global_metadata(**global_metadata)
+                      .set_chunk_bboxes(chunk.meta.doc_items, document)
+                      .set_media_files(chunk.meta.doc_items)
+                      ).build()
+            vectors.append(vector)
+
+            chunk_index_on_page += 1
+            # file_list = self.get_media_files(chunk.meta.doc_items)
+            # upload_tasks.append(asyncio.create_task(
+            #     upload_files(file_list, request=request)
+            # ))
+
+        if upload_tasks:
+            await asyncio.gather(*upload_tasks)
+        return vectors
+
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+        document: DoclingDocument = self.load_documents(file_path, **kwargs)
+        artifacts_dir, reference_path = self.get_paths(file_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+        chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+        vectors = []
+        if len(chunks) >= 1:
+            vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+        else:
+            raise GenosServiceException(1, f"chunk length is 0")
+
+        text = ""
+        for vector in vectors:
+            if len(text) + len(vector.text) > 8192:
+                break
+            text += vector.text
+        
+        return [vectors[0]]
+
+
+class GenosServiceException(Exception):
+    """GenOS 와의 의존성 부분 제거를 위해 추가"""
+
+    def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+        self.code = 1
+        self.error_code = error_code
+        self.error_msg = error_msg or "GenOS Service Exception"
+        self.msg_params = msg_params or {}
+
+    def __repr__(self) -> str:
+        class_name = self.__class__.__name__
+        return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# async def assert_cancelled(request: Request):
+#     """GenOS 와의 의존성 제거를 위해 추가"""
+#     if await request.is_disconnected():
+#         raise GenosServiceException(1, f"Cancelled")
+
+
+# @@@@ 성민: OCR을 위해서 추가
+from docling.datamodel.pipeline_options import (
+        AcceleratorDevice,
+        AcceleratorOptions,
+        # OcrEngine,
+        # PdfBackend,
+        PdfPipelineOptions,
+        TableFormerMode,
+        PipelineOptions,
+        PaddleOcrOptions,
+)
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.document_converter import PdfFormatOption
+
+class DocumentProcessor:
+    def __init__(self):
+        self.page_chunk_counts = defaultdict(int)
+        self.hwpx_processor = HwpxProcessor()
+        self.docx_processor = DocxProcessor()
+
+        
+        
+        # @@@@ 성민: OCR을 위해서 추가
+        self.ocr_endpoint = "http://doc-parser-ocr-service:8080/ocr"
+        ocr_options = PaddleOcrOptions(
+            force_full_page_ocr=False,
+            lang=['korean'],
+            ocr_endpoint=self.ocr_endpoint,
+            text_score=0.3)
+
+        
+        device = AcceleratorDevice.AUTO
+        num_threads = 8
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+
+        # PDF 파이프라인 옵션 설정
+        self.pipe_line_options = PdfPipelineOptions()
+        self.pipe_line_options.generate_page_images = True
+        self.pipe_line_options.generate_picture_images = True
+        self.pipe_line_options.do_ocr = False
+        self.pipe_line_options.ocr_options = ocr_options
+        # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+        # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+        # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+        # ocr_options = TesseractOcrOptions()
+        # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+        # ocr_options.path = './.tesseract/tessdata'
+        # self.pipe_line_options.ocr_options = ocr_options
+        # self.pipe_line_options.artifacts_path = Path("/models/")
+        self.pipe_line_options.do_table_structure = True
+        self.pipe_line_options.images_scale = 2
+        self.pipe_line_options.table_structure_options.do_cell_matching = True
+        self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        self.pipe_line_options.accelerator_options = accelerator_options
+        
+        # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+        self.simple_pipeline_options = PipelineOptions()
+        self.simple_pipeline_options.save_images = False
+        
+        # ocr 파이프라인 옵션
+        self.ocr_pipe_line_options = PdfPipelineOptions()
+        self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+        self.ocr_pipe_line_options.do_ocr = True
+        self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+        self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True
+        
+        self.ocr_converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(
+                        pipeline_options=self.ocr_pipe_line_options,
+                        backend=DoclingParseV4DocumentBackend
+                    ),
+                }
+            )
+
+    def get_loader(self, file_path: str):
+        ext = os.path.splitext(file_path)[-1].lower()
+        real_type = self.get_real_file_type(file_path)
+
+        # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+        if ext != real_type and real_type == 'pdf':
+            return PyMuPDFLoader(file_path)
+        elif ext != real_type and real_type in ['txt', 'json', 'md']:
+            return TextLoader(file_path)
+        # 원래 확장자 기반 로직
+        elif ext == '.pdf':
+            return PyMuPDFLoader(file_path)
+        elif ext == '.doc':
+            convert_to_pdf(file_path)
+            return UnstructuredWordDocumentLoader(file_path)
+        elif ext in ['.ppt', '.pptx']:
+            convert_to_pdf(file_path)
+            return UnstructuredPowerPointLoader(file_path)
+        elif ext in ['.jpg', '.jpeg', '.png']:
+            convert_to_pdf(file_path)
+            # 한국어 OCR 지원을 위한 언어 설정
+            return UnstructuredImageLoader(
+                file_path, 
+                languages=["kor", "eng"],  # 한국어 + 영어 OCR
+            )
+        elif ext in ['.txt', '.json', '.md']:
+            return TextLoader(file_path)
+        elif ext == '.hwp':
+            return HwpLoader(file_path)
+        elif ext == '.md':
+            return UnstructuredMarkdownLoader(file_path)
+        else:
+            return UnstructuredFileLoader(file_path)
+
+    def get_real_file_type(self, file_path: str) -> str:
+        """파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
+        with open(file_path, 'rb') as f:
+            header = f.read(8) 
+        if header.startswith(b'%PDF-'):
+            return 'pdf'
+        elif header.startswith(b'\x89PNG'):
+            return 'png'
+        elif header.startswith(b'\xff\xd8\xff'):
+            return 'jpg'
+
+        # 매직 헤더로 판단할 수 없으면 확장자 사용
+        return os.path.splitext(file_path)[-1].lower()
+
+    def convert_md_to_pdf(self, md_path):
+        """Markdown 파일을 PDF로 변환"""
+        install_packages(['chardet'])
+        import chardet
+
+        pdf_path = md_path.replace('.md', '.pdf')
+        with open(md_path, 'rb') as f:
+            raw_file = f.read()
+        candidates = ['utf-8', 'utf-8-sig']
+        try:
+            det = (chardet.detect(raw_file) or {}).get('encoding') or ''
+            # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가
+            if det and det.lower() not in ('ascii', 'unknown'):
+                if det.lower() not in [c.lower() for c in candidates]:
+                    candidates.append(det)
+        except Exception:
+            pass
+        candidates += ['cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+        md_content = None
+        for enc in candidates:
+            try:
+                md_content = raw_file.decode(enc)
+                break
+            except UnicodeDecodeError:
+                continue
+        if md_content is None:
+            md_content = raw_file.decode('utf-8', errors='replace')
+
+        html_content = markdown(md_content)
+        if HTML:
+            HTML(string=html_content).write_pdf(pdf_path)
+        return pdf_path
+
+    
+    
+    def _create_converters(self):
+        """컨버터들을 생성하는 헬퍼 메서드"""
+        self.ocr_converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(
+                        pipeline_options=self.ocr_pipe_line_options,
+                        backend=DoclingParseV4DocumentBackend
+                    ),
+                }
+            )
+        
+    
+    def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+        save_images = kwargs.get('save_images', True)
+        include_wmf = kwargs.get('include_wmf', False)
+
+        # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+        if (self.simple_pipeline_options.save_images != save_images or
+            getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf):
+            self.simple_pipeline_options.save_images = save_images
+            self.simple_pipeline_options.include_wmf = include_wmf
+            self._create_converters()
+
+        try:
+            conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+        except Exception as e:
+            print("@@@@", e)
+            # conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+        
+        return conv_result.document
+    
+    
+    def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
+        loader = self.get_loader(file_path)
+        documents = loader.load()
+
+        # @@@@ 성민: 밑에 주석
+        # 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공
+        # ext = os.path.splitext(file_path)[-1].lower()
+        # if ext in ['.jpg', '.jpeg', '.png']:
+        #     # documents가 없거나, 있어도 모든 page_content가 비어있는 경우
+        #     if not documents or not any(doc.page_content.strip() for doc in documents):
+        #         documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})]
+
+        # @@@@ 성민 새로 작성: 텍스트가 없을 경우 OCR 수행
+        if not documents or not any(doc.page_content.strip() for doc in documents):
+            document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+            documents = list([Document(page_content=document.export_to_markdown(), metadata={})])
+
+        return documents
+
+    def split_documents(self, documents, **kwargs: dict) -> list[Document]:
+        # @@@@ 성민: GenOS에서 바꿔도 안바뀌는듯?
+        print("@@@@ kwargs", kwargs)
+        
+        kwargs.setdefault("chunk_size", 20_000)
+
+        text_splitter = RecursiveCharacterTextSplitter(**kwargs)
+
+        chunks = text_splitter.split_documents(documents)
+        chunks = [chunk for chunk in chunks if chunk.page_content]
+
+        if not chunks:
+            raise Exception('Empty document')
+
+        for chunk in chunks:
+            page = chunk.metadata.get('page', 0)
+            self.page_chunk_counts[page] += 1
+        return chunks
+
+    def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict) -> list[dict]:
+        ext = os.path.splitext(file_path)[-1].lower()
+        real_type = self.get_real_file_type(file_path)
+
+        # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+        if ext != real_type and real_type == 'pdf':
+            pdf_path = file_path
+        elif ext != real_type and real_type in ['txt', 'json', 'md']:
+            pdf_path = _get_pdf_path(file_path)
+        # 원래 확장자 기반 로직
+        elif file_path.endswith('.md'):
+            pdf_path = self.convert_md_to_pdf(file_path)
+        elif file_path.endswith(('.ppt', '.pptx')):
+            pdf_path = _get_pdf_path(file_path)
+        else:
+            pdf_path = _get_pdf_path(file_path)
+
+        # doc = fitz.open(pdf_path) if (pdf_path and os.path.exists(pdf_path)) else None
+
+        if file_path.endswith(('.ppt', '.pptx')):
+            if os.path.exists(pdf_path):
+                subprocess.run(["rm", pdf_path], check=True)
+
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=max([chunk.metadata.get('page', 0) for chunk in chunks]),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z'
+        )
+        current_page = None
+        chunk_index_on_page = 0
+
+        vectors = []
+        for chunk_idx, chunk in enumerate(chunks):
+            page = chunk.metadata.get('page', 0)
+            text = chunk.page_content
+
+            if page != current_page:
+                current_page = page
+                chunk_index_on_page = 0
+
+            # 첨부용에서는 bbox 정보 추출 X
+            # if doc:
+            #     fitz_page = doc.load_page(page)
+            #     global_metadata['chunk_bboxes'] = json.dumps(merge_overlapping_bboxes([{
+            #         'page': page + 1,
+            #         'type': 'text',
+            #         'bbox': {
+            #             'l': rect[0] / fitz_page.rect.width,
+            #             't': rect[1] / fitz_page.rect.height,
+            #             'r': rect[2] / fitz_page.rect.width,
+            #             'b': rect[3] / fitz_page.rect.height,
+            #         }
+            #     } for rect in fitz_page.search_for(text)], x_tolerance=1 / fitz_page.rect.width,
+            #         y_tolerance=1 / fitz_page.rect.height))
+
+            vectors.append(GenOSVectorMeta.model_validate({
+                'text': text,
+                'n_char': len(text),
+                'n_word': len(text.split()),
+                'n_line': len(text.splitlines()),
+                'i_page': page,
+                'e_page': page,
+                'i_chunk_on_page': chunk_index_on_page,
+                'n_chunk_of_page': self.page_chunk_counts[page],
+                'i_chunk_on_doc': chunk_idx,
+                **global_metadata
+            }))
+            chunk_index_on_page += 1
+
+        return vectors
+
+    @guardrail
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+        ext = os.path.splitext(file_path)[-1].lower()
+        if ext in ('.wav', '.mp3', '.m4a'):
+            # Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model
+            tmp_path = "./tmp_audios_{}".format(os.path.basename(file_path).split('.')[0])
+            if not os.path.exists(tmp_path):
+                os.makedirs(tmp_path)
+
+            # Use 'Whisper' model served in-house
+            # [!] Modify the request parameters to change a STT model to be used
+            loader = AudioLoader(
+                file_path=file_path,
+                req_url="http://192.168.74.164:30100/v1/audio/transcriptions",
+                req_data={
+                    'model': 'model',
+                    'language': 'ko',
+                    'response_format': 'json',
+                    'temperature': '0',
+                    'stream': 'false',
+                    'timestamp_granularities[]': 'word'
+                },
+                chunk_sec=29,  # length(sec) of a chunk from the uploaded audio
+                tmp_path=tmp_path
+            )
+            vectors = loader.return_vectormeta_format()
+            # await assert_cancelled(request)
+
+            # Remove the temporal chunks
+            try:
+                subprocess.run(['rm', '-r', tmp_path], check=True)
+            except:
+                pass
+            # await assert_cancelled(request)
+            return vectors
+
+        elif ext in ('.csv', '.xlsx'):
+            loader = TabularLoader(file_path, ext)
+            vectors = loader.return_vectormeta_format()
+            # pdf_path = _get_pdf_path(file_path)
+            # await assert_cancelled(request)
+            return vectors
+
+        elif ext == '.hwp':
+            documents: list[Document] = self.load_documents(file_path, **kwargs)
+            # await assert_cancelled(request)
+            chunks: list[Document] = self.split_documents(documents, **kwargs)
+            # await assert_cancelled(request)
+            vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+            return vectors
+
+        elif ext == '.hwpx':
+            return await self.hwpx_processor(request, file_path, **kwargs)
+
+        elif ext == '.docx':
+            return await self.docx_processor(request, file_path, **kwargs)
+        
+        else:
+            documents: list[Document] = self.load_documents(file_path, **kwargs)
+            # await assert_cancelled(request)
+
+            chunks: list[Document] = self.split_documents(documents, **kwargs)
+            # await assert_cancelled(request)
+
+            vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+            return vectors
\ No newline at end of file

From acb33e0014be0a014b8cb07180f96fd39f4cb0dd Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Mon, 6 Apr 2026 11:28:24 +0900
Subject: [PATCH 17/19] =?UTF-8?q?feat:=20json=20=EC=A0=84=EC=B2=98?=
 =?UTF-8?q?=EB=A6=AC=EA=B8=B0=20=ED=95=98=EB=82=98=EB=A1=9C=20=ED=86=B5?=
 =?UTF-8?q?=ED=95=A9.(PMS=20-=20=EA=B2=BD=EC=83=81=EC=98=A4=EB=8D=94,=20TM?=
 =?UTF-8?q?,=20=EB=B0=9C=EC=A0=84=20=EC=A0=95=EC=A7=80)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 genon/preprocessor/facade/json_processor.py | 344 ++++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 genon/preprocessor/facade/json_processor.py

diff --git a/genon/preprocessor/facade/json_processor.py b/genon/preprocessor/facade/json_processor.py
new file mode 100644
index 0000000000..8616aab79b
--- /dev/null
+++ b/genon/preprocessor/facade/json_processor.py
@@ -0,0 +1,344 @@
+from datetime import datetime
+from typing import Optional, Iterable, Any, List, Dict, Tuple
+from collections import defaultdict
+from fastapi import Request
+from pydantic import BaseModel, ConfigDict
+from collections import Counter
+
+import re
+import asyncio
+import json
+import ast
+import pdb
+
+import pandas as pd
+
+from docling_core.types.doc import (
+    BoundingBox,
+    #CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    #ImageRef,
+    #ProvenanceItem,
+    #Size,
+    #TableCell,
+    #TableData,
+    #GroupItem,
+    DocItem,
+    PictureItem,
+    SectionHeaderItem,
+    TableItem,
+    TextItem,
+    PageItem
+)
+
+from docling.document_converter import DocumentConverter, PdfFormatOption, HTMLFormatOption
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling_core.types import DoclingDocument
+
+KV_MAP = {
+    "url": ["URL"],
+    "ins_date": [
+        "입력일", # 경상오더
+        "발행일자", # TM
+        ],
+    "title": [
+        "오더제목", # 경상오더
+        "고장내용", # 발전정지
+        "TM제목", # TM
+        ],
+    "num": [
+        "오더번호", # 경상오더
+        "번호", # 발전정지
+        "TM번호", # TM
+        ],
+    "Powersys": ["발전소"], # 발전정지
+    "desman": ["설계자"], # 경상오더
+    "desdept": ["설계부서"], # 경상오더, TM
+    "hogi": ["호기"],
+    "des_date": ["설계일"],
+    "stopcat": ["정지종별"], # 발전정지
+    "stopcat_code": ["정지종별코드"], # 발전정지
+    "parcat": ["대분류"], # 발전정지
+    "cat": ["분류"], # 발전정지
+    "event_date": ["발생일시"], # 발전정지
+    "rec_date": ["복구일시"], # 발전정지
+    "pubman": ["발행자"],
+    "pubdept": ["발행부서"],
+    "status": ["진행상태"]
+}
+
+class GenOSVectorMeta(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+class GenOSVectorMetaBuilder:
+    def __init__(self):
+        """빌더 초기화"""
+        self.text: Optional[str] = None
+        self.n_char: Optional[int] = None
+        self.n_word: Optional[int] = None
+        self.n_line: Optional[int] = None
+        self.i_page: Optional[int] = None
+        self.i_chunk_on_page: Optional[int] = None
+        self.n_chunk_of_page: Optional[int] = None
+        self.i_chunk_on_doc: Optional[int] = None
+        self.n_chunk_of_doc: Optional[int] = None
+        self.n_page: Optional[int] = None
+        self.reg_date: Optional[str] = None
+        self.bboxes: Optional[str] = None
+        self.url: Optional[str] = None
+
+        self.data = {"text": None, 
+                    "n_char": None,
+                    "n_line": None,
+                    "i_page": None,
+                    "i_chunk_on_page": None,
+                    "n_chunk_of_page": None,
+                    "i_chunk_on_doc": None,
+                    "n_chunk_of_doc": None,
+                    "n_page": None,
+                    "reg_date": None,
+                    "bboxes": None,
+                    "url": None
+                    }
+
+    def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+        """텍스트와 관련된 데이터를 설정"""
+        
+        self.text = text
+        self.n_char = len(text)
+        self.n_word = len(text.split())
+        self.n_line = len(text.splitlines())
+
+        self.data["text"] = text
+        self.data["n_char"] = len(text)
+        self.data["n_word"] = len(text.split())
+        self.data["n_line"] = len(text.splitlines())
+
+        return self
+
+    def set_page_info(
+            self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int
+    ) -> "GenOSVectorMetaBuilder":
+        """페이지 정보 설정"""
+        self.i_page = i_page
+        self.i_chunk_on_page = i_chunk_on_page
+        self.n_chunk_of_page = n_chunk_of_page
+
+        self.data["i_page"] = i_page
+        self.data["i_chunk_on_page"] = i_chunk_on_page
+        self.data["n_chunk_of_page"] = n_chunk_of_page
+
+        return self
+
+    def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+        """문서 전체의 청크 인덱스 설정"""
+        self.i_chunk_on_doc = i_chunk_on_doc
+
+        self.data["i_chunk_on_doc"] = i_chunk_on_doc
+
+        return self
+
+    def set_bboxes(self, bbox: BoundingBox) -> "GenOSVectorMetaBuilder":
+        """Bounding Boxes 정보 설정"""
+        #         bboxes.append({
+        #             'p1': {'x': rect[0] / fitz_page.rect.width, 'y': rect[1] / fitz_page.rect.height},
+        #             'p2': {'x': rect[2] / fitz_page.rect.width, 'y': rect[3] / fitz_page.rect.height},
+        #         })
+        # NOTE: docling은 BOTTOMLEFT인데 해당 좌표 그대로 활용되는지 ?
+        conv = []
+        conv.append({
+            'p1': {'x': 0, 'y': 0},
+            'p2': {'x': 0, 'y': 0},
+        })
+        self.bboxes = json.dumps(conv)
+
+        self.data["bboxes"] = json.dumps(conv)
+
+        return self
+
+    def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+        """글로벌 메타데이터 병합"""
+
+        for key, value in global_metadata.items():
+            setattr(self, key, value)
+            self.data[key] = value
+            
+                
+        return self
+
+    def build(self) -> GenOSVectorMeta:
+        """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+        return GenOSVectorMeta(text=self.data.pop("text", "ERROR: no text"), **self.data)
+
+class DocumentProcessor:
+    def __init__(self):
+        '''
+        initialize Document Converter
+        '''
+        self.page_chunk_counts = defaultdict(int)
+        # device = AcceleratorDevice.AUTO
+        num_threads = 4
+
+    def preprocess_json(self, jsonf):
+
+        metadata_keys = []
+        date_keys = []
+        
+        for jsonf_k, _ in jsonf.items():
+            for k, v in KV_MAP.items():
+                if jsonf_k in v:
+                    metadata_keys.append(jsonf_k)
+                    if "date" in k:
+                        date_keys.append(jsonf_k)
+                    
+        # date처리, json확인해보고 빼도됨. 
+        for k in date_keys:
+            if k in jsonf:
+                try: 
+                    # jsonf[k] = pdf.to_datetime(jsonf[k], errors='coerce').isoformat()
+                    date_value = jsonf[k]
+                    if not date_value:
+                      date_value = 0
+                    
+                    if date_value:
+                      dt_obj = self._parse_date_string(str(date_value))
+                      
+                      if dt_obj:
+                        jsonf[k] = int(dt_obj.strftime("%Y%m%d"))
+                      else:
+                        jsonf[k] = ""
+                    else:
+                      jsonf[k] = ""
+                except Exception:
+                    pass
+        
+        # nan 처리
+        for key in jsonf.keys():
+            try: 
+                if not isinstance(jsonf[key], list) and pd.isna(jsonf[key]):
+                    jsonf[key] = ""
+            except:
+                pass
+
+        # 메타데이터 처리
+        metadata = {key: jsonf[key] for key in metadata_keys if key in jsonf}
+
+        # formatted text 생성 
+        formatted_text = "\n ".join([f"{key} : {str(jsonf[key])}" for key in jsonf if not key.startswith('Unnamed')])
+
+        metadata_key_list = list(metadata.keys())
+        for k, v_list in KV_MAP.items():
+            for metadata_key in metadata_key_list:
+                if metadata_key in v_list:
+                    if k in metadata.keys():
+                        print(f"@@@@ 이미 있는 키: {metadata_key} ---X-->> {k}")
+                        print(f"@@@@ 있는 값: {k} : {metadata[k]}")
+                        metadata.pop(metadata_key)
+                    else:
+                        metadata[k] = metadata.pop(metadata_key)
+                    
+        chunk = {
+            "id": 1,
+            "text": formatted_text,
+            "metadata": metadata
+        }
+
+        return chunk
+       
+
+    def load_documents(self, file_path: str):        
+        with open(file_path, 'r', encoding='utf-8') as f:
+            jsonfile = json.load(f)
+
+        return jsonfile
+    
+    def _parse_date_string(self, date_str:str)-> Optional[datetime]:
+      formats = [
+        "%Y-%m-%d",
+        "%Y%m%d",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%dT%H:%M:%SZ",
+        "%Y-%m-%d %H:%M:%S%z",
+        "%Y-%m-%dT%H:%M:%S.%fZ"
+        ]
+        
+      if not date_str or date_str.strip() == "":
+        return 0
+      for fmt in formats:
+        try:
+          return datetime.strptime(date_str, fmt)
+        except ValueError:
+          continue
+        
+      return 0
+
+    
+    # def split_documents(self, documents: dict, **kwargs: dict) -> List[Dict]:
+    def split_documents(self, documents: dict, **kwargs: dict) -> Dict:
+        chunk_size = 1000
+        text = documents.get("text", "error")
+        chunks = []
+        chunk = ""
+
+        words = text.split(" ")
+
+        for word in words:
+            if len(chunk) + len(word) > chunk_size:
+                chunks.append(chunk)
+                chunk = word
+            else:
+                chunk += (" " + word) if chunk else word
+        
+        if chunk:
+            chunks.append(chunk)
+                
+        new_chunks = []
+        for chunk in chunks:
+            documents['text'] = chunk
+            new_chunks.append(documents.copy())
+
+        return new_chunks
+
+    
+    def compose_vectors(self, chunks: list[dict], file_path: str) -> \
+            list[dict]:
+        
+        first_chunk = chunks[0]
+        
+        global_metadata = dict(
+            n_chunk_of_doc=int(1),
+            n_page=int(1),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+            **first_chunk['metadata'],
+        )
+
+        current_page = 1
+        chunk_index_on_page = 0
+
+        vectors = []
+        for chunk in chunks:
+            vector = (GenOSVectorMetaBuilder()
+                        .set_text(chunk["text"])
+                        .set_page_info(1, 1, 1)
+                        .set_chunk_index(1)
+                        .set_global_metadata(**global_metadata)
+                        .set_bboxes(None)
+                        ).build()
+            vectors.append(vector)
+
+        return vectors
+
+    async def __call__(self, request: Request, file_path: str, **kwargs): # request: Request
+
+        file: dict = self.load_documents(file_path)
+        
+        document: dict = self.preprocess_json(file)
+        
+        chunks: list[dict] = self.split_documents(document, **kwargs)
+
+        vectors: list[dict] = self.compose_vectors(chunks=chunks, file_path= file_path)
+
+        return vectors
\ No newline at end of file

From d35e7714ddb807119f3ff2529cd9d83d01647f6d Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Mon, 6 Apr 2026 11:29:02 +0900
Subject: [PATCH 18/19] =?UTF-8?q?feat:=20OneAgent=20=EC=97=B0=EB=8F=99?=
 =?UTF-8?q?=EC=9A=A9=20=EC=A0=84=EC=B2=98=EB=A6=AC=EA=B8=B0=20=EC=B6=94?=
 =?UTF-8?q?=EA=B0=80.=20(=EC=B2=A8=EB=B6=80=EC=9A=A9=20=EC=A0=84=EC=B2=98?=
 =?UTF-8?q?=EB=A6=AC=EA=B8=B0=20=EA=B8=B0=EB=B0=98)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../preprocessor/facade/oneagent_processor.py | 1646 +++++++++++++++++
 1 file changed, 1646 insertions(+)
 create mode 100644 genon/preprocessor/facade/oneagent_processor.py

diff --git a/genon/preprocessor/facade/oneagent_processor.py b/genon/preprocessor/facade/oneagent_processor.py
new file mode 100644
index 0000000000..93870bace0
--- /dev/null
+++ b/genon/preprocessor/facade/oneagent_processor.py
@@ -0,0 +1,1646 @@
+from __future__ import annotations
+
+from collections import defaultdict
+
+import asyncio
+import fitz
+import json
+import math
+import os
+import pandas as pd
+import pydub
+import requests
+import shutil
+import subprocess
+import sys
+import threading
+import uuid
+import warnings
+from datetime import datetime
+from fastapi import Request
+from glob import glob
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+    # TextLoader,                       # TXT
+    PyMuPDFLoader,  # PDF
+    DataFrameLoader,  # DataFrame
+    UnstructuredWordDocumentLoader,  # DOC and DOCX
+    UnstructuredPowerPointLoader,  # PPT and PPTX
+    UnstructuredImageLoader,  # JPG, PNG
+    UnstructuredMarkdownLoader,  # Markdown
+    UnstructuredFileLoader,  # Generic fallback
+)
+from langchain_core.documents import Document
+from markdown2 import markdown
+from pandas import DataFrame
+from pathlib import Path
+from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing import Any, Iterable, Iterator, List, Optional, Union
+from typing_extensions import Self
+
+try:
+    import semchunk
+    from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+    raise RuntimeError(
+        "Module requires 'chunking' extra; to install, run: "
+        "`pip install 'docling-core[chunking]'`"
+    )
+try:
+    import chardet
+except ImportError:
+    raise RuntimeError("Module 'chardet' not imported. Run `pip install chardet`.")
+try:
+    from weasyprint import HTML
+except ImportError:
+    print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.")
+    HTML = None
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.document_converter import DocumentConverter, HwpxFormatOption, WordFormatOption
+from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta
+from docling_core.types import DoclingDocument as DLDocument
+from docling_core.types.doc import (
+    DocItem, DocItemLabel, DoclingDocument,
+    PictureItem, SectionHeaderItem, TableItem, TextItem
+)
+from docling_core.types.doc.document import LevelNumber, ListItem, CodeItem
+from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend
+# from utils import assert_cancelled
+# from genos_utils import upload_files, merge_overlapping_bboxes
+
+# import platform
+from pathlib import Path
+import os
+import subprocess
+import tempfile
+import shutil
+import unicodedata
+
+import logging
+
+for n in ("fontTools", "fontTools.ttLib", "fontTools.ttLib.ttFont"):
+    lg = logging.getLogger(n)
+    lg.setLevel(logging.CRITICAL)
+    lg.propagate = False
+    logging.getLogger().setLevel(logging.WARNING)        
+# pdf 변환 대상 확장자
+CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']
+
+
+
+### @@@@ 성민: 가드레일 용 ###
+import requests
+import re
+import json
+
+GUARDRAIL_WORKFLOW_ID = 694
+GUARDRAIL_BEARER_TOKEN = '23c3898fe3264fd597961af23a68fe7c'
+# GENOS_URL = 'https://ai.komipo.co.kr:30908/'
+# @@@@ 내부 호출로 변경
+GENOS_URL = 'http://llmops-gateway-api-service:8080'
+
+
+from functools import wraps
+
+def guardrail(func):
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        result = await func(*args, **kwargs)
+        for r in result:
+            url = f"{GENOS_URL}/workflow/{GUARDRAIL_WORKFLOW_ID}"
+            headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}")
+
+            if hasattr(r, "text"):
+                body = {'question': r.text}
+
+                res = requests.post(f'{url}/run/v2', json=body, headers=headers)
+
+                answer = res.json()['data']['text']
+
+                if answer.startswith("[UNSAFE]"):
+                    r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다."
+
+            
+        return result
+    return wrapper
+
+
+def convert_to_pdf(file_path: str) -> str | None:
+    """
+    LibreOffice로 PDF 변환을 시도한다.
+    실패해도 예외를 던지지 않고 None을 반환한다.
+    """
+    try:
+        in_path = Path(file_path).resolve()
+        out_dir = in_path.parent
+        pdf_path = in_path.with_suffix('.pdf')
+
+        # headless에서 UTF-8 locale 보장
+        env = os.environ.copy()
+        env.setdefault("LANG", "C.UTF-8")
+        env.setdefault("LC_ALL", "C.UTF-8")
+
+        # 확장자에 따라 필터(특히 .ppt는 impress 필터)
+        ext = in_path.suffix.lower()
+        if ext in ('.ppt', '.pptx'):
+            convert_arg = "pdf:impress_pdf_Export"
+        elif ext in ('.doc', '.docx'):
+            convert_arg = "pdf:writer_pdf_Export"
+        elif ext in ('.xls', '.xlsx', '.csv'):
+            convert_arg = "pdf:calc_pdf_Export"
+        else:
+            convert_arg = "pdf"
+
+        # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도
+        try:
+            in_path.name.encode('ascii')
+            candidates = [in_path]
+            tmp_dir = None
+        except UnicodeEncodeError:
+            tmp_dir = Path(tempfile.mkdtemp())
+            ascii_name = unicodedata.normalize('NFKD', in_path.stem).encode('ascii', 'ignore').decode('ascii') or "file"
+            ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}"
+            shutil.copy2(in_path, ascii_copy)
+            candidates = [ascii_copy, in_path]
+
+        for cand in candidates:
+            cmd = [
+                "soffice", "--headless",
+                "--convert-to", convert_arg,
+                "--outdir", str(out_dir),
+                str(cand)
+            ]
+            proc = subprocess.run(cmd, env=env, capture_output=True, text=True)
+            if proc.returncode == 0 and pdf_path.exists():
+                # 성공
+                if tmp_dir:
+                    shutil.rmtree(tmp_dir, ignore_errors=True)
+                return str(pdf_path)
+            # 실패해도 계속 시도 (로그만 찍고 무시)
+            print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}")
+
+        if tmp_dir:
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+        return None
+    except Exception as e:
+        # 어떤 에러든 삼키고 None 반환
+        print(f"[convert_to_pdf] error: {e}")
+        return None
+
+
+def _get_pdf_path(file_path: str) -> str:
+    """
+    다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수
+
+    Args:
+        file_path (str): 원본 파일 경로
+
+    Returns:
+        str: PDF 확장자로 변경된 파일 경로
+    """
+    pdf_path = file_path
+    for ext in CONVERTIBLE_EXTENSIONS:
+        pdf_path = pdf_path.replace(ext, '.pdf')
+    return pdf_path
+
+
+def install_packages(packages):
+    for package in packages:
+        try:
+            __import__(package)
+        except ImportError:
+            print(f"[!] {package} 패키지가 없습니다. 설치를 시도합니다.")
+            subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)
+
+
+class GenOSVectorMeta(BaseModel):
+    class Config:
+        extra = 'allow'
+
+    text: str | None = None
+    n_char: int | None = None
+    n_word: int | None = None
+    n_line: int | None = None
+    i_page: int | None = None
+    e_page: int | None = None
+    i_chunk_on_page: int | None = None
+    n_chunk_of_page: int | None = None
+    i_chunk_on_doc: int | None = None
+    n_chunk_of_doc: int | None = None
+    n_page: int | None = None
+    reg_date: str | None = None
+    chunk_bboxes: str | None = None
+    media_files: str | None = None
+
+
+class GenOSVectorMetaBuilder:
+    def __init__(self):
+        """빌더 초기화"""
+        self.text: Optional[str] = None
+        self.n_char: Optional[int] = None
+        self.n_word: Optional[int] = None
+        self.n_line: Optional[int] = None
+        self.i_page: Optional[int] = None
+        self.e_page: Optional[int] = None
+        self.i_chunk_on_page: Optional[int] = None
+        self.n_chunk_of_page: Optional[int] = None
+        self.i_chunk_on_doc: Optional[int] = None
+        self.n_chunk_of_doc: Optional[int] = None
+        self.n_page: Optional[int] = None
+        self.reg_date: Optional[str] = None
+        self.chunk_bboxes: Optional[str] = None
+        self.media_files: Optional[str] = None
+        # self.title: Optional[str] = None
+        # self.created_date: Optional[int] = None
+
+    def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+        """텍스트와 관련된 데이터를 설정"""
+        self.text = text
+        self.n_char = len(text)
+        self.n_word = len(text.split())
+        self.n_line = len(text.splitlines())
+        return self
+
+    def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder":
+        """페이지 정보 설정"""
+        self.i_page = i_page
+        self.i_chunk_on_page = i_chunk_on_page
+        self.n_chunk_of_page = n_chunk_of_page
+        return self
+
+    def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+        """문서 전체의 청크 인덱스 설정"""
+        self.i_chunk_on_doc = i_chunk_on_doc
+        return self
+
+    def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+        """글로벌 메타데이터 병합"""
+        for key, value in global_metadata.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+        return self
+
+    def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+        chunk_bboxes = []
+        for item in doc_items:
+            for prov in item.prov:
+                label = item.self_ref
+                type_ = item.label
+                size = document.pages.get(prov.page_no).size
+                page_no = prov.page_no
+                bbox = prov.bbox
+                bbox_data = {
+                    'l': bbox.l / size.width,
+                    't': bbox.t / size.height,
+                    'r': bbox.r / size.width,
+                    'b': bbox.b / size.height,
+                    'coord_origin': bbox.coord_origin.value
+                }
+                chunk_bboxes.append({
+                    'page': page_no,
+                    'bbox': bbox_data,
+                    'type': type_,
+                    'ref': label
+                })
+        self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None
+        self.chunk_bboxes = json.dumps(chunk_bboxes)
+        return self
+
+    def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+        temp_list = []
+        if not doc_items:
+            self.media_files = ""
+            return self
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref})
+        self.media_files = json.dumps(temp_list)
+        return self
+
+    def build(self) -> GenOSVectorMeta:
+        """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+        return GenOSVectorMeta(
+            text=self.text,
+            n_char=self.n_char,
+            n_word=self.n_word,
+            n_line=self.n_line,
+            i_page=self.i_page,
+            e_page=self.e_page,
+            i_chunk_on_page=self.i_chunk_on_page,
+            n_chunk_of_page=self.n_chunk_of_page,
+            i_chunk_on_doc=self.i_chunk_on_doc,
+            n_chunk_of_doc=self.n_chunk_of_doc,
+            n_page=self.n_page,
+            reg_date=self.reg_date,
+            chunk_bboxes=self.chunk_bboxes,
+            media_files=self.media_files,
+        )
+
+
+class HwpLoader:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.output_dir = os.path.join('/tmp', str(uuid.uuid4()))
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    def load(self):
+        try:
+            subprocess.run(['hwp5html', self.file_path, '--output', self.output_dir], check=True, timeout=600)
+            converted_file_path = os.path.join(self.output_dir, 'index.xhtml')
+            pdf_save_path = _get_pdf_path(self.file_path)
+            HTML(converted_file_path).write_pdf(pdf_save_path)
+            loader = PyMuPDFLoader(pdf_save_path)
+            return loader.load()
+        except Exception as e:
+            print(f"Failed to convert {self.file_path} to XHTML")
+            raise e
+        finally:
+            if os.path.exists(self.output_dir):
+                shutil.rmtree(self.output_dir)
+
+
+class TextLoader:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.output_dir = os.path.join('/tmp', str(uuid.uuid4()))
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    def load(self):
+        try:
+            with open(self.file_path, 'rb') as f:
+                raw = f.read()
+            enc = chardet.detect(raw).get('encoding') or ''
+            encodings = [enc] if enc and enc.lower() not in ('ascii', 'unknown') else []
+            encodings += ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+
+            content = None
+            for e in encodings:
+                try:
+                    content = raw.decode(e)  # 전체 파일로 디코딩
+                    break
+                except UnicodeDecodeError:
+                    continue
+            if content is None:
+                content = raw.decode('utf-8', errors='replace')
+
+            # 4) PDF 변환 유지
+            html = f"<html><meta charset='utf-8'><body><pre>{content}</pre></body></html>"
+            html_path = os.path.join(self.output_dir, 'temp.html')
+            with open(html_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+            # pdf_path = (self.file_path
+            #             .replace('.txt', '.pdf')
+            #             .replace('.json', '.pdf'))
+            pdf_path = _get_pdf_path(self.file_path)
+            if HTML:
+                HTML(html_path).write_pdf(pdf_path)
+                loader = PyMuPDFLoader(pdf_path)
+                return loader.load()
+            # PDF가 불가하면 Document 직접 반환 (원형 스키마 유지)
+            return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+
+        except Exception:
+            # 실패 시에도 스키마는 그대로 유지해 반환
+            for e in ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1']:
+                try:
+                    with open(self.file_path, 'r', encoding=e) as f:
+                        content = f.read()
+                    return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+                except UnicodeDecodeError:
+                    continue
+            with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f:
+                content = f.read()
+            return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+        finally:
+            if os.path.exists(self.output_dir):
+                shutil.rmtree(self.output_dir)
+
+
+class TabularLoader:
+    def __init__(self, file_path: str, ext: str):
+        
+        packages = ['openpyxl', 'chardet']
+        
+        install_packages(packages)
+
+        self.file_path = file_path
+        if ext == ".csv":
+            # convert_to_pdf(file_path) csv는 Pdf 변환 안 함
+            self.data_dict = self.load_csv_documents(file_path)
+        elif ext == ".xlsx":
+            # convert_to_pdf(file_path) xlsx는 Pdf 변환 안 함
+            self.data_dict = self.load_xlsx_documents(file_path)
+        else:
+            print(f"[!] Inadequate extension for TabularLoader: {ext}")
+            return
+
+    def check_sql_dtypes(self, df):
+        df = df.convert_dtypes()
+        res = []
+        for col in df.columns:
+            # col_name = col.strip().replace(' ', '_')
+            dtype = str(df.dtypes[col]).lower()
+
+            if 'int' in dtype:
+                if '64' in dtype:
+                    sql_dtype = 'BIGINT'
+                else:
+                    sql_dtype = 'INT'
+            elif 'float' in dtype:
+                sql_dtype = 'FLOAT'
+            elif 'bool' in dtype:
+                sql_dtype = 'BOOLEAN'
+            elif 'date' in dtype:
+                sql_dtype = 'DATE'
+                df[col] = df[col].astype(str)
+            elif 'datetime' in dtype:
+                sql_dtype = 'DATETIME'
+                df[col] = df[col].astype(str)
+            # else:
+            #     max_len = df[col].str.len().max().item() + 10
+            #     sql_dtype = f'VARCHAR({max_len})'
+            else:
+                lens = df[col].astype(str).str.len()
+                max_len_val = lens.max()
+                max_len = int(0 if pd.isna(max_len_val) else max_len_val) + 10
+                sql_dtype = f'VARCHAR({max_len})'
+
+            res.append([col, sql_dtype])
+
+        return df, res
+
+    def process_data_rows(self, data: dict):
+        """Arg: data (keys: 'sheet_name', 'page_column', 'page_column_type', 'documents')"""
+
+        rows = []
+        for doc in data["documents"]:
+            row = {}
+            if 'int' in data["page_column_type"]:
+                row[data["page_column"]] = int(doc.page_content)
+            elif 'float' in data["page_column_type"]:
+                row[data["page_column"]] = float(doc.page_content)
+            elif 'bool' in data["page_column_type"]:
+                if doc.page_content.lower() == 'true':
+                    row[data["page_column"]] = True
+                elif doc.page_content.lower() == 'false':
+                    row[data["page_column"]] = False
+                else:
+                    raise ValueError(f"Invalid boolean string: {doc.page_content}")
+            else:
+                row[data["page_column"]] = doc.page_content
+
+            row.update(doc.metadata)
+            rows.append(row)
+
+        processed_data = {"sheet_name": data["sheet_name"], "data_rows": rows, "data_types": data["dtypes"]}
+        return processed_data
+
+    def load_csv_documents(self, file_path: str, **kwargs: dict):
+        import chardet
+
+        with open(file_path, "rb") as f:
+            raw_file = f.read(10000)
+        enc_type = chardet.detect(raw_file)['encoding']
+        df = pd.read_csv(file_path, encoding=enc_type, index_col=False)
+        df = df.fillna('null')  # csv 파일에서도 xlsx 파일과 동일하게 null로 채움
+        df, dtypes_str = self.check_sql_dtypes(df)
+
+        for i in range(len(df.columns)):
+            try:
+                col = df.columns[0]
+                # col_type = str(type(col))
+                col_type = str(df[col].dtype)
+                df = df.astype({col: 'str'})
+                break
+            except:
+                raise ValueError(
+                    f"Any columns cannot be converted into the string type so that can't load LangChain Documents: {dtypes_str}")
+
+        loader = DataFrameLoader(df, page_content_column=col)
+        documents = loader.load()
+
+        data = {
+            "sheet_name": "table_1",
+            "page_column": col,
+            "page_column_type": col_type,
+            "documents": documents,
+            "dtypes": dtypes_str
+        }
+        data = self.process_data_rows(data)  # including only one sheet as it's a csv file
+        data_dict = {"data": [data]}
+        return data_dict
+
+    def load_xlsx_documents(self, file_path: str, **kwargs: dict):
+        dfs = pd.read_excel(file_path, sheet_name=None)
+        sheets = []
+        for sheet_name, df in dfs.items():
+            df = df.fillna('null')
+            df, dtypes_str = self.check_sql_dtypes(df)
+
+            for i in range(len(df.columns)):
+                try:
+                    col = df.columns[0]
+                    col_type = str(type(col))
+                    df = df.astype({col: 'str'})
+                    break
+                except:
+                    raise ValueError(
+                        f"Any columns cannot be converted into string type so that can't load LangChain Documents: {dtypes_str}")
+
+            loader = DataFrameLoader(df, page_content_column=col)
+            documents = loader.load()
+
+            sheet = {
+                "sheet_name": sheet_name,
+                "page_column": col,
+                "page_column_type": col_type,
+                "documents": documents,
+                "dtypes": dtypes_str
+            }
+            sheets.append(sheet)
+
+        data_dict = {"data": []}
+        for sheet in sheets:
+            data = self.process_data_rows(sheet)
+            data_dict["data"].append(data)
+
+        return data_dict
+
+    def return_vectormeta_format(self):
+        if not self.data_dict:
+            return None
+
+        text = "[DA] " + str(self.data_dict)  # Add a token to indicate this string is for data analysis
+
+        # @@@@ 성민: 토큰 수 줄이기위한 후처리(임시조치)
+        text = text.replace("Unnamed: ", "")
+        text = text[:2000]
+        
+        vectors = [GenOSVectorMeta.model_validate({
+            'text': text,
+            'n_char': 1,
+            'n_word': 1,
+            'n_line': 1,
+            'i_page': 1,
+            'e_page': 1,
+            'n_page': 1,
+            'i_chunk_on_page': 1,
+            'n_chunk_of_page': 1,
+            'i_chunk_on_doc': 1,
+            'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+            'chunk_bboxes': ".",
+            'media_files': "."
+        })]
+
+        
+        return vectors
+
+
+class AudioLoader:
+    def __init__(self,
+                 file_path: str,
+                 req_url: str,
+                 req_data: dict,
+                 chunk_sec: int = 29,
+                 tmp_path: str = '.',
+                 ):
+        self.file_path = file_path
+        self.tmp_path = tmp_path
+        self.chunk_sec = chunk_sec
+        self.req_url = req_url
+        self.req_data = req_data
+
+    def split_file_as_chunks(self) -> list:
+        audio = pydub.AudioSegment.from_file(self.file_path)
+        chunk_len = self.chunk_sec * 1000
+        n_chunks = math.ceil(len(audio) / chunk_len)
+
+        for i in range(n_chunks):
+            start_ms = i * chunk_len
+            overlap_start_ms = start_ms - 300 if start_ms > 0 else start_ms
+            end_ms = start_ms + chunk_len
+            audio_chunk = audio[overlap_start_ms:end_ms]
+            audio_chunk.export(os.path.join(self.tmp_path, "tmp_{}.wav".format(str(i))), format="wav")
+        tmp_files = glob(os.path.join(self.tmp_path, "*.wav"))
+        return tmp_files
+
+    def transcribe_audio(self, file_path_lst: list):
+        transcribed_text_chunks = []
+
+        def _send_request(filepath: str):
+            """Send a request to 'whisper' model served"""
+            files = {
+                'file': (filepath, open(filepath, 'rb'), 'audio/mp3'),
+            }
+
+            response = requests.post(self.req_url, data=self.req_data, files=files)
+            text = response.json().get('text', ', ')
+            transcribed_text_chunks.append({
+                'file_name': os.path.basename(filepath),
+                'text': text
+            })
+
+        # Send parallel requests
+        threads = [threading.Thread(target=_send_request, args=(f,)) for f in file_path_lst]
+        for t in threads: t.start()
+        for t in threads: t.join()
+
+        # Merge transcribed text snippets in order
+        transcribed_text_chunks.sort(key=lambda x: x['file_name'])
+        transcribed_text = "[AUDIO]" + ' '.join([t['text'] for t in transcribed_text_chunks])
+        return transcribed_text
+
+    def return_vectormeta_format(self):
+        audio_chunks = self.split_file_as_chunks()
+        transcribed_text = self.transcribe_audio(audio_chunks)
+        res = [GenOSVectorMeta.model_validate({
+            'text': transcribed_text,
+            'n_char': 1,
+            'n_word': 1,
+            'n_line': 1,
+            'i_page': 1,
+            'e_page': 1,
+            'n_page': 1,
+            'i_chunk_on_page': 1,
+            'n_chunk_of_page': 1,
+            'i_chunk_on_doc': 1,
+            'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+            'chunk_bboxes': ".",
+            'media_files': "."
+        })]
+        return res
+
+
+### for HWPX from 지능형 전처리기 ###
+#  * GenOSVectorMetaBuilder     #
+#  * HierarchicalChunker        #
+#  * HybridChunker              #
+#  * HwpxProcessor              #
+#  * GenosServiceException      #
+
+class HierarchicalChunker(BaseChunker):
+    r""" Chunker implementation leveraging the document layout.
+    Args:
+        merge_list_items (bool): Whether to merge successive list items.
+            Defaults to True.
+        delim (str): Delimiter to use for merging text. Defaults to "\n".
+    """
+    merge_list_items: bool = True
+
+    @classmethod
+    def _triplet_serialize(cls, table_df: DataFrame) -> str:
+        # copy header as first row and shift all rows by one
+        table_df.loc[-1] = table_df.columns  # type: ignore[call-overload]
+        table_df.index = table_df.index + 1
+        table_df = table_df.sort_index()
+
+        rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
+        cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
+
+        nrows = table_df.shape[0]
+        ncols = table_df.shape[1]
+        texts = [
+            f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
+            for i in range(1, nrows)
+            for j in range(1, ncols)
+        ]
+        output_text = ". ".join(texts)
+
+        return output_text
+
+    def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        r"""Chunk the provided document.
+        Args:
+            dl_doc (DLDocument): document to chunk
+
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        heading_by_level: dict[LevelNumber, str] = {}
+        list_items: list[TextItem] = []
+        for item, level in dl_doc.iterate_items():
+            captions = None
+            if isinstance(item, DocItem):
+                # first handle any merging needed
+                if self.merge_list_items:
+                    if isinstance(
+                            item, ListItem
+                    ) or (  # TODO remove when all captured as ListItem:
+                            isinstance(item, TextItem)
+                            and item.label == DocItemLabel.LIST_ITEM
+                    ):
+                        list_items.append(item)
+                        continue
+                    elif list_items:  # need to yield
+                        yield DocChunk(
+                            text=self.delim.join([i.text for i in list_items]),
+                            meta=DocMeta(
+                                doc_items=list_items,
+                                headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                                origin=dl_doc.origin,
+                            ),
+                        )
+                        list_items = []  # reset
+
+                if isinstance(item, SectionHeaderItem) or (
+                        isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]):
+                    level = (
+                        item.level
+                        if isinstance(item, SectionHeaderItem)
+                        else (0 if item.label == DocItemLabel.TITLE else 1)
+                    )
+                    heading_by_level[level] = item.text
+                    text = ''.join(str(value) for value in heading_by_level.values())
+
+                    # remove headings of higher level as they just went out of scope
+                    keys_to_del = [k for k in heading_by_level if k > level]
+                    for k in keys_to_del:
+                        heading_by_level.pop(k, None)
+                    c = DocChunk(
+                        text=text,
+                        meta=DocMeta(
+                            doc_items=[item],
+                            headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                            captions=captions,
+                            origin=dl_doc.origin
+                        ),
+                    )
+                    yield c
+                    continue
+
+                if isinstance(item, TextItem) or (
+                        (not self.merge_list_items) and isinstance(item, ListItem)) or isinstance(item, CodeItem):
+                    text = item.text
+
+                elif isinstance(item, TableItem):
+                    text = item.export_to_markdown(dl_doc)
+                    # dataframe으로 추출할 때 사용되는 코드
+                    # if table_df.shape[0] < 1 or table_df.shape[1] < 2:
+                    #     # at least two cols needed, as first column contains row headers
+                    #     continue
+                    # text = self._triplet_serialize(table_df=table_df)
+                    captions = [c.text for c in [r.resolve(dl_doc) for r in item.captions]] or None
+
+                elif isinstance(item, PictureItem):
+                    text = ''.join(str(value) for value in heading_by_level.values())
+                else:
+                    continue
+                c = DocChunk(
+                    text=text,
+                    meta=DocMeta(
+                        doc_items=[item],
+                        headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                        captions=captions,
+                        origin=dl_doc.origin,
+                    ),
+                )
+                yield c
+
+        if self.merge_list_items and list_items:  # need to yield
+            yield DocChunk(
+                text=self.delim.join([i.text for i in list_items]),
+                meta=DocMeta(
+                    doc_items=list_items,
+                    headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+                    origin=dl_doc.origin,
+                ),
+            )
+
+
+class HybridChunker(BaseChunker):
+    r"""Chunker doing tokenization-aware refinements on top of document layout chunking.
+    Args:
+        tokenizer: The tokenizer to use; either instantiated object or name or path of
+            respective pretrained model
+        max_tokens: The maximum number of tokens per chunk. If not set, limit is
+            resolved from the tokenizer
+        merge_peers: Whether to merge undersized chunks sharing same relevant metadata
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    tokenizer: Union[PreTrainedTokenizerBase, str] = (
+        "/nfs-root/all-MiniLM-L6-v2"
+    )
+    max_tokens: int = int(1e30)  # type: ignore[assignment]
+    merge_peers: bool = True
+    _inner_chunker: HierarchicalChunker = HierarchicalChunker()
+
+    @model_validator(mode="after")
+    def _patch_tokenizer_and_max_tokens(self) -> Self:
+        self._tokenizer = (
+            self.tokenizer
+            if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+            else AutoTokenizer.from_pretrained(self.tokenizer)
+        )
+        if self.max_tokens is None:
+            self.max_tokens = TypeAdapter(PositiveInt).validate_python(
+                self._tokenizer.model_max_length
+            )
+        return self
+
+    def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
+        if text is None:
+            return 0
+        elif isinstance(text, list):
+            total = 0
+            for t in text:
+                total += self._count_text_tokens(t)
+            return total
+        return len(self._tokenizer.tokenize(text))
+
+    class _ChunkLengthInfo(BaseModel):
+        total_len: int
+        text_len: int
+        other_len: int
+
+    def _count_chunk_tokens(self, doc_chunk: DocChunk):
+        ser_txt = self.serialize(chunk=doc_chunk)
+        return len(self._tokenizer.tokenize(text=ser_txt))
+
+    def _doc_chunk_length(self, doc_chunk: DocChunk):
+        text_length = self._count_text_tokens(doc_chunk.text)
+        total = self._count_chunk_tokens(doc_chunk=doc_chunk)
+        return self._ChunkLengthInfo(
+            total_len=total,
+            text_len=text_length,
+            other_len=total - text_length,
+        )
+
+    def _make_chunk_from_doc_items(
+            self, doc_chunk: DocChunk, window_start: int, window_end: int
+    ):
+        doc_items = doc_chunk.meta.doc_items[window_start: window_end + 1]
+        meta = DocMeta(
+            doc_items=doc_items,
+            headings=doc_chunk.meta.headings,
+            captions=doc_chunk.meta.captions,
+            origin=doc_chunk.meta.origin,
+        )
+        window_text = (
+            doc_chunk.text
+            if len(doc_chunk.meta.doc_items) == 1
+            else self.delim.join(
+                [
+                    doc_item.text
+                    for doc_item in doc_items
+                    if isinstance(doc_item, TextItem)
+                ]
+            )
+        )
+        new_chunk = DocChunk(text=window_text, meta=meta)
+        return new_chunk
+
+    def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
+        chunks = []
+        window_start = 0
+        window_end = 0  # an inclusive index
+        num_items = len(doc_chunk.meta.doc_items)
+        while window_end < num_items:
+            new_chunk = self._make_chunk_from_doc_items(
+                doc_chunk=doc_chunk,
+                window_start=window_start,
+                window_end=window_end,
+            )
+            if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
+                if window_end < num_items - 1:
+                    window_end += 1
+                    # 아직 청크에 여유가 있고, 남은 아이템도 있으므로 계속 추가 시도
+                    continue
+                else:
+                    # 현재 윈도우의 모든 아이템이 청크에 들어갔고, 더 이상 아이템이 없음
+                    window_end = num_items  # signalizing the last loop
+            elif window_start == window_end:
+                # 아이템 1개도 청크에 안 들어감 → 단독 청크로 처리, 이후 재분할
+                window_end += 1
+                window_start = window_end
+            else:
+                # 마지막 아이템 빼고 청크 생성 → 남은 아이템으로 새 윈도우 시작
+                new_chunk = self._make_chunk_from_doc_items(
+                    doc_chunk=doc_chunk,
+                    window_start=window_start,
+                    window_end=window_end - 1,
+                )
+                window_start = window_end
+            chunks.append(new_chunk)
+        return chunks
+
+    def _split_using_plain_text(self, doc_chunk: DocChunk) -> list[DocChunk]:
+        lengths = self._doc_chunk_length(doc_chunk)
+        if lengths.total_len <= self.max_tokens:
+            return [doc_chunk]
+        else:
+            # 헤더/캡션을 제외하고 본문 텍스트에 할당 가능한 토큰 수 계산
+            available_length = self.max_tokens - lengths.other_len
+            sem_chunker = semchunk.chunkerify(
+                self._tokenizer, chunk_size=available_length
+            )
+            if available_length <= 0:
+                warnings.warn(
+                    f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}"
+                    # noqa
+                )
+                return []
+            text = doc_chunk.text
+            segments = sem_chunker.chunk(text)
+            chunks = [type(doc_chunk)(text=s, meta=doc_chunk.meta) for s in segments]
+            return chunks
+
+    def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
+        output_chunks = []
+        window_start = 0
+        window_end = 0  # an inclusive index
+        num_chunks = len(chunks)
+
+        while window_end < num_chunks:
+            chunk = chunks[window_end]
+            headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
+            ready_to_append = False
+
+            if window_start == window_end:
+                current_headings_and_captions = headings_and_captions
+                window_end += 1
+                first_chunk_of_window = chunk
+
+            else:
+                chks = chunks[window_start: window_end + 1]
+                doc_items = [it for chk in chks for it in chk.meta.doc_items]
+                candidate = DocChunk(
+                    text=self.delim.join([chk.text for chk in chks]),
+                    meta=DocMeta(
+                        doc_items=doc_items,
+                        headings=current_headings_and_captions[0],
+                        captions=current_headings_and_captions[1],
+                        origin=chunk.meta.origin,
+                    ),
+                )
+
+                if (headings_and_captions == current_headings_and_captions
+                        and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
+                ):
+                    # 토큰 수 여유 있음 → 청크 확장 계속
+                    window_end += 1
+                    new_chunk = candidate
+                else:
+                    ready_to_append = True
+
+            if ready_to_append or window_end == num_chunks:
+                # no more room OR the start of new metadata.
+                if window_start + 1 == window_end:
+                    output_chunks.append(first_chunk_of_window)
+                else:
+                    output_chunks.append(new_chunk)
+                window_start = window_end
+
+        return output_chunks
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        r"""Chunk the provided document.
+        Args:
+            dl_doc (DLDocument): document to chunk
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        res: Iterable[DocChunk]
+        res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs)  # type: ignore
+        res = [x for c in res for x in self._split_by_doc_items(c)]
+        res = [x for c in res for x in self._split_using_plain_text(c)]
+
+        if self.merge_peers:
+            res = self._merge_chunks_with_matching_metadata(res)
+        return iter(res)
+
+
+class DocxProcessor:
+    def __init__(self):
+        self.page_chunk_counts = defaultdict(int)
+        self.pipeline_options = PipelineOptions()
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.DOCX: WordFormatOption(
+                pipeline_cls=SimplePipeline, backend=GenosMsWordDocumentBackend
+                ),
+            }
+        )
+
+    def get_paths(self, file_path: str):
+        output_path, output_file = os.path.split(file_path)
+        filename, _ = os.path.splitext(output_file)
+        artifacts_dir = Path(f"{output_path}/{filename}")
+        if artifacts_dir.is_absolute():
+            reference_path = None
+        else:
+            reference_path = artifacts_dir.parent
+        return artifacts_dir, reference_path
+
+    def get_media_files(self, doc_items: list):
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({'path': path, 'name': name})
+        return temp_list
+
+    def safe_join(self, iterable):
+        if not isinstance(iterable, (list, tuple, set)):
+            return ''
+        return ''.join(map(str, iterable)) + '\n'
+
+    def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+        return conv_result.document
+
+    def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+        chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+        chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+        for chunk in chunks:
+            self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+        return chunks
+
+    async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+                              **kwargs: dict) -> list[dict]:
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=document.num_pages(),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+        )
+
+        current_page = None
+        chunk_index_on_page = 0
+        vectors = []
+        upload_tasks = []
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+            content = self.safe_join(chunk.meta.headings) + chunk.text
+
+            if chunk_page != current_page:
+                current_page = chunk_page
+                chunk_index_on_page = 0
+
+            vector = (GenOSVectorMetaBuilder()
+                      .set_text(content)
+                      .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+                      .set_chunk_index(chunk_idx)
+                      .set_global_metadata(**global_metadata)
+                      .set_chunk_bboxes(chunk.meta.doc_items, document)
+                      .set_media_files(chunk.meta.doc_items)
+                      ).build()
+            vectors.append(vector)
+
+            chunk_index_on_page += 1
+            # file_list = self.get_media_files(chunk.meta.doc_items)
+            # upload_tasks.append(asyncio.create_task(
+            #     upload_files(file_list, request=request)
+            # ))
+
+        if upload_tasks:
+            await asyncio.gather(*upload_tasks)
+        return vectors
+
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+        document: DoclingDocument = self.load_documents(file_path, **kwargs)
+        artifacts_dir, reference_path = self.get_paths(file_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+        chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+        vectors = []
+        if len(chunks) >= 1:
+            vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+        else:
+            raise GenosServiceException(1, f"chunk length is 0")
+        return vectors
+
+
+class HwpxProcessor:
+    def __init__(self):
+        self.page_chunk_counts = defaultdict(int)
+        self.pipeline_options = PipelineOptions()
+        self.pipeline_options.save_images = False
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.XML_HWPX: HwpxFormatOption(
+                    pipeline_options=self.pipeline_options
+                )
+            }
+        )
+
+    def get_paths(self, file_path: str):
+        output_path, output_file = os.path.split(file_path)
+        filename, _ = os.path.splitext(output_file)
+        artifacts_dir = Path(f"{output_path}/{filename}")
+        if artifacts_dir.is_absolute():
+            reference_path = None
+        else:
+            reference_path = artifacts_dir.parent
+        return artifacts_dir, reference_path
+
+    def get_media_files(self, doc_items: list):
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({'path': path, 'name': name})
+        return temp_list
+
+    def safe_join(self, iterable):
+        if not isinstance(iterable, (list, tuple, set)):
+            return ''
+        return ''.join(map(str, iterable)) + '\n'
+
+    def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        save_images = kwargs.get('save_images', False)
+
+        if self.pipeline_options.save_images != save_images:
+            self.pipeline_options.save_images = save_images
+            # self._create_converters()
+
+        conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+        return conv_result.document
+
+    def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+        chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+        chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+        for chunk in chunks:
+            self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+        return chunks
+
+    async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+                              **kwargs: dict) -> list[dict]:
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=document.num_pages(),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+        )
+
+        current_page = None
+        chunk_index_on_page = 0
+        vectors = []
+        upload_tasks = []
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+            content = self.safe_join(chunk.meta.headings) + chunk.text
+
+            if chunk_page != current_page:
+                current_page = chunk_page
+                chunk_index_on_page = 0
+
+            vector = (GenOSVectorMetaBuilder()
+                      .set_text(content)
+                      .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+                      .set_chunk_index(chunk_idx)
+                      .set_global_metadata(**global_metadata)
+                      .set_chunk_bboxes(chunk.meta.doc_items, document)
+                      .set_media_files(chunk.meta.doc_items)
+                      ).build()
+            vectors.append(vector)
+
+            chunk_index_on_page += 1
+            # file_list = self.get_media_files(chunk.meta.doc_items)
+            # upload_tasks.append(asyncio.create_task(
+            #     upload_files(file_list, request=request)
+            # ))
+
+        if upload_tasks:
+            await asyncio.gather(*upload_tasks)
+        return vectors
+
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+        document: DoclingDocument = self.load_documents(file_path, **kwargs)
+        artifacts_dir, reference_path = self.get_paths(file_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+        chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+        vectors = []
+        if len(chunks) >= 1:
+            vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+        else:
+            raise GenosServiceException(1, f"chunk length is 0")
+
+        
+        text = ""
+        for vector in vectors:
+            if len(text) + len(vector.text) > 8192:
+                break
+            text += vector.text
+        
+        
+        return [vectors[0]]
+
+
+class GenosServiceException(Exception):
+    """GenOS 와의 의존성 부분 제거를 위해 추가"""
+
+    def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+        self.code = 1
+        self.error_code = error_code
+        self.error_msg = error_msg or "GenOS Service Exception"
+        self.msg_params = msg_params or {}
+
+    def __repr__(self) -> str:
+        class_name = self.__class__.__name__
+        return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# async def assert_cancelled(request: Request):
+#     """GenOS 와의 의존성 제거를 위해 추가"""
+#     if await request.is_disconnected():
+#         raise GenosServiceException(1, f"Cancelled")
+
+
+# @@@@ 성민: OCR을 위해서 추가
+from docling.datamodel.pipeline_options import (
+        AcceleratorDevice,
+        AcceleratorOptions,
+        # OcrEngine,
+        # PdfBackend,
+        PdfPipelineOptions,
+        TableFormerMode,
+        PipelineOptions,
+        PaddleOcrOptions,
+)
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.document_converter import PdfFormatOption
+
+class DocumentProcessor:
+    def __init__(self):
+        self.page_chunk_counts = defaultdict(int)
+        self.hwpx_processor = HwpxProcessor()
+        self.docx_processor = DocxProcessor()
+
+        # @@@@ 성민: OCR을 위해서 추가
+        self.ocr_endpoint = "http://doc-parser-ocr-service:8080/ocr"
+        ocr_options = PaddleOcrOptions(
+            force_full_page_ocr=False,
+            lang=['korean'],
+            ocr_endpoint=self.ocr_endpoint,
+            text_score=0.3)
+
+        
+        device = AcceleratorDevice.AUTO
+        num_threads = 8
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+
+        # PDF 파이프라인 옵션 설정
+        self.pipe_line_options = PdfPipelineOptions()
+        self.pipe_line_options.generate_page_images = True
+        self.pipe_line_options.generate_picture_images = True
+        self.pipe_line_options.do_ocr = False
+        self.pipe_line_options.ocr_options = ocr_options
+        # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+        # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+        # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+        # ocr_options = TesseractOcrOptions()
+        # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+        # ocr_options.path = './.tesseract/tessdata'
+        # self.pipe_line_options.ocr_options = ocr_options
+        # self.pipe_line_options.artifacts_path = Path("/models/")
+        self.pipe_line_options.do_table_structure = True
+        self.pipe_line_options.images_scale = 2
+        self.pipe_line_options.table_structure_options.do_cell_matching = True
+        self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        self.pipe_line_options.accelerator_options = accelerator_options
+        
+        # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+        self.simple_pipeline_options = PipelineOptions()
+        self.simple_pipeline_options.save_images = False
+        
+        # ocr 파이프라인 옵션
+        self.ocr_pipe_line_options = PdfPipelineOptions()
+        self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+        self.ocr_pipe_line_options.do_ocr = True
+        self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+        self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True
+        
+        self.ocr_converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(
+                        pipeline_options=self.ocr_pipe_line_options,
+                        backend=DoclingParseV4DocumentBackend
+                    ),
+                }
+            )
+
+    def get_loader(self, file_path: str):
+        ext = os.path.splitext(file_path)[-1].lower()
+        real_type = self.get_real_file_type(file_path)
+
+        # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+        if ext != real_type and real_type == 'pdf':
+            return PyMuPDFLoader(file_path)
+        elif ext != real_type and real_type in ['txt', 'json', 'md']:
+            return TextLoader(file_path)
+        # 원래 확장자 기반 로직
+        elif ext == '.pdf':
+            return PyMuPDFLoader(file_path)
+        elif ext == '.doc':
+            convert_to_pdf(file_path)
+            return UnstructuredWordDocumentLoader(file_path)
+        elif ext in ['.ppt', '.pptx']:
+            convert_to_pdf(file_path)
+            return UnstructuredPowerPointLoader(file_path)
+        elif ext in ['.jpg', '.jpeg', '.png']:
+            convert_to_pdf(file_path)
+            # 한국어 OCR 지원을 위한 언어 설정
+            return UnstructuredImageLoader(
+                file_path, 
+                languages=["kor", "eng"],  # 한국어 + 영어 OCR
+            )
+        elif ext in ['.txt', '.json', '.md']:
+            return TextLoader(file_path)
+        elif ext == '.hwp':
+            return HwpLoader(file_path)
+        elif ext == '.md':
+            return UnstructuredMarkdownLoader(file_path)
+        else:
+            return UnstructuredFileLoader(file_path)
+
+    def get_real_file_type(self, file_path: str) -> str:
+        """파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
+        with open(file_path, 'rb') as f:
+            header = f.read(8) 
+        if header.startswith(b'%PDF-'):
+            return 'pdf'
+        elif header.startswith(b'\x89PNG'):
+            return 'png'
+        elif header.startswith(b'\xff\xd8\xff'):
+            return 'jpg'
+
+        # 매직 헤더로 판단할 수 없으면 확장자 사용
+        return os.path.splitext(file_path)[-1].lower()
+
+    def convert_md_to_pdf(self, md_path):
+        """Markdown 파일을 PDF로 변환"""
+        install_packages(['chardet'])
+        import chardet
+
+        pdf_path = md_path.replace('.md', '.pdf')
+        with open(md_path, 'rb') as f:
+            raw_file = f.read()
+        candidates = ['utf-8', 'utf-8-sig']
+        try:
+            det = (chardet.detect(raw_file) or {}).get('encoding') or ''
+            # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가
+            if det and det.lower() not in ('ascii', 'unknown'):
+                if det.lower() not in [c.lower() for c in candidates]:
+                    candidates.append(det)
+        except Exception:
+            pass
+        candidates += ['cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+        md_content = None
+        for enc in candidates:
+            try:
+                md_content = raw_file.decode(enc)
+                break
+            except UnicodeDecodeError:
+                continue
+        if md_content is None:
+            md_content = raw_file.decode('utf-8', errors='replace')
+
+        html_content = markdown(md_content)
+        if HTML:
+            HTML(string=html_content).write_pdf(pdf_path)
+        return pdf_path
+
+    
+    
+    def _create_converters(self):
+        """컨버터들을 생성하는 헬퍼 메서드"""
+        self.ocr_converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(
+                        pipeline_options=self.ocr_pipe_line_options,
+                        backend=DoclingParseV4DocumentBackend
+                    ),
+                }
+            )
+      
+    
+    def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+        save_images = kwargs.get('save_images', True)
+        include_wmf = kwargs.get('include_wmf', False)
+
+        # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+        if (self.simple_pipeline_options.save_images != save_images or
+            getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf):
+            self.simple_pipeline_options.save_images = save_images
+            self.simple_pipeline_options.include_wmf = include_wmf
+            self._create_converters()
+
+        try:
+            conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+        except Exception as e:
+            print("@@@@", e)
+            # conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+        
+        return conv_result.document
+    
+    
+    def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
+        loader = self.get_loader(file_path)
+        documents = loader.load()
+
+        # @@@@ 성민: 밑에 주석
+        # 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공
+        # ext = os.path.splitext(file_path)[-1].lower()
+        # if ext in ['.jpg', '.jpeg', '.png']:
+        #     # documents가 없거나, 있어도 모든 page_content가 비어있는 경우
+        #     if not documents or not any(doc.page_content.strip() for doc in documents):
+        #         documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})]
+
+        # @@@@ 성민 새로 작성: 텍스트가 없을 경우 OCR 수행
+        if not documents or not any(doc.page_content.strip() for doc in documents):
+            document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+            documents = list([Document(page_content=document.export_to_markdown(), metadata={})])
+
+        return documents
+
+    def split_documents(self, documents, **kwargs: dict) -> list[Document]:
+        # @@@@ 성민: GenOS에서 바꿔도 안바뀌는듯?
+        print("@@@@ kwargs", kwargs)
+        
+        kwargs.setdefault("chunk_size", 20_000)
+        
+        splitter_kwargs = {
+            k: v for k, v in kwargs.items()
+            if k in ["chunk_size", "chunk_overlap", "separators", "length_function"]
+        }
+        
+        text_splitter = RecursiveCharacterTextSplitter(**splitter_kwargs)
+
+        chunks = text_splitter.split_documents(documents)
+        chunks = [chunk for chunk in chunks if chunk.page_content]
+
+        if not chunks:
+            raise Exception('Empty document')
+
+        for chunk in chunks:
+            page = chunk.metadata.get('page', 0)
+            self.page_chunk_counts[page] += 1
+        return chunks
+
+    def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict) -> list[dict]:
+        ext = os.path.splitext(file_path)[-1].lower()
+        real_type = self.get_real_file_type(file_path)
+
+        # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+        if ext != real_type and real_type == 'pdf':
+            pdf_path = file_path
+        elif ext != real_type and real_type in ['txt', 'json', 'md']:
+            pdf_path = _get_pdf_path(file_path)
+        # 원래 확장자 기반 로직
+        elif file_path.endswith('.md'):
+            pdf_path = self.convert_md_to_pdf(file_path)
+        elif file_path.endswith(('.ppt', '.pptx')):
+            pdf_path = _get_pdf_path(file_path)
+        else:
+            pdf_path = _get_pdf_path(file_path)
+
+        # doc = fitz.open(pdf_path) if (pdf_path and os.path.exists(pdf_path)) else None
+
+        if file_path.endswith(('.ppt', '.pptx')):
+            if os.path.exists(pdf_path):
+                subprocess.run(["rm", pdf_path], check=True)
+
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=max([chunk.metadata.get('page', 0) for chunk in chunks]),
+            reg_date=datetime.now().isoformat(timespec='seconds') + 'Z'
+        )
+        current_page = None
+        chunk_index_on_page = 0
+
+        vectors = []
+        for chunk_idx, chunk in enumerate(chunks):
+            page = chunk.metadata.get('page', 0)
+            text = chunk.page_content
+
+            if page != current_page:
+                current_page = page
+                chunk_index_on_page = 0
+
+            # 첨부용에서는 bbox 정보 추출 X
+            # if doc:
+            #     fitz_page = doc.load_page(page)
+            #     global_metadata['chunk_bboxes'] = json.dumps(merge_overlapping_bboxes([{
+            #         'page': page + 1,
+            #         'type': 'text',
+            #         'bbox': {
+            #             'l': rect[0] / fitz_page.rect.width,
+            #             't': rect[1] / fitz_page.rect.height,
+            #             'r': rect[2] / fitz_page.rect.width,
+            #             'b': rect[3] / fitz_page.rect.height,
+            #         }
+            #     } for rect in fitz_page.search_for(text)], x_tolerance=1 / fitz_page.rect.width,
+            #         y_tolerance=1 / fitz_page.rect.height))
+
+            vectors.append(GenOSVectorMeta.model_validate({
+                'text': text,
+                'n_char': len(text),
+                'n_word': len(text.split()),
+                'n_line': len(text.splitlines()),
+                'i_page': page,
+                'e_page': page,
+                'i_chunk_on_page': chunk_index_on_page,
+                'n_chunk_of_page': self.page_chunk_counts[page],
+                'i_chunk_on_doc': chunk_idx,
+                **global_metadata
+            }))
+            chunk_index_on_page += 1
+
+        return vectors
+
+    @guardrail
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+
+        # @@@@ 성민: OneAgent 연동용
+        if "uploads" in kwargs.keys():
+            import base64
+            uploads = kwargs.get("uploads", None)[0]
+
+            # @@@@ 전처리기 파일 저장 경로
+            folder = "/nfs-root/tmp/uploads"
+
+            decoded = base64.b64decode(uploads['data'].split(",", 1)[1])
+            file_path = os.path.join(folder, uploads['name'])
+
+            with open(file_path, 'wb') as f:
+                f.write(decoded)
+
+        ext = os.path.splitext(file_path)[-1].lower()
+        if ext in ('.wav', '.mp3', '.m4a'):
+            # Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model
+            tmp_path = "./tmp_audios_{}".format(os.path.basename(file_path).split('.')[0])
+            if not os.path.exists(tmp_path):
+                os.makedirs(tmp_path)
+
+            # Use 'Whisper' model served in-house
+            # [!] Modify the request parameters to change a STT model to be used
+            loader = AudioLoader(
+                file_path=file_path,
+                req_url="http://192.168.74.164:30100/v1/audio/transcriptions",
+                req_data={
+                    'model': 'model',
+                    'language': 'ko',
+                    'response_format': 'json',
+                    'temperature': '0',
+                    'stream': 'false',
+                    'timestamp_granularities[]': 'word'
+                },
+                chunk_sec=29,  # length(sec) of a chunk from the uploaded audio
+                tmp_path=tmp_path
+            )
+            vectors = loader.return_vectormeta_format()
+            # await assert_cancelled(request)
+
+            # Remove the temporal chunks
+            try:
+                subprocess.run(['rm', '-r', tmp_path], check=True)
+            except:
+                pass
+            # await assert_cancelled(request)
+            return vectors
+
+        elif ext in ('.csv', '.xlsx'):
+            loader = TabularLoader(file_path, ext)
+            vectors = loader.return_vectormeta_format()
+            # pdf_path = _get_pdf_path(file_path)
+            # await assert_cancelled(request)
+            return vectors
+
+        elif ext == '.hwp':
+            documents: list[Document] = self.load_documents(file_path, **kwargs)
+            # await assert_cancelled(request)
+            chunks: list[Document] = self.split_documents(documents, **kwargs)
+            # await assert_cancelled(request)
+            vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+            return vectors
+
+        elif ext == '.hwpx':
+            return await self.hwpx_processor(request, file_path, **kwargs)
+
+        elif ext == '.docx':
+            return await self.docx_processor(request, file_path, **kwargs)
+        
+        else:
+            documents: list[Document] = self.load_documents(file_path, **kwargs)
+            # await assert_cancelled(request)
+
+            chunks: list[Document] = self.split_documents(documents, **kwargs)
+            # await assert_cancelled(request)
+
+            vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+
+            return vectors
\ No newline at end of file

From b5877fe12fa2066a9a7fec8441575ad1e0b4ffbb Mon Sep 17 00:00:00 2001
From: seongmincho315 <seongmin.cho@genon.ai>
Date: Thu, 16 Apr 2026 13:23:30 +0900
Subject: [PATCH 19/19] =?UTF-8?q?feat:=20=EB=AA=A8=EB=93=88=ED=99=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 genon/preprocessor/module/base_processor.py   |  175 ++
 .../module/intelligent_processor.py           | 1675 +++++++++++++++++
 genon/preprocessor/module/test.py             |   59 +
 genon/preprocessor/module/test_processor.py   |   55 +
 genon/preprocessor/module/utils/chunkers.py   |  836 ++++++++
 genon/preprocessor/module/utils/genos_util.py |   17 +
 genon/preprocessor/module/utils/metadata.py   |  352 ++++
 genon/preprocessor/module/utils/util.py       |  146 ++
 8 files changed, 3315 insertions(+)
 create mode 100644 genon/preprocessor/module/base_processor.py
 create mode 100644 genon/preprocessor/module/intelligent_processor.py
 create mode 100644 genon/preprocessor/module/test.py
 create mode 100644 genon/preprocessor/module/test_processor.py
 create mode 100644 genon/preprocessor/module/utils/chunkers.py
 create mode 100644 genon/preprocessor/module/utils/genos_util.py
 create mode 100644 genon/preprocessor/module/utils/metadata.py
 create mode 100644 genon/preprocessor/module/utils/util.py

diff --git a/genon/preprocessor/module/base_processor.py b/genon/preprocessor/module/base_processor.py
new file mode 100644
index 0000000000..fd277a8b08
--- /dev/null
+++ b/genon/preprocessor/module/base_processor.py
@@ -0,0 +1,175 @@
+from typing import Any, List
+
+from fastapi import Request
+from langchain_core.documents import Document
+from docling.document_converter import DocumentConverter, PdfFormatOption, HwpxFormatOption, WordFormatOption
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    PdfPipelineOptions,
+    TableFormerMode,
+    PipelineOptions,
+    PaddleOcrOptions,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend
+from docling_core.transforms.chunker import BaseChunker, DocChunk
+from docling_core.types import DoclingDocument
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+
+
+from utils.chunkers import CHUNKERS
+from utils.metadata import GenOSVectorMetaBuilder
+
+
+# load 파이프라인
+# open -> table -> reading order
+
+"""
+모델들
+- detection model
+- recognition model
+- ocr
+    - easy
+    -  paddle
+
+- table 모델
+
+- vlm
+    - 이미지 디스크립션 모델
+    - 문서 로테이션 모델
+    - TOC 생성 모델
+"""
+
+"""
+컴포넌트
+ - base64로 오면 저장 (oneagent용)
+ - 파일 오픈: 확장자 별로....
+ - pdf로 저장(GenOS에서 보여주려고)
+ - 리딩오더
+ - 레이아웃
+ - 테이블 디텍션
+ - 이미지 디스크립션
+ - ocr -> 용도별로
+ - 이미지 로테이션
+"""
+
+# TODO all ext
+FORMAT_MAP = {
+    "pdf": InputFormat.PDF,
+    # "hwp": InputFormat.HWP,
+    # "hwpx":InputFormat.HWPX,# TODO
+    # "doc":InputFormat.DOC, # TODO
+    "docx": InputFormat.DOCX,
+    # "ppt": InputFormat.PPT, #TODO
+    # "pptx": InputFormat.PPTX,
+    # "xlsx": InputFormat.XLSX,
+    # "csv": InputFormat.CSV,
+    # "md": InputFormat.MD,
+    # "json": InputFormat.JSON,
+    # "html": InputFormat.HTML,
+}
+
+
+# TODO all ext
+FORMAT_OPTION_MAP = {
+    InputFormat.PDF: PdfFormatOption,
+    # InputFormat.HWP: HwpFormatOption # TODO 왜 HwpFormatOption은 없는지
+    # "hwpx":InputFormat.HWPX,# TODO
+    # "doc":InputFormat.DOC, # TODO
+    InputFormat.DOCX: WordFormatOption,
+    # "ppt": InputFormat.PPT, #TODO
+    # InputFormat.PPTX,
+    # InputFormat.XLSX,
+    # InputFormat.CSV,
+    # InputFormat.MD,
+    # InputFormat.JSON,
+    # InputFormat.HTML,
+}
+
+PIPELINE_MAP = {
+    "pdf": StandardPdfPipeline,
+    "simple": SimplePipeline,
+}
+
+BACKEND_MAP = {
+    "pypdf": PyPdfiumDocumentBackend,
+    "msword": GenosMsWordDocumentBackend,
+}
+
+
+class BaseProcessor:
+    pipeline: list[str] = None
+    format_options: None
+    chunker: BaseChunker = None
+    loaders: list = None
+    converter: DocumentConverter = None
+    config: dict = None
+
+    def __init__(self, config: dict) -> None:
+        # mapping 해주자
+        self.config = config
+        self.allowed_formats = self._build_allowed_formats()
+        self.format_options = self._build_format_options()
+        self.converter = DocumentConverter(
+            allowed_formats=self.allowed_formats,
+            format_options=self.format_options,
+        )
+
+        # self.loaders = LOADERS["pdf"]  # 로더 왜 필요하더라
+
+        self.chunker = CHUNKERS["simple"]()
+        self.genos_meta_builder = GenOSVectorMetaBuilder()
+
+    def _build_allowed_formats(self):
+        allowed_formats = []
+        for _format in self.config["format_options"].keys():
+            format = FORMAT_MAP.get(_format, None)
+            assert format is not None, f"@@@@ 잘못된 확장자입니다. {_format}, 가능한 확장자: {list(FORMAT_MAP.keys())}"
+            allowed_formats.append(format)
+        return allowed_formats
+
+    def _build_format_options(self):
+        format_options = {}
+        for _format, option in self.config["format_options"].items():
+            format = FORMAT_MAP.get(_format, None)
+
+            format_options[format] = FORMAT_OPTION_MAP[format](
+                pipeline_cls=PIPELINE_MAP[option["pipeline_options"]],
+                backend=BACKEND_MAP[option["backend"]],
+            )
+
+            # @@@ 성민: pdf 일때만 이미지 저장이 가능하네
+            if "generate_picture_images" in option and option["generate_picture_images"] == True:
+                format_options[format].pipeline_options.generate_picture_images = True
+
+        return format_options
+
+    def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
+        """
+        설명: 확장자에 해당하는 DocumentConverter를 사용하여 ConversionResult 리턴
+        """
+        # TODO: OneAgent 호출인지 판단. 이게 여기 있어야 할까 __call_ 에 있어야 할까.
+
+        conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+
+        return conv_result.document
+
+    def split_documents(self, documents: list[Document], **kwargs: dict) -> list[Document]:
+        chunks = list(self.chunker.chunk(documents, **kwargs))
+        return chunks
+
+    async def compose_vectors(
+        self, request: Request, file_path: str, document: DoclingDocument, chunks: List[DocChunk], **kwargs: dict
+    ) -> list[dict]:
+        return await self.genos_meta_builder(document, chunks, file_path, request, **kwargs)
+
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict) -> Any:
+        documents = self.load_documents(file_path, **kwargs)
+        chunks = self.split_documents(documents, **kwargs)
+        vectors = await self.compose_vectors(request, file_path, documents, chunks, **kwargs)
+        return vectors
diff --git a/genon/preprocessor/module/intelligent_processor.py b/genon/preprocessor/module/intelligent_processor.py
new file mode 100644
index 0000000000..79742ec8ab
--- /dev/null
+++ b/genon/preprocessor/module/intelligent_processor.py
@@ -0,0 +1,1675 @@
+from __future__ import annotations
+
+import json
+import os
+import logging
+import math, bisect
+from pathlib import Path
+
+from collections import defaultdict
+from datetime import datetime
+from typing import Optional, Iterable, Any, List, Dict, Tuple
+
+from fastapi import Request
+
+_log = logging.getLogger(__name__)
+
+# docling imports
+
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+
+# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.genos_pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.pipeline.simple_pipeline import SimplePipeline
+
+# from docling.datamodel.document import ConversionStatus
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    # OcrEngine,
+    # PdfBackend,
+    PdfPipelineOptions,
+    TableFormerMode,
+    PipelineOptions,
+    PaddleOcrOptions,
+)
+
+from docling.document_converter import DocumentConverter, PdfFormatOption, FormatOption
+from docling.datamodel.pipeline_options import DataEnrichmentOptions
+from docling.utils.document_enrichment import enrich_document, check_document
+from docling.datamodel.document import ConversionResult
+from docling_core.transforms.chunker import (
+    BaseChunk,
+    BaseChunker,
+    DocChunk,
+    DocMeta,
+)
+from docling_core.types import DoclingDocument
+
+from pandas import DataFrame
+import asyncio
+from docling_core.types import DoclingDocument as DLDocument
+from docling_core.types.doc.document import (
+    DocumentOrigin,
+    LevelNumber,
+    ListItem,
+    CodeItem,
+    ContentLayer,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    DocItem,
+    PictureItem,
+    SectionHeaderItem,
+    TableItem,
+    TextItem,
+    PageItem,
+    ProvenanceItem,
+)
+from collections import Counter
+import re
+import json
+import warnings
+from typing import Iterable, Iterator, Optional, Union
+
+from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing_extensions import Self
+
+try:
+    import semchunk
+    from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+    raise RuntimeError("Module requires 'chunking' extra; to install, run: " "`pip install 'docling-core[chunking]'`")
+
+try:
+    from genos_utils import upload_files
+except ImportError:
+    upload_files = None
+
+# ============================================
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Chunker implementation leveraging the document structure."""
+
+
+class GenosBucketChunker(BaseChunker):
+    """토큰 제한을 고려하여 섹션별 청크를 분할하고 병합하는 청커 (v2)"""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    tokenizer: Union[PreTrainedTokenizerBase, str] = "sentence-transformers/all-MiniLM-L6-v2"
+    max_tokens: int = 1024
+    merge_peers: bool = True
+
+    # _inner_chunker: BaseChunker = None
+    _tokenizer: PreTrainedTokenizerBase = None
+    merge_list_items: bool = True
+
+    @model_validator(mode="after")
+    def _initialize_components(self) -> Self:
+        # 토크나이저 초기화
+        self._tokenizer = (
+            self.tokenizer
+            if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+            else AutoTokenizer.from_pretrained(self.tokenizer)
+        )
+        return self
+
+    def preprocess(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        """문서의 모든 아이템을 헤더 정보와 함께 청크로 생성
+
+        Args:
+            dl_doc: 청킹할 문서
+
+        Yields:
+            문서의 모든 아이템을 포함하는 하나의 청크
+        """
+        # 모든 아이템과 헤더 정보 수집
+        all_items = []
+        all_header_info = []  # 각 아이템의 헤더 정보
+        current_heading_by_level: dict[LevelNumber, str] = {}
+        all_header_short_info = []  # 각 아이템의 짧은 헤더 정보
+        current_heading_short_by_level: dict[LevelNumber, str] = {}
+        list_items: list[TextItem] = []
+
+        # iterate_items()로 수집된 아이템들의 self_ref 추적
+        processed_refs = set()
+
+        # 모든 아이템 순회
+        for item, level in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
+            if hasattr(item, "self_ref"):
+                processed_refs.add(item.self_ref)
+
+            if not isinstance(item, DocItem):
+                continue
+
+            # 리스트 아이템 병합 처리
+            if self.merge_list_items:
+                if isinstance(item, ListItem) or (isinstance(item, TextItem) and item.label == DocItemLabel.LIST_ITEM):
+                    list_items.append(item)
+                    continue
+                elif list_items:
+                    # 누적된 리스트 아이템들을 추가
+                    for list_item in list_items:
+                        all_items.append(list_item)
+                        # 리스트 아이템의 헤더 정보 저장
+                        all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                        all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+                    list_items = []
+
+            # 섹션 헤더 처리
+            if isinstance(item, SectionHeaderItem) or (
+                isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+            ):
+                # 새로운 헤더 레벨 설정
+                header_level = (
+                    item.level
+                    if isinstance(item, SectionHeaderItem)
+                    else (0 if item.label == DocItemLabel.TITLE else 1)
+                )
+                current_heading_by_level[header_level] = item.text
+                current_heading_short_by_level[header_level] = item.orig  # 첫 단어로 짧은 헤더 정보 설정
+
+                # 더 깊은 레벨의 헤더들 제거
+                keys_to_del = [k for k in current_heading_by_level if k > header_level]
+                for k in keys_to_del:
+                    current_heading_by_level.pop(k, None)
+                keys_to_del_short = [k for k in current_heading_short_by_level if k > header_level]
+                for k in keys_to_del_short:
+                    current_heading_short_by_level.pop(k, None)
+
+                # 헤더 아이템도 추가 (헤더 자체도 아이템임)
+                all_items.append(item)
+                all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+                continue
+
+            if (
+                isinstance(item, TextItem)
+                or isinstance(item, ListItem)
+                or isinstance(item, CodeItem)
+                or isinstance(item, TableItem)
+                or isinstance(item, PictureItem)
+            ):
+                # if item.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                #     item.text = ""
+                all_items.append(item)
+                # 현재 아이템의 헤더 정보 저장
+                all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+
+        # 마지막 리스트 아이템들 처리
+        if list_items:
+            for list_item in list_items:
+                all_items.append(list_item)
+                all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+
+        # iterate_items()에서 누락된 테이블들을 별도로 추가
+        missing_tables = []
+        for table in dl_doc.tables:
+            table_ref = getattr(table, "self_ref", None)
+            if table_ref not in processed_refs:
+                missing_tables.append(table)
+
+        # 누락된 테이블들을 문서 앞부분에 추가 (페이지 1의 테이블들일 가능성이 높음)
+        if missing_tables:
+            for missing_table in missing_tables:
+                # 첫 번째 위치에 삽입 (헤더 테이블일 가능성이 높음)
+                all_items.insert(0, missing_table)
+                all_header_info.insert(0, {})  # 빈 헤더 정보
+                all_header_short_info.insert(0, {})  # 빈 짧은 헤더 정보
+
+        # 아이템이 없으면 빈 문서
+        if not all_items:
+            return
+
+        # 모든 아이템을 하나의 청크로 반환 (HybridChunker에서 분할)
+        # headings는 None으로 설정하고, 헤더 정보는 별도로 관리
+        chunk = DocChunk(
+            text="",  # 텍스트는 HybridChunker에서 생성
+            meta=DocMeta(
+                doc_items=all_items,
+                headings=None,  # DocMeta의 원래 형식 유지
+                captions=None,
+                origin=dl_doc.origin,
+            ),
+        )
+        # 헤더 정보를 별도 속성으로 저장
+        chunk._header_info_list = all_header_info
+        chunk._header_short_info_list = all_header_short_info  # 짧은 헤더 정보도 저장
+        yield chunk
+
+    def _count_tokens(self, text: str) -> int:
+        """텍스트의 토큰 수 계산 (안전한 분할 처리)"""
+        if not text:
+            return 0
+
+        # 텍스트를 더 작은 단위로 분할하여 계산
+        max_chunk_length = 300  # 더 안전한 길이로 설정
+        total_tokens = 0
+
+        # 텍스트를 줄 단위로 먼저 분할
+        lines = text.split("\n")
+        current_chunk = ""
+
+        for line in lines:
+            # 현재 청크에 줄을 추가했을 때 길이 확인
+            temp_chunk = current_chunk + "\n" + line if current_chunk else line
+
+            if len(temp_chunk) <= max_chunk_length:
+                current_chunk = temp_chunk
+            else:
+                # 현재 청크가 있으면 토큰 계산
+                if current_chunk:
+                    try:
+                        total_tokens += len(self._tokenizer.tokenize(current_chunk))
+                    except Exception:
+                        total_tokens += int(len(current_chunk.split()) * 1.3)  # 대략적인 계산
+
+                # 새로운 청크 시작
+                current_chunk = line
+
+        # 마지막 청크 처리
+        if current_chunk:
+            try:
+                total_tokens += len(self._tokenizer.tokenize(current_chunk))
+            except Exception:
+                total_tokens += int(len(current_chunk.split()) * 1.3)  # 대략적인 계산
+
+        return total_tokens
+
+    def _generate_text_from_items_with_headers(
+        self, items: list[DocItem], header_info_list: list[dict], dl_doc: DoclingDocument
+    ) -> str:
+        """DocItem 리스트로부터 헤더 정보를 포함한 텍스트 생성"""
+        text_parts = []
+        current_section_headers = {}  # 현재 섹션의 헤더 정보
+
+        for i, item in enumerate(items):
+            item_headers = header_info_list[i] if i < len(header_info_list) else {}
+
+            # 헤더 정보가 변경된 경우 (새로운 섹션 시작)
+            if item_headers != current_section_headers:
+                # 변경된 헤더 레벨들만 추가
+                headers_to_add = []
+                for level in sorted(item_headers.keys()):
+                    # 이전 섹션과 다른 헤더만 추가
+                    if level not in current_section_headers or current_section_headers[level] != item_headers[level]:
+                        # 해당 레벨까지의 모든 상위 헤더 포함
+                        for l in sorted(item_headers.keys()):
+                            if l < level:
+                                headers_to_add.append(item_headers[l])
+                            elif l == level:
+                                headers_to_add.append("")
+
+                        break
+
+                # 헤더가 있으면 추가
+                if headers_to_add:
+                    header_text = ", ".join(headers_to_add)
+                    if header_text not in text_parts:
+                        text_parts.append(header_text)
+
+                current_section_headers = item_headers.copy()
+
+            # 아이템 텍스트 추가
+            if isinstance(item, TableItem):
+                table_text = self._extract_table_text(item, dl_doc)
+                if table_text:
+                    text_parts.append(table_text)
+            elif hasattr(item, "text") and item.text:
+                # 타이틀과 섹션 헤더 처리 개선
+                # is_section_header = (
+                #     isinstance(item, SectionHeaderItem) or
+                #     (isinstance(item, TextItem) and
+                #      item.label in [DocItemLabel.SECTION_HEADER])  # TITLE은 제외
+                # )
+
+                # 타이틀은 항상 포함, 섹션 헤더는 중복 방지를 위해 스킵
+                # if not is_section_header:
+                # 20250909, shkim, text_parts에 없는 경우만 추가. 섹션헤더가 반복해서 추가되는 것 방지
+                if item.text not in text_parts:
+                    text_parts.append(item.text)
+            elif isinstance(item, PictureItem):
+                text_parts.append("")  # 이미지는 빈 텍스트
+
+        result_text = self.delim.join(text_parts)
+        return result_text
+
+    def _extract_table_text(self, table_item: TableItem, dl_doc: DoclingDocument) -> str:
+        """테이블에서 텍스트를 추출하는 일반화된 메서드"""
+        try:
+            # 먼저 export_to_markdown 시도
+            table_text = table_item.export_to_markdown(dl_doc)
+            if table_text and table_text.strip():
+                return table_text
+        except Exception:
+            pass
+
+        # export_to_markdown 실패 시 테이블 셀 데이터에서 직접 텍스트 추출
+        try:
+            if hasattr(table_item, "data") and table_item.data:
+                cell_texts = []
+
+                # table_cells에서 텍스트 추출
+                if hasattr(table_item.data, "table_cells"):
+                    for cell in table_item.data.table_cells:
+                        if hasattr(cell, "text") and cell.text and cell.text.strip():
+                            cell_texts.append(cell.text.strip())
+
+                # grid에서 텍스트 추출 (table_cells가 없는 경우)
+                elif hasattr(table_item.data, "grid") and table_item.data.grid:
+                    for row in table_item.data.grid:
+                        if isinstance(row, list):
+                            for cell in row:
+                                if hasattr(cell, "text") and cell.text and cell.text.strip():
+                                    cell_texts.append(cell.text.strip())
+
+                # 추출된 셀 텍스트들을 결합
+                if cell_texts:
+                    return " ".join(cell_texts)
+        except Exception:
+            pass
+
+        # 모든 방법 실패 시 item.text 사용 (있는 경우)
+        if hasattr(table_item, "text") and table_item.text:
+            return table_item.text
+
+        return ""
+
+    def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[str]]:
+        """헤더 정보 리스트에서 실제 사용되는 모든 헤더들을 level 순서대로 추출하고 ', '로 연결"""
+        if not header_info_list:
+            return None
+
+        all_headers = []  # header 순서대로 추가
+        seen_headers = set()  # 중복 방지용
+
+        for header_info in header_info_list:
+            if header_info:
+                for level in sorted(header_info.keys()):
+                    header_text = header_info[level]
+                    if header_text and header_text not in seen_headers:
+                        all_headers.append(header_text)
+                        seen_headers.add(header_text)
+
+        return all_headers if all_headers else None
+
+    def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]:
+        """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)"""
+        if not table_text:
+            return [table_text]
+
+        # 전체 테이블이 토큰 제한 내인지 확인
+        if self._count_tokens(table_text) <= max_tokens:
+            return [table_text]
+
+        # 단순히 토큰 수 기준으로 텍스트 분할
+        # semchunk 사용하여 토큰 제한에 맞게 분할
+        chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens)
+        chunks = chunker(table_text)
+        return chunks if chunks else [table_text]
+
+    def _is_section_header(self, item: DocItem) -> bool:
+        """아이템이 section header인지 확인"""
+        return isinstance(item, SectionHeaderItem) or (
+            isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+        )
+
+    def _get_section_header_level(self, item: DocItem) -> Optional[int]:
+        """Section header의 level을 반환"""
+        if isinstance(item, SectionHeaderItem):
+            return item.level
+        elif isinstance(item, TextItem):
+            if item.label == DocItemLabel.TITLE:
+                return 0
+            elif item.label == DocItemLabel.SECTION_HEADER:
+                return 1
+        return None
+
+    def _generate_section_text_with_heading(
+        self, section_items: list[DocItem], section_header_infos: list[dict], dl_doc: DoclingDocument
+    ) -> str:
+        """섹션의 텍스트를 생성하되, 앞에 heading을 붙임"""
+        # 첫 번째 item의 header_info에서 heading 추출
+        if section_header_infos and section_header_infos[0]:
+            merged_headers = {}
+            for level, header_text in section_header_infos[0].items():
+                if header_text:
+                    merged_headers[level] = header_text
+
+            # level 순서대로 정렬해서 ', '로 연결
+            if merged_headers:
+                sorted_levels = sorted(merged_headers.keys())
+                headers = [merged_headers[level] for level in sorted_levels]
+                heading_text = ", ".join(headers)
+            else:
+                heading_text = ""
+        else:
+            heading_text = ""
+
+        # 섹션의 일반 텍스트 생성
+        section_text = self._generate_text_from_items_with_headers(section_items, section_header_infos, dl_doc)
+
+        # heading이 있으면 앞에 붙이기
+        if heading_text:
+            return heading_text + ", " + section_text
+        else:
+            return section_text
+
+    def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument) -> list[DocChunk]:
+        """문서를 토큰 제한에 맞게 분할 (v2: 섹션 헤더 기준으로 분할 후 max_tokens로 병합)"""
+        items = doc_chunk.meta.doc_items
+        header_info_list = getattr(doc_chunk, "_header_info_list", [])
+        header_short_info_list = getattr(doc_chunk, "_header_short_info_list", [])
+
+        if not items:
+            return []
+
+        # ================================================================
+        # 헬퍼 함수들
+        # ================================================================
+
+        def get_header_level(header_infos, *, first=False, default=-1):
+            """header_infos에서 최종 레벨 계산"""
+            if not header_infos:
+                return default
+            info = header_infos[0] if first else header_infos[-1]
+            return max(info.keys(), default=default)
+
+        def get_current_chunk(
+            doc_chunk: DocChunk,
+            merged_texts: list[str],
+            merged_header_short_infos: list[dict],
+            merged_items: list[DocItem],
+        ):
+            """현재까지 병합된 내용으로 DocChunk 생성"""
+            if not merged_texts:
+                return None
+            chunk_text = "\n".join(merged_texts)
+            used_headers = self._extract_used_headers(merged_header_short_infos)
+
+            return DocChunk(
+                text=chunk_text,
+                meta=DocMeta(
+                    doc_items=merged_items,
+                    headings=used_headers,
+                    captions=None,
+                    origin=doc_chunk.meta.origin,
+                ),
+            )
+
+        def get_text_from_item(item: DocItem) -> str:
+            """DocItem에서 텍스트 추출"""
+            if isinstance(item, TableItem):
+                return self._extract_table_text(item, dl_doc)
+            elif hasattr(item, "text") and item.text:
+                return item.text
+            elif isinstance(item, PictureItem):
+                text = ""
+                for annotation in item.annotations:
+                    if hasattr(annotation, "text"):
+                        text += annotation.text
+                return text
+            return ""
+
+        def split_items_evenly_by_tokens(item_token_counts, max_tokens):
+            n = len(item_token_counts)
+            total = sum(item_token_counts)
+            if n == 0:
+                return []
+            if total <= max_tokens:
+                return [(0, n)]  # ✅ 항상 (a,b)
+
+            k = math.ceil(total / max_tokens)
+            target = total / k
+
+            P = [0]
+            for c in item_token_counts:
+                P.append(P[-1] + c)
+
+            cuts = [0]
+            used = {0}
+            for t in range(1, k):
+                goal = t * target
+                j = bisect.bisect_left(P, goal)
+
+                cand = []
+                if 0 < j < len(P):
+                    cand.append(j)
+                if 0 <= j - 1 < len(P):
+                    cand.append(j - 1)
+
+                best = None
+                best_dist = float("inf")
+                for x in cand:
+                    if x in used:
+                        continue
+                    if x <= cuts[-1]:
+                        continue
+                    if x >= len(P) - 1:  # n
+                        continue
+                    dist = abs(P[x] - goal)
+                    if dist < best_dist:
+                        best_dist = dist
+                        best = x
+
+                if best is None:
+                    best = min(max(cuts[-1] + 1, 1), len(P) - 2)
+
+                cuts.append(best)
+                used.add(best)
+
+            cuts.append(n)
+
+            return [(a, b) for a, b in zip(cuts[:-1], cuts[1:])]
+
+        def adjust_captions(items_group):
+
+            b_modified = False
+            for idx, group in enumerate(items_group):
+                if group is None:
+                    continue
+                item = group[0][0]
+                ref_idx_list = []
+                if hasattr(item, "captions") and item.captions:
+                    for cap in item.captions:
+                        cap_ref = cap.cref
+                        cap_idx = -1
+                        for j, it in enumerate(items_group):
+                            if it is None:
+                                continue
+                            if getattr(it[0][0], "self_ref", None) == cap_ref:
+                                cap_idx = j
+                                break
+                        if cap_idx != -1:
+                            ref_idx_list.append(cap_idx)
+                if ref_idx_list:
+                    ref_idx_list = sorted(ref_idx_list)
+
+                if not ref_idx_list:
+                    continue
+
+                # caption 아이템들을 부모 아이템 바로 뒤로 이동
+                for cap_idx in ref_idx_list:
+                    for g in items_group[cap_idx]:
+                        items_group[idx].append(g)
+                    items_group[cap_idx] = None  # 나중에 None 제거
+                    b_modified = True
+
+            if b_modified:
+                items_group = [it for it in items_group if it is not None]
+
+            return items_group
+
+        def adjust_pictures_in_tables(items_group):
+            # picture in table 처리
+
+            b_modified = False
+            for idx, group in enumerate(items_group):
+                if group is None:
+                    continue
+                item = group[0][0]
+                pic_idx_list = []
+                if isinstance(item, TableItem):
+                    table_bbox = item.prov[0].bbox
+                    table_page_no = item.prov[0].page_no
+
+                    for j in range(len(items_group)):
+                        if items_group[j] is None:
+                            continue
+                        pic_item = items_group[j][0][0]
+                        if isinstance(pic_item, PictureItem):
+                            # table 안의 picture인지 확인. iou 사용
+                            pic_bbox = pic_item.prov[0].bbox
+                            pic_page_no = pic_item.prov[0].page_no
+                            if pic_page_no != table_page_no:
+                                continue
+                            ios = pic_bbox.intersection_over_self(table_bbox)
+                            if ios > 0.5:  # picture가 50% 이상 table 안에 포함되면 table 안의 picture로 간주
+                                pic_idx_list.append(j)
+                    if pic_idx_list:
+                        pic_idx_list = sorted(pic_idx_list)
+
+                if not pic_idx_list:
+                    continue
+
+                for pic_idx in pic_idx_list:
+                    for g in items_group[pic_idx]:
+                        items_group[idx].append(g)
+                    items_group[pic_idx] = None  # 나중에 None 제거
+                    b_modified = True
+
+            if b_modified:
+                items_group = [it for it in items_group if it is not None]
+
+            return items_group
+
+        # ================================================================
+        # 1단계: 섹션 헤더 기준으로 분할
+        # ================================================================
+
+        sections = []  # [(items, header_infos, header_short_infos), ...]
+        cur_items, cur_h_infos, cur_h_short = [], [], []
+
+        for i, item in enumerate(items):
+            h_info = header_info_list[i] if i < len(header_info_list) else {}
+            h_short = header_short_info_list[i] if i < len(header_short_info_list) else {}
+
+            # 섹션 헤더를 만나면
+            if self._is_section_header(item):
+                # 이전 섹션이 있으면 저장
+                if cur_items:
+                    sections.append((cur_items, cur_h_infos, cur_h_short))
+
+                # 새로운 섹션 시작
+                cur_items = [item]
+                cur_h_infos = [h_info]
+                cur_h_short = [h_short]
+            else:
+                # 섹션 헤더가 아니면 현재 섹션에 추가
+                cur_items.append(item)
+                cur_h_infos.append(h_info)
+                cur_h_short.append(h_short)
+
+        # 마지막 섹션 저장
+        if cur_items:
+            sections.append((cur_items, cur_h_infos, cur_h_short))
+
+        # ================================================================
+        # 2단계: 각 섹션의 텍스트에 heading 붙이기
+        # ================================================================
+
+        sections_with_text = []
+        for items, header_infos, header_short_infos in sections:
+            text = self._generate_section_text_with_heading(items, header_short_infos, dl_doc)
+            sections_with_text.append((text, items, header_infos, header_short_infos))
+
+        # ================================================================
+        # 2.5단계: 너무 긴 청크는 분할
+        # ================================================================
+        if self.max_tokens > 0:
+            for i in range(len(sections_with_text)):
+                text, items, h_infos, h_short = sections_with_text[i]
+                token_count = self._count_tokens(text)
+                if token_count < self.max_tokens:
+                    continue
+
+                # caption 및 table 내 그림은 같은 섹션에 있도록 조정
+                items_group = [[(item, info, short)] for item, info, short in zip(items, h_infos, h_short)]
+                items_group = adjust_captions(items_group)
+                items_group = adjust_pictures_in_tables(items_group)
+
+                # 너무 긴 섹션은 분할
+                # 각 아이템 별 token 수 계산
+                item_token_counts = []
+                for group in items_group:
+                    cur_count = 0
+                    for g in group:
+                        cur_count += self._count_tokens(get_text_from_item(g[0]))
+                    item_token_counts.append(cur_count)
+
+                # 아이템 그룹들을 토큰 기준으로 균등 분할
+                split_info = split_items_evenly_by_tokens(item_token_counts, self.max_tokens)
+
+                # item_groups를 섹션으로 다시 구성
+                new_sections = []
+                for a, b in split_info:
+
+                    # 각 그룹에서 items, h_infos, h_short로 분리
+                    group_items = []
+                    group_h_infos = []
+                    group_h_short = []
+                    for idx in range(a, b):
+                        for g in items_group[idx]:
+                            group_items.append(g[0])
+                            group_h_infos.append(g[1])
+                            group_h_short.append(g[2])
+
+                    new_text = self._generate_section_text_with_heading(group_items, group_h_short, dl_doc)
+                    new_sections.append((new_text, group_items, group_h_infos, group_h_short))
+
+                # 원래 섹션을 새로 분할된 섹션들로 교체
+                sections_with_text.pop(i)
+                for new_section in reversed(new_sections):
+                    sections_with_text.insert(i, new_section)
+
+        # ================================================================
+        # 3단계: 단독 타이틀(1줄만) → 다음 섹션으로 병합
+        # ================================================================
+
+        for i in range(len(sections_with_text) - 2, -1, -1):
+            text, items, h_infos, h_short = sections_with_text[i]
+
+            # 아이템이 하나인 섹션 헤더만 검사
+            if len(items) != 1 or not self._is_section_header(items[0]):
+                continue
+
+            # 문단이 이미 구성된 것은 제외 (문자 수가 30자 이상이면 문단을 구성했다고 간주)
+            item_text = "".join(getattr(it, "text", "") for it in items)
+            if len(item_text) > 30:
+                continue
+
+            # 현재 섹션헤더 레벨이 다음 섹션헤더 레벨보다 더 높은 경우에만 병합 (높은 레벨이 더 작은 숫자)
+            n_text, n_items, n_h_infos, n_h_short = sections_with_text[i + 1]
+            current_level = get_header_level(h_infos, first=False)
+            next_level = get_header_level(n_h_infos, first=True)
+            if 0 <= next_level < current_level:
+                continue
+
+            # 다음 섹션과 병합
+            sections_with_text[i] = (text + "\n" + n_text, items + n_items, h_infos + n_h_infos, h_short + n_h_short)
+            sections_with_text.pop(i + 1)
+
+        # ================================================================
+        # 4단계: 토큰 기준 병합
+        # ================================================================
+
+        result_chunks = []
+        merged_texts, merged_items = [], []
+        merged_header_infos, merged_header_short_infos = [], []
+
+        for text, items, header_infos, header_short_infos in sections_with_text:
+
+            b_new_chunk = False
+
+            # ----------------------------------
+            # 병합 가능 여부 판단
+
+            # 병합 가능 토큰 수 계산
+            test_tokens = self._count_tokens("\n".join(merged_texts + [text]))
+
+            # 현재 섹션헤더 레벨과 병합된 섹션헤더 레벨
+            section_level = get_header_level(header_infos, first=True)
+            merged_level = get_header_level(merged_header_infos, first=False)
+
+            # 토큰 수 초과 시 새로운 청크 생성
+            if test_tokens > self.max_tokens and len(merged_texts) > 0:
+                b_new_chunk = True
+            # 현재 섹션헤더 레벨이 더 높으면 새로운 청크 생성
+            elif 0 <= section_level < merged_level:
+                b_new_chunk = True
+            # ----------------------------------
+
+            # 새로운 청크 생성
+            if b_new_chunk:
+                cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items)
+                if cur_chunk:
+                    result_chunks.append(cur_chunk)
+
+                # 새로운 병합 시작
+                merged_texts = [text]
+                merged_items = items
+                merged_header_infos = header_infos
+                merged_header_short_infos = header_short_infos
+            else:
+                # 현재 섹션 병합
+                merged_texts.append(text)
+                merged_items.extend(items)
+                merged_header_infos.extend(header_infos)
+                merged_header_short_infos.extend(header_short_infos)
+
+        # 마지막 병합된 items 처리
+        cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items)
+        if cur_chunk:
+            result_chunks.append(cur_chunk)
+
+        return result_chunks
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        """문서를 청킹하여 반환
+
+        Args:
+            dl_doc: 청킹할 문서
+
+        Yields:
+            토큰 제한에 맞게 분할된 청크들
+        """
+        doc_chunks = list(self.preprocess(dl_doc=dl_doc, **kwargs))
+
+        if not doc_chunks:
+            return iter([])
+
+        doc_chunk = doc_chunks[0]  # preprocess는 하나의 청크만 반환
+
+        final_chunks = self._split_document_by_tokens(doc_chunk, dl_doc)
+
+        return iter(final_chunks)
+
+
+class GenOSVectorMeta(BaseModel):
+    class Config:
+        extra = "allow"
+
+    text: str = None
+    n_char: int = None
+    n_word: int = None
+    n_line: int = None
+    e_page: int = None
+    i_page: int = None
+    i_chunk_on_page: int = None
+    n_chunk_of_page: int = None
+    i_chunk_on_doc: int = None
+    n_chunk_of_doc: int = None
+    n_page: int = None
+    reg_date: str = None
+    chunk_bboxes: str = None
+    media_files: str = None
+    title: str = None
+    created_date: int = None
+    appendix: str = None  ## !! appendix feature (2025-09-30, geonhee kim) !!
+
+
+class GenOSVectorMetaBuilder:
+    def __init__(self):
+        """빌더 초기화"""
+        self.text: Optional[str] = None
+        self.n_char: Optional[int] = None
+        self.n_word: Optional[int] = None
+        self.n_line: Optional[int] = None
+        self.i_page: Optional[int] = None
+        self.e_page: Optional[int] = None
+        self.i_chunk_on_page: Optional[int] = None
+        self.n_chunk_of_page: Optional[int] = None
+        self.i_chunk_on_doc: Optional[int] = None
+        self.n_chunk_of_doc: Optional[int] = None
+        self.n_page: Optional[int] = None
+        self.reg_date: Optional[str] = None
+        self.chunk_bboxes: Optional[str] = None
+        self.media_files: Optional[str] = None
+        self.title: Optional[str] = None
+        self.created_date: Optional[int] = None
+        self.appendix: Optional[str] = None  # !! appendix feature (2025-09-30, geonhee kim) !!
+
+    def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+        """텍스트와 관련된 데이터를 설정"""
+        self.text = text
+        self.n_char = len(text)
+        self.n_word = len(text.split())
+        self.n_line = len(text.splitlines())
+        return self
+
+    def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder":
+        """페이지 정보 설정"""
+        self.i_page = i_page
+        self.i_chunk_on_page = i_chunk_on_page
+        self.n_chunk_of_page = n_chunk_of_page
+        return self
+
+    def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+        """문서 전체의 청크 인덱스 설정"""
+        self.i_chunk_on_doc = i_chunk_on_doc
+        return self
+
+    def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+        """글로벌 메타데이터 병합"""
+        for key, value in global_metadata.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+        return self
+
+    def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+        chunk_bboxes = []
+        for item in doc_items:
+            for prov in item.prov:
+                label = item.self_ref
+                type_ = item.label
+                size = document.pages.get(prov.page_no).size
+                page_no = prov.page_no
+                bbox = prov.bbox
+                bbox_data = {
+                    "l": bbox.l / size.width,
+                    "t": bbox.t / size.height,
+                    "r": bbox.r / size.width,
+                    "b": bbox.b / size.height,
+                    "coord_origin": bbox.coord_origin.value,
+                }
+                chunk_bboxes.append({"page": page_no, "bbox": bbox_data, "type": type_, "ref": label})
+        self.e_page = max([bbox["page"] for bbox in chunk_bboxes]) if chunk_bboxes else None
+        self.chunk_bboxes = json.dumps(chunk_bboxes)
+        return self
+
+    def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                path = str(item.image.uri)
+                print(item)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({"name": name, "type": "image", "ref": item.self_ref})
+        self.media_files = json.dumps(temp_list)
+        return self
+
+    def build(self) -> GenOSVectorMeta:
+        """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+        return GenOSVectorMeta(
+            text=self.text,
+            n_char=self.n_char,
+            n_word=self.n_word,
+            n_line=self.n_line,
+            i_page=self.i_page,
+            e_page=self.e_page,
+            i_chunk_on_page=self.i_chunk_on_page,
+            n_chunk_of_page=self.n_chunk_of_page,
+            i_chunk_on_doc=self.i_chunk_on_doc,
+            n_chunk_of_doc=self.n_chunk_of_doc,
+            n_page=self.n_page,
+            reg_date=self.reg_date,
+            chunk_bboxes=self.chunk_bboxes,
+            media_files=self.media_files,
+            title=self.title,
+            created_date=self.created_date,
+            appendix=self.appendix or "",  # !! appendix feature (2025-09-30, geonhee kim) !!
+        )
+
+
+class DocumentProcessor:
+
+    def __init__(self):
+        """
+        initialize Document Converter
+        """
+        self.ocr_endpoint = "http://192.168.73.172:48080/ocr"
+        ocr_options = PaddleOcrOptions(
+            force_full_page_ocr=False, lang=["korean"], ocr_endpoint=self.ocr_endpoint, text_score=0.3
+        )
+
+        self.page_chunk_counts = defaultdict(int)
+        device = AcceleratorDevice.AUTO
+        num_threads = 8
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+        # PDF 파이프라인 옵션 설정
+        self.pipe_line_options = PdfPipelineOptions()
+        self.pipe_line_options.generate_page_images = True
+        self.pipe_line_options.generate_picture_images = True
+        self.pipe_line_options.do_ocr = False
+        # self.pipe_line_options.ocr_options = ocr_options
+        # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+        # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+        # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+        # ocr_options = TesseractOcrOptions()
+        # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+        # ocr_options.path = './.tesseract/tessdata'
+        # self.pipe_line_options.ocr_options = ocr_options
+        # self.pipe_line_options.artifacts_path = Path("/models/")
+        # self.pipe_line_options.do_table_structure = True
+        self.pipe_line_options.do_table_structure = False
+        self.pipe_line_options.images_scale = 2
+        self.pipe_line_options.table_structure_options.do_cell_matching = False
+        # self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        # self.pipe_line_options.accelerator_options = accelerator_options
+
+        # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+        self.simple_pipeline_options = PipelineOptions()
+        self.simple_pipeline_options.save_images = False
+
+        # ocr 파이프라인 옵션
+        self.ocr_pipe_line_options = PdfPipelineOptions()
+        self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+        self.ocr_pipe_line_options.do_ocr = False
+        self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+        self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = False
+
+        # 기본 컨버터들 생성
+        self._create_converters()
+
+        # enrichment 옵션 설정
+        self.enrichment_options = DataEnrichmentOptions(
+            do_toc_enrichment=False,
+            # toc_doc_type="law",
+            # extract_metadata=False,
+            # toc_api_provider="custom",
+            # # Mistral-Small-3.1-24B-Instruct-2503, 운영망
+            # toc_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/502/v1/chat/completions",
+            # metadata_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/502/v1/chat/completions",
+            # toc_api_key="022653a3743849e299f19f19d323490b",
+            # metadata_api_key="022653a3743849e299f19f19d323490b",
+            # # Mistral-Small-3.1-24B-Instruct-2503, 한국은행 클러스터
+            # # toc_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions",
+            # # metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions",
+            # # toc_api_key="9e32423947fd4a5da07a28962fe88487",
+            # # metadata_api_key="9e32423947fd4a5da07a28962fe88487",
+            # toc_model="model",
+            # metadata_model="model",
+            # toc_temperature=0.0,
+            # toc_top_p=0.00001,
+            # toc_seed=33,
+            # toc_max_tokens=10000,
+            # toc_system_prompt=toc_system_prompt,
+            # toc_user_prompt=toc_user_prompt,
+        )
+
+    def _create_converters(self):
+        """컨버터들을 생성하는 헬퍼 메서드"""
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=self.pipe_line_options, backend=PyPdfiumDocumentBackend
+                ),
+            }
+        )
+        self.second_converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=self.pipe_line_options, backend=PyPdfiumDocumentBackend
+                ),
+            },
+        )
+        self.ocr_converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=self.ocr_pipe_line_options, backend=DoclingParseV4DocumentBackend
+                ),
+            }
+        )
+        self.ocr_second_converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=self.ocr_pipe_line_options, backend=PyPdfiumDocumentBackend
+                ),
+            },
+        )
+
+    def load_documents_with_docling(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+        save_images = kwargs.get("save_images", True)
+        include_wmf = kwargs.get("include_wmf", False)
+
+        # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+        if (
+            self.simple_pipeline_options.save_images != save_images
+            or getattr(self.simple_pipeline_options, "include_wmf", False) != include_wmf
+        ):
+            self.simple_pipeline_options.save_images = save_images
+            self.simple_pipeline_options.include_wmf = include_wmf
+            self._create_converters()
+
+        try:
+            conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+        except Exception as e:
+            conv_result: ConversionResult = self.second_converter.convert(file_path, raises_on_error=True)
+        return conv_result.document
+
+    def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+        # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+        save_images = kwargs.get("save_images", True)
+        include_wmf = kwargs.get("include_wmf", False)
+
+        # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+        if (
+            self.simple_pipeline_options.save_images != save_images
+            or getattr(self.simple_pipeline_options, "include_wmf", False) != include_wmf
+        ):
+            self.simple_pipeline_options.save_images = save_images
+            self.simple_pipeline_options.include_wmf = include_wmf
+            self._create_converters()
+
+        try:
+            conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+        except Exception as e:
+            conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+        return conv_result.document
+
+    def load_documents(self, file_path: str, **kwargs) -> DoclingDocument:
+        return self.load_documents_with_docling(file_path, **kwargs)
+
+    def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+        chunker: GenosBucketChunker = GenosBucketChunker(max_tokens=0, merge_peers=True)
+
+        chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+        for chunk in chunks:
+            self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+        return chunks
+
+    def safe_join(self, iterable):
+        if not isinstance(iterable, (list, tuple, set)):
+            return ""
+        return "".join(map(str, iterable)) + "\n"
+
+    def parse_created_date(self, date_text: str) -> Optional[int]:
+        """
+        작성일 텍스트를 파싱하여 YYYYMMDD 형식의 정수로 변환
+
+        Args:
+            date_text: 작성일 텍스트 (YYYY-MM 또는 YYYY-MM-DD 형식)
+
+        Returns:
+            YYYYMMDD 형식의 정수, 파싱 실패시 None
+        """
+        if not date_text or not isinstance(date_text, str) or date_text == "None":
+            return 0
+
+        # 공백 제거 및 정리
+        date_text = date_text.strip()
+
+        # YYYY-MM-DD 형식 매칭
+        match_full = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", date_text)
+        if match_full:
+            year, month, day = match_full.groups()
+            try:
+                # 유효한 날짜인지 검증
+                datetime(int(year), int(month), int(day))
+                return int(f"{year}{month.zfill(2)}{day.zfill(2)}")
+            except ValueError:
+                pass
+
+        # YYYY-MM 형식 매칭 (일자는 01로 설정)
+        match_month = re.match(r"^(\d{4})-(\d{1,2})$", date_text)
+        if match_month:
+            year, month = match_month.groups()
+            try:
+                # 유효한 월인지 검증
+                datetime(int(year), int(month), 1)
+                return int(f"{year}{month.zfill(2)}01")
+            except ValueError:
+                pass
+
+        # YYYY 형식 매칭 (월일은 0101로 설정)
+        match_year = re.match(r"^(\d{4})$", date_text)
+        if match_year:
+            year = match_year.group(1)
+            try:
+                datetime(int(year), 1, 1)
+                return int(f"{year}0101")
+            except ValueError:
+                pass
+
+        return 0
+
+    def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocument:
+        return document
+
+        # 새로운 enriched result 받기
+        document = enrich_document(document, self.enrichment_options, **kwargs)
+        return document
+
+    async def compose_vectors(
+        self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, **kwargs: dict
+    ) -> list[dict]:
+        title = ""
+        created_date = 0
+        try:
+            if (
+                document.key_value_items
+                and len(document.key_value_items) > 0
+                and hasattr(document.key_value_items[0], "graph")
+                and hasattr(document.key_value_items[0].graph, "cells")
+                and len(document.key_value_items[0].graph.cells) > 1
+            ):
+                # 작성일 추출 (cells[1])
+                date_text = document.key_value_items[0].graph.cells[1].text
+                created_date = self.parse_created_date(date_text)
+        except (AttributeError, IndexError) as e:
+            pass
+
+        for item, _ in document.iterate_items():
+            if hasattr(item, "label"):
+                if item.label == DocItemLabel.TITLE:
+                    title = item.text.strip() if item.text else ""
+                    break
+
+        # kwargs에서 부록 정보 추출 !! appendix feature (2025-09-30, geonhee kim) !!
+        appendix_info = kwargs.get("appendix", "")
+        appendix_list = []
+        if isinstance(appendix_info, str):
+            appendix_list = (
+                [item.strip() for item in json.loads(appendix_info) if item.strip()] if appendix_info else []
+            )
+        elif isinstance(appendix_info, list):
+            appendix_list = appendix_info
+        else:
+            appendix_list = []
+
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=document.num_pages(),
+            reg_date=datetime.now().isoformat(timespec="seconds") + "Z",
+            created_date=created_date,
+            title=title,
+        )
+
+        current_page = None
+        chunk_index_on_page = 0
+        vectors = []
+        upload_tasks = []
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+            # header 앞에 헤더 마커 추가 (HEADER: )
+            headers_text = "HEADER: " + ", ".join(chunk.meta.headings) + "\n" if chunk.meta.headings else ""
+            content = headers_text + chunk.text
+
+            # appendix 추출 !! appendix feature (2025-09-30, geonhee kim) !!
+            matched_appendices = self.check_appendix_keywords(content, appendix_list)
+            # print(appendix_list, matched_appendices)
+            chunk_global_metadata = global_metadata.copy()
+            chunk_global_metadata["appendix"] = matched_appendices  # Only matched ones
+            ###
+
+            if chunk_page != current_page:
+                current_page = chunk_page
+                chunk_index_on_page = 0
+
+            vector = (
+                GenOSVectorMetaBuilder()
+                .set_text(content)
+                .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+                .set_chunk_index(chunk_idx)
+                .set_global_metadata(**chunk_global_metadata)  #!! appendix feature (2025-09-30, geonhee kim) !!
+                .set_chunk_bboxes(chunk.meta.doc_items, document)
+                .set_media_files(chunk.meta.doc_items)
+            ).build()
+            vectors.append(vector)
+
+            chunk_index_on_page += 1
+            if upload_files:
+                file_list = self.get_media_files(chunk.meta.doc_items)
+                upload_tasks.append(asyncio.create_task(upload_files(file_list, request=request)))
+
+        if upload_tasks:
+            await asyncio.gather(*upload_tasks)
+
+        return vectors
+
+    def get_media_files(self, doc_items: list):
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({"path": path, "name": name})
+        return temp_list
+
+    def check_glyph_text(self, text: str, threshold: int = 1) -> bool:
+        """텍스트에 GLYPH 항목이 있는지 확인하는 메서드"""
+        if not text:
+            return False
+
+        # GLYPH 항목이 있는지 정규식으로 확인
+        matches = re.findall(r"GLYPH\w*", text)
+        if len(matches) >= threshold:
+            # print(f"Text has glyphs. len(matches): {len(matches)}. ")
+            return True
+
+        return False
+
+    def check_glyphs(self, document: DoclingDocument) -> bool:
+        """문서에 글리프가 있는지 확인하는 메서드"""
+        for item, level in document.iterate_items():
+            if isinstance(item, TextItem) and hasattr(item, "prov") and item.prov:
+                page_no = item.prov[0].page_no
+                # page_texts += item.text
+
+                # GLYPH 항목이 있는지 확인. 정규식사용
+                matches = re.findall(r"GLYPH\w*", item.text)
+                if len(matches) > 10:
+                    # print(f"Document has glyphs on page {page_no}. len(matches): {len(matches)}. ")
+                    return True
+
+        return False
+
+    def check_appendix_keywords(
+        self, content: str, appendix_list: list
+    ) -> str:  # !! appendix feature (2025-09-30, geonhee kim) !!
+        if not content or not appendix_list:
+            return ""
+
+        matched_appendices = []
+
+        # 1. Find appendix patterns in content first
+        found_patterns = []
+
+        # Complex patterns: 별지/별표/장부 + numbers (with hyphens, Roman numerals)
+        # Updated regex to capture full patterns like "별지 제 Ⅰ -1 호 서식" by matching until closing delimiters
+        content = re.sub(r"\s+", "", content)
+        complex_patterns = re.findall(r"(별지|별표|장부)(?:제)?([^<>()\[\]]+?)(?=(?:호|서식)|[<>\)\]]|$)", content)
+        for pattern_type, number in complex_patterns:
+            found_patterns.extend(
+                [
+                    f"{pattern_type} {number}",
+                    f"{pattern_type} 제{number}호",
+                    f"{pattern_type}{number}",
+                    f"{pattern_type}제{number}호",
+                ]
+            )
+
+        # Standalone patterns: (별표), (별지), (장부)
+        standalone_patterns = re.findall(r"[\(\[]+(별지|별표|장부)[\)\]]+", content)
+        for pattern_type in set(standalone_patterns):
+            found_patterns.extend(
+                [
+                    pattern_type,
+                    f"{pattern_type}",
+                ]
+            )
+
+        # 2. Check if found patterns match any appendix in the list
+        for appendix in appendix_list:
+            if not appendix or not isinstance(appendix, str):
+                continue
+
+            appendix_clean = appendix.replace(".pdf", "").lower().strip()
+
+            # If any found pattern exists in appendix filename, it's a match
+            for pattern in found_patterns:
+                if pattern.lower().strip() in appendix_clean:
+                    matched_appendices.append(appendix)
+                    break  # Prevent duplicates
+
+        return ", ".join(matched_appendices) if matched_appendices else ""
+
+    def ocr_all_table_cells(self, document: DoclingDocument, pdf_path) -> List[Dict[str, Any]]:
+        """
+        글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR을 수행합니다.
+        Args:
+            document: DoclingDocument 객체
+            pdf_path: PDF 파일 경로
+        Returns:
+            OCR이 완료된 문서의 DoclingDocument 객체
+        """
+        import fitz
+        import base64
+        import requests
+
+        def post_ocr_bytes(img_bytes: bytes, timeout=60) -> dict:
+            HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
+            payload = {"file": base64.b64encode(img_bytes).decode("ascii"), "fileType": 1, "visualize": False}
+            r = requests.post(self.ocr_endpoint, json=payload, headers=HEADERS, timeout=timeout)
+            if not r.ok:
+                # 진단에 도움되도록 본문 일부 출력
+                raise RuntimeError(f"OCR HTTP {r.status_code}: {r.text[:500]}")
+            return r.json()
+
+        def extract_ocr_fields(resp: dict):
+            """
+            resp: 위와 같은 OCR 응답 JSON(dict)
+            return: (rec_texts, rec_scores, rec_boxes) — 모두 list
+            """
+            if resp is None:
+                return [], [], []
+
+            # 최상위 상태 체크
+            if resp.get("errorCode") not in (0, None):
+                return [], [], []
+
+            ocr_results = resp.get("result", {}).get("ocrResults", [])
+            if not ocr_results:
+                return [], [], []
+
+            pruned = ocr_results[0].get("prunedResult", {})
+            if not pruned:
+                return [], [], []
+
+            rec_texts = pruned.get("rec_texts", [])  # list[str]
+            rec_scores = pruned.get("rec_scores", [])  # list[float]
+            rec_boxes = pruned.get("rec_boxes", [])  # list[[x1,y1,x2,y2]]
+
+            # 길이 불일치 방어: 최소 길이에 맞춰 자르기
+            n = min(len(rec_texts), len(rec_scores), len(rec_boxes))
+            return rec_texts[:n], rec_scores[:n], rec_boxes[:n]
+
+        try:
+            doc = fitz.open(pdf_path)
+
+            for table_idx, table_item in enumerate(document.tables):
+                if not table_item.data or not table_item.data.table_cells:
+                    continue
+
+                b_ocr = False
+                for cell_idx, cell in enumerate(table_item.data.table_cells):
+                    if self.check_glyph_text(cell.text, threshold=1):
+                        b_ocr = True
+                        break
+
+                if b_ocr is False:
+                    # 글리프 깨진 텍스트가 없는 경우, OCR을 수행하지 않음
+                    continue
+
+                for cell_idx, cell in enumerate(table_item.data.table_cells):
+
+                    # Provenance 정보에서 위치 정보 추출
+                    if not table_item.prov:
+                        continue
+
+                    page_no = table_item.prov[0].page_no - 1
+                    bbox = cell.bbox
+
+                    page = doc.load_page(page_no)
+
+                    # 셀의 바운딩 박스를 사용하여 이미지에서 해당 영역을 잘라냄
+                    cell_bbox = fitz.Rect(bbox.l, min(bbox.t, bbox.b), bbox.r, max(bbox.t, bbox.b))
+
+                    # bbox 높이 계산 (PDF 좌표계 단위)
+                    bbox_height = cell_bbox.height
+
+                    # 목표 픽셀 높이
+                    target_height = 20
+
+                    # zoom factor 계산
+                    # (너무 작은 bbox일 경우 0으로 나누는 걸 방지)
+                    zoom_factor = target_height / bbox_height if bbox_height > 0 else 1.0
+                    zoom_factor = min(zoom_factor, 4.0)  # 최대 확대 비율 제한
+                    zoom_factor = max(zoom_factor, 1)  # 최소 확대 비율 제한
+
+                    # 페이지를 이미지로 렌더링
+                    mat = fitz.Matrix(zoom_factor, zoom_factor)
+                    pix = page.get_pixmap(matrix=mat, clip=cell_bbox)
+                    img_data = pix.tobytes("png")
+
+                    result = post_ocr_bytes(img_data, timeout=60)
+                    rec_texts, rec_scores, rec_boxes = extract_ocr_fields(result)
+
+                    cell.text = ""
+                    for t in rec_texts:
+                        if len(cell.text) > 0:
+                            cell.text += " "
+                        cell.text += t if t else ""
+        except Exception as e:
+            print(f"OCR processing failed: {e}")
+            pass
+
+        return document
+
+    def setup_logging(self, level_num: int):
+        """
+        5"DEBUG", 4"INFO", 3"WARNING", 2"ERROR", 1"CRITICAL", 0"NOLOG" 중 하나를 받아서 로깅 레벨을 설정하는 메서드
+        """
+
+        def get_level_name(level_num: int) -> str:
+            level_map = {5: "DEBUG", 4: "INFO", 3: "WARNING", 2: "ERROR", 1: "CRITICAL", 0: "NOLOG"}
+            return level_map.get(level_num, "INFO")
+
+        level_name = get_level_name(level_num)
+        print(f"Setting log level to: {level_name}")
+
+        if level_name == "NOLOG" or not hasattr(logging, level_name):
+            logging.disable(logging.CRITICAL)  # 모든 로그 비활성화
+            return
+
+        level = getattr(logging, level_name.upper())
+
+        # root logger 설정 (핸들러는 main에서만 설정)
+        logging.basicConfig(
+            level=level,
+            format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+            handlers=[logging.StreamHandler()],  # 콘솔 출력
+        )
+
+        # root logger level 적용
+        logging.getLogger().setLevel(level)
+
+    async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+        self.setup_logging(kwargs.get("log_level", 4))
+
+        _log.info(f"file_path: {file_path}")
+        _log.info(f"kwargs: {kwargs}")
+
+        document: DoclingDocument = self.load_documents(file_path, **kwargs)
+
+        # @@@@ 성민: 이게....... 여기 있는게 아니라 로드 중간에 있어야 할거같은데.
+        if not check_document(document, self.enrichment_options) or self.check_glyphs(document):
+            # OCR이 필요하다고 판단되면 OCR 수행
+            document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+        # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
+        document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
+
+        output_path, output_file = os.path.split(file_path)
+        filename, _ = os.path.splitext(output_file)
+        artifacts_dir = Path(f"{output_path}/{filename}")
+        if artifacts_dir.is_absolute():
+            reference_path = None
+        else:
+            reference_path = artifacts_dir.parent
+
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
+
+        document = self.enrichment(document, **kwargs)
+
+        has_text_items = False
+        for item, _ in document.iterate_items():
+            if (
+                isinstance(item, (TextItem, ListItem, CodeItem, SectionHeaderItem)) and item.text and item.text.strip()
+            ) or (isinstance(item, TableItem) and item.data and len(item.data.table_cells) == 0):
+                has_text_items = True
+                break
+
+        if has_text_items:
+            # Extract Chunk from DoclingDocument
+            chunks: List[DocChunk] = self.split_documents(document, **kwargs)
+        else:
+            # text가 있는 item이 없을 때 document에 임의의 text item 추가
+            # 첫 번째 페이지의 기본 정보 사용 (1-based indexing)
+            page_no = 1
+
+            # ProvenanceItem 생성
+            prov = ProvenanceItem(page_no=page_no, bbox=BoundingBox(l=0, t=0, r=1, b=1), charspan=(0, 1))  # 최소 bbox
+
+            # document에 temp text item 추가
+            document.add_text(label=DocItemLabel.TEXT, text=".", prov=prov)
+
+            # split_documents 호출
+            chunks: List[DocChunk] = self.split_documents(document, **kwargs)
+        # await assert_cancelled(request)
+
+        vectors = []
+        if len(chunks) >= 1:
+            vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+        else:
+            raise GenosServiceException(1, f"chunk length is 0")
+
+        """
+        # 미디어 파일 업로드 방법
+        media_files = [
+            { 'path': '/tmp/graph.jpg', 'name': 'graph.jpg', 'type': 'image' },
+            { 'path': '/result/1/graph.jpg', 'name': '1/graph.jpg', 'type': 'image' },
+        ]
+
+        # 업로드 요청 시에는 path, name 필요
+        file_list = [{k: v for k, v in file.items() if k != 'type'} for file in media_files]
+        await upload_files(file_list, request=request)
+
+        # 메타에 저장시에는 name, type 필요
+        meta = [{k: v for k, v in file.items() if k != 'path'} for file in media_files]
+        vectors[0].media_files = meta
+        """
+
+        return vectors
+
+
+class GenosServiceException(Exception):
+    # GenOS 와의 의존성 부분 제거를 위해 추가
+    def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+        self.code = 1
+        self.error_code = error_code
+        self.error_msg = error_msg or "GenOS Service Exception"
+        self.msg_params = msg_params or {}
+
+    def __repr__(self) -> str:
+        class_name = self.__class__.__name__
+        return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# GenOS 와의 의존성 제거를 위해 추가
+async def assert_cancelled(request: Request):
+    if await request.is_disconnected():
+        raise GenosServiceException(1, f"Cancelled")
+
+
+# -----------------------------------------------------------------
+# enrichment 프롬프트
+# -----------------------------------------------------------------
+
+toc_system_prompt = """You are an expert at generating table of contents (목차) from Korean documents. You specialize in regulatory documents, terms of service, contracts, and mixed-format documents that combine formal regulatory structures with general section headers.
+""".strip()
+toc_user_prompt = """
+Here is the Korean document you need to analyze:
+
+<document>
+{{raw_text}}
+</document>
+
+Your task is to extract and organize all structural elements from this document into a hierarchical table of contents. Korean documents often have mixed structures where some sections follow formal regulatory patterns (제x장/절/관/조) while others use general section numbering and headers.
+
+## Analysis Process
+
+Before generating the final table of contents, work through the document systematically in `<analysis>` tags. It's OK for this section to be quite long. Follow these steps:
+
+1. **Document Title Extraction**: Quote the main document title exactly as it appears at the beginning of the document.
+
+2. **Structural Marker Identification**: Scan through the document and quote all the key structural markers you find, such as:
+   - Formal regulatory patterns: 제x장, 제x절, 제x관, 제x조
+   - General section patterns: numbered headers (1., 2., etc.), lettered headers (가., 나., etc.)
+   - Special sections: 부칙, 별지, 별표, etc.
+
+3. **Systematic Section Extraction**: Work through the document from beginning to end, extracting each structural element in order:
+   - For each main section, quote the exact title as it appears
+   - For each subsection, quote the exact title and note which main section it belongs under
+   - For each article/item, quote the exact title and note its parent section
+   - Include any appendices, attachments, and addenda
+
+4. **Hierarchy Building**: For each extracted element, explicitly note:
+   - What level it should be at (main section, subsection, sub-subsection, etc.)
+   - What its parent section is (if any)
+   - What numbering it should receive in the final TOC (1., 1.1., 1.1.1., etc.)
+
+5. **Structure Verification**: Review your extracted elements to ensure:
+   - All structural elements are captured in document order
+   - The hierarchy makes logical sense
+   - No elements are duplicated or missed
+
+## Output Requirements
+
+After your analysis, generate the table of contents with this exact format:
+
+```
+<toc>
+TITLE:<document title>
+1. <first main section title>
+1.1. <first subsection title>
+1.1.1. <first sub-subsection title>
+1.2. <second subsection title>
+2. <second main section title>
+2.1. <subsection under second main section>
+3. <third main section title>
+</toc>
+```
+
+## Formatting Guidelines
+
+- Start with `TITLE:` followed by the document title
+- Use hierarchical decimal numbering (1, 1.1, 1.1.1, etc.)
+- Follow each number with a space and the original title exactly as it appears
+- Maintain the document's logical hierarchy
+- Include appendices, attachments, and addenda as separate top-level items
+- Extract titles exactly as they appear - do not include explanatory content
+- Handle both formal regulatory structures and general section headers
+- Wrap the entire table of contents in `<toc></toc>` tags
+""".strip()
diff --git a/genon/preprocessor/module/test.py b/genon/preprocessor/module/test.py
new file mode 100644
index 0000000000..b8b1310f51
--- /dev/null
+++ b/genon/preprocessor/module/test.py
@@ -0,0 +1,59 @@
+import os
+from fastapi import Request
+import logging
+import asyncio
+import json
+import time
+
+import sys
+
+sys.path.insert(0, "../../../")  # 현재 doc_parser의 docling 폴더 참조
+
+# 테스트할 전처리기 임포트
+# from attachment_processor import DocumentProcessor # 첨부용
+# from convert_processor import DocumentProcessor # 변환형
+# from intelligent_processor import DocumentProcessor  # 지능형
+from test_processor import DocumentProcessor
+
+# 파일 경로
+file_path = "../sample_files/pdf_sample.pdf"
+# file_path = "/home/gamy0315/doc_parser/삼성전자_재무제표.pdf"
+# file_path = "/home/gamy0315/doc_parser/genon/preprocessor/sample_files/docx_sample.docx"
+
+# 파일 존재 여부 확인
+if not os.path.exists(file_path):
+    print(f"Sample file not found: {file_path}")
+    print("Please add a file to the sample_files folder.")
+    exit(1)
+
+# DocumentProcessor 인스턴스 생성
+doc_processor = DocumentProcessor()
+
+# FastAPI 요청 예제
+mock_request = Request(scope={"type": "http"})
+
+
+# 비동기 메서드 실행
+async def process_document():
+    # print(file_path)
+    kwargs = {}
+    kwargs["org_filename"] = os.path.basename(file_path)
+    vectors = await doc_processor(mock_request, file_path, **kwargs)
+    return vectors
+
+
+begin = time.time()
+# 메인 루프 실행
+result = asyncio.run(process_document())
+
+result_list_as_dict = [item.model_dump() for item in result]
+
+# 최종적으로 이 리스트를 JSON으로 저장
+with open("result.json", "w", encoding="utf-8") as f:
+    json.dump(result_list_as_dict, f, ensure_ascii=False, indent=4)
+
+end = time.time()
+print(f"Processing time: {end - begin:.2f} seconds")
+
+
+breakpoint()
diff --git a/genon/preprocessor/module/test_processor.py b/genon/preprocessor/module/test_processor.py
new file mode 100644
index 0000000000..5b3637166a
--- /dev/null
+++ b/genon/preprocessor/module/test_processor.py
@@ -0,0 +1,55 @@
+# from typing import Any
+
+# from fastapi import Request
+# from langchain_core.documents import Document
+
+from base_processor import BaseProcessor
+
+
+ocr_config = {
+    "model": "~~~~~~~~~",  # easy, paddle
+    "end_point": "~~~~",
+}
+
+vlm_layout_config = {
+    "model": "~~~~~~~~",
+    "end_point": "~~~~~~~~~",
+}
+
+vlm_toc_config = {
+    "model": "~~~~~~~~",
+    "end_point": "~~~~~~~~~",
+}
+
+
+pdf_config = {
+    "pipeline_options": "~~~~~",
+    "backend": "~~~~",
+}
+
+toc_config = {
+    "pipeline_options": "~~~~~",
+    "backend": "~~~~",
+}
+
+
+config = {
+    # TODO: ["pdf", "hwp", "hwpx", "doc", "docx", "xlsx", "csv", "ppt", "pptx", "md", "json", "html"], + image
+    "format_options": {
+        "pdf": {
+            "pipeline_options": "pdf",  # simple | pdf
+            "backend": "pypdf",  #
+            "generate_picture_images": True,  # pdf일때만 설정 가능한듯
+        },
+        "docx": {
+            "pipeline_options": "simple",
+            "backend": "msword",
+        },
+    },
+    "chunker": "simple",  # TODO: static bucket, dynamic bucket
+}
+
+
+class DocumentProcessor(BaseProcessor):
+    def __init__(self):
+        super().__init__(config)
diff --git a/genon/preprocessor/module/utils/chunkers.py b/genon/preprocessor/module/utils/chunkers.py
new file mode 100644
index 0000000000..ccb124e40a
--- /dev/null
+++ b/genon/preprocessor/module/utils/chunkers.py
@@ -0,0 +1,836 @@
+import math, bisect
+from typing import Any, Iterable, Iterator, List, Optional, Union
+from typing_extensions import Self
+from pydantic import ConfigDict, model_validator
+
+
+from docling_core.types import DoclingDocument
+from docling_core.types.doc import DocItem, TextItem, SectionHeaderItem, CodeItem, TableItem, PictureItem
+from docling_core.types.doc.document import LevelNumber, ContentLayer, ListItem
+from docling_core.transforms.chunker import BaseChunker, BaseChunk, DocChunk, DocMeta
+from docling_core.types.doc.labels import DocItemLabel
+
+import semchunk
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+
+class GenosBucketChunker(BaseChunker):
+    """토큰 제한을 고려하여 섹션별 청크를 분할하고 병합하는 청커 (v2)"""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    tokenizer: Union[PreTrainedTokenizerBase, str] = "sentence-transformers/all-MiniLM-L6-v2"
+    max_tokens: int = 1024
+    merge_peers: bool = True
+
+    # _inner_chunker: BaseChunker = None
+    _tokenizer: PreTrainedTokenizerBase = None
+    merge_list_items: bool = True
+
+    @model_validator(mode="after")
+    def _initialize_components(self) -> Self:
+        # 토크나이저 초기화
+        self._tokenizer = (
+            self.tokenizer
+            if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+            else AutoTokenizer.from_pretrained(self.tokenizer)
+        )
+        return self
+
+    def preprocess(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        """문서의 모든 아이템을 헤더 정보와 함께 청크로 생성
+
+        Args:
+            dl_doc: 청킹할 문서
+
+        Yields:
+            문서의 모든 아이템을 포함하는 하나의 청크
+        """
+        # 모든 아이템과 헤더 정보 수집
+        all_items = []
+        all_header_info = []  # 각 아이템의 헤더 정보
+        current_heading_by_level: dict[LevelNumber, str] = {}
+        all_header_short_info = []  # 각 아이템의 짧은 헤더 정보
+        current_heading_short_by_level: dict[LevelNumber, str] = {}
+        list_items: list[TextItem] = []
+
+        # iterate_items()로 수집된 아이템들의 self_ref 추적
+        processed_refs = set()
+
+        # 모든 아이템 순회
+        for item, level in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
+            if hasattr(item, "self_ref"):
+                processed_refs.add(item.self_ref)
+
+            if not isinstance(item, DocItem):
+                continue
+
+            # 리스트 아이템 병합 처리
+            if self.merge_list_items:
+                if isinstance(item, ListItem) or (isinstance(item, TextItem) and item.label == DocItemLabel.LIST_ITEM):
+                    list_items.append(item)
+                    continue
+                elif list_items:
+                    # 누적된 리스트 아이템들을 추가
+                    for list_item in list_items:
+                        all_items.append(list_item)
+                        # 리스트 아이템의 헤더 정보 저장
+                        all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                        all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+                    list_items = []
+
+            # 섹션 헤더 처리
+            if isinstance(item, SectionHeaderItem) or (
+                isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+            ):
+                # 새로운 헤더 레벨 설정
+                header_level = (
+                    item.level
+                    if isinstance(item, SectionHeaderItem)
+                    else (0 if item.label == DocItemLabel.TITLE else 1)
+                )
+                current_heading_by_level[header_level] = item.text
+                current_heading_short_by_level[header_level] = item.orig  # 첫 단어로 짧은 헤더 정보 설정
+
+                # 더 깊은 레벨의 헤더들 제거
+                keys_to_del = [k for k in current_heading_by_level if k > header_level]
+                for k in keys_to_del:
+                    current_heading_by_level.pop(k, None)
+                keys_to_del_short = [k for k in current_heading_short_by_level if k > header_level]
+                for k in keys_to_del_short:
+                    current_heading_short_by_level.pop(k, None)
+
+                # 헤더 아이템도 추가 (헤더 자체도 아이템임)
+                all_items.append(item)
+                all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+                continue
+
+            if (
+                isinstance(item, TextItem)
+                or isinstance(item, ListItem)
+                or isinstance(item, CodeItem)
+                or isinstance(item, TableItem)
+                or isinstance(item, PictureItem)
+            ):
+                # if item.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                #     item.text = ""
+                all_items.append(item)
+                # 현재 아이템의 헤더 정보 저장
+                all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+
+        # 마지막 리스트 아이템들 처리
+        if list_items:
+            for list_item in list_items:
+                all_items.append(list_item)
+                all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+                all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+
+        # iterate_items()에서 누락된 테이블들을 별도로 추가
+        missing_tables = []
+        for table in dl_doc.tables:
+            table_ref = getattr(table, "self_ref", None)
+            if table_ref not in processed_refs:
+                missing_tables.append(table)
+
+        # 누락된 테이블들을 문서 앞부분에 추가 (페이지 1의 테이블들일 가능성이 높음)
+        if missing_tables:
+            for missing_table in missing_tables:
+                # 첫 번째 위치에 삽입 (헤더 테이블일 가능성이 높음)
+                all_items.insert(0, missing_table)
+                all_header_info.insert(0, {})  # 빈 헤더 정보
+                all_header_short_info.insert(0, {})  # 빈 짧은 헤더 정보
+
+        # 아이템이 없으면 빈 문서
+        if not all_items:
+            return
+
+        # 모든 아이템을 하나의 청크로 반환 (HybridChunker에서 분할)
+        # headings는 None으로 설정하고, 헤더 정보는 별도로 관리
+        chunk = DocChunk(
+            text="",  # 텍스트는 HybridChunker에서 생성
+            meta=DocMeta(
+                doc_items=all_items,
+                headings=None,  # DocMeta의 원래 형식 유지
+                captions=None,
+                origin=dl_doc.origin,
+            ),
+        )
+        # 헤더 정보를 별도 속성으로 저장
+        chunk._header_info_list = all_header_info
+        chunk._header_short_info_list = all_header_short_info  # 짧은 헤더 정보도 저장
+        yield chunk
+
+    def _count_tokens(self, text: str) -> int:
+        """텍스트의 토큰 수 계산 (안전한 분할 처리)"""
+        if not text:
+            return 0
+
+        # 텍스트를 더 작은 단위로 분할하여 계산
+        max_chunk_length = 300  # 더 안전한 길이로 설정
+        total_tokens = 0
+
+        # 텍스트를 줄 단위로 먼저 분할
+        lines = text.split("\n")
+        current_chunk = ""
+
+        for line in lines:
+            # 현재 청크에 줄을 추가했을 때 길이 확인
+            temp_chunk = current_chunk + "\n" + line if current_chunk else line
+
+            if len(temp_chunk) <= max_chunk_length:
+                current_chunk = temp_chunk
+            else:
+                # 현재 청크가 있으면 토큰 계산
+                if current_chunk:
+                    try:
+                        total_tokens += len(self._tokenizer.tokenize(current_chunk))
+                    except Exception:
+                        total_tokens += int(len(current_chunk.split()) * 1.3)  # 대략적인 계산
+
+                # 새로운 청크 시작
+                current_chunk = line
+
+        # 마지막 청크 처리
+        if current_chunk:
+            try:
+                total_tokens += len(self._tokenizer.tokenize(current_chunk))
+            except Exception:
+                total_tokens += int(len(current_chunk.split()) * 1.3)  # 대략적인 계산
+
+        return total_tokens
+
+    def _generate_text_from_items_with_headers(
+        self, items: list[DocItem], header_info_list: list[dict], dl_doc: DoclingDocument
+    ) -> str:
+        """DocItem 리스트로부터 헤더 정보를 포함한 텍스트 생성"""
+        text_parts = []
+        current_section_headers = {}  # 현재 섹션의 헤더 정보
+
+        for i, item in enumerate(items):
+            item_headers = header_info_list[i] if i < len(header_info_list) else {}
+
+            # 헤더 정보가 변경된 경우 (새로운 섹션 시작)
+            if item_headers != current_section_headers:
+                # 변경된 헤더 레벨들만 추가
+                headers_to_add = []
+                for level in sorted(item_headers.keys()):
+                    # 이전 섹션과 다른 헤더만 추가
+                    if level not in current_section_headers or current_section_headers[level] != item_headers[level]:
+                        # 해당 레벨까지의 모든 상위 헤더 포함
+                        for l in sorted(item_headers.keys()):
+                            if l < level:
+                                headers_to_add.append(item_headers[l])
+                            elif l == level:
+                                headers_to_add.append("")
+
+                        break
+
+                # 헤더가 있으면 추가
+                if headers_to_add:
+                    header_text = ", ".join(headers_to_add)
+                    if header_text not in text_parts:
+                        text_parts.append(header_text)
+
+                current_section_headers = item_headers.copy()
+
+            # 아이템 텍스트 추가
+            if isinstance(item, TableItem):
+                table_text = self._extract_table_text(item, dl_doc)
+                if table_text:
+                    text_parts.append(table_text)
+            elif hasattr(item, "text") and item.text:
+                # 타이틀과 섹션 헤더 처리 개선
+                # is_section_header = (
+                #     isinstance(item, SectionHeaderItem) or
+                #     (isinstance(item, TextItem) and
+                #      item.label in [DocItemLabel.SECTION_HEADER])  # TITLE은 제외
+                # )
+
+                # 타이틀은 항상 포함, 섹션 헤더는 중복 방지를 위해 스킵
+                # if not is_section_header:
+                # 20250909, shkim, text_parts에 없는 경우만 추가. 섹션헤더가 반복해서 추가되는 것 방지
+                if item.text not in text_parts:
+                    text_parts.append(item.text)
+            elif isinstance(item, PictureItem):
+                text_parts.append("")  # 이미지는 빈 텍스트
+
+        result_text = self.delim.join(text_parts)
+        return result_text
+
+    def _extract_table_text(self, table_item: TableItem, dl_doc: DoclingDocument) -> str:
+        """테이블에서 텍스트를 추출하는 일반화된 메서드"""
+        try:
+            # 먼저 export_to_markdown 시도
+            table_text = table_item.export_to_markdown(dl_doc)
+            if table_text and table_text.strip():
+                return table_text
+        except Exception:
+            pass
+
+        # export_to_markdown 실패 시 테이블 셀 데이터에서 직접 텍스트 추출
+        try:
+            if hasattr(table_item, "data") and table_item.data:
+                cell_texts = []
+
+                # table_cells에서 텍스트 추출
+                if hasattr(table_item.data, "table_cells"):
+                    for cell in table_item.data.table_cells:
+                        if hasattr(cell, "text") and cell.text and cell.text.strip():
+                            cell_texts.append(cell.text.strip())
+
+                # grid에서 텍스트 추출 (table_cells가 없는 경우)
+                elif hasattr(table_item.data, "grid") and table_item.data.grid:
+                    for row in table_item.data.grid:
+                        if isinstance(row, list):
+                            for cell in row:
+                                if hasattr(cell, "text") and cell.text and cell.text.strip():
+                                    cell_texts.append(cell.text.strip())
+
+                # 추출된 셀 텍스트들을 결합
+                if cell_texts:
+                    return " ".join(cell_texts)
+        except Exception:
+            pass
+
+        # 모든 방법 실패 시 item.text 사용 (있는 경우)
+        if hasattr(table_item, "text") and table_item.text:
+            return table_item.text
+
+        return ""
+
+    def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[str]]:
+        """헤더 정보 리스트에서 실제 사용되는 모든 헤더들을 level 순서대로 추출하고 ', '로 연결"""
+        if not header_info_list:
+            return None
+
+        all_headers = []  # header 순서대로 추가
+        seen_headers = set()  # 중복 방지용
+
+        for header_info in header_info_list:
+            if header_info:
+                for level in sorted(header_info.keys()):
+                    header_text = header_info[level]
+                    if header_text and header_text not in seen_headers:
+                        all_headers.append(header_text)
+                        seen_headers.add(header_text)
+
+        return all_headers if all_headers else None
+
+    def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]:
+        """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)"""
+        if not table_text:
+            return [table_text]
+
+        # 전체 테이블이 토큰 제한 내인지 확인
+        if self._count_tokens(table_text) <= max_tokens:
+            return [table_text]
+
+        # 단순히 토큰 수 기준으로 텍스트 분할
+        # semchunk 사용하여 토큰 제한에 맞게 분할
+        chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens)
+        chunks = chunker(table_text)
+        return chunks if chunks else [table_text]
+
+    def _is_section_header(self, item: DocItem) -> bool:
+        """아이템이 section header인지 확인"""
+        return isinstance(item, SectionHeaderItem) or (
+            isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+        )
+
+    def _get_section_header_level(self, item: DocItem) -> Optional[int]:
+        """Section header의 level을 반환"""
+        if isinstance(item, SectionHeaderItem):
+            return item.level
+        elif isinstance(item, TextItem):
+            if item.label == DocItemLabel.TITLE:
+                return 0
+            elif item.label == DocItemLabel.SECTION_HEADER:
+                return 1
+        return None
+
+    def _generate_section_text_with_heading(
+        self, section_items: list[DocItem], section_header_infos: list[dict], dl_doc: DoclingDocument
+    ) -> str:
+        """섹션의 텍스트를 생성하되, 앞에 heading을 붙임"""
+        # 첫 번째 item의 header_info에서 heading 추출
+        if section_header_infos and section_header_infos[0]:
+            merged_headers = {}
+            for level, header_text in section_header_infos[0].items():
+                if header_text:
+                    merged_headers[level] = header_text
+
+            # level 순서대로 정렬해서 ', '로 연결
+            if merged_headers:
+                sorted_levels = sorted(merged_headers.keys())
+                headers = [merged_headers[level] for level in sorted_levels]
+                heading_text = ", ".join(headers)
+            else:
+                heading_text = ""
+        else:
+            heading_text = ""
+
+        # 섹션의 일반 텍스트 생성
+        section_text = self._generate_text_from_items_with_headers(section_items, section_header_infos, dl_doc)
+
+        # heading이 있으면 앞에 붙이기
+        if heading_text:
+            return heading_text + ", " + section_text
+        else:
+            return section_text
+
+    def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument) -> list[DocChunk]:
+        """문서를 토큰 제한에 맞게 분할 (v2: 섹션 헤더 기준으로 분할 후 max_tokens로 병합)"""
+        items = doc_chunk.meta.doc_items
+        header_info_list = getattr(doc_chunk, "_header_info_list", [])
+        header_short_info_list = getattr(doc_chunk, "_header_short_info_list", [])
+
+        if not items:
+            return []
+
+        # ================================================================
+        # 헬퍼 함수들
+        # ================================================================
+
+        def get_header_level(header_infos, *, first=False, default=-1):
+            """header_infos에서 최종 레벨 계산"""
+            if not header_infos:
+                return default
+            info = header_infos[0] if first else header_infos[-1]
+            return max(info.keys(), default=default)
+
+        def get_current_chunk(
+            doc_chunk: DocChunk,
+            merged_texts: list[str],
+            merged_header_short_infos: list[dict],
+            merged_items: list[DocItem],
+        ):
+            """현재까지 병합된 내용으로 DocChunk 생성"""
+            if not merged_texts:
+                return None
+            chunk_text = "\n".join(merged_texts)
+            used_headers = self._extract_used_headers(merged_header_short_infos)
+
+            return DocChunk(
+                text=chunk_text,
+                meta=DocMeta(
+                    doc_items=merged_items,
+                    headings=used_headers,
+                    captions=None,
+                    origin=doc_chunk.meta.origin,
+                ),
+            )
+
+        def get_text_from_item(item: DocItem) -> str:
+            """DocItem에서 텍스트 추출"""
+            if isinstance(item, TableItem):
+                return self._extract_table_text(item, dl_doc)
+            elif hasattr(item, "text") and item.text:
+                return item.text
+            elif isinstance(item, PictureItem):
+                text = ""
+                for annotation in item.annotations:
+                    if hasattr(annotation, "text"):
+                        text += annotation.text
+                return text
+            return ""
+
+        def split_items_evenly_by_tokens(item_token_counts, max_tokens):
+            n = len(item_token_counts)
+            total = sum(item_token_counts)
+            if n == 0:
+                return []
+            if total <= max_tokens:
+                return [(0, n)]  # ✅ 항상 (a,b)
+
+            k = math.ceil(total / max_tokens)
+            target = total / k
+
+            P = [0]
+            for c in item_token_counts:
+                P.append(P[-1] + c)
+
+            cuts = [0]
+            used = {0}
+            for t in range(1, k):
+                goal = t * target
+                j = bisect.bisect_left(P, goal)
+
+                cand = []
+                if 0 < j < len(P):
+                    cand.append(j)
+                if 0 <= j - 1 < len(P):
+                    cand.append(j - 1)
+
+                best = None
+                best_dist = float("inf")
+                for x in cand:
+                    if x in used:
+                        continue
+                    if x <= cuts[-1]:
+                        continue
+                    if x >= len(P) - 1:  # n
+                        continue
+                    dist = abs(P[x] - goal)
+                    if dist < best_dist:
+                        best_dist = dist
+                        best = x
+
+                if best is None:
+                    best = min(max(cuts[-1] + 1, 1), len(P) - 2)
+
+                cuts.append(best)
+                used.add(best)
+
+            cuts.append(n)
+
+            return [(a, b) for a, b in zip(cuts[:-1], cuts[1:])]
+
+        def adjust_captions(items_group):
+
+            b_modified = False
+            for idx, group in enumerate(items_group):
+                if group is None:
+                    continue
+                item = group[0][0]
+                ref_idx_list = []
+                if hasattr(item, "captions") and item.captions:
+                    for cap in item.captions:
+                        cap_ref = cap.cref
+                        cap_idx = -1
+                        for j, it in enumerate(items_group):
+                            if it is None:
+                                continue
+                            if getattr(it[0][0], "self_ref", None) == cap_ref:
+                                cap_idx = j
+                                break
+                        if cap_idx != -1:
+                            ref_idx_list.append(cap_idx)
+                if ref_idx_list:
+                    ref_idx_list = sorted(ref_idx_list)
+
+                if not ref_idx_list:
+                    continue
+
+                # caption 아이템들을 부모 아이템 바로 뒤로 이동
+                for cap_idx in ref_idx_list:
+                    for g in items_group[cap_idx]:
+                        items_group[idx].append(g)
+                    items_group[cap_idx] = None  # 나중에 None 제거
+                    b_modified = True
+
+            if b_modified:
+                items_group = [it for it in items_group if it is not None]
+
+            return items_group
+
+        def adjust_pictures_in_tables(items_group):
+            # picture in table 처리
+
+            b_modified = False
+            for idx, group in enumerate(items_group):
+                if group is None:
+                    continue
+                item = group[0][0]
+                pic_idx_list = []
+                if isinstance(item, TableItem):
+                    table_bbox = item.prov[0].bbox
+                    table_page_no = item.prov[0].page_no
+
+                    for j in range(len(items_group)):
+                        if items_group[j] is None:
+                            continue
+                        pic_item = items_group[j][0][0]
+                        if isinstance(pic_item, PictureItem):
+                            # table 안의 picture인지 확인. iou 사용
+                            pic_bbox = pic_item.prov[0].bbox
+                            pic_page_no = pic_item.prov[0].page_no
+                            if pic_page_no != table_page_no:
+                                continue
+                            ios = pic_bbox.intersection_over_self(table_bbox)
+                            if ios > 0.5:  # picture가 50% 이상 table 안에 포함되면 table 안의 picture로 간주
+                                pic_idx_list.append(j)
+                    if pic_idx_list:
+                        pic_idx_list = sorted(pic_idx_list)
+
+                if not pic_idx_list:
+                    continue
+
+                for pic_idx in pic_idx_list:
+                    for g in items_group[pic_idx]:
+                        items_group[idx].append(g)
+                    items_group[pic_idx] = None  # 나중에 None 제거
+                    b_modified = True
+
+            if b_modified:
+                items_group = [it for it in items_group if it is not None]
+
+            return items_group
+
+        # ================================================================
+        # 1단계: 섹션 헤더 기준으로 분할
+        # ================================================================
+
+        sections = []  # [(items, header_infos, header_short_infos), ...]
+        cur_items, cur_h_infos, cur_h_short = [], [], []
+
+        for i, item in enumerate(items):
+            h_info = header_info_list[i] if i < len(header_info_list) else {}
+            h_short = header_short_info_list[i] if i < len(header_short_info_list) else {}
+
+            # 섹션 헤더를 만나면
+            if self._is_section_header(item):
+                # 이전 섹션이 있으면 저장
+                if cur_items:
+                    sections.append((cur_items, cur_h_infos, cur_h_short))
+
+                # 새로운 섹션 시작
+                cur_items = [item]
+                cur_h_infos = [h_info]
+                cur_h_short = [h_short]
+            else:
+                # 섹션 헤더가 아니면 현재 섹션에 추가
+                cur_items.append(item)
+                cur_h_infos.append(h_info)
+                cur_h_short.append(h_short)
+
+        # 마지막 섹션 저장
+        if cur_items:
+            sections.append((cur_items, cur_h_infos, cur_h_short))
+
+        # ================================================================
+        # 2단계: 각 섹션의 텍스트에 heading 붙이기
+        # ================================================================
+
+        sections_with_text = []
+        for items, header_infos, header_short_infos in sections:
+            text = self._generate_section_text_with_heading(items, header_short_infos, dl_doc)
+            sections_with_text.append((text, items, header_infos, header_short_infos))
+
+        # ================================================================
+        # 2.5단계: 너무 긴 청크는 분할
+        # ================================================================
+        if self.max_tokens > 0:
+            for i in range(len(sections_with_text)):
+                text, items, h_infos, h_short = sections_with_text[i]
+                token_count = self._count_tokens(text)
+                if token_count < self.max_tokens:
+                    continue
+
+                # caption 및 table 내 그림은 같은 섹션에 있도록 조정
+                items_group = [[(item, info, short)] for item, info, short in zip(items, h_infos, h_short)]
+                items_group = adjust_captions(items_group)
+                items_group = adjust_pictures_in_tables(items_group)
+
+                # 너무 긴 섹션은 분할
+                # 각 아이템 별 token 수 계산
+                item_token_counts = []
+                for group in items_group:
+                    cur_count = 0
+                    for g in group:
+                        cur_count += self._count_tokens(get_text_from_item(g[0]))
+                    item_token_counts.append(cur_count)
+
+                # 아이템 그룹들을 토큰 기준으로 균등 분할
+                split_info = split_items_evenly_by_tokens(item_token_counts, self.max_tokens)
+
+                # item_groups를 섹션으로 다시 구성
+                new_sections = []
+                for a, b in split_info:
+
+                    # 각 그룹에서 items, h_infos, h_short로 분리
+                    group_items = []
+                    group_h_infos = []
+                    group_h_short = []
+                    for idx in range(a, b):
+                        for g in items_group[idx]:
+                            group_items.append(g[0])
+                            group_h_infos.append(g[1])
+                            group_h_short.append(g[2])
+
+                    new_text = self._generate_section_text_with_heading(group_items, group_h_short, dl_doc)
+                    new_sections.append((new_text, group_items, group_h_infos, group_h_short))
+
+                # 원래 섹션을 새로 분할된 섹션들로 교체
+                sections_with_text.pop(i)
+                for new_section in reversed(new_sections):
+                    sections_with_text.insert(i, new_section)
+
+        # ================================================================
+        # 3단계: 단독 타이틀(1줄만) → 다음 섹션으로 병합
+        # ================================================================
+
+        for i in range(len(sections_with_text) - 2, -1, -1):
+            text, items, h_infos, h_short = sections_with_text[i]
+
+            # 아이템이 하나인 섹션 헤더만 검사
+            if len(items) != 1 or not self._is_section_header(items[0]):
+                continue
+
+            # 문단이 이미 구성된 것은 제외 (문자 수가 30자 이상이면 문단을 구성했다고 간주)
+            item_text = "".join(getattr(it, "text", "") for it in items)
+            if len(item_text) > 30:
+                continue
+
+            # 현재 섹션헤더 레벨이 다음 섹션헤더 레벨보다 더 높은 경우에만 병합 (높은 레벨이 더 작은 숫자)
+            n_text, n_items, n_h_infos, n_h_short = sections_with_text[i + 1]
+            current_level = get_header_level(h_infos, first=False)
+            next_level = get_header_level(n_h_infos, first=True)
+            if 0 <= next_level < current_level:
+                continue
+
+            # 다음 섹션과 병합
+            sections_with_text[i] = (text + "\n" + n_text, items + n_items, h_infos + n_h_infos, h_short + n_h_short)
+            sections_with_text.pop(i + 1)
+
+        # ================================================================
+        # 4단계: 토큰 기준 병합
+        # ================================================================
+
+        result_chunks = []
+        merged_texts, merged_items = [], []
+        merged_header_infos, merged_header_short_infos = [], []
+
+        for text, items, header_infos, header_short_infos in sections_with_text:
+
+            b_new_chunk = False
+
+            # ----------------------------------
+            # 병합 가능 여부 판단
+
+            # 병합 가능 토큰 수 계산
+            test_tokens = self._count_tokens("\n".join(merged_texts + [text]))
+
+            # 현재 섹션헤더 레벨과 병합된 섹션헤더 레벨
+            section_level = get_header_level(header_infos, first=True)
+            merged_level = get_header_level(merged_header_infos, first=False)
+
+            # 토큰 수 초과 시 새로운 청크 생성
+            if test_tokens > self.max_tokens and len(merged_texts) > 0:
+                b_new_chunk = True
+            # 현재 섹션헤더 레벨이 더 높으면 새로운 청크 생성
+            elif 0 <= section_level < merged_level:
+                b_new_chunk = True
+            # ----------------------------------
+
+            # 새로운 청크 생성
+            if b_new_chunk:
+                cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items)
+                if cur_chunk:
+                    result_chunks.append(cur_chunk)
+
+                # 새로운 병합 시작
+                merged_texts = [text]
+                merged_items = items
+                merged_header_infos = header_infos
+                merged_header_short_infos = header_short_infos
+            else:
+                # 현재 섹션 병합
+                merged_texts.append(text)
+                merged_items.extend(items)
+                merged_header_infos.extend(header_infos)
+                merged_header_short_infos.extend(header_short_infos)
+
+        # 마지막 병합된 items 처리
+        cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items)
+        if cur_chunk:
+            result_chunks.append(cur_chunk)
+
+        return result_chunks
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        """문서를 청킹하여 반환
+
+        Args:
+            dl_doc: 청킹할 문서
+
+        Yields:
+            토큰 제한에 맞게 분할된 청크들
+        """
+        doc_chunks = list(self.preprocess(dl_doc=dl_doc, **kwargs))
+
+        if not doc_chunks:
+            return iter([])
+
+        doc_chunk = doc_chunks[0]  # preprocess는 하나의 청크만 반환
+
+        final_chunks = self._split_document_by_tokens(doc_chunk, dl_doc)
+
+        return iter(final_chunks)
+
+
+class SimpleChunker(BaseChunker):
+    chunk_size: int = 1000
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: dict):
+        if "chunk_size" in kwargs:
+            print(f"@@@@ 기본 chunk_size를 사용합니다: {self.chunk_size}")
+
+        chunk_size = kwargs.get("chunk_size", self.chunk_size)
+
+        # 모든 아이템 수집
+        all_items: list[DocItem] = []
+        for item, _ in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
+            if isinstance(item, (TextItem, ListItem, CodeItem, SectionHeaderItem, TableItem, PictureItem)):
+                all_items.append(item)
+
+        if not all_items:
+            return iter([])
+
+        def get_text(item: DocItem) -> str:
+            if isinstance(item, TableItem):
+                try:
+                    return item.export_to_markdown(dl_doc) or ""
+                except Exception:
+                    return getattr(item, "text", "") or ""
+            return getattr(item, "text", "") or ""
+
+        chunks: list[DocChunk] = []
+        current_items: list[DocItem] = []
+        current_len = 0
+
+        for item in all_items:
+            item_text = get_text(item)
+            item_len = len(item_text)
+
+            # 현재 청크가 비어있지 않고 추가하면 chunk_size 초과 시 저장
+            if current_items and current_len + item_len + 1 > chunk_size:
+                chunks.append(
+                    DocChunk(
+                        text="\n".join(get_text(it) for it in current_items),
+                        meta=DocMeta(
+                            doc_items=current_items,
+                            headings=None,
+                            captions=None,
+                            origin=dl_doc.origin,
+                        ),
+                    )
+                )
+                current_items = []
+                current_len = 0
+
+            current_items.append(item)
+            current_len += item_len + (1 if current_len else 0)
+
+        # 마지막 청크 처리
+        if current_items:
+            chunks.append(
+                DocChunk(
+                    text="\n".join(get_text(it) for it in current_items),
+                    meta=DocMeta(
+                        doc_items=current_items,
+                        headings=None,
+                        captions=None,
+                        origin=dl_doc.origin,
+                    ),
+                )
+            )
+
+        return iter(chunks)
+
+
+CHUNKERS = {
+    "bucket": GenosBucketChunker,
+    "simple": SimpleChunker,
+}
diff --git a/genon/preprocessor/module/utils/genos_util.py b/genon/preprocessor/module/utils/genos_util.py
new file mode 100644
index 0000000000..23d808cfe9
--- /dev/null
+++ b/genon/preprocessor/module/utils/genos_util.py
@@ -0,0 +1,17 @@
+class GenosServiceException(Exception):
+    """GenOS 와의 의존성 부분 제거를 위해 추가"""
+
+    def __init__(
+        self,
+        error_code: str,
+        error_msg: Optional[str] = None,
+        msg_params: Optional[dict] = None,
+    ) -> None:
+        self.code = 1
+        self.error_code = error_code
+        self.error_msg = error_msg or "GenOS Service Exception"
+        self.msg_params = msg_params or {}
+
+    def __repr__(self) -> str:
+        class_name = self.__class__.__name__
+        return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
diff --git a/genon/preprocessor/module/utils/metadata.py b/genon/preprocessor/module/utils/metadata.py
new file mode 100644
index 0000000000..6c0f96ab49
--- /dev/null
+++ b/genon/preprocessor/module/utils/metadata.py
@@ -0,0 +1,352 @@
+import re
+from collections import defaultdict
+from pydantic import BaseModel
+from typing import Optional, List
+from datetime import datetime
+import json
+
+import asyncio
+from fastapi import Request
+from langchain_core.documents import Document
+from docling_core.types import DoclingDocument
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.transforms.chunker import DocChunk
+from docling_core.types.doc import PictureItem
+
+try:
+    from genos_utils import upload_files  # TODO 이거 확인하기
+except ImportError:
+    upload_files = None
+
+
+class GenOSVectorMeta(BaseModel):
+    class Config:
+        extra = "allow"
+
+    text: str = None
+    n_char: int = None
+    n_word: int = None
+    n_line: int = None
+    e_page: int = None
+    i_page: int = None
+    i_chunk_on_page: int = None
+    n_chunk_of_page: int = None
+    i_chunk_on_doc: int = None
+    n_chunk_of_doc: int = None
+    n_page: int = None
+    reg_date: str = None
+    chunk_bboxes: str = None
+    media_files: str = None
+    title: str = None
+    created_date: int = None
+    appendix: str = None  ## !! appendix feature (2025-09-30, geonhee kim) !!
+
+
+class GenOSVectorMetaBuilder:
+    def __init__(self):
+        """빌더 초기화"""
+        self.text: Optional[str] = None
+        self.n_char: Optional[int] = None
+        self.n_word: Optional[int] = None
+        self.n_line: Optional[int] = None
+        self.i_page: Optional[int] = None
+        self.e_page: Optional[int] = None
+        self.i_chunk_on_page: Optional[int] = None
+        self.n_chunk_of_page: Optional[int] = None
+        self.i_chunk_on_doc: Optional[int] = None
+        self.n_chunk_of_doc: Optional[int] = None
+        self.n_page: Optional[int] = None
+        self.reg_date: Optional[str] = None
+        self.chunk_bboxes: Optional[str] = None
+        self.media_files: Optional[str] = None
+        self.title: Optional[str] = None
+        self.created_date: Optional[int] = None
+        self.appendix: Optional[str] = None  # !! appendix feature (2025-09-30, geonhee kim) !!
+
+    def parse_created_date(self, date_text: str) -> Optional[int]:
+        """
+        작성일 텍스트를 파싱하여 YYYYMMDD 형식의 정수로 변환
+
+        Args:
+            date_text: 작성일 텍스트 (YYYY-MM 또는 YYYY-MM-DD 형식)
+
+        Returns:
+            YYYYMMDD 형식의 정수, 파싱 실패시 None
+        """
+        if not date_text or not isinstance(date_text, str) or date_text == "None":
+            return 0
+
+        # 공백 제거 및 정리
+        date_text = date_text.strip()
+
+        # YYYY-MM-DD 형식 매칭
+        match_full = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", date_text)
+        if match_full:
+            year, month, day = match_full.groups()
+            try:
+                # 유효한 날짜인지 검증
+                datetime(int(year), int(month), int(day))
+                return int(f"{year}{month.zfill(2)}{day.zfill(2)}")
+            except ValueError:
+                pass
+
+        # YYYY-MM 형식 매칭 (일자는 01로 설정)
+        match_month = re.match(r"^(\d{4})-(\d{1,2})$", date_text)
+        if match_month:
+            year, month = match_month.groups()
+            try:
+                # 유효한 월인지 검증
+                datetime(int(year), int(month), 1)
+                return int(f"{year}{month.zfill(2)}01")
+            except ValueError:
+                pass
+
+        # YYYY 형식 매칭 (월일은 0101로 설정)
+        match_year = re.match(r"^(\d{4})$", date_text)
+        if match_year:
+            year = match_year.group(1)
+            try:
+                datetime(int(year), 1, 1)
+                return int(f"{year}0101")
+            except ValueError:
+                pass
+
+        return 0
+
+    def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+        """텍스트와 관련된 데이터를 설정"""
+        self.text = text
+        self.n_char = len(text)
+        self.n_word = len(text.split())
+        self.n_line = len(text.splitlines())
+        return self
+
+    def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder":
+        """페이지 정보 설정"""
+        self.i_page = i_page
+        self.i_chunk_on_page = i_chunk_on_page
+        self.n_chunk_of_page = n_chunk_of_page
+        return self
+
+    def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+        """문서 전체의 청크 인덱스 설정"""
+        self.i_chunk_on_doc = i_chunk_on_doc
+        return self
+
+    def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+        """글로벌 메타데이터 병합"""
+        for key, value in global_metadata.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+        return self
+
+    def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+        chunk_bboxes = []
+        for item in doc_items:
+            for prov in item.prov:
+                label = item.self_ref
+                type_ = item.label
+                size = document.pages.get(prov.page_no).size
+                page_no = prov.page_no
+                bbox = prov.bbox
+                bbox_data = {
+                    "l": bbox.l / size.width,
+                    "t": bbox.t / size.height,
+                    "r": bbox.r / size.width,
+                    "b": bbox.b / size.height,
+                    "coord_origin": bbox.coord_origin.value,
+                }
+                chunk_bboxes.append({"page": page_no, "bbox": bbox_data, "type": type_, "ref": label})
+        self.e_page = max([bbox["page"] for bbox in chunk_bboxes]) if chunk_bboxes else None
+        self.chunk_bboxes = json.dumps(chunk_bboxes)
+        return self
+
+    def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+        temp_list = []
+        for item in doc_items:
+            if isinstance(item, PictureItem):
+                if item.image is None:
+                    print("@@@@ item.image is None: pipeline_options - generate_picture_images False!!")
+                    continue
+                path = str(item.image.uri)
+                name = path.rsplit("/", 1)[-1]
+                temp_list.append({"name": name, "type": "image", "ref": item.self_ref})
+        self.media_files = json.dumps(temp_list)
+        return self
+
+    def get_title(self, document):
+        title = ""
+        for item, _ in document.iterate_items():
+            if hasattr(item, "label"):
+                if item.label == DocItemLabel.TITLE:
+                    title = item.text.strip() if item.text else ""
+                    break
+
+        return title
+
+    def get_created_data(self, document: DoclingDocument):
+        created_date = 0
+        try:
+            if (
+                document.key_value_items
+                and len(document.key_value_items) > 0
+                and hasattr(document.key_value_items[0], "graph")
+                and hasattr(document.key_value_items[0].graph, "cells")
+                and len(document.key_value_items[0].graph.cells) > 1
+            ):
+                # 작성일 추출 (cells[1])
+                date_text = document.key_value_items[0].graph.cells[1].text
+                created_date = self.parse_created_date(date_text)
+        except (AttributeError, IndexError) as e:
+            pass
+
+        return created_date
+
+    def get_appendix_keywords(
+        self, content: str, appendix_list: list
+    ) -> str:  # !! appendix feature (2025-09-30, geonhee kim) !!
+        if not content or not appendix_list:
+            return ""
+
+        matched_appendices = []
+
+        # 1. Find appendix patterns in content first
+        found_patterns = []
+
+        # Complex patterns: 별지/별표/장부 + numbers (with hyphens, Roman numerals)
+        # Updated regex to capture full patterns like "별지 제 Ⅰ -1 호 서식" by matching until closing delimiters
+        content = re.sub(r"\s+", "", content)
+        complex_patterns = re.findall(r"(별지|별표|장부)(?:제)?([^<>()\[\]]+?)(?=(?:호|서식)|[<>\)\]]|$)", content)
+        for pattern_type, number in complex_patterns:
+            found_patterns.extend(
+                [
+                    f"{pattern_type} {number}",
+                    f"{pattern_type} 제{number}호",
+                    f"{pattern_type}{number}",
+                    f"{pattern_type}제{number}호",
+                ]
+            )
+
+        # Standalone patterns: (별표), (별지), (장부)
+        standalone_patterns = re.findall(r"[\(\[]+(별지|별표|장부)[\)\]]+", content)
+        for pattern_type in set(standalone_patterns):
+            found_patterns.extend(
+                [
+                    pattern_type,
+                    f"{pattern_type}",
+                ]
+            )
+
+        # 2. Check if found patterns match any appendix in the list
+        for appendix in appendix_list:
+            if not appendix or not isinstance(appendix, str):
+                continue
+
+            appendix_clean = appendix.replace(".pdf", "").lower().strip()
+
+            # If any found pattern exists in appendix filename, it's a match
+            for pattern in found_patterns:
+                if pattern.lower().strip() in appendix_clean:
+                    matched_appendices.append(appendix)
+                    break  # Prevent duplicates
+
+        return ", ".join(matched_appendices) if matched_appendices else ""
+
+    def get_chunk_count(self, chunks: List[DocChunk]):
+        page_chunk_counts = defaultdict(int)
+
+        for chunk in chunks:
+            page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+
+        return page_chunk_counts
+
+    def build(self) -> GenOSVectorMeta:
+        """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+        return GenOSVectorMeta(
+            text=self.text,
+            n_char=self.n_char,
+            n_word=self.n_word,
+            n_line=self.n_line,
+            i_page=self.i_page,
+            e_page=self.e_page,
+            i_chunk_on_page=self.i_chunk_on_page,
+            n_chunk_of_page=self.n_chunk_of_page,
+            i_chunk_on_doc=self.i_chunk_on_doc,
+            n_chunk_of_doc=self.n_chunk_of_doc,
+            n_page=self.n_page,
+            reg_date=self.reg_date,
+            chunk_bboxes=self.chunk_bboxes,
+            media_files=self.media_files,
+            title=self.title,
+            created_date=self.created_date,
+            appendix=self.appendix or "",  # !! appendix feature (2025-09-30, geonhee kim) !!
+        )
+
+    async def __call__(
+        self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, **kwargs: dict
+    ):
+        title = self.get_title(document)
+        created_date = self.get_created_data(document)
+        page_chunk_counts = self.get_chunk_count(chunks)
+
+        # kwargs에서 부록 정보 추출 !! appendix feature (2025-09-30, geonhee kim) !!
+        appendix_info = kwargs.get("appendix", "")
+        appendix_list = []
+        if isinstance(appendix_info, str):
+            appendix_list = (
+                [item.strip() for item in json.loads(appendix_info) if item.strip()] if appendix_info else []
+            )
+        elif isinstance(appendix_info, list):
+            appendix_list = appendix_info
+        else:
+            appendix_list = []
+
+        global_metadata = dict(
+            n_chunk_of_doc=len(chunks),
+            n_page=document.num_pages(),
+            reg_date=datetime.now().isoformat(timespec="seconds") + "Z",
+            created_date=created_date,
+            title=title,
+        )
+
+        current_page = None
+        chunk_index_on_page = 0
+        vectors = []
+        upload_tasks = []
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+            # header 앞에 헤더 마커 추가 (HEADER: )
+            headers_text = "HEADER: " + ", ".join(chunk.meta.headings) + "\n" if chunk.meta.headings else ""
+            content = headers_text + chunk.text
+
+            # appendix 추출 !! appendix feature (2025-09-30, geonhee kim) !!
+            matched_appendices = self.get_appendix_keywords(content, appendix_list)
+            # print(appendix_list, matched_appendices)
+            chunk_global_metadata = global_metadata.copy()
+            chunk_global_metadata["appendix"] = matched_appendices  # Only matched ones
+            ###
+
+            if chunk_page != current_page:
+                current_page = chunk_page
+                chunk_index_on_page = 0
+
+            vector = (
+                GenOSVectorMetaBuilder()
+                .set_text(content)
+                .set_page_info(chunk_page, chunk_index_on_page, page_chunk_counts[chunk_page])
+                .set_chunk_index(chunk_idx)
+                .set_global_metadata(**chunk_global_metadata)  #!! appendix feature (2025-09-30, geonhee kim) !!
+                .set_chunk_bboxes(chunk.meta.doc_items, document)
+                .set_media_files(chunk.meta.doc_items)
+            ).build()
+            vectors.append(vector)
+
+            chunk_index_on_page += 1
+            if upload_files:
+                file_list = self.get_media_files(chunk.meta.doc_items)
+                upload_tasks.append(asyncio.create_task(upload_files(file_list, request=request)))
+
+        if upload_tasks:
+            await asyncio.gather(*upload_tasks)
+
+        return vectors
diff --git a/genon/preprocessor/module/utils/util.py b/genon/preprocessor/module/utils/util.py
new file mode 100644
index 0000000000..eca19f8b7c
--- /dev/null
+++ b/genon/preprocessor/module/utils/util.py
@@ -0,0 +1,146 @@
+import os
+from pathlib import Path
+
+import shutil
+import unicodedata
+import tempfile
+import subprocess
+
+from markdown2 import markdown
+
+try:
+    from weasyprint import HTML
+except ImportError:
+    print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.")
+    HTML = None
+
+
+def convert_to_pdf(file_path: str) -> str | None:
+    """
+    LibreOffice로 PDF 변환을 시도한다.
+    실패해도 예외를 던지지 않고 None을 반환한다.
+    """
+    try:
+        in_path = Path(file_path).resolve()
+        out_dir = in_path.parent
+        pdf_path = in_path.with_suffix(".pdf")
+
+        # headless에서 UTF-8 locale 보장
+        env = os.environ.copy()
+        env.setdefault("LANG", "C.UTF-8")
+        env.setdefault("LC_ALL", "C.UTF-8")
+
+        # 확장자에 따라 필터(특히 .ppt는 impress 필터)
+        ext = in_path.suffix.lower()
+        if ext in (".ppt", ".pptx"):
+            convert_arg = "pdf:impress_pdf_Export"
+        elif ext in (".doc", ".docx"):
+            convert_arg = "pdf:writer_pdf_Export"
+        elif ext in (".xls", ".xlsx", ".csv"):
+            convert_arg = "pdf:calc_pdf_Export"
+        else:
+            convert_arg = "pdf"
+
+        # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도
+        try:
+            in_path.name.encode("ascii")
+            candidates = [in_path]
+            tmp_dir = None
+        except UnicodeEncodeError:
+            tmp_dir = Path(tempfile.mkdtemp())
+            ascii_name = unicodedata.normalize("NFKD", in_path.stem).encode("ascii", "ignore").decode("ascii") or "file"
+            ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}"
+            shutil.copy2(in_path, ascii_copy)
+            candidates = [ascii_copy, in_path]
+
+        for cand in candidates:
+            cmd = [
+                "soffice",
+                "--headless",
+                "--convert-to",
+                convert_arg,
+                "--outdir",
+                str(out_dir),
+                str(cand),
+            ]
+            proc = subprocess.run(cmd, env=env, capture_output=True, text=True)
+            if proc.returncode == 0 and pdf_path.exists():
+                # 성공
+                if tmp_dir:
+                    shutil.rmtree(tmp_dir, ignore_errors=True)
+                return str(pdf_path)
+            # 실패해도 계속 시도 (로그만 찍고 무시)
+            print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}")
+
+        if tmp_dir:
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+        return None
+    except Exception as e:
+        # 어떤 에러든 삼키고 None 반환
+        print(f"[convert_to_pdf] error: {e}")
+        return None
+
+
+def _get_pdf_path(file_path: str, CONVERTIBLE_EXTENSIONS: list) -> str:
+    """
+    다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수
+
+    Args:
+        file_path (str): 원본 파일 경로
+
+    Returns:
+        str: PDF 확장자로 변경된 파일 경로
+    """
+    pdf_path = file_path
+    for ext in CONVERTIBLE_EXTENSIONS:
+        pdf_path = pdf_path.replace(ext, ".pdf")
+    return pdf_path
+
+
+def get_real_file_type(file_path: str) -> str:
+    """파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
+    with open(file_path, "rb") as f:
+        header = f.read(8)
+    if header.startswith(b"%PDF-"):
+        return "pdf"
+    elif header.startswith(b"\x89PNG"):
+        return "png"
+    elif header.startswith(b"\xff\xd8\xff"):
+        return "jpg"
+
+    # 매직 헤더로 판단할 수 없으면 확장자 사용
+    return os.path.splitext(file_path)[-1].lower()
+
+
+def convert_md_to_pdf(md_path):
+    """Markdown 파일을 PDF로 변환"""
+    install_packages(["chardet"])
+    import chardet
+
+    pdf_path = md_path.replace(".md", ".pdf")
+    with open(md_path, "rb") as f:
+        raw_file = f.read()
+    candidates = ["utf-8", "utf-8-sig"]
+    try:
+        det = (chardet.detect(raw_file) or {}).get("encoding") or ""
+        # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가
+        if det and det.lower() not in ("ascii", "unknown"):
+            if det.lower() not in [c.lower() for c in candidates]:
+                candidates.append(det)
+    except Exception:
+        pass
+    candidates += ["cp949", "euc-kr", "iso-8859-1", "latin-1"]
+    md_content = None
+    for enc in candidates:
+        try:
+            md_content = raw_file.decode(enc)
+            break
+        except UnicodeDecodeError:
+            continue
+    if md_content is None:
+        md_content = raw_file.decode("utf-8", errors="replace")
+
+    html_content = markdown(md_content)
+    if HTML:
+        HTML(string=html_content).write_pdf(pdf_path)
+    return pdf_path