diff --git a/build-script/doc-parser-build.config b/build-script/doc-parser-build.config index f54b799ee3..c909a43497 100644 --- a/build-script/doc-parser-build.config +++ b/build-script/doc-parser-build.config @@ -6,19 +6,10 @@ DOCKER_REGISTRY=mncregistry:30500 IMAGE_NAME=doc-parser-preprocessor # 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능) -IMAGE_VERSION=1.3.6.2 +IMAGE_VERSION=1.3.7-komipo # 실제 Dockerfile 위치 (루트 기준) DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile # 빌드 후 push 할지 여부 -PUSH_IMAGE=true - -# USER, GROUP -APP_UID=3000 -APP_GID=3000 -APP_UNAME=genos -APP_GNAME=genos - -# NLTK packages (comma-separated). Use "all" to download everything. -APP_NLTK_PACKAGES=punkt,stopwords,averaged_perceptron_tagger,averaged_perceptron_tagger_eng,wordnet,omw-1.4 +PUSH_IMAGE=false diff --git a/build-script/paddle-ocr-build.config b/build-script/paddle-ocr-build.config index 8c9ced262e..ea2f74486d 100644 --- a/build-script/paddle-ocr-build.config +++ b/build-script/paddle-ocr-build.config @@ -6,7 +6,7 @@ DOCKERFILE=genon/serving/paddle/docker/Dockerfile # 이미지 이름/태그 IMAGE_NAME=doc-parser-ocr -IMAGE_TAG=0.0.0 +IMAGE_TAG=1.3.7-komipo # 푸시할 레지스트리 (없으면 빈값) REGISTRY=mncregistry:30500 diff --git a/genon/README.md b/genon/README.md index a04ebd0783..4e28533cac 100644 --- a/genon/README.md +++ b/genon/README.md @@ -60,7 +60,7 @@ 6. 사이트 배포 시 ```shell 1. 이미지 저장 -docker save mncregistry:30500/mnc/doc-parser-preprocessor:latest | gzip > doc-parser-preprocessor.tar.gz +docker save mncregistry:30500/mnc/doc-parser-preprocessor:1.3.3-komipo | gzip > doc-parser-preprocessor.tar.gz 2. 사이트에서 이미지 복원 gunzip -c doc-parser-preprocessor.tar.gz | docker load 3. register_image.sh 파일 실행 @@ -75,4 +75,10 @@ gunzip -c doc-parser-preprocessor.tar.gz | docker load ```shell kubectl apply -f doc-parser-ocr-deployment.yaml ``` -5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml) \ No newline at end of file +5. 노드 포트로 배포시는 [doc-parser-ocr-deployment-node-port.yaml](serving/paddle/k8s-manifest/doc-parser-ocr-deployment-node-port.yaml) + +사이트에서 +``` +docker save mncregistry:30500/doc-parser-ocr:1.3.3-komipo | gzip > doc-parser-ocr.tar.gz +gunzip -c doc-parser-ocr.tar.gz | docker load +``` diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py index f5fea12eba..2adb82a385 100644 --- a/genon/preprocessor/facade/attachment_processor.py +++ b/genon/preprocessor/facade/attachment_processor.py @@ -99,6 +99,41 @@ # pdf 변환 대상 확장자 CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx'] +## 보안컨설팅 조치로 인한 가드레일 추가 + +import requests +import re +import json + +GUARDRAIL_WORKFLOW_ID = 694 +GUARDRAIL_BEARER_TOKEN = "" +GENOS_URL = "" + +from functools import wraps + +def guardrail(func): + @wraps(func) + async def wrapper(*args, **kwargs): + result = await func(*args, **kwargs) + + for r in result: + url = f"{GENOS_URL}/api/gateway/workflow/{GUARDRAIL_WORKFLOW_ID}" + headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}") + + if hasattr(r, "text"): + body = {"question": r.text} + + res = requests.post(f"{url}/run/v2", json=body, headers=headers) + + answer = res.json()["data"]["text"] + + if answer.startswith("[UNSAFE]"): + r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다." + + return result + + return wrapper + def convert_to_pdf(file_path: str) -> str | None: """ @@ -179,6 +214,7 @@ def _get_pdf_path(file_path: str) -> str: return pdf_path + def install_packages(packages): for package in packages: try: @@ -1432,6 +1468,7 @@ def get_level_name(level_num: int) -> str: # root logger level 적용 logging.getLogger().setLevel(level) + @guardrail async def __call__(self, request: Request, file_path: str, **kwargs: dict): self.setup_logging(kwargs.get('log_level', 4)) diff --git a/genon/preprocessor/facade/attachment_processor_guardrail.py b/genon/preprocessor/facade/attachment_processor_guardrail.py new file mode 100644 index 0000000000..0064b76832 --- /dev/null +++ b/genon/preprocessor/facade/attachment_processor_guardrail.py @@ -0,0 +1,1626 @@ +from __future__ import annotations + +from collections import defaultdict + +import asyncio +import fitz +import json +import math +import os +import pandas as pd +import pydub +import requests +import shutil +import subprocess +import sys +import threading +import uuid +import warnings +from datetime import datetime +from fastapi import Request +from glob import glob +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import ( + # TextLoader, # TXT + PyMuPDFLoader, # PDF + DataFrameLoader, # DataFrame + UnstructuredWordDocumentLoader, # DOC and DOCX + UnstructuredPowerPointLoader, # PPT and PPTX + UnstructuredImageLoader, # JPG, PNG + UnstructuredMarkdownLoader, # Markdown + UnstructuredFileLoader, # Generic fallback +) +from langchain_core.documents import Document +from markdown2 import markdown +from pandas import DataFrame +from pathlib import Path +from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator +from typing import Any, Iterable, Iterator, List, Optional, Union +from typing_extensions import Self + +try: + import semchunk + from transformers import AutoTokenizer, PreTrainedTokenizerBase +except ImportError: + raise RuntimeError( + "Module requires 'chunking' extra; to install, run: " + "`pip install 'docling-core[chunking]'`" + ) +try: + import chardet +except ImportError: + raise RuntimeError("Module 'chardet' not imported. Run `pip install chardet`.") +try: + from weasyprint import HTML +except ImportError: + print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.") + HTML = None + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.document import ConversionResult, InputDocument +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.document_converter import DocumentConverter, HwpxFormatOption, WordFormatOption +from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta +from docling_core.types import DoclingDocument as DLDocument +from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, + PictureItem, SectionHeaderItem, TableItem, TextItem +) +from docling_core.types.doc.document import LevelNumber, ListItem, CodeItem +from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend +# from utils import assert_cancelled +# from genos_utils import upload_files, merge_overlapping_bboxes + +# import platform +from pathlib import Path +import os +import subprocess +import tempfile +import shutil +import unicodedata + +import logging + +for n in ("fontTools", "fontTools.ttLib", "fontTools.ttLib.ttFont"): + lg = logging.getLogger(n) + lg.setLevel(logging.CRITICAL) + lg.propagate = False + logging.getLogger().setLevel(logging.WARNING) +# pdf 변환 대상 확장자 +CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx'] + + + +### 가드레일 용 ### +import requests +import re +import json + +GUARDRAIL_WORKFLOW_ID = 694 +GUARDRAIL_BEARER_TOKEN = '23c3898fe3264fd597961af23a68fe7c' +# GENOS_URL = 'https://ai.komipo.co.kr:30908/' +# @@@@ 내부 호출로 변경 +GENOS_URL = 'http://llmops-gateway-api-service:8080' + + +from functools import wraps + +def guardrail(func): + @wraps(func) + async def wrapper(*args, **kwargs): + result = await func(*args, **kwargs) + for r in result: + url = f"{GENOS_URL}/workflow/{GUARDRAIL_WORKFLOW_ID}" + headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}") + + if hasattr(r, "text"): + body = {'question': r.text} + + res = requests.post(f'{url}/run/v2', json=body, headers=headers) + + answer = res.json()['data']['text'] + + if answer.startswith("[UNSAFE]"): + r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다." + + + return result + return wrapper + + +def convert_to_pdf(file_path: str) -> str | None: + """ + LibreOffice로 PDF 변환을 시도한다. + 실패해도 예외를 던지지 않고 None을 반환한다. + """ + try: + in_path = Path(file_path).resolve() + out_dir = in_path.parent + pdf_path = in_path.with_suffix('.pdf') + + # headless에서 UTF-8 locale 보장 + env = os.environ.copy() + env.setdefault("LANG", "C.UTF-8") + env.setdefault("LC_ALL", "C.UTF-8") + + # 확장자에 따라 필터(특히 .ppt는 impress 필터) + ext = in_path.suffix.lower() + if ext in ('.ppt', '.pptx'): + convert_arg = "pdf:impress_pdf_Export" + elif ext in ('.doc', '.docx'): + convert_arg = "pdf:writer_pdf_Export" + elif ext in ('.xls', '.xlsx', '.csv'): + convert_arg = "pdf:calc_pdf_Export" + else: + convert_arg = "pdf" + + # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도 + try: + in_path.name.encode('ascii') + candidates = [in_path] + tmp_dir = None + except UnicodeEncodeError: + tmp_dir = Path(tempfile.mkdtemp()) + ascii_name = unicodedata.normalize('NFKD', in_path.stem).encode('ascii', 'ignore').decode('ascii') or "file" + ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}" + shutil.copy2(in_path, ascii_copy) + candidates = [ascii_copy, in_path] + + for cand in candidates: + cmd = [ + "soffice", "--headless", + "--convert-to", convert_arg, + "--outdir", str(out_dir), + str(cand) + ] + proc = subprocess.run(cmd, env=env, capture_output=True, text=True) + if proc.returncode == 0 and pdf_path.exists(): + # 성공 + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return str(pdf_path) + # 실패해도 계속 시도 (로그만 찍고 무시) + print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}") + + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + return None + except Exception as e: + # 어떤 에러든 삼키고 None 반환 + print(f"[convert_to_pdf] error: {e}") + return None + + +def _get_pdf_path(file_path: str) -> str: + """ + 다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수 + + Args: + file_path (str): 원본 파일 경로 + + Returns: + str: PDF 확장자로 변경된 파일 경로 + """ + pdf_path = file_path + for ext in CONVERTIBLE_EXTENSIONS: + pdf_path = pdf_path.replace(ext, '.pdf') + return pdf_path + + +def install_packages(packages): + for package in packages: + try: + __import__(package) + except ImportError: + print(f"[!] {package} 패키지가 없습니다. 설치를 시도합니다.") + subprocess.run([sys.executable, "-m", "pip", "install", package], check=True) + + +class GenOSVectorMeta(BaseModel): + class Config: + extra = 'allow' + + text: str | None = None + n_char: int | None = None + n_word: int | None = None + n_line: int | None = None + i_page: int | None = None + e_page: int | None = None + i_chunk_on_page: int | None = None + n_chunk_of_page: int | None = None + i_chunk_on_doc: int | None = None + n_chunk_of_doc: int | None = None + n_page: int | None = None + reg_date: str | None = None + chunk_bboxes: str | None = None + media_files: str | None = None + + +class GenOSVectorMetaBuilder: + def __init__(self): + """빌더 초기화""" + self.text: Optional[str] = None + self.n_char: Optional[int] = None + self.n_word: Optional[int] = None + self.n_line: Optional[int] = None + self.i_page: Optional[int] = None + self.e_page: Optional[int] = None + self.i_chunk_on_page: Optional[int] = None + self.n_chunk_of_page: Optional[int] = None + self.i_chunk_on_doc: Optional[int] = None + self.n_chunk_of_doc: Optional[int] = None + self.n_page: Optional[int] = None + self.reg_date: Optional[str] = None + self.chunk_bboxes: Optional[str] = None + self.media_files: Optional[str] = None + # self.title: Optional[str] = None + # self.created_date: Optional[int] = None + + def set_text(self, text: str) -> "GenOSVectorMetaBuilder": + """텍스트와 관련된 데이터를 설정""" + self.text = text + self.n_char = len(text) + self.n_word = len(text.split()) + self.n_line = len(text.splitlines()) + return self + + def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder": + """페이지 정보 설정""" + self.i_page = i_page + self.i_chunk_on_page = i_chunk_on_page + self.n_chunk_of_page = n_chunk_of_page + return self + + def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder": + """문서 전체의 청크 인덱스 설정""" + self.i_chunk_on_doc = i_chunk_on_doc + return self + + def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder": + """글로벌 메타데이터 병합""" + for key, value in global_metadata.items(): + if hasattr(self, key): + setattr(self, key, value) + return self + + def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder": + chunk_bboxes = [] + for item in doc_items: + for prov in item.prov: + label = item.self_ref + type_ = item.label + size = document.pages.get(prov.page_no).size + page_no = prov.page_no + bbox = prov.bbox + bbox_data = { + 'l': bbox.l / size.width, + 't': bbox.t / size.height, + 'r': bbox.r / size.width, + 'b': bbox.b / size.height, + 'coord_origin': bbox.coord_origin.value + } + chunk_bboxes.append({ + 'page': page_no, + 'bbox': bbox_data, + 'type': type_, + 'ref': label + }) + self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None + self.chunk_bboxes = json.dumps(chunk_bboxes) + return self + + def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder": + temp_list = [] + if not doc_items: + self.media_files = "" + return self + for item in doc_items: + if isinstance(item, PictureItem): + path = str(item.image.uri) + name = path.rsplit("/", 1)[-1] + temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref}) + self.media_files = json.dumps(temp_list) + return self + + def build(self) -> GenOSVectorMeta: + """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성""" + return GenOSVectorMeta( + text=self.text, + n_char=self.n_char, + n_word=self.n_word, + n_line=self.n_line, + i_page=self.i_page, + e_page=self.e_page, + i_chunk_on_page=self.i_chunk_on_page, + n_chunk_of_page=self.n_chunk_of_page, + i_chunk_on_doc=self.i_chunk_on_doc, + n_chunk_of_doc=self.n_chunk_of_doc, + n_page=self.n_page, + reg_date=self.reg_date, + chunk_bboxes=self.chunk_bboxes, + media_files=self.media_files, + ) + + +class HwpLoader: + def __init__(self, file_path: str): + self.file_path = file_path + self.output_dir = os.path.join('/tmp', str(uuid.uuid4())) + os.makedirs(self.output_dir, exist_ok=True) + + def load(self): + try: + subprocess.run(['hwp5html', self.file_path, '--output', self.output_dir], check=True, timeout=600) + converted_file_path = os.path.join(self.output_dir, 'index.xhtml') + pdf_save_path = _get_pdf_path(self.file_path) + HTML(converted_file_path).write_pdf(pdf_save_path) + loader = PyMuPDFLoader(pdf_save_path) + return loader.load() + except Exception as e: + print(f"Failed to convert {self.file_path} to XHTML") + raise e + finally: + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + +class TextLoader: + def __init__(self, file_path: str): + self.file_path = file_path + self.output_dir = os.path.join('/tmp', str(uuid.uuid4())) + os.makedirs(self.output_dir, exist_ok=True) + + def load(self): + try: + with open(self.file_path, 'rb') as f: + raw = f.read() + enc = chardet.detect(raw).get('encoding') or '' + encodings = [enc] if enc and enc.lower() not in ('ascii', 'unknown') else [] + encodings += ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1'] + + content = None + for e in encodings: + try: + content = raw.decode(e) # 전체 파일로 디코딩 + break + except UnicodeDecodeError: + continue + if content is None: + content = raw.decode('utf-8', errors='replace') + + # 4) PDF 변환 유지 + html = f"
{content}"
+ html_path = os.path.join(self.output_dir, 'temp.html')
+ with open(html_path, 'w', encoding='utf-8') as f:
+ f.write(html)
+ # pdf_path = (self.file_path
+ # .replace('.txt', '.pdf')
+ # .replace('.json', '.pdf'))
+ pdf_path = _get_pdf_path(self.file_path)
+ if HTML:
+ HTML(html_path).write_pdf(pdf_path)
+ loader = PyMuPDFLoader(pdf_path)
+ return loader.load()
+ # PDF가 불가하면 Document 직접 반환 (원형 스키마 유지)
+ return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+
+ except Exception:
+ # 실패 시에도 스키마는 그대로 유지해 반환
+ for e in ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1']:
+ try:
+ with open(self.file_path, 'r', encoding=e) as f:
+ content = f.read()
+ return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+ except UnicodeDecodeError:
+ continue
+ with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+ return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+ finally:
+ if os.path.exists(self.output_dir):
+ shutil.rmtree(self.output_dir)
+
+
+class TabularLoader:
+ def __init__(self, file_path: str, ext: str):
+
+ packages = ['openpyxl', 'chardet']
+
+ install_packages(packages)
+
+ self.file_path = file_path
+ if ext == ".csv":
+ # convert_to_pdf(file_path) csv는 Pdf 변환 안 함
+ self.data_dict = self.load_csv_documents(file_path)
+ elif ext == ".xlsx":
+ # convert_to_pdf(file_path) xlsx는 Pdf 변환 안 함
+ self.data_dict = self.load_xlsx_documents(file_path)
+ else:
+ print(f"[!] Inadequate extension for TabularLoader: {ext}")
+ return
+
+ def check_sql_dtypes(self, df):
+ df = df.convert_dtypes()
+ res = []
+ for col in df.columns:
+ # col_name = col.strip().replace(' ', '_')
+ dtype = str(df.dtypes[col]).lower()
+
+ if 'int' in dtype:
+ if '64' in dtype:
+ sql_dtype = 'BIGINT'
+ else:
+ sql_dtype = 'INT'
+ elif 'float' in dtype:
+ sql_dtype = 'FLOAT'
+ elif 'bool' in dtype:
+ sql_dtype = 'BOOLEAN'
+ elif 'date' in dtype:
+ sql_dtype = 'DATE'
+ df[col] = df[col].astype(str)
+ elif 'datetime' in dtype:
+ sql_dtype = 'DATETIME'
+ df[col] = df[col].astype(str)
+ # else:
+ # max_len = df[col].str.len().max().item() + 10
+ # sql_dtype = f'VARCHAR({max_len})'
+ else:
+ lens = df[col].astype(str).str.len()
+ max_len_val = lens.max()
+ max_len = int(0 if pd.isna(max_len_val) else max_len_val) + 10
+ sql_dtype = f'VARCHAR({max_len})'
+
+ res.append([col, sql_dtype])
+
+ return df, res
+
+ def process_data_rows(self, data: dict):
+ """Arg: data (keys: 'sheet_name', 'page_column', 'page_column_type', 'documents')"""
+
+ rows = []
+ for doc in data["documents"]:
+ row = {}
+ if 'int' in data["page_column_type"]:
+ row[data["page_column"]] = int(doc.page_content)
+ elif 'float' in data["page_column_type"]:
+ row[data["page_column"]] = float(doc.page_content)
+ elif 'bool' in data["page_column_type"]:
+ if doc.page_content.lower() == 'true':
+ row[data["page_column"]] = True
+ elif doc.page_content.lower() == 'false':
+ row[data["page_column"]] = False
+ else:
+ raise ValueError(f"Invalid boolean string: {doc.page_content}")
+ else:
+ row[data["page_column"]] = doc.page_content
+
+ row.update(doc.metadata)
+ rows.append(row)
+
+ processed_data = {"sheet_name": data["sheet_name"], "data_rows": rows, "data_types": data["dtypes"]}
+ return processed_data
+
+ def load_csv_documents(self, file_path: str, **kwargs: dict):
+ import chardet
+
+ with open(file_path, "rb") as f:
+ raw_file = f.read(10000)
+ enc_type = chardet.detect(raw_file)['encoding']
+ df = pd.read_csv(file_path, encoding=enc_type, index_col=False)
+ df = df.fillna('null') # csv 파일에서도 xlsx 파일과 동일하게 null로 채움
+ df, dtypes_str = self.check_sql_dtypes(df)
+
+ for i in range(len(df.columns)):
+ try:
+ col = df.columns[0]
+ # col_type = str(type(col))
+ col_type = str(df[col].dtype)
+ df = df.astype({col: 'str'})
+ break
+ except:
+ raise ValueError(
+ f"Any columns cannot be converted into the string type so that can't load LangChain Documents: {dtypes_str}")
+
+ loader = DataFrameLoader(df, page_content_column=col)
+ documents = loader.load()
+
+ data = {
+ "sheet_name": "table_1",
+ "page_column": col,
+ "page_column_type": col_type,
+ "documents": documents,
+ "dtypes": dtypes_str
+ }
+ data = self.process_data_rows(data) # including only one sheet as it's a csv file
+ data_dict = {"data": [data]}
+ return data_dict
+
+ def load_xlsx_documents(self, file_path: str, **kwargs: dict):
+ dfs = pd.read_excel(file_path, sheet_name=None)
+ sheets = []
+ for sheet_name, df in dfs.items():
+ df = df.fillna('null')
+ df, dtypes_str = self.check_sql_dtypes(df)
+
+ for i in range(len(df.columns)):
+ try:
+ col = df.columns[0]
+ col_type = str(type(col))
+ df = df.astype({col: 'str'})
+ break
+ except:
+ raise ValueError(
+ f"Any columns cannot be converted into string type so that can't load LangChain Documents: {dtypes_str}")
+
+ loader = DataFrameLoader(df, page_content_column=col)
+ documents = loader.load()
+
+ sheet = {
+ "sheet_name": sheet_name,
+ "page_column": col,
+ "page_column_type": col_type,
+ "documents": documents,
+ "dtypes": dtypes_str
+ }
+ sheets.append(sheet)
+
+ data_dict = {"data": []}
+ for sheet in sheets:
+ data = self.process_data_rows(sheet)
+ data_dict["data"].append(data)
+
+ return data_dict
+
+ def return_vectormeta_format(self):
+ if not self.data_dict:
+ return None
+
+ text = "[DA] " + str(self.data_dict) # Add a token to indicate this string is for data analysis
+
+ # @@@@ 성민: 토큰 수 줄이기위한 후처리(임시조치)
+ text = text.replace("Unnamed: ", "")
+ text = text[:2000]
+
+
+ vectors = [GenOSVectorMeta.model_validate({
+ 'text': text,
+ 'n_char': 1,
+ 'n_word': 1,
+ 'n_line': 1,
+ 'i_page': 1,
+ 'e_page': 1,
+ 'n_page': 1,
+ 'i_chunk_on_page': 1,
+ 'n_chunk_of_page': 1,
+ 'i_chunk_on_doc': 1,
+ 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+ 'chunk_bboxes': ".",
+ 'media_files': "."
+ })]
+
+
+ return vectors
+
+
+class AudioLoader:
+ def __init__(self,
+ file_path: str,
+ req_url: str,
+ req_data: dict,
+ chunk_sec: int = 29,
+ tmp_path: str = '.',
+ ):
+ self.file_path = file_path
+ self.tmp_path = tmp_path
+ self.chunk_sec = chunk_sec
+ self.req_url = req_url
+ self.req_data = req_data
+
+ def split_file_as_chunks(self) -> list:
+ audio = pydub.AudioSegment.from_file(self.file_path)
+ chunk_len = self.chunk_sec * 1000
+ n_chunks = math.ceil(len(audio) / chunk_len)
+
+ for i in range(n_chunks):
+ start_ms = i * chunk_len
+ overlap_start_ms = start_ms - 300 if start_ms > 0 else start_ms
+ end_ms = start_ms + chunk_len
+ audio_chunk = audio[overlap_start_ms:end_ms]
+ audio_chunk.export(os.path.join(self.tmp_path, "tmp_{}.wav".format(str(i))), format="wav")
+ tmp_files = glob(os.path.join(self.tmp_path, "*.wav"))
+ return tmp_files
+
+ def transcribe_audio(self, file_path_lst: list):
+ transcribed_text_chunks = []
+
+ def _send_request(filepath: str):
+ """Send a request to 'whisper' model served"""
+ files = {
+ 'file': (filepath, open(filepath, 'rb'), 'audio/mp3'),
+ }
+
+ response = requests.post(self.req_url, data=self.req_data, files=files)
+ text = response.json().get('text', ', ')
+ transcribed_text_chunks.append({
+ 'file_name': os.path.basename(filepath),
+ 'text': text
+ })
+
+ # Send parallel requests
+ threads = [threading.Thread(target=_send_request, args=(f,)) for f in file_path_lst]
+ for t in threads: t.start()
+ for t in threads: t.join()
+
+ # Merge transcribed text snippets in order
+ transcribed_text_chunks.sort(key=lambda x: x['file_name'])
+ transcribed_text = "[AUDIO]" + ' '.join([t['text'] for t in transcribed_text_chunks])
+ return transcribed_text
+
+ def return_vectormeta_format(self):
+ audio_chunks = self.split_file_as_chunks()
+ transcribed_text = self.transcribe_audio(audio_chunks)
+ res = [GenOSVectorMeta.model_validate({
+ 'text': transcribed_text,
+ 'n_char': 1,
+ 'n_word': 1,
+ 'n_line': 1,
+ 'i_page': 1,
+ 'e_page': 1,
+ 'n_page': 1,
+ 'i_chunk_on_page': 1,
+ 'n_chunk_of_page': 1,
+ 'i_chunk_on_doc': 1,
+ 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+ 'chunk_bboxes': ".",
+ 'media_files': "."
+ })]
+ return res
+
+
+### for HWPX from 지능형 전처리기 ###
+# * GenOSVectorMetaBuilder #
+# * HierarchicalChunker #
+# * HybridChunker #
+# * HwpxProcessor #
+# * GenosServiceException #
+
+class HierarchicalChunker(BaseChunker):
+ r""" Chunker implementation leveraging the document layout.
+ Args:
+ merge_list_items (bool): Whether to merge successive list items.
+ Defaults to True.
+ delim (str): Delimiter to use for merging text. Defaults to "\n".
+ """
+ merge_list_items: bool = True
+
+ @classmethod
+ def _triplet_serialize(cls, table_df: DataFrame) -> str:
+ # copy header as first row and shift all rows by one
+ table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
+ table_df.index = table_df.index + 1
+ table_df = table_df.sort_index()
+
+ rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
+ cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
+
+ nrows = table_df.shape[0]
+ ncols = table_df.shape[1]
+ texts = [
+ f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
+ for i in range(1, nrows)
+ for j in range(1, ncols)
+ ]
+ output_text = ". ".join(texts)
+
+ return output_text
+
+ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ r"""Chunk the provided document.
+ Args:
+ dl_doc (DLDocument): document to chunk
+
+ Yields:
+ Iterator[Chunk]: iterator over extracted chunks
+ """
+ heading_by_level: dict[LevelNumber, str] = {}
+ list_items: list[TextItem] = []
+ for item, level in dl_doc.iterate_items():
+ captions = None
+ if isinstance(item, DocItem):
+ # first handle any merging needed
+ if self.merge_list_items:
+ if isinstance(
+ item, ListItem
+ ) or ( # TODO remove when all captured as ListItem:
+ isinstance(item, TextItem)
+ and item.label == DocItemLabel.LIST_ITEM
+ ):
+ list_items.append(item)
+ continue
+ elif list_items: # need to yield
+ yield DocChunk(
+ text=self.delim.join([i.text for i in list_items]),
+ meta=DocMeta(
+ doc_items=list_items,
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ origin=dl_doc.origin,
+ ),
+ )
+ list_items = [] # reset
+
+ if isinstance(item, SectionHeaderItem) or (
+ isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]):
+ level = (
+ item.level
+ if isinstance(item, SectionHeaderItem)
+ else (0 if item.label == DocItemLabel.TITLE else 1)
+ )
+ heading_by_level[level] = item.text
+ text = ''.join(str(value) for value in heading_by_level.values())
+
+ # remove headings of higher level as they just went out of scope
+ keys_to_del = [k for k in heading_by_level if k > level]
+ for k in keys_to_del:
+ heading_by_level.pop(k, None)
+ c = DocChunk(
+ text=text,
+ meta=DocMeta(
+ doc_items=[item],
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ captions=captions,
+ origin=dl_doc.origin
+ ),
+ )
+ yield c
+ continue
+
+ if isinstance(item, TextItem) or (
+ (not self.merge_list_items) and isinstance(item, ListItem)) or isinstance(item, CodeItem):
+ text = item.text
+
+ elif isinstance(item, TableItem):
+ text = item.export_to_markdown(dl_doc)
+ # dataframe으로 추출할 때 사용되는 코드
+ # if table_df.shape[0] < 1 or table_df.shape[1] < 2:
+ # # at least two cols needed, as first column contains row headers
+ # continue
+ # text = self._triplet_serialize(table_df=table_df)
+ captions = [c.text for c in [r.resolve(dl_doc) for r in item.captions]] or None
+
+ elif isinstance(item, PictureItem):
+ text = ''.join(str(value) for value in heading_by_level.values())
+ else:
+ continue
+ c = DocChunk(
+ text=text,
+ meta=DocMeta(
+ doc_items=[item],
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ captions=captions,
+ origin=dl_doc.origin,
+ ),
+ )
+ yield c
+
+ if self.merge_list_items and list_items: # need to yield
+ yield DocChunk(
+ text=self.delim.join([i.text for i in list_items]),
+ meta=DocMeta(
+ doc_items=list_items,
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ origin=dl_doc.origin,
+ ),
+ )
+
+
+class HybridChunker(BaseChunker):
+ r"""Chunker doing tokenization-aware refinements on top of document layout chunking.
+ Args:
+ tokenizer: The tokenizer to use; either instantiated object or name or path of
+ respective pretrained model
+ max_tokens: The maximum number of tokens per chunk. If not set, limit is
+ resolved from the tokenizer
+ merge_peers: Whether to merge undersized chunks sharing same relevant metadata
+ """
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+ tokenizer: Union[PreTrainedTokenizerBase, str] = (
+ "/nfs-root/all-MiniLM-L6-v2"
+ )
+ max_tokens: int = int(1e30) # type: ignore[assignment]
+ merge_peers: bool = True
+ _inner_chunker: HierarchicalChunker = HierarchicalChunker()
+
+ @model_validator(mode="after")
+ def _patch_tokenizer_and_max_tokens(self) -> Self:
+ self._tokenizer = (
+ self.tokenizer
+ if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+ else AutoTokenizer.from_pretrained(self.tokenizer)
+ )
+ if self.max_tokens is None:
+ self.max_tokens = TypeAdapter(PositiveInt).validate_python(
+ self._tokenizer.model_max_length
+ )
+ return self
+
+ def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
+ if text is None:
+ return 0
+ elif isinstance(text, list):
+ total = 0
+ for t in text:
+ total += self._count_text_tokens(t)
+ return total
+ return len(self._tokenizer.tokenize(text))
+
+ class _ChunkLengthInfo(BaseModel):
+ total_len: int
+ text_len: int
+ other_len: int
+
+ def _count_chunk_tokens(self, doc_chunk: DocChunk):
+ ser_txt = self.serialize(chunk=doc_chunk)
+ return len(self._tokenizer.tokenize(text=ser_txt))
+
+ def _doc_chunk_length(self, doc_chunk: DocChunk):
+ text_length = self._count_text_tokens(doc_chunk.text)
+ total = self._count_chunk_tokens(doc_chunk=doc_chunk)
+ return self._ChunkLengthInfo(
+ total_len=total,
+ text_len=text_length,
+ other_len=total - text_length,
+ )
+
+ def _make_chunk_from_doc_items(
+ self, doc_chunk: DocChunk, window_start: int, window_end: int
+ ):
+ doc_items = doc_chunk.meta.doc_items[window_start: window_end + 1]
+ meta = DocMeta(
+ doc_items=doc_items,
+ headings=doc_chunk.meta.headings,
+ captions=doc_chunk.meta.captions,
+ origin=doc_chunk.meta.origin,
+ )
+ window_text = (
+ doc_chunk.text
+ if len(doc_chunk.meta.doc_items) == 1
+ else self.delim.join(
+ [
+ doc_item.text
+ for doc_item in doc_items
+ if isinstance(doc_item, TextItem)
+ ]
+ )
+ )
+ new_chunk = DocChunk(text=window_text, meta=meta)
+ return new_chunk
+
+ def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
+ chunks = []
+ window_start = 0
+ window_end = 0 # an inclusive index
+ num_items = len(doc_chunk.meta.doc_items)
+ while window_end < num_items:
+ new_chunk = self._make_chunk_from_doc_items(
+ doc_chunk=doc_chunk,
+ window_start=window_start,
+ window_end=window_end,
+ )
+ if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
+ if window_end < num_items - 1:
+ window_end += 1
+ # 아직 청크에 여유가 있고, 남은 아이템도 있으므로 계속 추가 시도
+ continue
+ else:
+ # 현재 윈도우의 모든 아이템이 청크에 들어갔고, 더 이상 아이템이 없음
+ window_end = num_items # signalizing the last loop
+ elif window_start == window_end:
+ # 아이템 1개도 청크에 안 들어감 → 단독 청크로 처리, 이후 재분할
+ window_end += 1
+ window_start = window_end
+ else:
+ # 마지막 아이템 빼고 청크 생성 → 남은 아이템으로 새 윈도우 시작
+ new_chunk = self._make_chunk_from_doc_items(
+ doc_chunk=doc_chunk,
+ window_start=window_start,
+ window_end=window_end - 1,
+ )
+ window_start = window_end
+ chunks.append(new_chunk)
+ return chunks
+
+ def _split_using_plain_text(self, doc_chunk: DocChunk) -> list[DocChunk]:
+ lengths = self._doc_chunk_length(doc_chunk)
+ if lengths.total_len <= self.max_tokens:
+ return [doc_chunk]
+ else:
+ # 헤더/캡션을 제외하고 본문 텍스트에 할당 가능한 토큰 수 계산
+ available_length = self.max_tokens - lengths.other_len
+ sem_chunker = semchunk.chunkerify(
+ self._tokenizer, chunk_size=available_length
+ )
+ if available_length <= 0:
+ warnings.warn(
+ f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}"
+ # noqa
+ )
+ return []
+ text = doc_chunk.text
+ segments = sem_chunker.chunk(text)
+ chunks = [type(doc_chunk)(text=s, meta=doc_chunk.meta) for s in segments]
+ return chunks
+
+ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
+ output_chunks = []
+ window_start = 0
+ window_end = 0 # an inclusive index
+ num_chunks = len(chunks)
+
+ while window_end < num_chunks:
+ chunk = chunks[window_end]
+ headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
+ ready_to_append = False
+
+ if window_start == window_end:
+ current_headings_and_captions = headings_and_captions
+ window_end += 1
+ first_chunk_of_window = chunk
+
+ else:
+ chks = chunks[window_start: window_end + 1]
+ doc_items = [it for chk in chks for it in chk.meta.doc_items]
+ candidate = DocChunk(
+ text=self.delim.join([chk.text for chk in chks]),
+ meta=DocMeta(
+ doc_items=doc_items,
+ headings=current_headings_and_captions[0],
+ captions=current_headings_and_captions[1],
+ origin=chunk.meta.origin,
+ ),
+ )
+
+ if (headings_and_captions == current_headings_and_captions
+ and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
+ ):
+ # 토큰 수 여유 있음 → 청크 확장 계속
+ window_end += 1
+ new_chunk = candidate
+ else:
+ ready_to_append = True
+
+ if ready_to_append or window_end == num_chunks:
+ # no more room OR the start of new metadata.
+ if window_start + 1 == window_end:
+ output_chunks.append(first_chunk_of_window)
+ else:
+ output_chunks.append(new_chunk)
+ window_start = window_end
+
+ return output_chunks
+
+ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ r"""Chunk the provided document.
+ Args:
+ dl_doc (DLDocument): document to chunk
+ Yields:
+ Iterator[Chunk]: iterator over extracted chunks
+ """
+ res: Iterable[DocChunk]
+ res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # type: ignore
+ res = [x for c in res for x in self._split_by_doc_items(c)]
+ res = [x for c in res for x in self._split_using_plain_text(c)]
+
+ if self.merge_peers:
+ res = self._merge_chunks_with_matching_metadata(res)
+ return iter(res)
+
+
+class DocxProcessor:
+ def __init__(self):
+ self.page_chunk_counts = defaultdict(int)
+ self.pipeline_options = PipelineOptions()
+ self.converter = DocumentConverter(
+ format_options={
+ InputFormat.DOCX: WordFormatOption(
+ pipeline_cls=SimplePipeline, backend=GenosMsWordDocumentBackend
+ ),
+ }
+ )
+
+ def get_paths(self, file_path: str):
+ output_path, output_file = os.path.split(file_path)
+ filename, _ = os.path.splitext(output_file)
+ artifacts_dir = Path(f"{output_path}/{filename}")
+ if artifacts_dir.is_absolute():
+ reference_path = None
+ else:
+ reference_path = artifacts_dir.parent
+ return artifacts_dir, reference_path
+
+ def get_media_files(self, doc_items: list):
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'path': path, 'name': name})
+ return temp_list
+
+ def safe_join(self, iterable):
+ if not isinstance(iterable, (list, tuple, set)):
+ return ''
+ return ''.join(map(str, iterable)) + '\n'
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+ chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+ chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+ for chunk in chunks:
+ self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+ return chunks
+
+ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+ **kwargs: dict) -> list[dict]:
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=document.num_pages(),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+ )
+
+ current_page = None
+ chunk_index_on_page = 0
+ vectors = []
+ upload_tasks = []
+ for chunk_idx, chunk in enumerate(chunks):
+ chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+ content = self.safe_join(chunk.meta.headings) + chunk.text
+
+ if chunk_page != current_page:
+ current_page = chunk_page
+ chunk_index_on_page = 0
+
+ vector = (GenOSVectorMetaBuilder()
+ .set_text(content)
+ .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+ .set_chunk_index(chunk_idx)
+ .set_global_metadata(**global_metadata)
+ .set_chunk_bboxes(chunk.meta.doc_items, document)
+ .set_media_files(chunk.meta.doc_items)
+ ).build()
+ vectors.append(vector)
+
+ chunk_index_on_page += 1
+ # file_list = self.get_media_files(chunk.meta.doc_items)
+ # upload_tasks.append(asyncio.create_task(
+ # upload_files(file_list, request=request)
+ # ))
+
+ if upload_tasks:
+ await asyncio.gather(*upload_tasks)
+ return vectors
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ document: DoclingDocument = self.load_documents(file_path, **kwargs)
+ artifacts_dir, reference_path = self.get_paths(file_path)
+ document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+ chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+ vectors = []
+ if len(chunks) >= 1:
+ vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+ else:
+ raise GenosServiceException(1, f"chunk length is 0")
+ return vectors
+
+
+class HwpxProcessor:
+ def __init__(self):
+ self.page_chunk_counts = defaultdict(int)
+ self.pipeline_options = PipelineOptions()
+ self.pipeline_options.save_images = False
+ self.converter = DocumentConverter(
+ format_options={
+ InputFormat.XML_HWPX: HwpxFormatOption(
+ pipeline_options=self.pipeline_options
+ )
+ }
+ )
+
+ def get_paths(self, file_path: str):
+ output_path, output_file = os.path.split(file_path)
+ filename, _ = os.path.splitext(output_file)
+ artifacts_dir = Path(f"{output_path}/{filename}")
+ if artifacts_dir.is_absolute():
+ reference_path = None
+ else:
+ reference_path = artifacts_dir.parent
+ return artifacts_dir, reference_path
+
+ def get_media_files(self, doc_items: list):
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'path': path, 'name': name})
+ return temp_list
+
+ def safe_join(self, iterable):
+ if not isinstance(iterable, (list, tuple, set)):
+ return ''
+ return ''.join(map(str, iterable)) + '\n'
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ save_images = kwargs.get('save_images', False)
+
+ if self.pipeline_options.save_images != save_images:
+ self.pipeline_options.save_images = save_images
+ # self._create_converters()
+
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+ chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+ chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+ for chunk in chunks:
+ self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+ return chunks
+
+ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+ **kwargs: dict) -> list[dict]:
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=document.num_pages(),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+ )
+
+ current_page = None
+ chunk_index_on_page = 0
+ vectors = []
+ upload_tasks = []
+ for chunk_idx, chunk in enumerate(chunks):
+ chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+ content = self.safe_join(chunk.meta.headings) + chunk.text
+
+ if chunk_page != current_page:
+ current_page = chunk_page
+ chunk_index_on_page = 0
+
+ vector = (GenOSVectorMetaBuilder()
+ .set_text(content)
+ .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+ .set_chunk_index(chunk_idx)
+ .set_global_metadata(**global_metadata)
+ .set_chunk_bboxes(chunk.meta.doc_items, document)
+ .set_media_files(chunk.meta.doc_items)
+ ).build()
+ vectors.append(vector)
+
+ chunk_index_on_page += 1
+ # file_list = self.get_media_files(chunk.meta.doc_items)
+ # upload_tasks.append(asyncio.create_task(
+ # upload_files(file_list, request=request)
+ # ))
+
+ if upload_tasks:
+ await asyncio.gather(*upload_tasks)
+ return vectors
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ document: DoclingDocument = self.load_documents(file_path, **kwargs)
+ artifacts_dir, reference_path = self.get_paths(file_path)
+ document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+ chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+ vectors = []
+ if len(chunks) >= 1:
+ vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+ else:
+ raise GenosServiceException(1, f"chunk length is 0")
+
+ text = ""
+ for vector in vectors:
+ if len(text) + len(vector.text) > 8192:
+ break
+ text += vector.text
+
+ return [vectors[0]]
+
+
+class GenosServiceException(Exception):
+ """GenOS 와의 의존성 부분 제거를 위해 추가"""
+
+ def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+ self.code = 1
+ self.error_code = error_code
+ self.error_msg = error_msg or "GenOS Service Exception"
+ self.msg_params = msg_params or {}
+
+ def __repr__(self) -> str:
+ class_name = self.__class__.__name__
+ return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# async def assert_cancelled(request: Request):
+# """GenOS 와의 의존성 제거를 위해 추가"""
+# if await request.is_disconnected():
+# raise GenosServiceException(1, f"Cancelled")
+
+
+# @@@@ 성민: OCR을 위해서 추가
+from docling.datamodel.pipeline_options import (
+ AcceleratorDevice,
+ AcceleratorOptions,
+ # OcrEngine,
+ # PdfBackend,
+ PdfPipelineOptions,
+ TableFormerMode,
+ PipelineOptions,
+ PaddleOcrOptions,
+)
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.document_converter import PdfFormatOption
+
+class DocumentProcessor:
+ def __init__(self):
+ self.page_chunk_counts = defaultdict(int)
+ self.hwpx_processor = HwpxProcessor()
+ self.docx_processor = DocxProcessor()
+
+
+
+ # @@@@ 성민: OCR을 위해서 추가
+ self.ocr_endpoint = "http://doc-parser-ocr-service:8080/ocr"
+ ocr_options = PaddleOcrOptions(
+ force_full_page_ocr=False,
+ lang=['korean'],
+ ocr_endpoint=self.ocr_endpoint,
+ text_score=0.3)
+
+
+ device = AcceleratorDevice.AUTO
+ num_threads = 8
+ accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+
+ # PDF 파이프라인 옵션 설정
+ self.pipe_line_options = PdfPipelineOptions()
+ self.pipe_line_options.generate_page_images = True
+ self.pipe_line_options.generate_picture_images = True
+ self.pipe_line_options.do_ocr = False
+ self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+ # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+ # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+ # ocr_options = TesseractOcrOptions()
+ # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+ # ocr_options.path = './.tesseract/tessdata'
+ # self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.artifacts_path = Path("/models/")
+ self.pipe_line_options.do_table_structure = True
+ self.pipe_line_options.images_scale = 2
+ self.pipe_line_options.table_structure_options.do_cell_matching = True
+ self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+ self.pipe_line_options.accelerator_options = accelerator_options
+
+ # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+ self.simple_pipeline_options = PipelineOptions()
+ self.simple_pipeline_options.save_images = False
+
+ # ocr 파이프라인 옵션
+ self.ocr_pipe_line_options = PdfPipelineOptions()
+ self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.do_ocr = True
+ self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True
+
+ self.ocr_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options,
+ backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+
+ def get_loader(self, file_path: str):
+ ext = os.path.splitext(file_path)[-1].lower()
+ real_type = self.get_real_file_type(file_path)
+
+ # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+ if ext != real_type and real_type == 'pdf':
+ return PyMuPDFLoader(file_path)
+ elif ext != real_type and real_type in ['txt', 'json', 'md']:
+ return TextLoader(file_path)
+ # 원래 확장자 기반 로직
+ elif ext == '.pdf':
+ return PyMuPDFLoader(file_path)
+ elif ext == '.doc':
+ convert_to_pdf(file_path)
+ return UnstructuredWordDocumentLoader(file_path)
+ elif ext in ['.ppt', '.pptx']:
+ convert_to_pdf(file_path)
+ return UnstructuredPowerPointLoader(file_path)
+ elif ext in ['.jpg', '.jpeg', '.png']:
+ convert_to_pdf(file_path)
+ # 한국어 OCR 지원을 위한 언어 설정
+ return UnstructuredImageLoader(
+ file_path,
+ languages=["kor", "eng"], # 한국어 + 영어 OCR
+ )
+ elif ext in ['.txt', '.json', '.md']:
+ return TextLoader(file_path)
+ elif ext == '.hwp':
+ return HwpLoader(file_path)
+ elif ext == '.md':
+ return UnstructuredMarkdownLoader(file_path)
+ else:
+ return UnstructuredFileLoader(file_path)
+
+ def get_real_file_type(self, file_path: str) -> str:
+ """파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
+ with open(file_path, 'rb') as f:
+ header = f.read(8)
+ if header.startswith(b'%PDF-'):
+ return 'pdf'
+ elif header.startswith(b'\x89PNG'):
+ return 'png'
+ elif header.startswith(b'\xff\xd8\xff'):
+ return 'jpg'
+
+ # 매직 헤더로 판단할 수 없으면 확장자 사용
+ return os.path.splitext(file_path)[-1].lower()
+
+ def convert_md_to_pdf(self, md_path):
+ """Markdown 파일을 PDF로 변환"""
+ install_packages(['chardet'])
+ import chardet
+
+ pdf_path = md_path.replace('.md', '.pdf')
+ with open(md_path, 'rb') as f:
+ raw_file = f.read()
+ candidates = ['utf-8', 'utf-8-sig']
+ try:
+ det = (chardet.detect(raw_file) or {}).get('encoding') or ''
+ # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가
+ if det and det.lower() not in ('ascii', 'unknown'):
+ if det.lower() not in [c.lower() for c in candidates]:
+ candidates.append(det)
+ except Exception:
+ pass
+ candidates += ['cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+ md_content = None
+ for enc in candidates:
+ try:
+ md_content = raw_file.decode(enc)
+ break
+ except UnicodeDecodeError:
+ continue
+ if md_content is None:
+ md_content = raw_file.decode('utf-8', errors='replace')
+
+ html_content = markdown(md_content)
+ if HTML:
+ HTML(string=html_content).write_pdf(pdf_path)
+ return pdf_path
+
+
+
+ def _create_converters(self):
+ """컨버터들을 생성하는 헬퍼 메서드"""
+ self.ocr_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options,
+ backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+
+
+ def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+ save_images = kwargs.get('save_images', True)
+ include_wmf = kwargs.get('include_wmf', False)
+
+ # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+ if (self.simple_pipeline_options.save_images != save_images or
+ getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf):
+ self.simple_pipeline_options.save_images = save_images
+ self.simple_pipeline_options.include_wmf = include_wmf
+ self._create_converters()
+
+ try:
+ conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+ except Exception as e:
+ print("@@@@", e)
+ # conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+
+ return conv_result.document
+
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
+ loader = self.get_loader(file_path)
+ documents = loader.load()
+
+ # @@@@ 성민: 밑에 주석
+ # 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공
+ # ext = os.path.splitext(file_path)[-1].lower()
+ # if ext in ['.jpg', '.jpeg', '.png']:
+ # # documents가 없거나, 있어도 모든 page_content가 비어있는 경우
+ # if not documents or not any(doc.page_content.strip() for doc in documents):
+ # documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})]
+
+ # @@@@ 성민 새로 작성: 텍스트가 없을 경우 OCR 수행
+ if not documents or not any(doc.page_content.strip() for doc in documents):
+ document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+ documents = list([Document(page_content=document.export_to_markdown(), metadata={})])
+
+ return documents
+
+ def split_documents(self, documents, **kwargs: dict) -> list[Document]:
+ # @@@@ 성민: GenOS에서 바꿔도 안바뀌는듯?
+ print("@@@@ kwargs", kwargs)
+
+ kwargs.setdefault("chunk_size", 20_000)
+
+ text_splitter = RecursiveCharacterTextSplitter(**kwargs)
+
+ chunks = text_splitter.split_documents(documents)
+ chunks = [chunk for chunk in chunks if chunk.page_content]
+
+ if not chunks:
+ raise Exception('Empty document')
+
+ for chunk in chunks:
+ page = chunk.metadata.get('page', 0)
+ self.page_chunk_counts[page] += 1
+ return chunks
+
+ def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict) -> list[dict]:
+ ext = os.path.splitext(file_path)[-1].lower()
+ real_type = self.get_real_file_type(file_path)
+
+ # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+ if ext != real_type and real_type == 'pdf':
+ pdf_path = file_path
+ elif ext != real_type and real_type in ['txt', 'json', 'md']:
+ pdf_path = _get_pdf_path(file_path)
+ # 원래 확장자 기반 로직
+ elif file_path.endswith('.md'):
+ pdf_path = self.convert_md_to_pdf(file_path)
+ elif file_path.endswith(('.ppt', '.pptx')):
+ pdf_path = _get_pdf_path(file_path)
+ else:
+ pdf_path = _get_pdf_path(file_path)
+
+ # doc = fitz.open(pdf_path) if (pdf_path and os.path.exists(pdf_path)) else None
+
+ if file_path.endswith(('.ppt', '.pptx')):
+ if os.path.exists(pdf_path):
+ subprocess.run(["rm", pdf_path], check=True)
+
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=max([chunk.metadata.get('page', 0) for chunk in chunks]),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z'
+ )
+ current_page = None
+ chunk_index_on_page = 0
+
+ vectors = []
+ for chunk_idx, chunk in enumerate(chunks):
+ page = chunk.metadata.get('page', 0)
+ text = chunk.page_content
+
+ if page != current_page:
+ current_page = page
+ chunk_index_on_page = 0
+
+ # 첨부용에서는 bbox 정보 추출 X
+ # if doc:
+ # fitz_page = doc.load_page(page)
+ # global_metadata['chunk_bboxes'] = json.dumps(merge_overlapping_bboxes([{
+ # 'page': page + 1,
+ # 'type': 'text',
+ # 'bbox': {
+ # 'l': rect[0] / fitz_page.rect.width,
+ # 't': rect[1] / fitz_page.rect.height,
+ # 'r': rect[2] / fitz_page.rect.width,
+ # 'b': rect[3] / fitz_page.rect.height,
+ # }
+ # } for rect in fitz_page.search_for(text)], x_tolerance=1 / fitz_page.rect.width,
+ # y_tolerance=1 / fitz_page.rect.height))
+
+ vectors.append(GenOSVectorMeta.model_validate({
+ 'text': text,
+ 'n_char': len(text),
+ 'n_word': len(text.split()),
+ 'n_line': len(text.splitlines()),
+ 'i_page': page,
+ 'e_page': page,
+ 'i_chunk_on_page': chunk_index_on_page,
+ 'n_chunk_of_page': self.page_chunk_counts[page],
+ 'i_chunk_on_doc': chunk_idx,
+ **global_metadata
+ }))
+ chunk_index_on_page += 1
+
+ return vectors
+
+ @guardrail
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ ext = os.path.splitext(file_path)[-1].lower()
+ if ext in ('.wav', '.mp3', '.m4a'):
+ # Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model
+ tmp_path = "./tmp_audios_{}".format(os.path.basename(file_path).split('.')[0])
+ if not os.path.exists(tmp_path):
+ os.makedirs(tmp_path)
+
+ # Use 'Whisper' model served in-house
+ # [!] Modify the request parameters to change a STT model to be used
+ loader = AudioLoader(
+ file_path=file_path,
+ req_url="http://192.168.74.164:30100/v1/audio/transcriptions",
+ req_data={
+ 'model': 'model',
+ 'language': 'ko',
+ 'response_format': 'json',
+ 'temperature': '0',
+ 'stream': 'false',
+ 'timestamp_granularities[]': 'word'
+ },
+ chunk_sec=29, # length(sec) of a chunk from the uploaded audio
+ tmp_path=tmp_path
+ )
+ vectors = loader.return_vectormeta_format()
+ # await assert_cancelled(request)
+
+ # Remove the temporal chunks
+ try:
+ subprocess.run(['rm', '-r', tmp_path], check=True)
+ except:
+ pass
+ # await assert_cancelled(request)
+ return vectors
+
+ elif ext in ('.csv', '.xlsx'):
+ loader = TabularLoader(file_path, ext)
+ vectors = loader.return_vectormeta_format()
+ # pdf_path = _get_pdf_path(file_path)
+ # await assert_cancelled(request)
+ return vectors
+
+ elif ext == '.hwp':
+ documents: list[Document] = self.load_documents(file_path, **kwargs)
+ # await assert_cancelled(request)
+ chunks: list[Document] = self.split_documents(documents, **kwargs)
+ # await assert_cancelled(request)
+ vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+ return vectors
+
+ elif ext == '.hwpx':
+ return await self.hwpx_processor(request, file_path, **kwargs)
+
+ elif ext == '.docx':
+ return await self.docx_processor(request, file_path, **kwargs)
+
+ else:
+ documents: list[Document] = self.load_documents(file_path, **kwargs)
+ # await assert_cancelled(request)
+
+ chunks: list[Document] = self.split_documents(documents, **kwargs)
+ # await assert_cancelled(request)
+
+ vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+ return vectors
\ No newline at end of file
diff --git a/genon/preprocessor/facade/intelligent_processor_ocr.py b/genon/preprocessor/facade/intelligent_processor_ocr.py
new file mode 100644
index 0000000000..b720d3bf26
--- /dev/null
+++ b/genon/preprocessor/facade/intelligent_processor_ocr.py
@@ -0,0 +1,1384 @@
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+from collections import defaultdict
+from datetime import datetime
+from typing import Optional, Iterable, Any, List, Dict, Tuple
+
+from fastapi import Request
+
+# docling imports
+
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.pipeline.simple_pipeline import SimplePipeline
+# from docling.datamodel.document import ConversionStatus
+from docling.datamodel.pipeline_options import (
+ AcceleratorDevice,
+ AcceleratorOptions,
+ # OcrEngine,
+ # PdfBackend,
+ PdfPipelineOptions,
+ TableFormerMode,
+ PipelineOptions,
+ PaddleOcrOptions,
+)
+
+from docling.document_converter import (
+ DocumentConverter,
+ PdfFormatOption,
+ FormatOption
+)
+from docling.datamodel.pipeline_options import DataEnrichmentOptions
+from docling.utils.document_enrichment import enrich_document, check_document
+from docling.datamodel.document import ConversionResult
+from docling_core.transforms.chunker import (
+ BaseChunk,
+ BaseChunker,
+ DocChunk,
+ DocMeta,
+)
+from docling_core.types import DoclingDocument
+
+from pandas import DataFrame
+import asyncio
+from docling_core.types import DoclingDocument as DLDocument
+from docling_core.types.doc.document import (
+ DocumentOrigin,
+ LevelNumber,
+ ListItem,
+ CodeItem,
+ ContentLayer,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import (
+ BoundingBox,
+ DocItemLabel,
+ DoclingDocument,
+ DocumentOrigin,
+ DocItem,
+ PictureItem,
+ SectionHeaderItem,
+ TableItem,
+ TextItem,
+ PageItem
+)
+from collections import Counter
+import re
+import json
+import warnings
+from typing import Iterable, Iterator, Optional, Union
+
+from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing_extensions import Self
+
+try:
+ import semchunk
+ from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+ raise RuntimeError(
+ "Module requires 'chunking' extra; to install, run: "
+ "`pip install 'docling-core[chunking]'`"
+ )
+
+# from genos_utils import upload_files
+
+# ============================================
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Chunker implementation leveraging the document structure."""
+
+
+class HierarchicalChunker(BaseChunker):
+ """문서 구조와 헤더 계층을 유지하면서 아이템을 순차적으로 처리하는 청커"""
+
+ merge_list_items: bool = True
+
+ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ """문서의 모든 아이템을 헤더 정보와 함께 청크로 생성
+
+ Args:
+ dl_doc: 청킹할 문서
+
+ Yields:
+ 문서의 모든 아이템을 포함하는 하나의 청크
+ """
+ # 모든 아이템과 헤더 정보 수집
+ all_items = []
+ all_header_info = [] # 각 아이템의 헤더 정보
+ current_heading_by_level: dict[LevelNumber, str] = {}
+ list_items: list[TextItem] = []
+
+ # iterate_items()로 수집된 아이템들의 self_ref 추적
+ processed_refs = set()
+
+ # 모든 아이템 순회
+ for item, level in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
+ if hasattr(item, 'self_ref'):
+ processed_refs.add(item.self_ref)
+
+ if not isinstance(item, DocItem):
+ continue
+
+ # 리스트 아이템 병합 처리
+ if self.merge_list_items:
+ if isinstance(item, ListItem) or (
+ isinstance(item, TextItem) and item.label == DocItemLabel.LIST_ITEM
+ ):
+ list_items.append(item)
+ continue
+ elif list_items:
+ # 누적된 리스트 아이템들을 추가
+ for list_item in list_items:
+ all_items.append(list_item)
+ # 리스트 아이템의 헤더 정보 저장
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+ list_items = []
+
+ # 섹션 헤더 처리
+ if isinstance(item, SectionHeaderItem) or (
+ isinstance(item, TextItem) and
+ item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+ ):
+ # 새로운 헤더 레벨 설정
+ header_level = (
+ item.level if isinstance(item, SectionHeaderItem)
+ else (0 if item.label == DocItemLabel.TITLE else 1)
+ )
+ current_heading_by_level[header_level] = item.text
+
+ # 더 깊은 레벨의 헤더들 제거
+ keys_to_del = [k for k in current_heading_by_level if k > header_level]
+ for k in keys_to_del:
+ current_heading_by_level.pop(k, None)
+
+ # 헤더 아이템도 추가 (헤더 자체도 아이템임)
+ all_items.append(item)
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+ continue
+
+ if (isinstance(item, TextItem) or
+ isinstance(item, ListItem) or
+ isinstance(item, CodeItem) or
+ isinstance(item, TableItem) or
+ isinstance(item, PictureItem)):
+ all_items.append(item)
+ # 현재 아이템의 헤더 정보 저장
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+
+ # 마지막 리스트 아이템들 처리
+ if list_items:
+ for list_item in list_items:
+ all_items.append(list_item)
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+
+ # iterate_items()에서 누락된 테이블들을 별도로 추가
+ missing_tables = []
+ for table in dl_doc.tables:
+ table_ref = getattr(table, 'self_ref', None)
+ if table_ref not in processed_refs:
+ missing_tables.append(table)
+
+ # 누락된 테이블들을 문서 앞부분에 추가 (페이지 1의 테이블들일 가능성이 높음)
+ if missing_tables:
+ for missing_table in missing_tables:
+ # 첫 번째 위치에 삽입 (헤더 테이블일 가능성이 높음)
+ all_items.insert(0, missing_table)
+ all_header_info.insert(0, {}) # 빈 헤더 정보
+
+ # 아이템이 없으면 빈 문서
+ if not all_items:
+ return
+
+ # 모든 아이템을 하나의 청크로 반환 (HybridChunker에서 분할)
+ # headings는 None으로 설정하고, 헤더 정보는 별도로 관리
+ chunk = DocChunk(
+ text="", # 텍스트는 HybridChunker에서 생성
+ meta=DocMeta(
+ doc_items=all_items,
+ headings=None, # DocMeta의 원래 형식 유지
+ captions=None,
+ origin=dl_doc.origin,
+ ),
+ )
+ # 헤더 정보를 별도 속성으로 저장
+ chunk._header_info_list = all_header_info
+ yield chunk
+
+class HybridChunker(BaseChunker):
+ """토큰 제한을 고려하여 섹션별 청크를 분할하고 병합하는 청커"""
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ tokenizer: Union[PreTrainedTokenizerBase, str] = "sentence-transformers/all-MiniLM-L6-v2"
+ max_tokens: int = 1024
+ merge_peers: bool = True
+
+ _inner_chunker: BaseChunker = None
+ _tokenizer: PreTrainedTokenizerBase = None
+
+ @model_validator(mode="after")
+ def _initialize_components(self) -> Self:
+ # 토크나이저 초기화
+ self._tokenizer = (
+ self.tokenizer
+ if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+ else AutoTokenizer.from_pretrained(self.tokenizer)
+ )
+
+ # HierarchicalChunker 초기화
+ if self._inner_chunker is None:
+ self._inner_chunker = HierarchicalChunker()
+
+ return self
+
+ def _count_tokens(self, text: str) -> int:
+ """텍스트의 토큰 수 계산 (안전한 분할 처리)"""
+ if not text:
+ return 0
+
+ # 텍스트를 더 작은 단위로 분할하여 계산
+ max_chunk_length = 300 # 더 안전한 길이로 설정
+ total_tokens = 0
+
+ # 텍스트를 줄 단위로 먼저 분할
+ lines = text.split('\n')
+ current_chunk = ""
+
+ for line in lines:
+ # 현재 청크에 줄을 추가했을 때 길이 확인
+ temp_chunk = current_chunk + '\n' + line if current_chunk else line
+
+ if len(temp_chunk) <= max_chunk_length:
+ current_chunk = temp_chunk
+ else:
+ # 현재 청크가 있으면 토큰 계산
+ if current_chunk:
+ try:
+ total_tokens += len(self._tokenizer.tokenize(current_chunk))
+ except Exception:
+ total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산
+
+ # 새로운 청크 시작
+ current_chunk = line
+
+ # 마지막 청크 처리
+ if current_chunk:
+ try:
+ total_tokens += len(self._tokenizer.tokenize(current_chunk))
+ except Exception:
+ total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산
+
+ return total_tokens
+
+ def _generate_text_from_items_with_headers(self, items: list[DocItem],
+ header_info_list: list[dict],
+ dl_doc: DoclingDocument) -> str:
+ """DocItem 리스트로부터 헤더 정보를 포함한 텍스트 생성"""
+ text_parts = []
+ current_section_headers = {} # 현재 섹션의 헤더 정보
+
+ for i, item in enumerate(items):
+ item_headers = header_info_list[i] if i < len(header_info_list) else {}
+
+ # 헤더 정보가 변경된 경우 (새로운 섹션 시작)
+ if item_headers != current_section_headers:
+ # 변경된 헤더 레벨들만 추가
+ headers_to_add = []
+
+ for level in sorted(item_headers.keys()):
+ # 이전 섹션과 다른 헤더만 추가
+ if (level not in current_section_headers or
+ current_section_headers[level] != item_headers[level]):
+ # 해당 레벨까지의 모든 상위 헤더 포함
+ for l in sorted(item_headers.keys()):
+ if l <= level:
+ headers_to_add.append(item_headers[l])
+ break
+
+ # 헤더가 있으면 추가
+ if headers_to_add:
+ header_text = ", ".join(headers_to_add)
+ if header_text not in text_parts:
+ text_parts.append(header_text)
+
+ current_section_headers = item_headers.copy()
+
+ # 아이템 텍스트 추가
+ if isinstance(item, TableItem):
+ table_text = self._extract_table_text(item, dl_doc)
+ if table_text:
+ text_parts.append(table_text)
+ elif hasattr(item, 'text') and item.text:
+ # 타이틀과 섹션 헤더 처리 개선
+ # is_section_header = (
+ # isinstance(item, SectionHeaderItem) or
+ # (isinstance(item, TextItem) and
+ # item.label in [DocItemLabel.SECTION_HEADER]) # TITLE은 제외
+ # )
+
+ # 타이틀은 항상 포함, 섹션 헤더는 중복 방지를 위해 스킵
+ # if not is_section_header:
+ # 20250909, shkim, text_parts에 없는 경우만 추가. 섹션헤더가 반복해서 추가되는 것 방지
+ if item.text not in text_parts:
+ text_parts.append(item.text)
+ elif isinstance(item, PictureItem):
+ text_parts.append("") # 이미지는 빈 텍스트
+
+ # delim이 정의되지 않은 경우 기본값 사용
+ delim = getattr(self, 'delim', '\n')
+ result_text = delim.join(text_parts)
+
+ return result_text
+
+ def _extract_table_text(self, table_item: TableItem, dl_doc: DoclingDocument) -> str:
+ """테이블에서 텍스트를 추출하는 일반화된 메서드"""
+ try:
+ # 먼저 export_to_markdown 시도
+ table_text = table_item.export_to_markdown(dl_doc)
+ if table_text and table_text.strip():
+ return table_text
+ except Exception:
+ pass
+
+ # export_to_markdown 실패 시 테이블 셀 데이터에서 직접 텍스트 추출
+ try:
+ if hasattr(table_item, 'data') and table_item.data:
+ cell_texts = []
+
+ # table_cells에서 텍스트 추출
+ if hasattr(table_item.data, 'table_cells'):
+ for cell in table_item.data.table_cells:
+ if hasattr(cell, 'text') and cell.text and cell.text.strip():
+ cell_texts.append(cell.text.strip())
+
+ # grid에서 텍스트 추출 (table_cells가 없는 경우)
+ elif hasattr(table_item.data, 'grid') and table_item.data.grid:
+ for row in table_item.data.grid:
+ if isinstance(row, list):
+ for cell in row:
+ if hasattr(cell, 'text') and cell.text and cell.text.strip():
+ cell_texts.append(cell.text.strip())
+
+ # 추출된 셀 텍스트들을 결합
+ if cell_texts:
+ return ' '.join(cell_texts)
+ except Exception:
+ pass
+
+ # 모든 방법 실패 시 item.text 사용 (있는 경우)
+ if hasattr(table_item, 'text') and table_item.text:
+ return table_item.text
+
+ return ""
+
+ def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[str]]:
+ """헤더 정보 리스트에서 실제 사용되는 헤더들을 추출 """
+ if not header_info_list:
+ return None
+
+ all_headers = [] # header 순서대로 추가
+ seen_headers = set() # 중복 방지용
+
+ for header_info in header_info_list:
+ if header_info:
+ for level in sorted(header_info.keys()):
+ header_text = header_info[level]
+ if header_text and header_text not in seen_headers:
+ all_headers.append(header_text)
+ seen_headers.add(header_text)
+
+ return all_headers if all_headers else None
+
+ def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]:
+ """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)"""
+ if not table_text:
+ return [table_text]
+
+ # 전체 테이블이 토큰 제한 내인지 확인
+ if self._count_tokens(table_text) <= max_tokens:
+ return [table_text]
+
+ # 단순히 토큰 수 기준으로 텍스트 분할
+ # semchunk 사용하여 토큰 제한에 맞게 분할
+ chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens)
+ chunks = chunker(table_text)
+ return chunks if chunks else [table_text]
+
+ def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument) -> list[DocChunk]:
+ """문서를 토큰 제한에 맞게 분할 (여러 섹션이 하나의 청크에 포함 가능)"""
+ items = doc_chunk.meta.doc_items
+ header_info_list = getattr(doc_chunk, '_header_info_list', []) # 각 아이템의 헤더 정보 리스트
+
+ if not items:
+ return []
+
+ result_chunks = []
+ current_items = []
+ current_header_infos = []
+
+ i = 0
+ while i < len(items):
+ item = items[i]
+ header_info = header_info_list[i] if i < len(header_info_list) else {}
+
+ # 테이블 아이템인 경우 특별 처리
+ if isinstance(item, TableItem):
+ # 현재까지 누적된 아이템들이 있으면 먼저 청크로 생성
+ if current_items:
+ chunk_text = self._generate_text_from_items_with_headers(
+ current_items, current_header_infos, dl_doc
+ )
+ tokens = self._count_tokens(chunk_text)
+
+ # 실제 사용된 헤더들만 추출
+ used_headers = self._extract_used_headers(current_header_infos)
+ result_chunks.append(DocChunk(
+ text=chunk_text,
+ meta=DocMeta(
+ doc_items=current_items.copy(),
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ )
+ ))
+ current_items = []
+ current_header_infos = []
+
+ # 테이블과 앞뒤 아이템을 포함한 청크 생성
+ table_items = []
+ table_header_infos = []
+
+ # 앞 아이템 추가 (가능한 경우)
+ # if i > 0 and len(result_chunks) == 0: # 첫 번째 테이블이고 앞에 아이템이 있는 경우
+ # table_items.append(items[i-1])
+ # prev_header_info = header_info_list[i-1] if i-1 < len(header_info_list) else {}
+ # table_header_infos.append(prev_header_info)
+
+ # 테이블 추가
+ table_items.append(item)
+ table_header_infos.append(header_info)
+
+ # 뒤 아이템 추가 (가능한 경우)
+ # if i + 1 < len(items):
+ # table_items.append(items[i+1])
+ # next_header_info = header_info_list[i+1] if i+1 < len(header_info_list) else {}
+ # table_header_infos.append(next_header_info)
+ # i += 1 # 다음 아이템은 이미 처리했으므로 스킵
+
+ # 테이블 청크 생성 (토큰 제한 확인)
+ table_text = self._generate_text_from_items_with_headers(
+ table_items, table_header_infos, dl_doc
+ )
+ table_tokens = self._count_tokens(table_text)
+
+ # 테이블이 max_tokens를 초과하는 경우, 테이블을 분할
+ if table_tokens > self.max_tokens:
+ # 테이블 텍스트만 추출하여 분할
+ table_only_text = self._extract_table_text(item, dl_doc)
+ # split_tables = self._split_table_text(table_only_text, 4096)
+ split_tables = [table_only_text]
+
+ # 분할된 각 테이블에 대해 청크 생성
+ for split_table in split_tables:
+ # 기존 _generate_text_from_items_with_headers 함수 활용
+ full_text = self._generate_text_from_items_with_headers(
+ [item], [header_info], dl_doc
+ )
+ # 원본 테이블 텍스트를 분할된 테이블로 교체
+ full_text = full_text.replace(table_only_text, split_table)
+
+ # 원래 tableitem에 들어갔어야 할 heading 값 유지
+ used_headers = self._extract_used_headers([header_info])
+ result_chunks.append(DocChunk(
+ text=full_text,
+ meta=DocMeta(
+ doc_items=[item],
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ )
+ ))
+ else:
+ used_headers = self._extract_used_headers(table_header_infos)
+ result_chunks.append(DocChunk(
+ text=table_text,
+ meta=DocMeta(
+ doc_items=table_items,
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ )
+ ))
+
+ i += 1
+ continue
+
+ # 일반 아이템 처리 - 토큰 제한 확인
+ test_items = current_items + [item]
+ test_header_infos = current_header_infos + [header_info]
+ test_text = self._generate_text_from_items_with_headers(
+ test_items, test_header_infos, dl_doc
+ )
+ test_tokens = self._count_tokens(test_text)
+
+ if test_tokens <= self.max_tokens:
+ current_items.append(item)
+ current_header_infos.append(header_info)
+ else:
+ # 토큰 제한 초과 - 현재까지의 아이템들로 청크 생성
+ if current_items:
+ chunk_text = self._generate_text_from_items_with_headers(
+ current_items, current_header_infos, dl_doc
+ )
+ chunk_tokens = self._count_tokens(chunk_text)
+
+ used_headers = self._extract_used_headers(current_header_infos)
+ result_chunks.append(DocChunk(
+ text=chunk_text,
+ meta=DocMeta(
+ doc_items=current_items.copy(),
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ )
+ ))
+ # 새로운 청크 시작
+ current_items = [item]
+ current_header_infos = [header_info]
+ else:
+ # 단일 아이템이 토큰 제한을 초과하는 경우
+ single_text = self._generate_text_from_items_with_headers(
+ [item], [header_info], dl_doc
+ )
+ single_tokens = self._count_tokens(single_text)
+
+ used_headers = self._extract_used_headers([header_info])
+ result_chunks.append(DocChunk(
+ text=single_text,
+ meta=DocMeta(
+ doc_items=[item],
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ )
+ ))
+
+ i += 1
+
+ # 마지막 남은 아이템들 처리
+ if current_items:
+ chunk_text = self._generate_text_from_items_with_headers(
+ current_items, current_header_infos, dl_doc
+ )
+ chunk_tokens = self._count_tokens(chunk_text)
+
+ used_headers = self._extract_used_headers(current_header_infos)
+ result_chunks.append(DocChunk(
+ text=chunk_text,
+ meta=DocMeta(
+ doc_items=current_items,
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ )
+ ))
+
+ # 작은 청크들 병합 처리
+ return self._merge_small_chunks(result_chunks, dl_doc)
+
+ def _merge_small_chunks(self, chunks: list[DocChunk], dl_doc: DoclingDocument) -> list[DocChunk]:
+ """작은 청크들을 병합하여 토큰 효율성을 높임 (개선된 버전)"""
+ if not chunks:
+ return chunks
+
+ min_chunk_size = self.max_tokens // 3 # 최소 청크 크기를 더 크게 설정 (2000/3 = 666토큰)
+ merged_chunks = []
+ current_merge_candidate = None
+
+ for i, chunk in enumerate(chunks):
+ chunk_tokens = self._count_tokens(chunk.text)
+
+ # 아주 큰 청크는 분할 필요
+ if chunk_tokens > self.max_tokens:
+ if current_merge_candidate:
+ merged_chunks.append(current_merge_candidate)
+ current_merge_candidate = None
+
+ # 큰 청크를 분할 (임시로 그대로 추가하되, 경고 표시)
+ merged_chunks.append(chunk)
+ continue
+
+ # 작은 청크인 경우 병합 대상 (테이블 청크도 포함)
+ if chunk_tokens < min_chunk_size:
+ if current_merge_candidate is None:
+ current_merge_candidate = chunk
+ else:
+ # 병합 시도
+ merged_items = current_merge_candidate.meta.doc_items + chunk.meta.doc_items
+ merged_header_infos = (
+ getattr(current_merge_candidate, '_header_info_list', []) +
+ getattr(chunk, '_header_info_list', [])
+ )
+
+ merged_text = self._generate_text_from_items_with_headers(
+ merged_items, merged_header_infos, dl_doc
+ )
+ merged_tokens = self._count_tokens(merged_text)
+
+ if merged_tokens <= self.max_tokens:
+ current_merge_candidate = DocChunk(
+ text=merged_text,
+ meta=DocMeta(
+ doc_items=merged_items,
+ headings=self._extract_used_headers(merged_header_infos),
+ captions=None,
+ origin=chunk.meta.origin,
+ )
+ )
+ current_merge_candidate._header_info_list = merged_header_infos
+ else:
+ merged_chunks.append(current_merge_candidate)
+ current_merge_candidate = chunk
+ else:
+ if current_merge_candidate:
+ # 이전 병합 후보가 있으면 현재 청크와 병합 시도
+ candidate_tokens = self._count_tokens(current_merge_candidate.text)
+ if candidate_tokens < min_chunk_size:
+ # 현재 청크와 병합 시도
+ merged_items = current_merge_candidate.meta.doc_items + chunk.meta.doc_items
+ merged_header_infos = (
+ getattr(current_merge_candidate, '_header_info_list', []) +
+ getattr(chunk, '_header_info_list', [])
+ )
+
+ merged_text = self._generate_text_from_items_with_headers(
+ merged_items, merged_header_infos, dl_doc
+ )
+ merged_tokens = self._count_tokens(merged_text)
+
+ if merged_tokens <= self.max_tokens:
+ merged_chunks.append(DocChunk(
+ text=merged_text,
+ meta=DocMeta(
+ doc_items=merged_items,
+ headings=self._extract_used_headers(merged_header_infos),
+ captions=None,
+ origin=chunk.meta.origin,
+ )
+ ))
+ current_merge_candidate = None
+ continue
+
+ # 병합할 수 없으면 후보를 먼저 추가
+ merged_chunks.append(current_merge_candidate)
+ current_merge_candidate = None
+
+ merged_chunks.append(chunk)
+
+ # 마지막 병합 후보 처리
+ if current_merge_candidate:
+ merged_chunks.append(current_merge_candidate)
+
+ return merged_chunks
+
+ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ """문서를 청킹하여 반환
+
+ Args:
+ dl_doc: 청킹할 문서
+
+ Yields:
+ 토큰 제한에 맞게 분할된 청크들
+ """
+ doc_chunks = list(self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs))
+
+ if not doc_chunks:
+ return iter([])
+
+ doc_chunk = doc_chunks[0] # HierarchicalChunker는 하나의 청크만 반환
+
+ final_chunks = self._split_document_by_tokens(doc_chunk, dl_doc)
+
+ return iter(final_chunks)
+
+
+class GenOSVectorMeta(BaseModel):
+ class Config:
+ extra = 'allow'
+
+ text: str = None
+ n_char: int = None
+ n_word: int = None
+ n_line: int = None
+ e_page: int = None
+ i_page: int = None
+ i_chunk_on_page: int = None
+ n_chunk_of_page: int = None
+ i_chunk_on_doc: int = None
+ n_chunk_of_doc: int = None
+ n_page: int = None
+ reg_date: str = None
+ chunk_bboxes: str = None
+ media_files: str = None
+ title: str = None
+ created_date: int = None
+
+
+class GenOSVectorMetaBuilder:
+ def __init__(self):
+ """빌더 초기화"""
+ self.text: Optional[str] = None
+ self.n_char: Optional[int] = None
+ self.n_word: Optional[int] = None
+ self.n_line: Optional[int] = None
+ self.i_page: Optional[int] = None
+ self.e_page: Optional[int] = None
+ self.i_chunk_on_page: Optional[int] = None
+ self.n_chunk_of_page: Optional[int] = None
+ self.i_chunk_on_doc: Optional[int] = None
+ self.n_chunk_of_doc: Optional[int] = None
+ self.n_page: Optional[int] = None
+ self.reg_date: Optional[str] = None
+ self.chunk_bboxes: Optional[str] = None
+ self.media_files: Optional[str] = None
+ self.title: Optional[str] = None
+ self.created_date: Optional[int] = None
+
+ def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+ """텍스트와 관련된 데이터를 설정"""
+ self.text = text
+ self.n_char = len(text)
+ self.n_word = len(text.split())
+ self.n_line = len(text.splitlines())
+ return self
+
+ def set_page_info(
+ self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int
+ ) -> "GenOSVectorMetaBuilder":
+ """페이지 정보 설정"""
+ self.i_page = i_page
+ self.i_chunk_on_page = i_chunk_on_page
+ self.n_chunk_of_page = n_chunk_of_page
+ return self
+
+ def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+ """문서 전체의 청크 인덱스 설정"""
+ self.i_chunk_on_doc = i_chunk_on_doc
+ return self
+
+ def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+ """글로벌 메타데이터 병합"""
+ for key, value in global_metadata.items():
+ if hasattr(self, key):
+ setattr(self, key, value)
+ return self
+
+ def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+ chunk_bboxes = []
+ for item in doc_items:
+ for prov in item.prov:
+ label = item.self_ref
+ type_ = item.label
+ size = document.pages.get(prov.page_no).size
+ page_no = prov.page_no
+ bbox = prov.bbox
+ bbox_data = {'l': bbox.l / size.width,
+ 't': bbox.t / size.height,
+ 'r': bbox.r / size.width,
+ 'b': bbox.b / size.height,
+ 'coord_origin': bbox.coord_origin.value}
+ chunk_bboxes.append({'page': page_no, 'bbox': bbox_data, 'type': type_, 'ref': label})
+ self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None
+ self.chunk_bboxes = json.dumps(chunk_bboxes)
+ return self
+
+ def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref})
+ self.media_files = json.dumps(temp_list)
+ return self
+
+ def build(self) -> GenOSVectorMeta:
+ """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+ return GenOSVectorMeta(
+ text=self.text,
+ n_char=self.n_char,
+ n_word=self.n_word,
+ n_line=self.n_line,
+ i_page=self.i_page,
+ e_page=self.e_page,
+ i_chunk_on_page=self.i_chunk_on_page,
+ n_chunk_of_page=self.n_chunk_of_page,
+ i_chunk_on_doc=self.i_chunk_on_doc,
+ n_chunk_of_doc=self.n_chunk_of_doc,
+ n_page=self.n_page,
+ reg_date=self.reg_date,
+ chunk_bboxes=self.chunk_bboxes,
+ media_files=self.media_files,
+ title=self.title,
+ created_date=self.created_date,
+ )
+
+
+class DocumentProcessor:
+
+ def __init__(self):
+ '''
+ initialize Document Converter
+ '''
+ self.ocr_endpoint = "http://192.168.73.170:30880/ocr"
+ ocr_options = PaddleOcrOptions(
+ force_full_page_ocr=False,
+ lang=['korean'],
+ ocr_endpoint=self.ocr_endpoint,
+ text_score=0.3)
+
+ self.page_chunk_counts = defaultdict(int)
+ device = AcceleratorDevice.AUTO
+ num_threads = 8
+ accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+ # PDF 파이프라인 옵션 설정
+ self.pipe_line_options = PdfPipelineOptions()
+ self.pipe_line_options.generate_page_images = True
+ self.pipe_line_options.generate_picture_images = True
+ self.pipe_line_options.do_ocr = False
+ self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+ # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+ # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+ # ocr_options = TesseractOcrOptions()
+ # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+ # ocr_options.path = './.tesseract/tessdata'
+ # self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.artifacts_path = Path("/models/")
+ self.pipe_line_options.do_table_structure = True
+ self.pipe_line_options.images_scale = 2
+ self.pipe_line_options.table_structure_options.do_cell_matching = True
+ self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+ self.pipe_line_options.accelerator_options = accelerator_options
+
+ # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+ self.simple_pipeline_options = PipelineOptions()
+ self.simple_pipeline_options.save_images = False
+
+ # ocr 파이프라인 옵션
+ self.ocr_pipe_line_options = PdfPipelineOptions()
+ self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.do_ocr = True
+ self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True
+
+ # 기본 컨버터들 생성
+ self._create_converters()
+
+ # enrichment 옵션 설정
+ self.enrichment_options = DataEnrichmentOptions(
+ do_toc_enrichment=False,
+ extract_metadata=True,
+ toc_api_provider="custom",
+
+ # Mistral-Small-3.1-24B-Instruct-2503, 운영망
+ toc_api_base_url="https://genos.mnc.ai:3443/api/gateway/rep/serving/502/v1/chat/completions",
+ metadata_api_base_url="https://genos.mnc.ai:3443/api/gateway/rep/serving/502/v1/chat/completions",
+ toc_api_key="022653a3743849e299f19f19d323490b",
+ metadata_api_key="022653a3743849e299f19f19d323490b",
+
+ # Mistral-Small-3.1-24B-Instruct-2503, 한국은행 클러스터
+ # toc_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions",
+ # metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions",
+ # toc_api_key="9e32423947fd4a5da07a28962fe88487",
+ # metadata_api_key="9e32423947fd4a5da07a28962fe88487",
+
+ toc_model="model",
+ metadata_model="model",
+
+ toc_temperature=0.0,
+ toc_top_p=0.00001,
+ toc_seed=33,
+ toc_max_tokens=1000
+ )
+
+ def _create_converters(self):
+ """컨버터들을 생성하는 헬퍼 메서드"""
+ self.converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.pipe_line_options,
+ backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+ self.second_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.pipe_line_options,
+ backend=PyPdfiumDocumentBackend
+ ),
+ },
+ )
+ self.ocr_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options,
+ backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+ self.ocr_second_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options,
+ backend=PyPdfiumDocumentBackend
+ ),
+ },
+ )
+
+ def load_documents_with_docling(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+ save_images = kwargs.get('save_images', True)
+ include_wmf = kwargs.get('include_wmf', False)
+
+ # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+ if (self.simple_pipeline_options.save_images != save_images or
+ getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf):
+ self.simple_pipeline_options.save_images = save_images
+ self.simple_pipeline_options.include_wmf = include_wmf
+ self._create_converters()
+
+ try:
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+ except Exception as e:
+ conv_result: ConversionResult = self.second_converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+ save_images = kwargs.get('save_images', True)
+ include_wmf = kwargs.get('include_wmf', False)
+
+ # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+ if (self.simple_pipeline_options.save_images != save_images or
+ getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf):
+ self.simple_pipeline_options.save_images = save_images
+ self.simple_pipeline_options.include_wmf = include_wmf
+ self._create_converters()
+
+ try:
+ conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+ except Exception as e:
+ conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def load_documents(self, file_path: str, **kwargs) -> DoclingDocument:
+ return self.load_documents_with_docling(file_path, **kwargs)
+
+ def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+ chunker: HybridChunker = HybridChunker(
+ max_tokens=1000,
+ merge_peers=True
+ )
+
+ chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+ for chunk in chunks:
+ self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+ return chunks
+
+ def safe_join(self, iterable):
+ if not isinstance(iterable, (list, tuple, set)):
+ return ''
+ return ''.join(map(str, iterable)) + '\n'
+
+ def parse_created_date(self, date_text: str) -> Optional[int]:
+ """
+ 작성일 텍스트를 파싱하여 YYYYMMDD 형식의 정수로 변환
+
+ Args:
+ date_text: 작성일 텍스트 (YYYY-MM 또는 YYYY-MM-DD 형식)
+
+ Returns:
+ YYYYMMDD 형식의 정수, 파싱 실패시 None
+ """
+ if not date_text or not isinstance(date_text, str) or date_text == "None":
+ return 0
+
+ # 공백 제거 및 정리
+ date_text = date_text.strip()
+
+ # YYYY-MM-DD 형식 매칭
+ match_full = re.match(r'^(\d{4})-(\d{1,2})-(\d{1,2})$', date_text)
+ if match_full:
+ year, month, day = match_full.groups()
+ try:
+ # 유효한 날짜인지 검증
+ datetime(int(year), int(month), int(day))
+ return int(f"{year}{month.zfill(2)}{day.zfill(2)}")
+ except ValueError:
+ pass
+
+ # YYYY-MM 형식 매칭 (일자는 01로 설정)
+ match_month = re.match(r'^(\d{4})-(\d{1,2})$', date_text)
+ if match_month:
+ year, month = match_month.groups()
+ try:
+ # 유효한 월인지 검증
+ datetime(int(year), int(month), 1)
+ return int(f"{year}{month.zfill(2)}01")
+ except ValueError:
+ pass
+
+ # YYYY 형식 매칭 (월일은 0101로 설정)
+ match_year = re.match(r'^(\d{4})$', date_text)
+ if match_year:
+ year = match_year.group(1)
+ try:
+ datetime(int(year), 1, 1)
+ return int(f"{year}0101")
+ except ValueError:
+ pass
+
+ return 0
+
+ def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocument:
+
+ # 새로운 enriched result 받기
+ document = enrich_document(document, self.enrichment_options, **kwargs)
+ return document
+
+ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+ **kwargs: dict) -> \
+ list[dict]:
+ title = ""
+ created_date = 0
+ try:
+ if (document.key_value_items and
+ len(document.key_value_items) > 0 and
+ hasattr(document.key_value_items[0], 'graph') and
+ hasattr(document.key_value_items[0].graph, 'cells') and
+ len(document.key_value_items[0].graph.cells) > 1):
+ # 작성일 추출 (cells[1])
+ date_text = document.key_value_items[0].graph.cells[1].text
+ created_date = self.parse_created_date(date_text)
+ except (AttributeError, IndexError) as e:
+ pass
+
+ for item, _ in document.iterate_items():
+ if hasattr(item, 'label'):
+ if item.label == DocItemLabel.TITLE:
+ title = item.text.strip() if item.text else ""
+ break
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=document.num_pages(),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+ created_date=created_date,
+ title=title
+ )
+
+ current_page = None
+ chunk_index_on_page = 0
+ vectors = []
+ upload_tasks = []
+ for chunk_idx, chunk in enumerate(chunks):
+ chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+ # header 앞에 헤더 마커 추가 (HEADER: )
+ headers_text = "HEADER: " + ", ".join(chunk.meta.headings) + '\n' if chunk.meta.headings else ''
+ content = headers_text + chunk.text
+
+ if chunk_page != current_page:
+ current_page = chunk_page
+ chunk_index_on_page = 0
+
+ vector = (GenOSVectorMetaBuilder()
+ .set_text(content)
+ .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+ .set_chunk_index(chunk_idx)
+ .set_global_metadata(**global_metadata)
+ .set_chunk_bboxes(chunk.meta.doc_items, document)
+ .set_media_files(chunk.meta.doc_items)
+ ).build()
+ vectors.append(vector)
+
+ chunk_index_on_page += 1
+ # file_list = self.get_media_files(chunk.meta.doc_items)
+ # upload_tasks.append(asyncio.create_task(
+ # upload_files(file_list, request=request)
+ # ))
+
+ if upload_tasks:
+ await asyncio.gather(*upload_tasks)
+
+ return vectors
+
+ def get_media_files(self, doc_items: list):
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'path': path, 'name': name})
+ return temp_list
+
+ def check_glyph_text(self, text: str, threshold: int = 1) -> bool:
+ """텍스트에 GLYPH 항목이 있는지 확인하는 메서드"""
+ if not text:
+ return False
+
+ # GLYPH 항목이 있는지 정규식으로 확인
+ matches = re.findall(r'GLYPH\w*', text)
+ if len(matches) >= threshold:
+ # print(f"Text has glyphs. len(matches): {len(matches)}. ")
+ return True
+
+ return False
+
+ def check_glyphs(self, document: DoclingDocument) -> bool:
+ """문서에 글리프가 있는지 확인하는 메서드"""
+ for item, level in document.iterate_items():
+ if isinstance(item, TextItem) and hasattr(item, 'prov') and item.prov:
+ page_no = item.prov[0].page_no
+ # page_texts += item.text
+
+ # GLYPH 항목이 있는지 확인. 정규식사용
+ matches = re.findall(r'GLYPH\w*', item.text)
+ if len(matches) > 10:
+ # print(f"Document has glyphs on page {page_no}. len(matches): {len(matches)}. ")
+ return True
+
+ return False
+
+ def ocr_all_table_cells(self, document: DoclingDocument, pdf_path) -> List[Dict[str, Any]]:
+ """
+ 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR을 수행합니다.
+ Args:
+ document: DoclingDocument 객체
+ pdf_path: PDF 파일 경로
+ Returns:
+ OCR이 완료된 문서의 DoclingDocument 객체
+ """
+ import fitz
+ import base64
+ import requests
+
+ def post_ocr_bytes(img_bytes: bytes, timeout=60) -> dict:
+ HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
+ payload = {"file": base64.b64encode(img_bytes).decode("ascii"), "fileType": 1, "visualize": False}
+ r = requests.post(self.ocr_endpoint, json=payload, headers=HEADERS, timeout=timeout)
+ if not r.ok:
+ # 진단에 도움되도록 본문 일부 출력
+ raise RuntimeError(f"OCR HTTP {r.status_code}: {r.text[:500]}")
+ return r.json()
+
+ def extract_ocr_fields(resp: dict):
+ """
+ resp: 위와 같은 OCR 응답 JSON(dict)
+ return: (rec_texts, rec_scores, rec_boxes) — 모두 list
+ """
+ if resp is None:
+ return [], [], []
+
+ # 최상위 상태 체크
+ if resp.get("errorCode") not in (0, None):
+ return [], [], []
+
+ ocr_results = (
+ resp.get("result", {})
+ .get("ocrResults", [])
+ )
+ if not ocr_results:
+ return [], [], []
+
+ pruned = (
+ ocr_results[0]
+ .get("prunedResult", {})
+ )
+ if not pruned:
+ return [], [], []
+
+ rec_texts = pruned.get("rec_texts", []) # list[str]
+ rec_scores = pruned.get("rec_scores", []) # list[float]
+ rec_boxes = pruned.get("rec_boxes", []) # list[[x1,y1,x2,y2]]
+
+ # 길이 불일치 방어: 최소 길이에 맞춰 자르기
+ n = min(len(rec_texts), len(rec_scores), len(rec_boxes))
+ return rec_texts[:n], rec_scores[:n], rec_boxes[:n]
+
+ try:
+ doc = fitz.open(pdf_path)
+
+ for table_idx, table_item in enumerate(document.tables):
+ if not table_item.data or not table_item.data.table_cells:
+ continue
+
+ b_ocr = False
+ for cell_idx, cell in enumerate(table_item.data.table_cells):
+ if self.check_glyph_text(cell.text, threshold=1):
+ b_ocr = True
+ break
+
+ if b_ocr is False:
+ # 글리프 깨진 텍스트가 없는 경우, OCR을 수행하지 않음
+ continue
+
+ for cell_idx, cell in enumerate(table_item.data.table_cells):
+
+ # Provenance 정보에서 위치 정보 추출
+ if not table_item.prov:
+ continue
+
+ page_no = table_item.prov[0].page_no - 1
+ bbox = cell.bbox
+
+ page = doc.load_page(page_no)
+
+ # 셀의 바운딩 박스를 사용하여 이미지에서 해당 영역을 잘라냄
+ cell_bbox = fitz.Rect(
+ bbox.l, min(bbox.t, bbox.b),
+ bbox.r, max(bbox.t, bbox.b)
+ )
+
+ # bbox 높이 계산 (PDF 좌표계 단위)
+ bbox_height = cell_bbox.height
+
+ # 목표 픽셀 높이
+ target_height = 20
+
+ # zoom factor 계산
+ # (너무 작은 bbox일 경우 0으로 나누는 걸 방지)
+ zoom_factor = target_height / bbox_height if bbox_height > 0 else 1.0
+ zoom_factor = min(zoom_factor, 4.0) # 최대 확대 비율 제한
+ zoom_factor = max(zoom_factor, 1) # 최소 확대 비율 제한
+
+ # 페이지를 이미지로 렌더링
+ mat = fitz.Matrix(zoom_factor, zoom_factor)
+ pix = page.get_pixmap(matrix=mat, clip=cell_bbox)
+ img_data = pix.tobytes("png")
+
+ result = post_ocr_bytes(img_data, timeout=60)
+ rec_texts, rec_scores, rec_boxes = extract_ocr_fields(result)
+
+ cell.text = ""
+ for t in rec_texts:
+ if len(cell.text) > 0:
+ cell.text += " "
+ cell.text += t if t else ""
+ except Exception as e:
+ print(f"OCR processing failed: {e}")
+ pass
+
+ return document
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ # kwargs['save_images'] = True # 이미지 처리
+ # kwargs['include_wmf'] = True # wmf 처리
+ document: DoclingDocument = self.load_documents(file_path, **kwargs)
+
+ if not check_document(document, self.enrichment_options) or self.check_glyphs(document):
+ # OCR이 필요하다고 판단되면 OCR 수행
+ document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+ if document.origin.mimetype == "text/html":
+ pass
+ else:
+ # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
+ document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
+
+ output_path, output_file = os.path.split(file_path)
+ filename, _ = os.path.splitext(output_file)
+ artifacts_dir = Path(f"{output_path}/{filename}")
+ if artifacts_dir.is_absolute():
+ reference_path = None
+ else:
+ reference_path = artifacts_dir.parent
+
+ document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+ document = self.enrichment(document, **kwargs)
+
+ has_text_items = False
+ for item, _ in document.iterate_items():
+ if (isinstance(item, (TextItem, ListItem, CodeItem, SectionHeaderItem)) and item.text and item.text.strip()) or (isinstance(item, TableItem) and item.data and len(item.data.table_cells) == 0):
+ has_text_items = True
+ break
+
+ if has_text_items:
+ # Extract Chunk from DoclingDocument
+ chunks: List[DocChunk] = self.split_documents(document, **kwargs)
+ else:
+ # text가 있는 item이 없을 때 document에 임의의 text item 추가
+ from docling_core.types.doc import ProvenanceItem
+
+ # 첫 번째 페이지의 기본 정보 사용 (1-based indexing)
+ page_no = 1
+
+ # ProvenanceItem 생성
+ prov = ProvenanceItem(
+ page_no=page_no,
+ bbox=BoundingBox(l=0, t=0, r=1, b=1), # 최소 bbox
+ charspan=(0, 1)
+ )
+
+ # document에 temp text item 추가
+ document.add_text(
+ label=DocItemLabel.TEXT,
+ text=".",
+ prov=prov
+ )
+
+ # split_documents 호출
+ chunks: List[DocChunk] = self.split_documents(document, **kwargs)
+ # await assert_cancelled(request)
+
+ vectors = []
+ if len(chunks) >= 1:
+ vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+ else:
+ raise GenosServiceException(1, f"chunk length is 0")
+
+ """
+ # 미디어 파일 업로드 방법
+ media_files = [
+ { 'path': '/tmp/graph.jpg', 'name': 'graph.jpg', 'type': 'image' },
+ { 'path': '/result/1/graph.jpg', 'name': '1/graph.jpg', 'type': 'image' },
+ ]
+
+ # 업로드 요청 시에는 path, name 필요
+ file_list = [{k: v for k, v in file.items() if k != 'type'} for file in media_files]
+ await upload_files(file_list, request=request)
+
+ # 메타에 저장시에는 name, type 필요
+ meta = [{k: v for k, v in file.items() if k != 'path'} for file in media_files]
+ vectors[0].media_files = meta
+ """
+
+ return vectors
+
+
+class GenosServiceException(Exception):
+ # GenOS 와의 의존성 부분 제거를 위해 추가
+ def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+ self.code = 1
+ self.error_code = error_code
+ self.error_msg = error_msg or "GenOS Service Exception"
+ self.msg_params = msg_params or {}
+
+ def __repr__(self) -> str:
+ class_name = self.__class__.__name__
+ return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# GenOS 와의 의존성 제거를 위해 추가
+async def assert_cancelled(request: Request):
+ if await request.is_disconnected():
+ raise GenosServiceException(1, f"Cancelled")
diff --git a/genon/preprocessor/facade/json_processor.py b/genon/preprocessor/facade/json_processor.py
new file mode 100644
index 0000000000..8616aab79b
--- /dev/null
+++ b/genon/preprocessor/facade/json_processor.py
@@ -0,0 +1,344 @@
+from datetime import datetime
+from typing import Optional, Iterable, Any, List, Dict, Tuple
+from collections import defaultdict
+from fastapi import Request
+from pydantic import BaseModel, ConfigDict
+from collections import Counter
+
+import re
+import asyncio
+import json
+import ast
+import pdb
+
+import pandas as pd
+
+from docling_core.types.doc import (
+ BoundingBox,
+ #CoordOrigin,
+ DocItemLabel,
+ DoclingDocument,
+ DocumentOrigin,
+ GroupLabel,
+ #ImageRef,
+ #ProvenanceItem,
+ #Size,
+ #TableCell,
+ #TableData,
+ #GroupItem,
+ DocItem,
+ PictureItem,
+ SectionHeaderItem,
+ TableItem,
+ TextItem,
+ PageItem
+)
+
+from docling.document_converter import DocumentConverter, PdfFormatOption, HTMLFormatOption
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling_core.types import DoclingDocument
+
+KV_MAP = {
+ "url": ["URL"],
+ "ins_date": [
+ "입력일", # 경상오더
+ "발행일자", # TM
+ ],
+ "title": [
+ "오더제목", # 경상오더
+ "고장내용", # 발전정지
+ "TM제목", # TM
+ ],
+ "num": [
+ "오더번호", # 경상오더
+ "번호", # 발전정지
+ "TM번호", # TM
+ ],
+ "Powersys": ["발전소"], # 발전정지
+ "desman": ["설계자"], # 경상오더
+ "desdept": ["설계부서"], # 경상오더, TM
+ "hogi": ["호기"],
+ "des_date": ["설계일"],
+ "stopcat": ["정지종별"], # 발전정지
+ "stopcat_code": ["정지종별코드"], # 발전정지
+ "parcat": ["대분류"], # 발전정지
+ "cat": ["분류"], # 발전정지
+ "event_date": ["발생일시"], # 발전정지
+ "rec_date": ["복구일시"], # 발전정지
+ "pubman": ["발행자"],
+ "pubdept": ["발행부서"],
+ "status": ["진행상태"]
+}
+
+class GenOSVectorMeta(BaseModel):
+ model_config = ConfigDict(extra="allow")
+
+class GenOSVectorMetaBuilder:
+ def __init__(self):
+ """빌더 초기화"""
+ self.text: Optional[str] = None
+ self.n_char: Optional[int] = None
+ self.n_word: Optional[int] = None
+ self.n_line: Optional[int] = None
+ self.i_page: Optional[int] = None
+ self.i_chunk_on_page: Optional[int] = None
+ self.n_chunk_of_page: Optional[int] = None
+ self.i_chunk_on_doc: Optional[int] = None
+ self.n_chunk_of_doc: Optional[int] = None
+ self.n_page: Optional[int] = None
+ self.reg_date: Optional[str] = None
+ self.bboxes: Optional[str] = None
+ self.url: Optional[str] = None
+
+ self.data = {"text": None,
+ "n_char": None,
+ "n_line": None,
+ "i_page": None,
+ "i_chunk_on_page": None,
+ "n_chunk_of_page": None,
+ "i_chunk_on_doc": None,
+ "n_chunk_of_doc": None,
+ "n_page": None,
+ "reg_date": None,
+ "bboxes": None,
+ "url": None
+ }
+
+ def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+ """텍스트와 관련된 데이터를 설정"""
+
+ self.text = text
+ self.n_char = len(text)
+ self.n_word = len(text.split())
+ self.n_line = len(text.splitlines())
+
+ self.data["text"] = text
+ self.data["n_char"] = len(text)
+ self.data["n_word"] = len(text.split())
+ self.data["n_line"] = len(text.splitlines())
+
+ return self
+
+ def set_page_info(
+ self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int
+ ) -> "GenOSVectorMetaBuilder":
+ """페이지 정보 설정"""
+ self.i_page = i_page
+ self.i_chunk_on_page = i_chunk_on_page
+ self.n_chunk_of_page = n_chunk_of_page
+
+ self.data["i_page"] = i_page
+ self.data["i_chunk_on_page"] = i_chunk_on_page
+ self.data["n_chunk_of_page"] = n_chunk_of_page
+
+ return self
+
+ def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+ """문서 전체의 청크 인덱스 설정"""
+ self.i_chunk_on_doc = i_chunk_on_doc
+
+ self.data["i_chunk_on_doc"] = i_chunk_on_doc
+
+ return self
+
+ def set_bboxes(self, bbox: BoundingBox) -> "GenOSVectorMetaBuilder":
+ """Bounding Boxes 정보 설정"""
+ # bboxes.append({
+ # 'p1': {'x': rect[0] / fitz_page.rect.width, 'y': rect[1] / fitz_page.rect.height},
+ # 'p2': {'x': rect[2] / fitz_page.rect.width, 'y': rect[3] / fitz_page.rect.height},
+ # })
+ # NOTE: docling은 BOTTOMLEFT인데 해당 좌표 그대로 활용되는지 ?
+ conv = []
+ conv.append({
+ 'p1': {'x': 0, 'y': 0},
+ 'p2': {'x': 0, 'y': 0},
+ })
+ self.bboxes = json.dumps(conv)
+
+ self.data["bboxes"] = json.dumps(conv)
+
+ return self
+
+ def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+ """글로벌 메타데이터 병합"""
+
+ for key, value in global_metadata.items():
+ setattr(self, key, value)
+ self.data[key] = value
+
+
+ return self
+
+ def build(self) -> GenOSVectorMeta:
+ """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+ return GenOSVectorMeta(text=self.data.pop("text", "ERROR: no text"), **self.data)
+
+class DocumentProcessor:
+ def __init__(self):
+ '''
+ initialize Document Converter
+ '''
+ self.page_chunk_counts = defaultdict(int)
+ # device = AcceleratorDevice.AUTO
+ num_threads = 4
+
+ def preprocess_json(self, jsonf):
+
+ metadata_keys = []
+ date_keys = []
+
+ for jsonf_k, _ in jsonf.items():
+ for k, v in KV_MAP.items():
+ if jsonf_k in v:
+ metadata_keys.append(jsonf_k)
+ if "date" in k:
+ date_keys.append(jsonf_k)
+
+ # date처리, json확인해보고 빼도됨.
+ for k in date_keys:
+ if k in jsonf:
+ try:
+ # jsonf[k] = pdf.to_datetime(jsonf[k], errors='coerce').isoformat()
+ date_value = jsonf[k]
+ if not date_value:
+ date_value = 0
+
+ if date_value:
+ dt_obj = self._parse_date_string(str(date_value))
+
+ if dt_obj:
+ jsonf[k] = int(dt_obj.strftime("%Y%m%d"))
+ else:
+ jsonf[k] = ""
+ else:
+ jsonf[k] = ""
+ except Exception:
+ pass
+
+ # nan 처리
+ for key in jsonf.keys():
+ try:
+ if not isinstance(jsonf[key], list) and pd.isna(jsonf[key]):
+ jsonf[key] = ""
+ except:
+ pass
+
+ # 메타데이터 처리
+ metadata = {key: jsonf[key] for key in metadata_keys if key in jsonf}
+
+ # formatted text 생성
+ formatted_text = "\n ".join([f"{key} : {str(jsonf[key])}" for key in jsonf if not key.startswith('Unnamed')])
+
+ metadata_key_list = list(metadata.keys())
+ for k, v_list in KV_MAP.items():
+ for metadata_key in metadata_key_list:
+ if metadata_key in v_list:
+ if k in metadata.keys():
+ print(f"@@@@ 이미 있는 키: {metadata_key} ---X-->> {k}")
+ print(f"@@@@ 있는 값: {k} : {metadata[k]}")
+ metadata.pop(metadata_key)
+ else:
+ metadata[k] = metadata.pop(metadata_key)
+
+ chunk = {
+ "id": 1,
+ "text": formatted_text,
+ "metadata": metadata
+ }
+
+ return chunk
+
+
+ def load_documents(self, file_path: str):
+ with open(file_path, 'r', encoding='utf-8') as f:
+ jsonfile = json.load(f)
+
+ return jsonfile
+
+ def _parse_date_string(self, date_str:str)-> Optional[datetime]:
+ formats = [
+ "%Y-%m-%d",
+ "%Y%m%d",
+ "%Y-%m-%d %H:%M:%S",
+ "%Y-%m-%dT%H:%M:%SZ",
+ "%Y-%m-%d %H:%M:%S%z",
+ "%Y-%m-%dT%H:%M:%S.%fZ"
+ ]
+
+ if not date_str or date_str.strip() == "":
+ return 0
+ for fmt in formats:
+ try:
+ return datetime.strptime(date_str, fmt)
+ except ValueError:
+ continue
+
+ return 0
+
+
+ # def split_documents(self, documents: dict, **kwargs: dict) -> List[Dict]:
+ def split_documents(self, documents: dict, **kwargs: dict) -> Dict:
+ chunk_size = 1000
+ text = documents.get("text", "error")
+ chunks = []
+ chunk = ""
+
+ words = text.split(" ")
+
+ for word in words:
+ if len(chunk) + len(word) > chunk_size:
+ chunks.append(chunk)
+ chunk = word
+ else:
+ chunk += (" " + word) if chunk else word
+
+ if chunk:
+ chunks.append(chunk)
+
+ new_chunks = []
+ for chunk in chunks:
+ documents['text'] = chunk
+ new_chunks.append(documents.copy())
+
+ return new_chunks
+
+
+ def compose_vectors(self, chunks: list[dict], file_path: str) -> \
+ list[dict]:
+
+ first_chunk = chunks[0]
+
+ global_metadata = dict(
+ n_chunk_of_doc=int(1),
+ n_page=int(1),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+ **first_chunk['metadata'],
+ )
+
+ current_page = 1
+ chunk_index_on_page = 0
+
+ vectors = []
+ for chunk in chunks:
+ vector = (GenOSVectorMetaBuilder()
+ .set_text(chunk["text"])
+ .set_page_info(1, 1, 1)
+ .set_chunk_index(1)
+ .set_global_metadata(**global_metadata)
+ .set_bboxes(None)
+ ).build()
+ vectors.append(vector)
+
+ return vectors
+
+ async def __call__(self, request: Request, file_path: str, **kwargs): # request: Request
+
+ file: dict = self.load_documents(file_path)
+
+ document: dict = self.preprocess_json(file)
+
+ chunks: list[dict] = self.split_documents(document, **kwargs)
+
+ vectors: list[dict] = self.compose_vectors(chunks=chunks, file_path= file_path)
+
+ return vectors
\ No newline at end of file
diff --git "a/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py" "b/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py"
index df58f2f2a7..c69a67dd5b 100644
--- "a/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py"
+++ "b/genon/preprocessor/facade/legacy/\354\240\201\354\236\254\354\232\251(\354\231\270\353\266\200)_ocr.py"
@@ -1253,8 +1253,11 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
# OCR이 필요하다고 판단되면 OCR 수행
document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
- # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
- document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
+ if document.origin.mimetype == "text/html":
+ pass
+ else:
+ # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
+ document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
output_path, output_file = os.path.split(file_path)
filename, _ = os.path.splitext(output_file)
diff --git a/genon/preprocessor/facade/oneagent_processor.py b/genon/preprocessor/facade/oneagent_processor.py
new file mode 100644
index 0000000000..93870bace0
--- /dev/null
+++ b/genon/preprocessor/facade/oneagent_processor.py
@@ -0,0 +1,1646 @@
+from __future__ import annotations
+
+from collections import defaultdict
+
+import asyncio
+import fitz
+import json
+import math
+import os
+import pandas as pd
+import pydub
+import requests
+import shutil
+import subprocess
+import sys
+import threading
+import uuid
+import warnings
+from datetime import datetime
+from fastapi import Request
+from glob import glob
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+ # TextLoader, # TXT
+ PyMuPDFLoader, # PDF
+ DataFrameLoader, # DataFrame
+ UnstructuredWordDocumentLoader, # DOC and DOCX
+ UnstructuredPowerPointLoader, # PPT and PPTX
+ UnstructuredImageLoader, # JPG, PNG
+ UnstructuredMarkdownLoader, # Markdown
+ UnstructuredFileLoader, # Generic fallback
+)
+from langchain_core.documents import Document
+from markdown2 import markdown
+from pandas import DataFrame
+from pathlib import Path
+from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing import Any, Iterable, Iterator, List, Optional, Union
+from typing_extensions import Self
+
+try:
+ import semchunk
+ from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+ raise RuntimeError(
+ "Module requires 'chunking' extra; to install, run: "
+ "`pip install 'docling-core[chunking]'`"
+ )
+try:
+ import chardet
+except ImportError:
+ raise RuntimeError("Module 'chardet' not imported. Run `pip install chardet`.")
+try:
+ from weasyprint import HTML
+except ImportError:
+ print("Warning: WeasyPrint could not be imported. PDF conversion features will be disabled.")
+ HTML = None
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.document_converter import DocumentConverter, HwpxFormatOption, WordFormatOption
+from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta
+from docling_core.types import DoclingDocument as DLDocument
+from docling_core.types.doc import (
+ DocItem, DocItemLabel, DoclingDocument,
+ PictureItem, SectionHeaderItem, TableItem, TextItem
+)
+from docling_core.types.doc.document import LevelNumber, ListItem, CodeItem
+from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend
+# from utils import assert_cancelled
+# from genos_utils import upload_files, merge_overlapping_bboxes
+
+# import platform
+from pathlib import Path
+import os
+import subprocess
+import tempfile
+import shutil
+import unicodedata
+
+import logging
+
+for n in ("fontTools", "fontTools.ttLib", "fontTools.ttLib.ttFont"):
+ lg = logging.getLogger(n)
+ lg.setLevel(logging.CRITICAL)
+ lg.propagate = False
+ logging.getLogger().setLevel(logging.WARNING)
+# pdf 변환 대상 확장자
+CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']
+
+
+
+### @@@@ 성민: 가드레일 용 ###
+import requests
+import re
+import json
+
+GUARDRAIL_WORKFLOW_ID = 694
+GUARDRAIL_BEARER_TOKEN = '23c3898fe3264fd597961af23a68fe7c'
+# GENOS_URL = 'https://ai.komipo.co.kr:30908/'
+# @@@@ 내부 호출로 변경
+GENOS_URL = 'http://llmops-gateway-api-service:8080'
+
+
+from functools import wraps
+
+def guardrail(func):
+ @wraps(func)
+ async def wrapper(*args, **kwargs):
+ result = await func(*args, **kwargs)
+ for r in result:
+ url = f"{GENOS_URL}/workflow/{GUARDRAIL_WORKFLOW_ID}"
+ headers = dict(Authorization=f"Bearer {GUARDRAIL_BEARER_TOKEN}")
+
+ if hasattr(r, "text"):
+ body = {'question': r.text}
+
+ res = requests.post(f'{url}/run/v2', json=body, headers=headers)
+
+ answer = res.json()['data']['text']
+
+ if answer.startswith("[UNSAFE]"):
+ r.text = "부적절한 텍스트가 포함되어 있으므로 해당 청크를 제거합니다."
+
+
+ return result
+ return wrapper
+
+
+def convert_to_pdf(file_path: str) -> str | None:
+ """
+ LibreOffice로 PDF 변환을 시도한다.
+ 실패해도 예외를 던지지 않고 None을 반환한다.
+ """
+ try:
+ in_path = Path(file_path).resolve()
+ out_dir = in_path.parent
+ pdf_path = in_path.with_suffix('.pdf')
+
+ # headless에서 UTF-8 locale 보장
+ env = os.environ.copy()
+ env.setdefault("LANG", "C.UTF-8")
+ env.setdefault("LC_ALL", "C.UTF-8")
+
+ # 확장자에 따라 필터(특히 .ppt는 impress 필터)
+ ext = in_path.suffix.lower()
+ if ext in ('.ppt', '.pptx'):
+ convert_arg = "pdf:impress_pdf_Export"
+ elif ext in ('.doc', '.docx'):
+ convert_arg = "pdf:writer_pdf_Export"
+ elif ext in ('.xls', '.xlsx', '.csv'):
+ convert_arg = "pdf:calc_pdf_Export"
+ else:
+ convert_arg = "pdf"
+
+ # 비ASCII 파일명 이슈 대비 임시 ASCII 파일명 복사본 시도
+ try:
+ in_path.name.encode('ascii')
+ candidates = [in_path]
+ tmp_dir = None
+ except UnicodeEncodeError:
+ tmp_dir = Path(tempfile.mkdtemp())
+ ascii_name = unicodedata.normalize('NFKD', in_path.stem).encode('ascii', 'ignore').decode('ascii') or "file"
+ ascii_copy = tmp_dir / f"{ascii_name}{in_path.suffix}"
+ shutil.copy2(in_path, ascii_copy)
+ candidates = [ascii_copy, in_path]
+
+ for cand in candidates:
+ cmd = [
+ "soffice", "--headless",
+ "--convert-to", convert_arg,
+ "--outdir", str(out_dir),
+ str(cand)
+ ]
+ proc = subprocess.run(cmd, env=env, capture_output=True, text=True)
+ if proc.returncode == 0 and pdf_path.exists():
+ # 성공
+ if tmp_dir:
+ shutil.rmtree(tmp_dir, ignore_errors=True)
+ return str(pdf_path)
+ # 실패해도 계속 시도 (로그만 찍고 무시)
+ print(f"[convert_to_pdf] stderr: {proc.stderr.strip()}")
+
+ if tmp_dir:
+ shutil.rmtree(tmp_dir, ignore_errors=True)
+ return None
+ except Exception as e:
+ # 어떤 에러든 삼키고 None 반환
+ print(f"[convert_to_pdf] error: {e}")
+ return None
+
+
+def _get_pdf_path(file_path: str) -> str:
+ """
+ 다양한 파일 확장자를 PDF 확장자로 변경하는 공통 함수
+
+ Args:
+ file_path (str): 원본 파일 경로
+
+ Returns:
+ str: PDF 확장자로 변경된 파일 경로
+ """
+ pdf_path = file_path
+ for ext in CONVERTIBLE_EXTENSIONS:
+ pdf_path = pdf_path.replace(ext, '.pdf')
+ return pdf_path
+
+
+def install_packages(packages):
+ for package in packages:
+ try:
+ __import__(package)
+ except ImportError:
+ print(f"[!] {package} 패키지가 없습니다. 설치를 시도합니다.")
+ subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)
+
+
+class GenOSVectorMeta(BaseModel):
+ class Config:
+ extra = 'allow'
+
+ text: str | None = None
+ n_char: int | None = None
+ n_word: int | None = None
+ n_line: int | None = None
+ i_page: int | None = None
+ e_page: int | None = None
+ i_chunk_on_page: int | None = None
+ n_chunk_of_page: int | None = None
+ i_chunk_on_doc: int | None = None
+ n_chunk_of_doc: int | None = None
+ n_page: int | None = None
+ reg_date: str | None = None
+ chunk_bboxes: str | None = None
+ media_files: str | None = None
+
+
+class GenOSVectorMetaBuilder:
+ def __init__(self):
+ """빌더 초기화"""
+ self.text: Optional[str] = None
+ self.n_char: Optional[int] = None
+ self.n_word: Optional[int] = None
+ self.n_line: Optional[int] = None
+ self.i_page: Optional[int] = None
+ self.e_page: Optional[int] = None
+ self.i_chunk_on_page: Optional[int] = None
+ self.n_chunk_of_page: Optional[int] = None
+ self.i_chunk_on_doc: Optional[int] = None
+ self.n_chunk_of_doc: Optional[int] = None
+ self.n_page: Optional[int] = None
+ self.reg_date: Optional[str] = None
+ self.chunk_bboxes: Optional[str] = None
+ self.media_files: Optional[str] = None
+ # self.title: Optional[str] = None
+ # self.created_date: Optional[int] = None
+
+ def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+ """텍스트와 관련된 데이터를 설정"""
+ self.text = text
+ self.n_char = len(text)
+ self.n_word = len(text.split())
+ self.n_line = len(text.splitlines())
+ return self
+
+ def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder":
+ """페이지 정보 설정"""
+ self.i_page = i_page
+ self.i_chunk_on_page = i_chunk_on_page
+ self.n_chunk_of_page = n_chunk_of_page
+ return self
+
+ def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+ """문서 전체의 청크 인덱스 설정"""
+ self.i_chunk_on_doc = i_chunk_on_doc
+ return self
+
+ def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+ """글로벌 메타데이터 병합"""
+ for key, value in global_metadata.items():
+ if hasattr(self, key):
+ setattr(self, key, value)
+ return self
+
+ def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+ chunk_bboxes = []
+ for item in doc_items:
+ for prov in item.prov:
+ label = item.self_ref
+ type_ = item.label
+ size = document.pages.get(prov.page_no).size
+ page_no = prov.page_no
+ bbox = prov.bbox
+ bbox_data = {
+ 'l': bbox.l / size.width,
+ 't': bbox.t / size.height,
+ 'r': bbox.r / size.width,
+ 'b': bbox.b / size.height,
+ 'coord_origin': bbox.coord_origin.value
+ }
+ chunk_bboxes.append({
+ 'page': page_no,
+ 'bbox': bbox_data,
+ 'type': type_,
+ 'ref': label
+ })
+ self.e_page = max([bbox['page'] for bbox in chunk_bboxes]) if chunk_bboxes else None
+ self.chunk_bboxes = json.dumps(chunk_bboxes)
+ return self
+
+ def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+ temp_list = []
+ if not doc_items:
+ self.media_files = ""
+ return self
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'name': name, 'type': 'image', 'ref': item.self_ref})
+ self.media_files = json.dumps(temp_list)
+ return self
+
+ def build(self) -> GenOSVectorMeta:
+ """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+ return GenOSVectorMeta(
+ text=self.text,
+ n_char=self.n_char,
+ n_word=self.n_word,
+ n_line=self.n_line,
+ i_page=self.i_page,
+ e_page=self.e_page,
+ i_chunk_on_page=self.i_chunk_on_page,
+ n_chunk_of_page=self.n_chunk_of_page,
+ i_chunk_on_doc=self.i_chunk_on_doc,
+ n_chunk_of_doc=self.n_chunk_of_doc,
+ n_page=self.n_page,
+ reg_date=self.reg_date,
+ chunk_bboxes=self.chunk_bboxes,
+ media_files=self.media_files,
+ )
+
+
+class HwpLoader:
+ def __init__(self, file_path: str):
+ self.file_path = file_path
+ self.output_dir = os.path.join('/tmp', str(uuid.uuid4()))
+ os.makedirs(self.output_dir, exist_ok=True)
+
+ def load(self):
+ try:
+ subprocess.run(['hwp5html', self.file_path, '--output', self.output_dir], check=True, timeout=600)
+ converted_file_path = os.path.join(self.output_dir, 'index.xhtml')
+ pdf_save_path = _get_pdf_path(self.file_path)
+ HTML(converted_file_path).write_pdf(pdf_save_path)
+ loader = PyMuPDFLoader(pdf_save_path)
+ return loader.load()
+ except Exception as e:
+ print(f"Failed to convert {self.file_path} to XHTML")
+ raise e
+ finally:
+ if os.path.exists(self.output_dir):
+ shutil.rmtree(self.output_dir)
+
+
+class TextLoader:
+ def __init__(self, file_path: str):
+ self.file_path = file_path
+ self.output_dir = os.path.join('/tmp', str(uuid.uuid4()))
+ os.makedirs(self.output_dir, exist_ok=True)
+
+ def load(self):
+ try:
+ with open(self.file_path, 'rb') as f:
+ raw = f.read()
+ enc = chardet.detect(raw).get('encoding') or ''
+ encodings = [enc] if enc and enc.lower() not in ('ascii', 'unknown') else []
+ encodings += ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+
+ content = None
+ for e in encodings:
+ try:
+ content = raw.decode(e) # 전체 파일로 디코딩
+ break
+ except UnicodeDecodeError:
+ continue
+ if content is None:
+ content = raw.decode('utf-8', errors='replace')
+
+ # 4) PDF 변환 유지
+ html = f"{content}"
+ html_path = os.path.join(self.output_dir, 'temp.html')
+ with open(html_path, 'w', encoding='utf-8') as f:
+ f.write(html)
+ # pdf_path = (self.file_path
+ # .replace('.txt', '.pdf')
+ # .replace('.json', '.pdf'))
+ pdf_path = _get_pdf_path(self.file_path)
+ if HTML:
+ HTML(html_path).write_pdf(pdf_path)
+ loader = PyMuPDFLoader(pdf_path)
+ return loader.load()
+ # PDF가 불가하면 Document 직접 반환 (원형 스키마 유지)
+ return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+
+ except Exception:
+ # 실패 시에도 스키마는 그대로 유지해 반환
+ for e in ['utf-8', 'cp949', 'euc-kr', 'iso-8859-1']:
+ try:
+ with open(self.file_path, 'r', encoding=e) as f:
+ content = f.read()
+ return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+ except UnicodeDecodeError:
+ continue
+ with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+ return [Document(page_content=content, metadata={'source': self.file_path, 'page': 0})]
+ finally:
+ if os.path.exists(self.output_dir):
+ shutil.rmtree(self.output_dir)
+
+
+class TabularLoader:
+ def __init__(self, file_path: str, ext: str):
+
+ packages = ['openpyxl', 'chardet']
+
+ install_packages(packages)
+
+ self.file_path = file_path
+ if ext == ".csv":
+ # convert_to_pdf(file_path) csv는 Pdf 변환 안 함
+ self.data_dict = self.load_csv_documents(file_path)
+ elif ext == ".xlsx":
+ # convert_to_pdf(file_path) xlsx는 Pdf 변환 안 함
+ self.data_dict = self.load_xlsx_documents(file_path)
+ else:
+ print(f"[!] Inadequate extension for TabularLoader: {ext}")
+ return
+
+ def check_sql_dtypes(self, df):
+ df = df.convert_dtypes()
+ res = []
+ for col in df.columns:
+ # col_name = col.strip().replace(' ', '_')
+ dtype = str(df.dtypes[col]).lower()
+
+ if 'int' in dtype:
+ if '64' in dtype:
+ sql_dtype = 'BIGINT'
+ else:
+ sql_dtype = 'INT'
+ elif 'float' in dtype:
+ sql_dtype = 'FLOAT'
+ elif 'bool' in dtype:
+ sql_dtype = 'BOOLEAN'
+ elif 'date' in dtype:
+ sql_dtype = 'DATE'
+ df[col] = df[col].astype(str)
+ elif 'datetime' in dtype:
+ sql_dtype = 'DATETIME'
+ df[col] = df[col].astype(str)
+ # else:
+ # max_len = df[col].str.len().max().item() + 10
+ # sql_dtype = f'VARCHAR({max_len})'
+ else:
+ lens = df[col].astype(str).str.len()
+ max_len_val = lens.max()
+ max_len = int(0 if pd.isna(max_len_val) else max_len_val) + 10
+ sql_dtype = f'VARCHAR({max_len})'
+
+ res.append([col, sql_dtype])
+
+ return df, res
+
+ def process_data_rows(self, data: dict):
+ """Arg: data (keys: 'sheet_name', 'page_column', 'page_column_type', 'documents')"""
+
+ rows = []
+ for doc in data["documents"]:
+ row = {}
+ if 'int' in data["page_column_type"]:
+ row[data["page_column"]] = int(doc.page_content)
+ elif 'float' in data["page_column_type"]:
+ row[data["page_column"]] = float(doc.page_content)
+ elif 'bool' in data["page_column_type"]:
+ if doc.page_content.lower() == 'true':
+ row[data["page_column"]] = True
+ elif doc.page_content.lower() == 'false':
+ row[data["page_column"]] = False
+ else:
+ raise ValueError(f"Invalid boolean string: {doc.page_content}")
+ else:
+ row[data["page_column"]] = doc.page_content
+
+ row.update(doc.metadata)
+ rows.append(row)
+
+ processed_data = {"sheet_name": data["sheet_name"], "data_rows": rows, "data_types": data["dtypes"]}
+ return processed_data
+
+ def load_csv_documents(self, file_path: str, **kwargs: dict):
+ import chardet
+
+ with open(file_path, "rb") as f:
+ raw_file = f.read(10000)
+ enc_type = chardet.detect(raw_file)['encoding']
+ df = pd.read_csv(file_path, encoding=enc_type, index_col=False)
+ df = df.fillna('null') # csv 파일에서도 xlsx 파일과 동일하게 null로 채움
+ df, dtypes_str = self.check_sql_dtypes(df)
+
+ for i in range(len(df.columns)):
+ try:
+ col = df.columns[0]
+ # col_type = str(type(col))
+ col_type = str(df[col].dtype)
+ df = df.astype({col: 'str'})
+ break
+ except:
+ raise ValueError(
+ f"Any columns cannot be converted into the string type so that can't load LangChain Documents: {dtypes_str}")
+
+ loader = DataFrameLoader(df, page_content_column=col)
+ documents = loader.load()
+
+ data = {
+ "sheet_name": "table_1",
+ "page_column": col,
+ "page_column_type": col_type,
+ "documents": documents,
+ "dtypes": dtypes_str
+ }
+ data = self.process_data_rows(data) # including only one sheet as it's a csv file
+ data_dict = {"data": [data]}
+ return data_dict
+
+ def load_xlsx_documents(self, file_path: str, **kwargs: dict):
+ dfs = pd.read_excel(file_path, sheet_name=None)
+ sheets = []
+ for sheet_name, df in dfs.items():
+ df = df.fillna('null')
+ df, dtypes_str = self.check_sql_dtypes(df)
+
+ for i in range(len(df.columns)):
+ try:
+ col = df.columns[0]
+ col_type = str(type(col))
+ df = df.astype({col: 'str'})
+ break
+ except:
+ raise ValueError(
+ f"Any columns cannot be converted into string type so that can't load LangChain Documents: {dtypes_str}")
+
+ loader = DataFrameLoader(df, page_content_column=col)
+ documents = loader.load()
+
+ sheet = {
+ "sheet_name": sheet_name,
+ "page_column": col,
+ "page_column_type": col_type,
+ "documents": documents,
+ "dtypes": dtypes_str
+ }
+ sheets.append(sheet)
+
+ data_dict = {"data": []}
+ for sheet in sheets:
+ data = self.process_data_rows(sheet)
+ data_dict["data"].append(data)
+
+ return data_dict
+
+ def return_vectormeta_format(self):
+ if not self.data_dict:
+ return None
+
+ text = "[DA] " + str(self.data_dict) # Add a token to indicate this string is for data analysis
+
+ # @@@@ 성민: 토큰 수 줄이기위한 후처리(임시조치)
+ text = text.replace("Unnamed: ", "")
+ text = text[:2000]
+
+ vectors = [GenOSVectorMeta.model_validate({
+ 'text': text,
+ 'n_char': 1,
+ 'n_word': 1,
+ 'n_line': 1,
+ 'i_page': 1,
+ 'e_page': 1,
+ 'n_page': 1,
+ 'i_chunk_on_page': 1,
+ 'n_chunk_of_page': 1,
+ 'i_chunk_on_doc': 1,
+ 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+ 'chunk_bboxes': ".",
+ 'media_files': "."
+ })]
+
+
+ return vectors
+
+
+class AudioLoader:
+ def __init__(self,
+ file_path: str,
+ req_url: str,
+ req_data: dict,
+ chunk_sec: int = 29,
+ tmp_path: str = '.',
+ ):
+ self.file_path = file_path
+ self.tmp_path = tmp_path
+ self.chunk_sec = chunk_sec
+ self.req_url = req_url
+ self.req_data = req_data
+
+ def split_file_as_chunks(self) -> list:
+ audio = pydub.AudioSegment.from_file(self.file_path)
+ chunk_len = self.chunk_sec * 1000
+ n_chunks = math.ceil(len(audio) / chunk_len)
+
+ for i in range(n_chunks):
+ start_ms = i * chunk_len
+ overlap_start_ms = start_ms - 300 if start_ms > 0 else start_ms
+ end_ms = start_ms + chunk_len
+ audio_chunk = audio[overlap_start_ms:end_ms]
+ audio_chunk.export(os.path.join(self.tmp_path, "tmp_{}.wav".format(str(i))), format="wav")
+ tmp_files = glob(os.path.join(self.tmp_path, "*.wav"))
+ return tmp_files
+
+ def transcribe_audio(self, file_path_lst: list):
+ transcribed_text_chunks = []
+
+ def _send_request(filepath: str):
+ """Send a request to 'whisper' model served"""
+ files = {
+ 'file': (filepath, open(filepath, 'rb'), 'audio/mp3'),
+ }
+
+ response = requests.post(self.req_url, data=self.req_data, files=files)
+ text = response.json().get('text', ', ')
+ transcribed_text_chunks.append({
+ 'file_name': os.path.basename(filepath),
+ 'text': text
+ })
+
+ # Send parallel requests
+ threads = [threading.Thread(target=_send_request, args=(f,)) for f in file_path_lst]
+ for t in threads: t.start()
+ for t in threads: t.join()
+
+ # Merge transcribed text snippets in order
+ transcribed_text_chunks.sort(key=lambda x: x['file_name'])
+ transcribed_text = "[AUDIO]" + ' '.join([t['text'] for t in transcribed_text_chunks])
+ return transcribed_text
+
+ def return_vectormeta_format(self):
+ audio_chunks = self.split_file_as_chunks()
+ transcribed_text = self.transcribe_audio(audio_chunks)
+ res = [GenOSVectorMeta.model_validate({
+ 'text': transcribed_text,
+ 'n_char': 1,
+ 'n_word': 1,
+ 'n_line': 1,
+ 'i_page': 1,
+ 'e_page': 1,
+ 'n_page': 1,
+ 'i_chunk_on_page': 1,
+ 'n_chunk_of_page': 1,
+ 'i_chunk_on_doc': 1,
+ 'reg_date': datetime.now().isoformat(timespec='seconds') + 'Z',
+ 'chunk_bboxes': ".",
+ 'media_files': "."
+ })]
+ return res
+
+
+### for HWPX from 지능형 전처리기 ###
+# * GenOSVectorMetaBuilder #
+# * HierarchicalChunker #
+# * HybridChunker #
+# * HwpxProcessor #
+# * GenosServiceException #
+
+class HierarchicalChunker(BaseChunker):
+ r""" Chunker implementation leveraging the document layout.
+ Args:
+ merge_list_items (bool): Whether to merge successive list items.
+ Defaults to True.
+ delim (str): Delimiter to use for merging text. Defaults to "\n".
+ """
+ merge_list_items: bool = True
+
+ @classmethod
+ def _triplet_serialize(cls, table_df: DataFrame) -> str:
+ # copy header as first row and shift all rows by one
+ table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
+ table_df.index = table_df.index + 1
+ table_df = table_df.sort_index()
+
+ rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
+ cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
+
+ nrows = table_df.shape[0]
+ ncols = table_df.shape[1]
+ texts = [
+ f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
+ for i in range(1, nrows)
+ for j in range(1, ncols)
+ ]
+ output_text = ". ".join(texts)
+
+ return output_text
+
+ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ r"""Chunk the provided document.
+ Args:
+ dl_doc (DLDocument): document to chunk
+
+ Yields:
+ Iterator[Chunk]: iterator over extracted chunks
+ """
+ heading_by_level: dict[LevelNumber, str] = {}
+ list_items: list[TextItem] = []
+ for item, level in dl_doc.iterate_items():
+ captions = None
+ if isinstance(item, DocItem):
+ # first handle any merging needed
+ if self.merge_list_items:
+ if isinstance(
+ item, ListItem
+ ) or ( # TODO remove when all captured as ListItem:
+ isinstance(item, TextItem)
+ and item.label == DocItemLabel.LIST_ITEM
+ ):
+ list_items.append(item)
+ continue
+ elif list_items: # need to yield
+ yield DocChunk(
+ text=self.delim.join([i.text for i in list_items]),
+ meta=DocMeta(
+ doc_items=list_items,
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ origin=dl_doc.origin,
+ ),
+ )
+ list_items = [] # reset
+
+ if isinstance(item, SectionHeaderItem) or (
+ isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]):
+ level = (
+ item.level
+ if isinstance(item, SectionHeaderItem)
+ else (0 if item.label == DocItemLabel.TITLE else 1)
+ )
+ heading_by_level[level] = item.text
+ text = ''.join(str(value) for value in heading_by_level.values())
+
+ # remove headings of higher level as they just went out of scope
+ keys_to_del = [k for k in heading_by_level if k > level]
+ for k in keys_to_del:
+ heading_by_level.pop(k, None)
+ c = DocChunk(
+ text=text,
+ meta=DocMeta(
+ doc_items=[item],
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ captions=captions,
+ origin=dl_doc.origin
+ ),
+ )
+ yield c
+ continue
+
+ if isinstance(item, TextItem) or (
+ (not self.merge_list_items) and isinstance(item, ListItem)) or isinstance(item, CodeItem):
+ text = item.text
+
+ elif isinstance(item, TableItem):
+ text = item.export_to_markdown(dl_doc)
+ # dataframe으로 추출할 때 사용되는 코드
+ # if table_df.shape[0] < 1 or table_df.shape[1] < 2:
+ # # at least two cols needed, as first column contains row headers
+ # continue
+ # text = self._triplet_serialize(table_df=table_df)
+ captions = [c.text for c in [r.resolve(dl_doc) for r in item.captions]] or None
+
+ elif isinstance(item, PictureItem):
+ text = ''.join(str(value) for value in heading_by_level.values())
+ else:
+ continue
+ c = DocChunk(
+ text=text,
+ meta=DocMeta(
+ doc_items=[item],
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ captions=captions,
+ origin=dl_doc.origin,
+ ),
+ )
+ yield c
+
+ if self.merge_list_items and list_items: # need to yield
+ yield DocChunk(
+ text=self.delim.join([i.text for i in list_items]),
+ meta=DocMeta(
+ doc_items=list_items,
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None,
+ origin=dl_doc.origin,
+ ),
+ )
+
+
+class HybridChunker(BaseChunker):
+ r"""Chunker doing tokenization-aware refinements on top of document layout chunking.
+ Args:
+ tokenizer: The tokenizer to use; either instantiated object or name or path of
+ respective pretrained model
+ max_tokens: The maximum number of tokens per chunk. If not set, limit is
+ resolved from the tokenizer
+ merge_peers: Whether to merge undersized chunks sharing same relevant metadata
+ """
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+ tokenizer: Union[PreTrainedTokenizerBase, str] = (
+ "/nfs-root/all-MiniLM-L6-v2"
+ )
+ max_tokens: int = int(1e30) # type: ignore[assignment]
+ merge_peers: bool = True
+ _inner_chunker: HierarchicalChunker = HierarchicalChunker()
+
+ @model_validator(mode="after")
+ def _patch_tokenizer_and_max_tokens(self) -> Self:
+ self._tokenizer = (
+ self.tokenizer
+ if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+ else AutoTokenizer.from_pretrained(self.tokenizer)
+ )
+ if self.max_tokens is None:
+ self.max_tokens = TypeAdapter(PositiveInt).validate_python(
+ self._tokenizer.model_max_length
+ )
+ return self
+
+ def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
+ if text is None:
+ return 0
+ elif isinstance(text, list):
+ total = 0
+ for t in text:
+ total += self._count_text_tokens(t)
+ return total
+ return len(self._tokenizer.tokenize(text))
+
+ class _ChunkLengthInfo(BaseModel):
+ total_len: int
+ text_len: int
+ other_len: int
+
+ def _count_chunk_tokens(self, doc_chunk: DocChunk):
+ ser_txt = self.serialize(chunk=doc_chunk)
+ return len(self._tokenizer.tokenize(text=ser_txt))
+
+ def _doc_chunk_length(self, doc_chunk: DocChunk):
+ text_length = self._count_text_tokens(doc_chunk.text)
+ total = self._count_chunk_tokens(doc_chunk=doc_chunk)
+ return self._ChunkLengthInfo(
+ total_len=total,
+ text_len=text_length,
+ other_len=total - text_length,
+ )
+
+ def _make_chunk_from_doc_items(
+ self, doc_chunk: DocChunk, window_start: int, window_end: int
+ ):
+ doc_items = doc_chunk.meta.doc_items[window_start: window_end + 1]
+ meta = DocMeta(
+ doc_items=doc_items,
+ headings=doc_chunk.meta.headings,
+ captions=doc_chunk.meta.captions,
+ origin=doc_chunk.meta.origin,
+ )
+ window_text = (
+ doc_chunk.text
+ if len(doc_chunk.meta.doc_items) == 1
+ else self.delim.join(
+ [
+ doc_item.text
+ for doc_item in doc_items
+ if isinstance(doc_item, TextItem)
+ ]
+ )
+ )
+ new_chunk = DocChunk(text=window_text, meta=meta)
+ return new_chunk
+
+ def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
+ chunks = []
+ window_start = 0
+ window_end = 0 # an inclusive index
+ num_items = len(doc_chunk.meta.doc_items)
+ while window_end < num_items:
+ new_chunk = self._make_chunk_from_doc_items(
+ doc_chunk=doc_chunk,
+ window_start=window_start,
+ window_end=window_end,
+ )
+ if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
+ if window_end < num_items - 1:
+ window_end += 1
+ # 아직 청크에 여유가 있고, 남은 아이템도 있으므로 계속 추가 시도
+ continue
+ else:
+ # 현재 윈도우의 모든 아이템이 청크에 들어갔고, 더 이상 아이템이 없음
+ window_end = num_items # signalizing the last loop
+ elif window_start == window_end:
+ # 아이템 1개도 청크에 안 들어감 → 단독 청크로 처리, 이후 재분할
+ window_end += 1
+ window_start = window_end
+ else:
+ # 마지막 아이템 빼고 청크 생성 → 남은 아이템으로 새 윈도우 시작
+ new_chunk = self._make_chunk_from_doc_items(
+ doc_chunk=doc_chunk,
+ window_start=window_start,
+ window_end=window_end - 1,
+ )
+ window_start = window_end
+ chunks.append(new_chunk)
+ return chunks
+
+ def _split_using_plain_text(self, doc_chunk: DocChunk) -> list[DocChunk]:
+ lengths = self._doc_chunk_length(doc_chunk)
+ if lengths.total_len <= self.max_tokens:
+ return [doc_chunk]
+ else:
+ # 헤더/캡션을 제외하고 본문 텍스트에 할당 가능한 토큰 수 계산
+ available_length = self.max_tokens - lengths.other_len
+ sem_chunker = semchunk.chunkerify(
+ self._tokenizer, chunk_size=available_length
+ )
+ if available_length <= 0:
+ warnings.warn(
+ f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}"
+ # noqa
+ )
+ return []
+ text = doc_chunk.text
+ segments = sem_chunker.chunk(text)
+ chunks = [type(doc_chunk)(text=s, meta=doc_chunk.meta) for s in segments]
+ return chunks
+
+ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
+ output_chunks = []
+ window_start = 0
+ window_end = 0 # an inclusive index
+ num_chunks = len(chunks)
+
+ while window_end < num_chunks:
+ chunk = chunks[window_end]
+ headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
+ ready_to_append = False
+
+ if window_start == window_end:
+ current_headings_and_captions = headings_and_captions
+ window_end += 1
+ first_chunk_of_window = chunk
+
+ else:
+ chks = chunks[window_start: window_end + 1]
+ doc_items = [it for chk in chks for it in chk.meta.doc_items]
+ candidate = DocChunk(
+ text=self.delim.join([chk.text for chk in chks]),
+ meta=DocMeta(
+ doc_items=doc_items,
+ headings=current_headings_and_captions[0],
+ captions=current_headings_and_captions[1],
+ origin=chunk.meta.origin,
+ ),
+ )
+
+ if (headings_and_captions == current_headings_and_captions
+ and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
+ ):
+ # 토큰 수 여유 있음 → 청크 확장 계속
+ window_end += 1
+ new_chunk = candidate
+ else:
+ ready_to_append = True
+
+ if ready_to_append or window_end == num_chunks:
+ # no more room OR the start of new metadata.
+ if window_start + 1 == window_end:
+ output_chunks.append(first_chunk_of_window)
+ else:
+ output_chunks.append(new_chunk)
+ window_start = window_end
+
+ return output_chunks
+
+ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ r"""Chunk the provided document.
+ Args:
+ dl_doc (DLDocument): document to chunk
+ Yields:
+ Iterator[Chunk]: iterator over extracted chunks
+ """
+ res: Iterable[DocChunk]
+ res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # type: ignore
+ res = [x for c in res for x in self._split_by_doc_items(c)]
+ res = [x for c in res for x in self._split_using_plain_text(c)]
+
+ if self.merge_peers:
+ res = self._merge_chunks_with_matching_metadata(res)
+ return iter(res)
+
+
+class DocxProcessor:
+ def __init__(self):
+ self.page_chunk_counts = defaultdict(int)
+ self.pipeline_options = PipelineOptions()
+ self.converter = DocumentConverter(
+ format_options={
+ InputFormat.DOCX: WordFormatOption(
+ pipeline_cls=SimplePipeline, backend=GenosMsWordDocumentBackend
+ ),
+ }
+ )
+
+ def get_paths(self, file_path: str):
+ output_path, output_file = os.path.split(file_path)
+ filename, _ = os.path.splitext(output_file)
+ artifacts_dir = Path(f"{output_path}/{filename}")
+ if artifacts_dir.is_absolute():
+ reference_path = None
+ else:
+ reference_path = artifacts_dir.parent
+ return artifacts_dir, reference_path
+
+ def get_media_files(self, doc_items: list):
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'path': path, 'name': name})
+ return temp_list
+
+ def safe_join(self, iterable):
+ if not isinstance(iterable, (list, tuple, set)):
+ return ''
+ return ''.join(map(str, iterable)) + '\n'
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+ chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+ chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+ for chunk in chunks:
+ self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+ return chunks
+
+ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+ **kwargs: dict) -> list[dict]:
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=document.num_pages(),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+ )
+
+ current_page = None
+ chunk_index_on_page = 0
+ vectors = []
+ upload_tasks = []
+ for chunk_idx, chunk in enumerate(chunks):
+ chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+ content = self.safe_join(chunk.meta.headings) + chunk.text
+
+ if chunk_page != current_page:
+ current_page = chunk_page
+ chunk_index_on_page = 0
+
+ vector = (GenOSVectorMetaBuilder()
+ .set_text(content)
+ .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+ .set_chunk_index(chunk_idx)
+ .set_global_metadata(**global_metadata)
+ .set_chunk_bboxes(chunk.meta.doc_items, document)
+ .set_media_files(chunk.meta.doc_items)
+ ).build()
+ vectors.append(vector)
+
+ chunk_index_on_page += 1
+ # file_list = self.get_media_files(chunk.meta.doc_items)
+ # upload_tasks.append(asyncio.create_task(
+ # upload_files(file_list, request=request)
+ # ))
+
+ if upload_tasks:
+ await asyncio.gather(*upload_tasks)
+ return vectors
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ document: DoclingDocument = self.load_documents(file_path, **kwargs)
+ artifacts_dir, reference_path = self.get_paths(file_path)
+ document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+ chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+ vectors = []
+ if len(chunks) >= 1:
+ vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+ else:
+ raise GenosServiceException(1, f"chunk length is 0")
+ return vectors
+
+
+class HwpxProcessor:
+ def __init__(self):
+ self.page_chunk_counts = defaultdict(int)
+ self.pipeline_options = PipelineOptions()
+ self.pipeline_options.save_images = False
+ self.converter = DocumentConverter(
+ format_options={
+ InputFormat.XML_HWPX: HwpxFormatOption(
+ pipeline_options=self.pipeline_options
+ )
+ }
+ )
+
+ def get_paths(self, file_path: str):
+ output_path, output_file = os.path.split(file_path)
+ filename, _ = os.path.splitext(output_file)
+ artifacts_dir = Path(f"{output_path}/{filename}")
+ if artifacts_dir.is_absolute():
+ reference_path = None
+ else:
+ reference_path = artifacts_dir.parent
+ return artifacts_dir, reference_path
+
+ def get_media_files(self, doc_items: list):
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({'path': path, 'name': name})
+ return temp_list
+
+ def safe_join(self, iterable):
+ if not isinstance(iterable, (list, tuple, set)):
+ return ''
+ return ''.join(map(str, iterable)) + '\n'
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ save_images = kwargs.get('save_images', False)
+
+ if self.pipeline_options.save_images != save_images:
+ self.pipeline_options.save_images = save_images
+ # self._create_converters()
+
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+ chunker = HybridChunker(max_tokens=int(1e30), merge_peers=True)
+ chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+ for chunk in chunks:
+ self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+ return chunks
+
+ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request,
+ **kwargs: dict) -> list[dict]:
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=document.num_pages(),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z',
+ )
+
+ current_page = None
+ chunk_index_on_page = 0
+ vectors = []
+ upload_tasks = []
+ for chunk_idx, chunk in enumerate(chunks):
+ chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+ content = self.safe_join(chunk.meta.headings) + chunk.text
+
+ if chunk_page != current_page:
+ current_page = chunk_page
+ chunk_index_on_page = 0
+
+ vector = (GenOSVectorMetaBuilder()
+ .set_text(content)
+ .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+ .set_chunk_index(chunk_idx)
+ .set_global_metadata(**global_metadata)
+ .set_chunk_bboxes(chunk.meta.doc_items, document)
+ .set_media_files(chunk.meta.doc_items)
+ ).build()
+ vectors.append(vector)
+
+ chunk_index_on_page += 1
+ # file_list = self.get_media_files(chunk.meta.doc_items)
+ # upload_tasks.append(asyncio.create_task(
+ # upload_files(file_list, request=request)
+ # ))
+
+ if upload_tasks:
+ await asyncio.gather(*upload_tasks)
+ return vectors
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ document: DoclingDocument = self.load_documents(file_path, **kwargs)
+ artifacts_dir, reference_path = self.get_paths(file_path)
+ document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+
+ chunks: list[DocChunk] = self.split_documents(document, **kwargs)
+
+ vectors = []
+ if len(chunks) >= 1:
+ vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+ else:
+ raise GenosServiceException(1, f"chunk length is 0")
+
+
+ text = ""
+ for vector in vectors:
+ if len(text) + len(vector.text) > 8192:
+ break
+ text += vector.text
+
+
+ return [vectors[0]]
+
+
+class GenosServiceException(Exception):
+ """GenOS 와의 의존성 부분 제거를 위해 추가"""
+
+ def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+ self.code = 1
+ self.error_code = error_code
+ self.error_msg = error_msg or "GenOS Service Exception"
+ self.msg_params = msg_params or {}
+
+ def __repr__(self) -> str:
+ class_name = self.__class__.__name__
+ return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# async def assert_cancelled(request: Request):
+# """GenOS 와의 의존성 제거를 위해 추가"""
+# if await request.is_disconnected():
+# raise GenosServiceException(1, f"Cancelled")
+
+
+# @@@@ 성민: OCR을 위해서 추가
+from docling.datamodel.pipeline_options import (
+ AcceleratorDevice,
+ AcceleratorOptions,
+ # OcrEngine,
+ # PdfBackend,
+ PdfPipelineOptions,
+ TableFormerMode,
+ PipelineOptions,
+ PaddleOcrOptions,
+)
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.document_converter import PdfFormatOption
+
+class DocumentProcessor:
+ def __init__(self):
+ self.page_chunk_counts = defaultdict(int)
+ self.hwpx_processor = HwpxProcessor()
+ self.docx_processor = DocxProcessor()
+
+ # @@@@ 성민: OCR을 위해서 추가
+ self.ocr_endpoint = "http://doc-parser-ocr-service:8080/ocr"
+ ocr_options = PaddleOcrOptions(
+ force_full_page_ocr=False,
+ lang=['korean'],
+ ocr_endpoint=self.ocr_endpoint,
+ text_score=0.3)
+
+
+ device = AcceleratorDevice.AUTO
+ num_threads = 8
+ accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+
+ # PDF 파이프라인 옵션 설정
+ self.pipe_line_options = PdfPipelineOptions()
+ self.pipe_line_options.generate_page_images = True
+ self.pipe_line_options.generate_picture_images = True
+ self.pipe_line_options.do_ocr = False
+ self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+ # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+ # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+ # ocr_options = TesseractOcrOptions()
+ # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+ # ocr_options.path = './.tesseract/tessdata'
+ # self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.artifacts_path = Path("/models/")
+ self.pipe_line_options.do_table_structure = True
+ self.pipe_line_options.images_scale = 2
+ self.pipe_line_options.table_structure_options.do_cell_matching = True
+ self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+ self.pipe_line_options.accelerator_options = accelerator_options
+
+ # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+ self.simple_pipeline_options = PipelineOptions()
+ self.simple_pipeline_options.save_images = False
+
+ # ocr 파이프라인 옵션
+ self.ocr_pipe_line_options = PdfPipelineOptions()
+ self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.do_ocr = True
+ self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = True
+
+ self.ocr_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options,
+ backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+
+ def get_loader(self, file_path: str):
+ ext = os.path.splitext(file_path)[-1].lower()
+ real_type = self.get_real_file_type(file_path)
+
+ # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+ if ext != real_type and real_type == 'pdf':
+ return PyMuPDFLoader(file_path)
+ elif ext != real_type and real_type in ['txt', 'json', 'md']:
+ return TextLoader(file_path)
+ # 원래 확장자 기반 로직
+ elif ext == '.pdf':
+ return PyMuPDFLoader(file_path)
+ elif ext == '.doc':
+ convert_to_pdf(file_path)
+ return UnstructuredWordDocumentLoader(file_path)
+ elif ext in ['.ppt', '.pptx']:
+ convert_to_pdf(file_path)
+ return UnstructuredPowerPointLoader(file_path)
+ elif ext in ['.jpg', '.jpeg', '.png']:
+ convert_to_pdf(file_path)
+ # 한국어 OCR 지원을 위한 언어 설정
+ return UnstructuredImageLoader(
+ file_path,
+ languages=["kor", "eng"], # 한국어 + 영어 OCR
+ )
+ elif ext in ['.txt', '.json', '.md']:
+ return TextLoader(file_path)
+ elif ext == '.hwp':
+ return HwpLoader(file_path)
+ elif ext == '.md':
+ return UnstructuredMarkdownLoader(file_path)
+ else:
+ return UnstructuredFileLoader(file_path)
+
+ def get_real_file_type(self, file_path: str) -> str:
+ """파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
+ with open(file_path, 'rb') as f:
+ header = f.read(8)
+ if header.startswith(b'%PDF-'):
+ return 'pdf'
+ elif header.startswith(b'\x89PNG'):
+ return 'png'
+ elif header.startswith(b'\xff\xd8\xff'):
+ return 'jpg'
+
+ # 매직 헤더로 판단할 수 없으면 확장자 사용
+ return os.path.splitext(file_path)[-1].lower()
+
+ def convert_md_to_pdf(self, md_path):
+ """Markdown 파일을 PDF로 변환"""
+ install_packages(['chardet'])
+ import chardet
+
+ pdf_path = md_path.replace('.md', '.pdf')
+ with open(md_path, 'rb') as f:
+ raw_file = f.read()
+ candidates = ['utf-8', 'utf-8-sig']
+ try:
+ det = (chardet.detect(raw_file) or {}).get('encoding') or ''
+ # chardet가 ascii/unknown이면 무시. 그 외면 후보에 추가
+ if det and det.lower() not in ('ascii', 'unknown'):
+ if det.lower() not in [c.lower() for c in candidates]:
+ candidates.append(det)
+ except Exception:
+ pass
+ candidates += ['cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
+ md_content = None
+ for enc in candidates:
+ try:
+ md_content = raw_file.decode(enc)
+ break
+ except UnicodeDecodeError:
+ continue
+ if md_content is None:
+ md_content = raw_file.decode('utf-8', errors='replace')
+
+ html_content = markdown(md_content)
+ if HTML:
+ HTML(string=html_content).write_pdf(pdf_path)
+ return pdf_path
+
+
+
+ def _create_converters(self):
+ """컨버터들을 생성하는 헬퍼 메서드"""
+ self.ocr_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options,
+ backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+
+
+ def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+ save_images = kwargs.get('save_images', True)
+ include_wmf = kwargs.get('include_wmf', False)
+
+ # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+ if (self.simple_pipeline_options.save_images != save_images or
+ getattr(self.simple_pipeline_options, 'include_wmf', False) != include_wmf):
+ self.simple_pipeline_options.save_images = save_images
+ self.simple_pipeline_options.include_wmf = include_wmf
+ self._create_converters()
+
+ try:
+ conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+ except Exception as e:
+ print("@@@@", e)
+ # conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+
+ return conv_result.document
+
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
+ loader = self.get_loader(file_path)
+ documents = loader.load()
+
+ # @@@@ 성민: 밑에 주석
+ # 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공
+ # ext = os.path.splitext(file_path)[-1].lower()
+ # if ext in ['.jpg', '.jpeg', '.png']:
+ # # documents가 없거나, 있어도 모든 page_content가 비어있는 경우
+ # if not documents or not any(doc.page_content.strip() for doc in documents):
+ # documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})]
+
+ # @@@@ 성민 새로 작성: 텍스트가 없을 경우 OCR 수행
+ if not documents or not any(doc.page_content.strip() for doc in documents):
+ document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+ documents = list([Document(page_content=document.export_to_markdown(), metadata={})])
+
+ return documents
+
+ def split_documents(self, documents, **kwargs: dict) -> list[Document]:
+ # @@@@ 성민: GenOS에서 바꿔도 안바뀌는듯?
+ print("@@@@ kwargs", kwargs)
+
+ kwargs.setdefault("chunk_size", 20_000)
+
+ splitter_kwargs = {
+ k: v for k, v in kwargs.items()
+ if k in ["chunk_size", "chunk_overlap", "separators", "length_function"]
+ }
+
+ text_splitter = RecursiveCharacterTextSplitter(**splitter_kwargs)
+
+ chunks = text_splitter.split_documents(documents)
+ chunks = [chunk for chunk in chunks if chunk.page_content]
+
+ if not chunks:
+ raise Exception('Empty document')
+
+ for chunk in chunks:
+ page = chunk.metadata.get('page', 0)
+ self.page_chunk_counts[page] += 1
+ return chunks
+
+ def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict) -> list[dict]:
+ ext = os.path.splitext(file_path)[-1].lower()
+ real_type = self.get_real_file_type(file_path)
+
+ # 확장자와 실제 파일 타입이 다를 때만 real_type 사용
+ if ext != real_type and real_type == 'pdf':
+ pdf_path = file_path
+ elif ext != real_type and real_type in ['txt', 'json', 'md']:
+ pdf_path = _get_pdf_path(file_path)
+ # 원래 확장자 기반 로직
+ elif file_path.endswith('.md'):
+ pdf_path = self.convert_md_to_pdf(file_path)
+ elif file_path.endswith(('.ppt', '.pptx')):
+ pdf_path = _get_pdf_path(file_path)
+ else:
+ pdf_path = _get_pdf_path(file_path)
+
+ # doc = fitz.open(pdf_path) if (pdf_path and os.path.exists(pdf_path)) else None
+
+ if file_path.endswith(('.ppt', '.pptx')):
+ if os.path.exists(pdf_path):
+ subprocess.run(["rm", pdf_path], check=True)
+
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=max([chunk.metadata.get('page', 0) for chunk in chunks]),
+ reg_date=datetime.now().isoformat(timespec='seconds') + 'Z'
+ )
+ current_page = None
+ chunk_index_on_page = 0
+
+ vectors = []
+ for chunk_idx, chunk in enumerate(chunks):
+ page = chunk.metadata.get('page', 0)
+ text = chunk.page_content
+
+ if page != current_page:
+ current_page = page
+ chunk_index_on_page = 0
+
+ # 첨부용에서는 bbox 정보 추출 X
+ # if doc:
+ # fitz_page = doc.load_page(page)
+ # global_metadata['chunk_bboxes'] = json.dumps(merge_overlapping_bboxes([{
+ # 'page': page + 1,
+ # 'type': 'text',
+ # 'bbox': {
+ # 'l': rect[0] / fitz_page.rect.width,
+ # 't': rect[1] / fitz_page.rect.height,
+ # 'r': rect[2] / fitz_page.rect.width,
+ # 'b': rect[3] / fitz_page.rect.height,
+ # }
+ # } for rect in fitz_page.search_for(text)], x_tolerance=1 / fitz_page.rect.width,
+ # y_tolerance=1 / fitz_page.rect.height))
+
+ vectors.append(GenOSVectorMeta.model_validate({
+ 'text': text,
+ 'n_char': len(text),
+ 'n_word': len(text.split()),
+ 'n_line': len(text.splitlines()),
+ 'i_page': page,
+ 'e_page': page,
+ 'i_chunk_on_page': chunk_index_on_page,
+ 'n_chunk_of_page': self.page_chunk_counts[page],
+ 'i_chunk_on_doc': chunk_idx,
+ **global_metadata
+ }))
+ chunk_index_on_page += 1
+
+ return vectors
+
+ @guardrail
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+
+ # @@@@ 성민: OneAgent 연동용
+ if "uploads" in kwargs.keys():
+ import base64
+ uploads = kwargs.get("uploads", None)[0]
+
+ # @@@@ 전처리기 파일 저장 경로
+ folder = "/nfs-root/tmp/uploads"
+
+ decoded = base64.b64decode(uploads['data'].split(",", 1)[1])
+ file_path = os.path.join(folder, uploads['name'])
+
+ with open(file_path, 'wb') as f:
+ f.write(decoded)
+
+ ext = os.path.splitext(file_path)[-1].lower()
+ if ext in ('.wav', '.mp3', '.m4a'):
+ # Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model
+ tmp_path = "./tmp_audios_{}".format(os.path.basename(file_path).split('.')[0])
+ if not os.path.exists(tmp_path):
+ os.makedirs(tmp_path)
+
+ # Use 'Whisper' model served in-house
+ # [!] Modify the request parameters to change a STT model to be used
+ loader = AudioLoader(
+ file_path=file_path,
+ req_url="http://192.168.74.164:30100/v1/audio/transcriptions",
+ req_data={
+ 'model': 'model',
+ 'language': 'ko',
+ 'response_format': 'json',
+ 'temperature': '0',
+ 'stream': 'false',
+ 'timestamp_granularities[]': 'word'
+ },
+ chunk_sec=29, # length(sec) of a chunk from the uploaded audio
+ tmp_path=tmp_path
+ )
+ vectors = loader.return_vectormeta_format()
+ # await assert_cancelled(request)
+
+ # Remove the temporal chunks
+ try:
+ subprocess.run(['rm', '-r', tmp_path], check=True)
+ except:
+ pass
+ # await assert_cancelled(request)
+ return vectors
+
+ elif ext in ('.csv', '.xlsx'):
+ loader = TabularLoader(file_path, ext)
+ vectors = loader.return_vectormeta_format()
+ # pdf_path = _get_pdf_path(file_path)
+ # await assert_cancelled(request)
+ return vectors
+
+ elif ext == '.hwp':
+ documents: list[Document] = self.load_documents(file_path, **kwargs)
+ # await assert_cancelled(request)
+ chunks: list[Document] = self.split_documents(documents, **kwargs)
+ # await assert_cancelled(request)
+ vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+ return vectors
+
+ elif ext == '.hwpx':
+ return await self.hwpx_processor(request, file_path, **kwargs)
+
+ elif ext == '.docx':
+ return await self.docx_processor(request, file_path, **kwargs)
+
+ else:
+ documents: list[Document] = self.load_documents(file_path, **kwargs)
+ # await assert_cancelled(request)
+
+ chunks: list[Document] = self.split_documents(documents, **kwargs)
+ # await assert_cancelled(request)
+
+ vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
+
+
+ return vectors
\ No newline at end of file
diff --git a/genon/preprocessor/module/base_processor.py b/genon/preprocessor/module/base_processor.py
new file mode 100644
index 0000000000..fd277a8b08
--- /dev/null
+++ b/genon/preprocessor/module/base_processor.py
@@ -0,0 +1,175 @@
+from typing import Any, List
+
+from fastapi import Request
+from langchain_core.documents import Document
+from docling.document_converter import DocumentConverter, PdfFormatOption, HwpxFormatOption, WordFormatOption
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+ AcceleratorDevice,
+ AcceleratorOptions,
+ PdfPipelineOptions,
+ TableFormerMode,
+ PipelineOptions,
+ PaddleOcrOptions,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.genos_msword_backend import GenosMsWordDocumentBackend
+from docling_core.transforms.chunker import BaseChunker, DocChunk
+from docling_core.types import DoclingDocument
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+
+
+from utils.chunkers import CHUNKERS
+from utils.metadata import GenOSVectorMetaBuilder
+
+
+# load 파이프라인
+# open -> table -> reading order
+
+"""
+모델들
+- detection model
+- recognition model
+- ocr
+ - easy
+ - paddle
+
+- table 모델
+
+- vlm
+ - 이미지 디스크립션 모델
+ - 문서 로테이션 모델
+ - TOC 생성 모델
+"""
+
+"""
+컴포넌트
+ - base64로 오면 저장 (oneagent용)
+ - 파일 오픈: 확장자 별로....
+ - pdf로 저장(GenOS에서 보여주려고)
+ - 리딩오더
+ - 레이아웃
+ - 테이블 디텍션
+ - 이미지 디스크립션
+ - ocr -> 용도별로
+ - 이미지 로테이션
+"""
+
+# TODO all ext
+FORMAT_MAP = {
+ "pdf": InputFormat.PDF,
+ # "hwp": InputFormat.HWP,
+ # "hwpx":InputFormat.HWPX,# TODO
+ # "doc":InputFormat.DOC, # TODO
+ "docx": InputFormat.DOCX,
+ # "ppt": InputFormat.PPT, #TODO
+ # "pptx": InputFormat.PPTX,
+ # "xlsx": InputFormat.XLSX,
+ # "csv": InputFormat.CSV,
+ # "md": InputFormat.MD,
+ # "json": InputFormat.JSON,
+ # "html": InputFormat.HTML,
+}
+
+
+# TODO all ext
+FORMAT_OPTION_MAP = {
+ InputFormat.PDF: PdfFormatOption,
+ # InputFormat.HWP: HwpFormatOption # TODO 왜 HwpFormatOption은 없는지
+ # "hwpx":InputFormat.HWPX,# TODO
+ # "doc":InputFormat.DOC, # TODO
+ InputFormat.DOCX: WordFormatOption,
+ # "ppt": InputFormat.PPT, #TODO
+ # InputFormat.PPTX,
+ # InputFormat.XLSX,
+ # InputFormat.CSV,
+ # InputFormat.MD,
+ # InputFormat.JSON,
+ # InputFormat.HTML,
+}
+
+PIPELINE_MAP = {
+ "pdf": StandardPdfPipeline,
+ "simple": SimplePipeline,
+}
+
+BACKEND_MAP = {
+ "pypdf": PyPdfiumDocumentBackend,
+ "msword": GenosMsWordDocumentBackend,
+}
+
+
+class BaseProcessor:
+ pipeline: list[str] = None
+ format_options: None
+ chunker: BaseChunker = None
+ loaders: list = None
+ converter: DocumentConverter = None
+ config: dict = None
+
+ def __init__(self, config: dict) -> None:
+ # mapping 해주자
+ self.config = config
+ self.allowed_formats = self._build_allowed_formats()
+ self.format_options = self._build_format_options()
+ self.converter = DocumentConverter(
+ allowed_formats=self.allowed_formats,
+ format_options=self.format_options,
+ )
+
+ # self.loaders = LOADERS["pdf"] # 로더 왜 필요하더라
+
+ self.chunker = CHUNKERS["simple"]()
+ self.genos_meta_builder = GenOSVectorMetaBuilder()
+
+ def _build_allowed_formats(self):
+ allowed_formats = []
+ for _format in self.config["format_options"].keys():
+ format = FORMAT_MAP.get(_format, None)
+ assert format is not None, f"@@@@ 잘못된 확장자입니다. {_format}, 가능한 확장자: {list(FORMAT_MAP.keys())}"
+ allowed_formats.append(format)
+ return allowed_formats
+
+ def _build_format_options(self):
+ format_options = {}
+ for _format, option in self.config["format_options"].items():
+ format = FORMAT_MAP.get(_format, None)
+
+ format_options[format] = FORMAT_OPTION_MAP[format](
+ pipeline_cls=PIPELINE_MAP[option["pipeline_options"]],
+ backend=BACKEND_MAP[option["backend"]],
+ )
+
+ # @@@ 성민: pdf 일때만 이미지 저장이 가능하네
+ if "generate_picture_images" in option and option["generate_picture_images"] == True:
+ format_options[format].pipeline_options.generate_picture_images = True
+
+ return format_options
+
+ def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
+ """
+ 설명: 확장자에 해당하는 DocumentConverter를 사용하여 ConversionResult 리턴
+ """
+ # TODO: OneAgent 호출인지 판단. 이게 여기 있어야 할까 __call_ 에 있어야 할까.
+
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+
+ return conv_result.document
+
+ def split_documents(self, documents: list[Document], **kwargs: dict) -> list[Document]:
+ chunks = list(self.chunker.chunk(documents, **kwargs))
+ return chunks
+
+ async def compose_vectors(
+ self, request: Request, file_path: str, document: DoclingDocument, chunks: List[DocChunk], **kwargs: dict
+ ) -> list[dict]:
+ return await self.genos_meta_builder(document, chunks, file_path, request, **kwargs)
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict) -> Any:
+ documents = self.load_documents(file_path, **kwargs)
+ chunks = self.split_documents(documents, **kwargs)
+ vectors = await self.compose_vectors(request, file_path, documents, chunks, **kwargs)
+ return vectors
diff --git a/genon/preprocessor/module/intelligent_processor.py b/genon/preprocessor/module/intelligent_processor.py
new file mode 100644
index 0000000000..79742ec8ab
--- /dev/null
+++ b/genon/preprocessor/module/intelligent_processor.py
@@ -0,0 +1,1675 @@
+from __future__ import annotations
+
+import json
+import os
+import logging
+import math, bisect
+from pathlib import Path
+
+from collections import defaultdict
+from datetime import datetime
+from typing import Optional, Iterable, Any, List, Dict, Tuple
+
+from fastapi import Request
+
+_log = logging.getLogger(__name__)
+
+# docling imports
+
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+
+# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.genos_pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.pipeline.simple_pipeline import SimplePipeline
+
+# from docling.datamodel.document import ConversionStatus
+from docling.datamodel.pipeline_options import (
+ AcceleratorDevice,
+ AcceleratorOptions,
+ # OcrEngine,
+ # PdfBackend,
+ PdfPipelineOptions,
+ TableFormerMode,
+ PipelineOptions,
+ PaddleOcrOptions,
+)
+
+from docling.document_converter import DocumentConverter, PdfFormatOption, FormatOption
+from docling.datamodel.pipeline_options import DataEnrichmentOptions
+from docling.utils.document_enrichment import enrich_document, check_document
+from docling.datamodel.document import ConversionResult
+from docling_core.transforms.chunker import (
+ BaseChunk,
+ BaseChunker,
+ DocChunk,
+ DocMeta,
+)
+from docling_core.types import DoclingDocument
+
+from pandas import DataFrame
+import asyncio
+from docling_core.types import DoclingDocument as DLDocument
+from docling_core.types.doc.document import (
+ DocumentOrigin,
+ LevelNumber,
+ ListItem,
+ CodeItem,
+ ContentLayer,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import (
+ BoundingBox,
+ DocItemLabel,
+ DoclingDocument,
+ DocumentOrigin,
+ DocItem,
+ PictureItem,
+ SectionHeaderItem,
+ TableItem,
+ TextItem,
+ PageItem,
+ ProvenanceItem,
+)
+from collections import Counter
+import re
+import json
+import warnings
+from typing import Iterable, Iterator, Optional, Union
+
+from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing_extensions import Self
+
+try:
+ import semchunk
+ from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+ raise RuntimeError("Module requires 'chunking' extra; to install, run: " "`pip install 'docling-core[chunking]'`")
+
+try:
+ from genos_utils import upload_files
+except ImportError:
+ upload_files = None
+
+# ============================================
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Chunker implementation leveraging the document structure."""
+
+
+class GenosBucketChunker(BaseChunker):
+ """토큰 제한을 고려하여 섹션별 청크를 분할하고 병합하는 청커 (v2)"""
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ tokenizer: Union[PreTrainedTokenizerBase, str] = "sentence-transformers/all-MiniLM-L6-v2"
+ max_tokens: int = 1024
+ merge_peers: bool = True
+
+ # _inner_chunker: BaseChunker = None
+ _tokenizer: PreTrainedTokenizerBase = None
+ merge_list_items: bool = True
+
+ @model_validator(mode="after")
+ def _initialize_components(self) -> Self:
+ # 토크나이저 초기화
+ self._tokenizer = (
+ self.tokenizer
+ if isinstance(self.tokenizer, PreTrainedTokenizerBase)
+ else AutoTokenizer.from_pretrained(self.tokenizer)
+ )
+ return self
+
+ def preprocess(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ """문서의 모든 아이템을 헤더 정보와 함께 청크로 생성
+
+ Args:
+ dl_doc: 청킹할 문서
+
+ Yields:
+ 문서의 모든 아이템을 포함하는 하나의 청크
+ """
+ # 모든 아이템과 헤더 정보 수집
+ all_items = []
+ all_header_info = [] # 각 아이템의 헤더 정보
+ current_heading_by_level: dict[LevelNumber, str] = {}
+ all_header_short_info = [] # 각 아이템의 짧은 헤더 정보
+ current_heading_short_by_level: dict[LevelNumber, str] = {}
+ list_items: list[TextItem] = []
+
+ # iterate_items()로 수집된 아이템들의 self_ref 추적
+ processed_refs = set()
+
+ # 모든 아이템 순회
+ for item, level in dl_doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
+ if hasattr(item, "self_ref"):
+ processed_refs.add(item.self_ref)
+
+ if not isinstance(item, DocItem):
+ continue
+
+ # 리스트 아이템 병합 처리
+ if self.merge_list_items:
+ if isinstance(item, ListItem) or (isinstance(item, TextItem) and item.label == DocItemLabel.LIST_ITEM):
+ list_items.append(item)
+ continue
+ elif list_items:
+ # 누적된 리스트 아이템들을 추가
+ for list_item in list_items:
+ all_items.append(list_item)
+ # 리스트 아이템의 헤더 정보 저장
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+ all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+ list_items = []
+
+ # 섹션 헤더 처리
+ if isinstance(item, SectionHeaderItem) or (
+ isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+ ):
+ # 새로운 헤더 레벨 설정
+ header_level = (
+ item.level
+ if isinstance(item, SectionHeaderItem)
+ else (0 if item.label == DocItemLabel.TITLE else 1)
+ )
+ current_heading_by_level[header_level] = item.text
+ current_heading_short_by_level[header_level] = item.orig # 첫 단어로 짧은 헤더 정보 설정
+
+ # 더 깊은 레벨의 헤더들 제거
+ keys_to_del = [k for k in current_heading_by_level if k > header_level]
+ for k in keys_to_del:
+ current_heading_by_level.pop(k, None)
+ keys_to_del_short = [k for k in current_heading_short_by_level if k > header_level]
+ for k in keys_to_del_short:
+ current_heading_short_by_level.pop(k, None)
+
+ # 헤더 아이템도 추가 (헤더 자체도 아이템임)
+ all_items.append(item)
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+ all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+ continue
+
+ if (
+ isinstance(item, TextItem)
+ or isinstance(item, ListItem)
+ or isinstance(item, CodeItem)
+ or isinstance(item, TableItem)
+ or isinstance(item, PictureItem)
+ ):
+ # if item.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+ # item.text = ""
+ all_items.append(item)
+ # 현재 아이템의 헤더 정보 저장
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+ all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+
+ # 마지막 리스트 아이템들 처리
+ if list_items:
+ for list_item in list_items:
+ all_items.append(list_item)
+ all_header_info.append({k: v for k, v in current_heading_by_level.items()})
+ all_header_short_info.append({k: v for k, v in current_heading_short_by_level.items()})
+
+ # iterate_items()에서 누락된 테이블들을 별도로 추가
+ missing_tables = []
+ for table in dl_doc.tables:
+ table_ref = getattr(table, "self_ref", None)
+ if table_ref not in processed_refs:
+ missing_tables.append(table)
+
+ # 누락된 테이블들을 문서 앞부분에 추가 (페이지 1의 테이블들일 가능성이 높음)
+ if missing_tables:
+ for missing_table in missing_tables:
+ # 첫 번째 위치에 삽입 (헤더 테이블일 가능성이 높음)
+ all_items.insert(0, missing_table)
+ all_header_info.insert(0, {}) # 빈 헤더 정보
+ all_header_short_info.insert(0, {}) # 빈 짧은 헤더 정보
+
+ # 아이템이 없으면 빈 문서
+ if not all_items:
+ return
+
+ # 모든 아이템을 하나의 청크로 반환 (HybridChunker에서 분할)
+ # headings는 None으로 설정하고, 헤더 정보는 별도로 관리
+ chunk = DocChunk(
+ text="", # 텍스트는 HybridChunker에서 생성
+ meta=DocMeta(
+ doc_items=all_items,
+ headings=None, # DocMeta의 원래 형식 유지
+ captions=None,
+ origin=dl_doc.origin,
+ ),
+ )
+ # 헤더 정보를 별도 속성으로 저장
+ chunk._header_info_list = all_header_info
+ chunk._header_short_info_list = all_header_short_info # 짧은 헤더 정보도 저장
+ yield chunk
+
+ def _count_tokens(self, text: str) -> int:
+ """텍스트의 토큰 수 계산 (안전한 분할 처리)"""
+ if not text:
+ return 0
+
+ # 텍스트를 더 작은 단위로 분할하여 계산
+ max_chunk_length = 300 # 더 안전한 길이로 설정
+ total_tokens = 0
+
+ # 텍스트를 줄 단위로 먼저 분할
+ lines = text.split("\n")
+ current_chunk = ""
+
+ for line in lines:
+ # 현재 청크에 줄을 추가했을 때 길이 확인
+ temp_chunk = current_chunk + "\n" + line if current_chunk else line
+
+ if len(temp_chunk) <= max_chunk_length:
+ current_chunk = temp_chunk
+ else:
+ # 현재 청크가 있으면 토큰 계산
+ if current_chunk:
+ try:
+ total_tokens += len(self._tokenizer.tokenize(current_chunk))
+ except Exception:
+ total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산
+
+ # 새로운 청크 시작
+ current_chunk = line
+
+ # 마지막 청크 처리
+ if current_chunk:
+ try:
+ total_tokens += len(self._tokenizer.tokenize(current_chunk))
+ except Exception:
+ total_tokens += int(len(current_chunk.split()) * 1.3) # 대략적인 계산
+
+ return total_tokens
+
+ def _generate_text_from_items_with_headers(
+ self, items: list[DocItem], header_info_list: list[dict], dl_doc: DoclingDocument
+ ) -> str:
+ """DocItem 리스트로부터 헤더 정보를 포함한 텍스트 생성"""
+ text_parts = []
+ current_section_headers = {} # 현재 섹션의 헤더 정보
+
+ for i, item in enumerate(items):
+ item_headers = header_info_list[i] if i < len(header_info_list) else {}
+
+ # 헤더 정보가 변경된 경우 (새로운 섹션 시작)
+ if item_headers != current_section_headers:
+ # 변경된 헤더 레벨들만 추가
+ headers_to_add = []
+ for level in sorted(item_headers.keys()):
+ # 이전 섹션과 다른 헤더만 추가
+ if level not in current_section_headers or current_section_headers[level] != item_headers[level]:
+ # 해당 레벨까지의 모든 상위 헤더 포함
+ for l in sorted(item_headers.keys()):
+ if l < level:
+ headers_to_add.append(item_headers[l])
+ elif l == level:
+ headers_to_add.append("")
+
+ break
+
+ # 헤더가 있으면 추가
+ if headers_to_add:
+ header_text = ", ".join(headers_to_add)
+ if header_text not in text_parts:
+ text_parts.append(header_text)
+
+ current_section_headers = item_headers.copy()
+
+ # 아이템 텍스트 추가
+ if isinstance(item, TableItem):
+ table_text = self._extract_table_text(item, dl_doc)
+ if table_text:
+ text_parts.append(table_text)
+ elif hasattr(item, "text") and item.text:
+ # 타이틀과 섹션 헤더 처리 개선
+ # is_section_header = (
+ # isinstance(item, SectionHeaderItem) or
+ # (isinstance(item, TextItem) and
+ # item.label in [DocItemLabel.SECTION_HEADER]) # TITLE은 제외
+ # )
+
+ # 타이틀은 항상 포함, 섹션 헤더는 중복 방지를 위해 스킵
+ # if not is_section_header:
+ # 20250909, shkim, text_parts에 없는 경우만 추가. 섹션헤더가 반복해서 추가되는 것 방지
+ if item.text not in text_parts:
+ text_parts.append(item.text)
+ elif isinstance(item, PictureItem):
+ text_parts.append("") # 이미지는 빈 텍스트
+
+ result_text = self.delim.join(text_parts)
+ return result_text
+
+ def _extract_table_text(self, table_item: TableItem, dl_doc: DoclingDocument) -> str:
+ """테이블에서 텍스트를 추출하는 일반화된 메서드"""
+ try:
+ # 먼저 export_to_markdown 시도
+ table_text = table_item.export_to_markdown(dl_doc)
+ if table_text and table_text.strip():
+ return table_text
+ except Exception:
+ pass
+
+ # export_to_markdown 실패 시 테이블 셀 데이터에서 직접 텍스트 추출
+ try:
+ if hasattr(table_item, "data") and table_item.data:
+ cell_texts = []
+
+ # table_cells에서 텍스트 추출
+ if hasattr(table_item.data, "table_cells"):
+ for cell in table_item.data.table_cells:
+ if hasattr(cell, "text") and cell.text and cell.text.strip():
+ cell_texts.append(cell.text.strip())
+
+ # grid에서 텍스트 추출 (table_cells가 없는 경우)
+ elif hasattr(table_item.data, "grid") and table_item.data.grid:
+ for row in table_item.data.grid:
+ if isinstance(row, list):
+ for cell in row:
+ if hasattr(cell, "text") and cell.text and cell.text.strip():
+ cell_texts.append(cell.text.strip())
+
+ # 추출된 셀 텍스트들을 결합
+ if cell_texts:
+ return " ".join(cell_texts)
+ except Exception:
+ pass
+
+ # 모든 방법 실패 시 item.text 사용 (있는 경우)
+ if hasattr(table_item, "text") and table_item.text:
+ return table_item.text
+
+ return ""
+
+ def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[str]]:
+ """헤더 정보 리스트에서 실제 사용되는 모든 헤더들을 level 순서대로 추출하고 ', '로 연결"""
+ if not header_info_list:
+ return None
+
+ all_headers = [] # header 순서대로 추가
+ seen_headers = set() # 중복 방지용
+
+ for header_info in header_info_list:
+ if header_info:
+ for level in sorted(header_info.keys()):
+ header_text = header_info[level]
+ if header_text and header_text not in seen_headers:
+ all_headers.append(header_text)
+ seen_headers.add(header_text)
+
+ return all_headers if all_headers else None
+
+ def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]:
+ """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)"""
+ if not table_text:
+ return [table_text]
+
+ # 전체 테이블이 토큰 제한 내인지 확인
+ if self._count_tokens(table_text) <= max_tokens:
+ return [table_text]
+
+ # 단순히 토큰 수 기준으로 텍스트 분할
+ # semchunk 사용하여 토큰 제한에 맞게 분할
+ chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens)
+ chunks = chunker(table_text)
+ return chunks if chunks else [table_text]
+
+ def _is_section_header(self, item: DocItem) -> bool:
+ """아이템이 section header인지 확인"""
+ return isinstance(item, SectionHeaderItem) or (
+ isinstance(item, TextItem) and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+ )
+
+ def _get_section_header_level(self, item: DocItem) -> Optional[int]:
+ """Section header의 level을 반환"""
+ if isinstance(item, SectionHeaderItem):
+ return item.level
+ elif isinstance(item, TextItem):
+ if item.label == DocItemLabel.TITLE:
+ return 0
+ elif item.label == DocItemLabel.SECTION_HEADER:
+ return 1
+ return None
+
+ def _generate_section_text_with_heading(
+ self, section_items: list[DocItem], section_header_infos: list[dict], dl_doc: DoclingDocument
+ ) -> str:
+ """섹션의 텍스트를 생성하되, 앞에 heading을 붙임"""
+ # 첫 번째 item의 header_info에서 heading 추출
+ if section_header_infos and section_header_infos[0]:
+ merged_headers = {}
+ for level, header_text in section_header_infos[0].items():
+ if header_text:
+ merged_headers[level] = header_text
+
+ # level 순서대로 정렬해서 ', '로 연결
+ if merged_headers:
+ sorted_levels = sorted(merged_headers.keys())
+ headers = [merged_headers[level] for level in sorted_levels]
+ heading_text = ", ".join(headers)
+ else:
+ heading_text = ""
+ else:
+ heading_text = ""
+
+ # 섹션의 일반 텍스트 생성
+ section_text = self._generate_text_from_items_with_headers(section_items, section_header_infos, dl_doc)
+
+ # heading이 있으면 앞에 붙이기
+ if heading_text:
+ return heading_text + ", " + section_text
+ else:
+ return section_text
+
+ def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument) -> list[DocChunk]:
+ """문서를 토큰 제한에 맞게 분할 (v2: 섹션 헤더 기준으로 분할 후 max_tokens로 병합)"""
+ items = doc_chunk.meta.doc_items
+ header_info_list = getattr(doc_chunk, "_header_info_list", [])
+ header_short_info_list = getattr(doc_chunk, "_header_short_info_list", [])
+
+ if not items:
+ return []
+
+ # ================================================================
+ # 헬퍼 함수들
+ # ================================================================
+
+ def get_header_level(header_infos, *, first=False, default=-1):
+ """header_infos에서 최종 레벨 계산"""
+ if not header_infos:
+ return default
+ info = header_infos[0] if first else header_infos[-1]
+ return max(info.keys(), default=default)
+
+ def get_current_chunk(
+ doc_chunk: DocChunk,
+ merged_texts: list[str],
+ merged_header_short_infos: list[dict],
+ merged_items: list[DocItem],
+ ):
+ """현재까지 병합된 내용으로 DocChunk 생성"""
+ if not merged_texts:
+ return None
+ chunk_text = "\n".join(merged_texts)
+ used_headers = self._extract_used_headers(merged_header_short_infos)
+
+ return DocChunk(
+ text=chunk_text,
+ meta=DocMeta(
+ doc_items=merged_items,
+ headings=used_headers,
+ captions=None,
+ origin=doc_chunk.meta.origin,
+ ),
+ )
+
+ def get_text_from_item(item: DocItem) -> str:
+ """DocItem에서 텍스트 추출"""
+ if isinstance(item, TableItem):
+ return self._extract_table_text(item, dl_doc)
+ elif hasattr(item, "text") and item.text:
+ return item.text
+ elif isinstance(item, PictureItem):
+ text = ""
+ for annotation in item.annotations:
+ if hasattr(annotation, "text"):
+ text += annotation.text
+ return text
+ return ""
+
+ def split_items_evenly_by_tokens(item_token_counts, max_tokens):
+ n = len(item_token_counts)
+ total = sum(item_token_counts)
+ if n == 0:
+ return []
+ if total <= max_tokens:
+ return [(0, n)] # ✅ 항상 (a,b)
+
+ k = math.ceil(total / max_tokens)
+ target = total / k
+
+ P = [0]
+ for c in item_token_counts:
+ P.append(P[-1] + c)
+
+ cuts = [0]
+ used = {0}
+ for t in range(1, k):
+ goal = t * target
+ j = bisect.bisect_left(P, goal)
+
+ cand = []
+ if 0 < j < len(P):
+ cand.append(j)
+ if 0 <= j - 1 < len(P):
+ cand.append(j - 1)
+
+ best = None
+ best_dist = float("inf")
+ for x in cand:
+ if x in used:
+ continue
+ if x <= cuts[-1]:
+ continue
+ if x >= len(P) - 1: # n
+ continue
+ dist = abs(P[x] - goal)
+ if dist < best_dist:
+ best_dist = dist
+ best = x
+
+ if best is None:
+ best = min(max(cuts[-1] + 1, 1), len(P) - 2)
+
+ cuts.append(best)
+ used.add(best)
+
+ cuts.append(n)
+
+ return [(a, b) for a, b in zip(cuts[:-1], cuts[1:])]
+
+ def adjust_captions(items_group):
+
+ b_modified = False
+ for idx, group in enumerate(items_group):
+ if group is None:
+ continue
+ item = group[0][0]
+ ref_idx_list = []
+ if hasattr(item, "captions") and item.captions:
+ for cap in item.captions:
+ cap_ref = cap.cref
+ cap_idx = -1
+ for j, it in enumerate(items_group):
+ if it is None:
+ continue
+ if getattr(it[0][0], "self_ref", None) == cap_ref:
+ cap_idx = j
+ break
+ if cap_idx != -1:
+ ref_idx_list.append(cap_idx)
+ if ref_idx_list:
+ ref_idx_list = sorted(ref_idx_list)
+
+ if not ref_idx_list:
+ continue
+
+ # caption 아이템들을 부모 아이템 바로 뒤로 이동
+ for cap_idx in ref_idx_list:
+ for g in items_group[cap_idx]:
+ items_group[idx].append(g)
+ items_group[cap_idx] = None # 나중에 None 제거
+ b_modified = True
+
+ if b_modified:
+ items_group = [it for it in items_group if it is not None]
+
+ return items_group
+
+ def adjust_pictures_in_tables(items_group):
+ # picture in table 처리
+
+ b_modified = False
+ for idx, group in enumerate(items_group):
+ if group is None:
+ continue
+ item = group[0][0]
+ pic_idx_list = []
+ if isinstance(item, TableItem):
+ table_bbox = item.prov[0].bbox
+ table_page_no = item.prov[0].page_no
+
+ for j in range(len(items_group)):
+ if items_group[j] is None:
+ continue
+ pic_item = items_group[j][0][0]
+ if isinstance(pic_item, PictureItem):
+ # table 안의 picture인지 확인. iou 사용
+ pic_bbox = pic_item.prov[0].bbox
+ pic_page_no = pic_item.prov[0].page_no
+ if pic_page_no != table_page_no:
+ continue
+ ios = pic_bbox.intersection_over_self(table_bbox)
+ if ios > 0.5: # picture가 50% 이상 table 안에 포함되면 table 안의 picture로 간주
+ pic_idx_list.append(j)
+ if pic_idx_list:
+ pic_idx_list = sorted(pic_idx_list)
+
+ if not pic_idx_list:
+ continue
+
+ for pic_idx in pic_idx_list:
+ for g in items_group[pic_idx]:
+ items_group[idx].append(g)
+ items_group[pic_idx] = None # 나중에 None 제거
+ b_modified = True
+
+ if b_modified:
+ items_group = [it for it in items_group if it is not None]
+
+ return items_group
+
+ # ================================================================
+ # 1단계: 섹션 헤더 기준으로 분할
+ # ================================================================
+
+ sections = [] # [(items, header_infos, header_short_infos), ...]
+ cur_items, cur_h_infos, cur_h_short = [], [], []
+
+ for i, item in enumerate(items):
+ h_info = header_info_list[i] if i < len(header_info_list) else {}
+ h_short = header_short_info_list[i] if i < len(header_short_info_list) else {}
+
+ # 섹션 헤더를 만나면
+ if self._is_section_header(item):
+ # 이전 섹션이 있으면 저장
+ if cur_items:
+ sections.append((cur_items, cur_h_infos, cur_h_short))
+
+ # 새로운 섹션 시작
+ cur_items = [item]
+ cur_h_infos = [h_info]
+ cur_h_short = [h_short]
+ else:
+ # 섹션 헤더가 아니면 현재 섹션에 추가
+ cur_items.append(item)
+ cur_h_infos.append(h_info)
+ cur_h_short.append(h_short)
+
+ # 마지막 섹션 저장
+ if cur_items:
+ sections.append((cur_items, cur_h_infos, cur_h_short))
+
+ # ================================================================
+ # 2단계: 각 섹션의 텍스트에 heading 붙이기
+ # ================================================================
+
+ sections_with_text = []
+ for items, header_infos, header_short_infos in sections:
+ text = self._generate_section_text_with_heading(items, header_short_infos, dl_doc)
+ sections_with_text.append((text, items, header_infos, header_short_infos))
+
+ # ================================================================
+ # 2.5단계: 너무 긴 청크는 분할
+ # ================================================================
+ if self.max_tokens > 0:
+ for i in range(len(sections_with_text)):
+ text, items, h_infos, h_short = sections_with_text[i]
+ token_count = self._count_tokens(text)
+ if token_count < self.max_tokens:
+ continue
+
+ # caption 및 table 내 그림은 같은 섹션에 있도록 조정
+ items_group = [[(item, info, short)] for item, info, short in zip(items, h_infos, h_short)]
+ items_group = adjust_captions(items_group)
+ items_group = adjust_pictures_in_tables(items_group)
+
+ # 너무 긴 섹션은 분할
+ # 각 아이템 별 token 수 계산
+ item_token_counts = []
+ for group in items_group:
+ cur_count = 0
+ for g in group:
+ cur_count += self._count_tokens(get_text_from_item(g[0]))
+ item_token_counts.append(cur_count)
+
+ # 아이템 그룹들을 토큰 기준으로 균등 분할
+ split_info = split_items_evenly_by_tokens(item_token_counts, self.max_tokens)
+
+ # item_groups를 섹션으로 다시 구성
+ new_sections = []
+ for a, b in split_info:
+
+ # 각 그룹에서 items, h_infos, h_short로 분리
+ group_items = []
+ group_h_infos = []
+ group_h_short = []
+ for idx in range(a, b):
+ for g in items_group[idx]:
+ group_items.append(g[0])
+ group_h_infos.append(g[1])
+ group_h_short.append(g[2])
+
+ new_text = self._generate_section_text_with_heading(group_items, group_h_short, dl_doc)
+ new_sections.append((new_text, group_items, group_h_infos, group_h_short))
+
+ # 원래 섹션을 새로 분할된 섹션들로 교체
+ sections_with_text.pop(i)
+ for new_section in reversed(new_sections):
+ sections_with_text.insert(i, new_section)
+
+ # ================================================================
+ # 3단계: 단독 타이틀(1줄만) → 다음 섹션으로 병합
+ # ================================================================
+
+ for i in range(len(sections_with_text) - 2, -1, -1):
+ text, items, h_infos, h_short = sections_with_text[i]
+
+ # 아이템이 하나인 섹션 헤더만 검사
+ if len(items) != 1 or not self._is_section_header(items[0]):
+ continue
+
+ # 문단이 이미 구성된 것은 제외 (문자 수가 30자 이상이면 문단을 구성했다고 간주)
+ item_text = "".join(getattr(it, "text", "") for it in items)
+ if len(item_text) > 30:
+ continue
+
+ # 현재 섹션헤더 레벨이 다음 섹션헤더 레벨보다 더 높은 경우에만 병합 (높은 레벨이 더 작은 숫자)
+ n_text, n_items, n_h_infos, n_h_short = sections_with_text[i + 1]
+ current_level = get_header_level(h_infos, first=False)
+ next_level = get_header_level(n_h_infos, first=True)
+ if 0 <= next_level < current_level:
+ continue
+
+ # 다음 섹션과 병합
+ sections_with_text[i] = (text + "\n" + n_text, items + n_items, h_infos + n_h_infos, h_short + n_h_short)
+ sections_with_text.pop(i + 1)
+
+ # ================================================================
+ # 4단계: 토큰 기준 병합
+ # ================================================================
+
+ result_chunks = []
+ merged_texts, merged_items = [], []
+ merged_header_infos, merged_header_short_infos = [], []
+
+ for text, items, header_infos, header_short_infos in sections_with_text:
+
+ b_new_chunk = False
+
+ # ----------------------------------
+ # 병합 가능 여부 판단
+
+ # 병합 가능 토큰 수 계산
+ test_tokens = self._count_tokens("\n".join(merged_texts + [text]))
+
+ # 현재 섹션헤더 레벨과 병합된 섹션헤더 레벨
+ section_level = get_header_level(header_infos, first=True)
+ merged_level = get_header_level(merged_header_infos, first=False)
+
+ # 토큰 수 초과 시 새로운 청크 생성
+ if test_tokens > self.max_tokens and len(merged_texts) > 0:
+ b_new_chunk = True
+ # 현재 섹션헤더 레벨이 더 높으면 새로운 청크 생성
+ elif 0 <= section_level < merged_level:
+ b_new_chunk = True
+ # ----------------------------------
+
+ # 새로운 청크 생성
+ if b_new_chunk:
+ cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items)
+ if cur_chunk:
+ result_chunks.append(cur_chunk)
+
+ # 새로운 병합 시작
+ merged_texts = [text]
+ merged_items = items
+ merged_header_infos = header_infos
+ merged_header_short_infos = header_short_infos
+ else:
+ # 현재 섹션 병합
+ merged_texts.append(text)
+ merged_items.extend(items)
+ merged_header_infos.extend(header_infos)
+ merged_header_short_infos.extend(header_short_infos)
+
+ # 마지막 병합된 items 처리
+ cur_chunk = get_current_chunk(doc_chunk, merged_texts, merged_header_short_infos, merged_items)
+ if cur_chunk:
+ result_chunks.append(cur_chunk)
+
+ return result_chunks
+
+ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+ """문서를 청킹하여 반환
+
+ Args:
+ dl_doc: 청킹할 문서
+
+ Yields:
+ 토큰 제한에 맞게 분할된 청크들
+ """
+ doc_chunks = list(self.preprocess(dl_doc=dl_doc, **kwargs))
+
+ if not doc_chunks:
+ return iter([])
+
+ doc_chunk = doc_chunks[0] # preprocess는 하나의 청크만 반환
+
+ final_chunks = self._split_document_by_tokens(doc_chunk, dl_doc)
+
+ return iter(final_chunks)
+
+
+class GenOSVectorMeta(BaseModel):
+ class Config:
+ extra = "allow"
+
+ text: str = None
+ n_char: int = None
+ n_word: int = None
+ n_line: int = None
+ e_page: int = None
+ i_page: int = None
+ i_chunk_on_page: int = None
+ n_chunk_of_page: int = None
+ i_chunk_on_doc: int = None
+ n_chunk_of_doc: int = None
+ n_page: int = None
+ reg_date: str = None
+ chunk_bboxes: str = None
+ media_files: str = None
+ title: str = None
+ created_date: int = None
+ appendix: str = None ## !! appendix feature (2025-09-30, geonhee kim) !!
+
+
+class GenOSVectorMetaBuilder:
+ def __init__(self):
+ """빌더 초기화"""
+ self.text: Optional[str] = None
+ self.n_char: Optional[int] = None
+ self.n_word: Optional[int] = None
+ self.n_line: Optional[int] = None
+ self.i_page: Optional[int] = None
+ self.e_page: Optional[int] = None
+ self.i_chunk_on_page: Optional[int] = None
+ self.n_chunk_of_page: Optional[int] = None
+ self.i_chunk_on_doc: Optional[int] = None
+ self.n_chunk_of_doc: Optional[int] = None
+ self.n_page: Optional[int] = None
+ self.reg_date: Optional[str] = None
+ self.chunk_bboxes: Optional[str] = None
+ self.media_files: Optional[str] = None
+ self.title: Optional[str] = None
+ self.created_date: Optional[int] = None
+ self.appendix: Optional[str] = None # !! appendix feature (2025-09-30, geonhee kim) !!
+
+ def set_text(self, text: str) -> "GenOSVectorMetaBuilder":
+ """텍스트와 관련된 데이터를 설정"""
+ self.text = text
+ self.n_char = len(text)
+ self.n_word = len(text.split())
+ self.n_line = len(text.splitlines())
+ return self
+
+ def set_page_info(self, i_page: int, i_chunk_on_page: int, n_chunk_of_page: int) -> "GenOSVectorMetaBuilder":
+ """페이지 정보 설정"""
+ self.i_page = i_page
+ self.i_chunk_on_page = i_chunk_on_page
+ self.n_chunk_of_page = n_chunk_of_page
+ return self
+
+ def set_chunk_index(self, i_chunk_on_doc: int) -> "GenOSVectorMetaBuilder":
+ """문서 전체의 청크 인덱스 설정"""
+ self.i_chunk_on_doc = i_chunk_on_doc
+ return self
+
+ def set_global_metadata(self, **global_metadata) -> "GenOSVectorMetaBuilder":
+ """글로벌 메타데이터 병합"""
+ for key, value in global_metadata.items():
+ if hasattr(self, key):
+ setattr(self, key, value)
+ return self
+
+ def set_chunk_bboxes(self, doc_items: list, document: DoclingDocument) -> "GenOSVectorMetaBuilder":
+ chunk_bboxes = []
+ for item in doc_items:
+ for prov in item.prov:
+ label = item.self_ref
+ type_ = item.label
+ size = document.pages.get(prov.page_no).size
+ page_no = prov.page_no
+ bbox = prov.bbox
+ bbox_data = {
+ "l": bbox.l / size.width,
+ "t": bbox.t / size.height,
+ "r": bbox.r / size.width,
+ "b": bbox.b / size.height,
+ "coord_origin": bbox.coord_origin.value,
+ }
+ chunk_bboxes.append({"page": page_no, "bbox": bbox_data, "type": type_, "ref": label})
+ self.e_page = max([bbox["page"] for bbox in chunk_bboxes]) if chunk_bboxes else None
+ self.chunk_bboxes = json.dumps(chunk_bboxes)
+ return self
+
+ def set_media_files(self, doc_items: list) -> "GenOSVectorMetaBuilder":
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ path = str(item.image.uri)
+ print(item)
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({"name": name, "type": "image", "ref": item.self_ref})
+ self.media_files = json.dumps(temp_list)
+ return self
+
+ def build(self) -> GenOSVectorMeta:
+ """설정된 데이터를 사용해 최종적으로 GenOSVectorMeta 객체 생성"""
+ return GenOSVectorMeta(
+ text=self.text,
+ n_char=self.n_char,
+ n_word=self.n_word,
+ n_line=self.n_line,
+ i_page=self.i_page,
+ e_page=self.e_page,
+ i_chunk_on_page=self.i_chunk_on_page,
+ n_chunk_of_page=self.n_chunk_of_page,
+ i_chunk_on_doc=self.i_chunk_on_doc,
+ n_chunk_of_doc=self.n_chunk_of_doc,
+ n_page=self.n_page,
+ reg_date=self.reg_date,
+ chunk_bboxes=self.chunk_bboxes,
+ media_files=self.media_files,
+ title=self.title,
+ created_date=self.created_date,
+ appendix=self.appendix or "", # !! appendix feature (2025-09-30, geonhee kim) !!
+ )
+
+
+class DocumentProcessor:
+
+ def __init__(self):
+ """
+ initialize Document Converter
+ """
+ self.ocr_endpoint = "http://192.168.73.172:48080/ocr"
+ ocr_options = PaddleOcrOptions(
+ force_full_page_ocr=False, lang=["korean"], ocr_endpoint=self.ocr_endpoint, text_score=0.3
+ )
+
+ self.page_chunk_counts = defaultdict(int)
+ device = AcceleratorDevice.AUTO
+ num_threads = 8
+ accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+ # PDF 파이프라인 옵션 설정
+ self.pipe_line_options = PdfPipelineOptions()
+ self.pipe_line_options.generate_page_images = True
+ self.pipe_line_options.generate_picture_images = True
+ self.pipe_line_options.do_ocr = False
+ # self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.ocr_options.lang = ["ko", 'en']
+ # self.pipe_line_options.ocr_options.model_storage_directory = "./.EasyOCR/model"
+ # self.pipe_line_options.ocr_options.force_full_page_ocr = True
+ # ocr_options = TesseractOcrOptions()
+ # ocr_options.lang = ['kor', 'kor_vert', 'eng', 'jpn', 'jpn_vert']
+ # ocr_options.path = './.tesseract/tessdata'
+ # self.pipe_line_options.ocr_options = ocr_options
+ # self.pipe_line_options.artifacts_path = Path("/models/")
+ # self.pipe_line_options.do_table_structure = True
+ self.pipe_line_options.do_table_structure = False
+ self.pipe_line_options.images_scale = 2
+ self.pipe_line_options.table_structure_options.do_cell_matching = False
+ # self.pipe_line_options.table_structure_options.mode = TableFormerMode.ACCURATE
+ # self.pipe_line_options.accelerator_options = accelerator_options
+
+ # Simple 파이프라인 옵션을 인스턴스 변수로 저장
+ self.simple_pipeline_options = PipelineOptions()
+ self.simple_pipeline_options.save_images = False
+
+ # ocr 파이프라인 옵션
+ self.ocr_pipe_line_options = PdfPipelineOptions()
+ self.ocr_pipe_line_options = self.pipe_line_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.do_ocr = False
+ self.ocr_pipe_line_options.ocr_options = ocr_options.model_copy(deep=True)
+ self.ocr_pipe_line_options.ocr_options.force_full_page_ocr = False
+
+ # 기본 컨버터들 생성
+ self._create_converters()
+
+ # enrichment 옵션 설정
+ self.enrichment_options = DataEnrichmentOptions(
+ do_toc_enrichment=False,
+ # toc_doc_type="law",
+ # extract_metadata=False,
+ # toc_api_provider="custom",
+ # # Mistral-Small-3.1-24B-Instruct-2503, 운영망
+ # toc_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/502/v1/chat/completions",
+ # metadata_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/502/v1/chat/completions",
+ # toc_api_key="022653a3743849e299f19f19d323490b",
+ # metadata_api_key="022653a3743849e299f19f19d323490b",
+ # # Mistral-Small-3.1-24B-Instruct-2503, 한국은행 클러스터
+ # # toc_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions",
+ # # metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/13/31/v1/chat/completions",
+ # # toc_api_key="9e32423947fd4a5da07a28962fe88487",
+ # # metadata_api_key="9e32423947fd4a5da07a28962fe88487",
+ # toc_model="model",
+ # metadata_model="model",
+ # toc_temperature=0.0,
+ # toc_top_p=0.00001,
+ # toc_seed=33,
+ # toc_max_tokens=10000,
+ # toc_system_prompt=toc_system_prompt,
+ # toc_user_prompt=toc_user_prompt,
+ )
+
+ def _create_converters(self):
+ """컨버터들을 생성하는 헬퍼 메서드"""
+ self.converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.pipe_line_options, backend=PyPdfiumDocumentBackend
+ ),
+ }
+ )
+ self.second_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.pipe_line_options, backend=PyPdfiumDocumentBackend
+ ),
+ },
+ )
+ self.ocr_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options, backend=DoclingParseV4DocumentBackend
+ ),
+ }
+ )
+ self.ocr_second_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=self.ocr_pipe_line_options, backend=PyPdfiumDocumentBackend
+ ),
+ },
+ )
+
+ def load_documents_with_docling(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+ save_images = kwargs.get("save_images", True)
+ include_wmf = kwargs.get("include_wmf", False)
+
+ # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+ if (
+ self.simple_pipeline_options.save_images != save_images
+ or getattr(self.simple_pipeline_options, "include_wmf", False) != include_wmf
+ ):
+ self.simple_pipeline_options.save_images = save_images
+ self.simple_pipeline_options.include_wmf = include_wmf
+ self._create_converters()
+
+ try:
+ conv_result: ConversionResult = self.converter.convert(file_path, raises_on_error=True)
+ except Exception as e:
+ conv_result: ConversionResult = self.second_converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def load_documents_with_docling_ocr(self, file_path: str, **kwargs: dict) -> DoclingDocument:
+ # kwargs에서 save_images 값을 가져와서 옵션 업데이트
+ save_images = kwargs.get("save_images", True)
+ include_wmf = kwargs.get("include_wmf", False)
+
+ # save_images 옵션이 현재 설정과 다르면 컨버터 재생성
+ if (
+ self.simple_pipeline_options.save_images != save_images
+ or getattr(self.simple_pipeline_options, "include_wmf", False) != include_wmf
+ ):
+ self.simple_pipeline_options.save_images = save_images
+ self.simple_pipeline_options.include_wmf = include_wmf
+ self._create_converters()
+
+ try:
+ conv_result: ConversionResult = self.ocr_converter.convert(file_path, raises_on_error=True)
+ except Exception as e:
+ conv_result: ConversionResult = self.ocr_second_converter.convert(file_path, raises_on_error=True)
+ return conv_result.document
+
+ def load_documents(self, file_path: str, **kwargs) -> DoclingDocument:
+ return self.load_documents_with_docling(file_path, **kwargs)
+
+ def split_documents(self, documents: DoclingDocument, **kwargs: dict) -> List[DocChunk]:
+ chunker: GenosBucketChunker = GenosBucketChunker(max_tokens=0, merge_peers=True)
+
+ chunks: List[DocChunk] = list(chunker.chunk(dl_doc=documents, **kwargs))
+ for chunk in chunks:
+ self.page_chunk_counts[chunk.meta.doc_items[0].prov[0].page_no] += 1
+ return chunks
+
+ def safe_join(self, iterable):
+ if not isinstance(iterable, (list, tuple, set)):
+ return ""
+ return "".join(map(str, iterable)) + "\n"
+
+ def parse_created_date(self, date_text: str) -> Optional[int]:
+ """
+ 작성일 텍스트를 파싱하여 YYYYMMDD 형식의 정수로 변환
+
+ Args:
+ date_text: 작성일 텍스트 (YYYY-MM 또는 YYYY-MM-DD 형식)
+
+ Returns:
+ YYYYMMDD 형식의 정수, 파싱 실패시 None
+ """
+ if not date_text or not isinstance(date_text, str) or date_text == "None":
+ return 0
+
+ # 공백 제거 및 정리
+ date_text = date_text.strip()
+
+ # YYYY-MM-DD 형식 매칭
+ match_full = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", date_text)
+ if match_full:
+ year, month, day = match_full.groups()
+ try:
+ # 유효한 날짜인지 검증
+ datetime(int(year), int(month), int(day))
+ return int(f"{year}{month.zfill(2)}{day.zfill(2)}")
+ except ValueError:
+ pass
+
+ # YYYY-MM 형식 매칭 (일자는 01로 설정)
+ match_month = re.match(r"^(\d{4})-(\d{1,2})$", date_text)
+ if match_month:
+ year, month = match_month.groups()
+ try:
+ # 유효한 월인지 검증
+ datetime(int(year), int(month), 1)
+ return int(f"{year}{month.zfill(2)}01")
+ except ValueError:
+ pass
+
+ # YYYY 형식 매칭 (월일은 0101로 설정)
+ match_year = re.match(r"^(\d{4})$", date_text)
+ if match_year:
+ year = match_year.group(1)
+ try:
+ datetime(int(year), 1, 1)
+ return int(f"{year}0101")
+ except ValueError:
+ pass
+
+ return 0
+
+ def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocument:
+ return document
+
+ # 새로운 enriched result 받기
+ document = enrich_document(document, self.enrichment_options, **kwargs)
+ return document
+
+ async def compose_vectors(
+ self, document: DoclingDocument, chunks: List[DocChunk], file_path: str, request: Request, **kwargs: dict
+ ) -> list[dict]:
+ title = ""
+ created_date = 0
+ try:
+ if (
+ document.key_value_items
+ and len(document.key_value_items) > 0
+ and hasattr(document.key_value_items[0], "graph")
+ and hasattr(document.key_value_items[0].graph, "cells")
+ and len(document.key_value_items[0].graph.cells) > 1
+ ):
+ # 작성일 추출 (cells[1])
+ date_text = document.key_value_items[0].graph.cells[1].text
+ created_date = self.parse_created_date(date_text)
+ except (AttributeError, IndexError) as e:
+ pass
+
+ for item, _ in document.iterate_items():
+ if hasattr(item, "label"):
+ if item.label == DocItemLabel.TITLE:
+ title = item.text.strip() if item.text else ""
+ break
+
+ # kwargs에서 부록 정보 추출 !! appendix feature (2025-09-30, geonhee kim) !!
+ appendix_info = kwargs.get("appendix", "")
+ appendix_list = []
+ if isinstance(appendix_info, str):
+ appendix_list = (
+ [item.strip() for item in json.loads(appendix_info) if item.strip()] if appendix_info else []
+ )
+ elif isinstance(appendix_info, list):
+ appendix_list = appendix_info
+ else:
+ appendix_list = []
+
+ global_metadata = dict(
+ n_chunk_of_doc=len(chunks),
+ n_page=document.num_pages(),
+ reg_date=datetime.now().isoformat(timespec="seconds") + "Z",
+ created_date=created_date,
+ title=title,
+ )
+
+ current_page = None
+ chunk_index_on_page = 0
+ vectors = []
+ upload_tasks = []
+ for chunk_idx, chunk in enumerate(chunks):
+ chunk_page = chunk.meta.doc_items[0].prov[0].page_no
+ # header 앞에 헤더 마커 추가 (HEADER: )
+ headers_text = "HEADER: " + ", ".join(chunk.meta.headings) + "\n" if chunk.meta.headings else ""
+ content = headers_text + chunk.text
+
+ # appendix 추출 !! appendix feature (2025-09-30, geonhee kim) !!
+ matched_appendices = self.check_appendix_keywords(content, appendix_list)
+ # print(appendix_list, matched_appendices)
+ chunk_global_metadata = global_metadata.copy()
+ chunk_global_metadata["appendix"] = matched_appendices # Only matched ones
+ ###
+
+ if chunk_page != current_page:
+ current_page = chunk_page
+ chunk_index_on_page = 0
+
+ vector = (
+ GenOSVectorMetaBuilder()
+ .set_text(content)
+ .set_page_info(chunk_page, chunk_index_on_page, self.page_chunk_counts[chunk_page])
+ .set_chunk_index(chunk_idx)
+ .set_global_metadata(**chunk_global_metadata) #!! appendix feature (2025-09-30, geonhee kim) !!
+ .set_chunk_bboxes(chunk.meta.doc_items, document)
+ .set_media_files(chunk.meta.doc_items)
+ ).build()
+ vectors.append(vector)
+
+ chunk_index_on_page += 1
+ if upload_files:
+ file_list = self.get_media_files(chunk.meta.doc_items)
+ upload_tasks.append(asyncio.create_task(upload_files(file_list, request=request)))
+
+ if upload_tasks:
+ await asyncio.gather(*upload_tasks)
+
+ return vectors
+
+ def get_media_files(self, doc_items: list):
+ temp_list = []
+ for item in doc_items:
+ if isinstance(item, PictureItem):
+ name = path.rsplit("/", 1)[-1]
+ temp_list.append({"path": path, "name": name})
+ return temp_list
+
+ def check_glyph_text(self, text: str, threshold: int = 1) -> bool:
+ """텍스트에 GLYPH 항목이 있는지 확인하는 메서드"""
+ if not text:
+ return False
+
+ # GLYPH 항목이 있는지 정규식으로 확인
+ matches = re.findall(r"GLYPH\w*", text)
+ if len(matches) >= threshold:
+ # print(f"Text has glyphs. len(matches): {len(matches)}. ")
+ return True
+
+ return False
+
+ def check_glyphs(self, document: DoclingDocument) -> bool:
+ """문서에 글리프가 있는지 확인하는 메서드"""
+ for item, level in document.iterate_items():
+ if isinstance(item, TextItem) and hasattr(item, "prov") and item.prov:
+ page_no = item.prov[0].page_no
+ # page_texts += item.text
+
+ # GLYPH 항목이 있는지 확인. 정규식사용
+ matches = re.findall(r"GLYPH\w*", item.text)
+ if len(matches) > 10:
+ # print(f"Document has glyphs on page {page_no}. len(matches): {len(matches)}. ")
+ return True
+
+ return False
+
+ def check_appendix_keywords(
+ self, content: str, appendix_list: list
+ ) -> str: # !! appendix feature (2025-09-30, geonhee kim) !!
+ if not content or not appendix_list:
+ return ""
+
+ matched_appendices = []
+
+ # 1. Find appendix patterns in content first
+ found_patterns = []
+
+ # Complex patterns: 별지/별표/장부 + numbers (with hyphens, Roman numerals)
+ # Updated regex to capture full patterns like "별지 제 Ⅰ -1 호 서식" by matching until closing delimiters
+ content = re.sub(r"\s+", "", content)
+ complex_patterns = re.findall(r"(별지|별표|장부)(?:제)?([^<>()\[\]]+?)(?=(?:호|서식)|[<>\)\]]|$)", content)
+ for pattern_type, number in complex_patterns:
+ found_patterns.extend(
+ [
+ f"{pattern_type} {number}",
+ f"{pattern_type} 제{number}호",
+ f"{pattern_type}{number}",
+ f"{pattern_type}제{number}호",
+ ]
+ )
+
+ # Standalone patterns: (별표), (별지), (장부)
+ standalone_patterns = re.findall(r"[\(\[]+(별지|별표|장부)[\)\]]+", content)
+ for pattern_type in set(standalone_patterns):
+ found_patterns.extend(
+ [
+ pattern_type,
+ f"{pattern_type}",
+ ]
+ )
+
+ # 2. Check if found patterns match any appendix in the list
+ for appendix in appendix_list:
+ if not appendix or not isinstance(appendix, str):
+ continue
+
+ appendix_clean = appendix.replace(".pdf", "").lower().strip()
+
+ # If any found pattern exists in appendix filename, it's a match
+ for pattern in found_patterns:
+ if pattern.lower().strip() in appendix_clean:
+ matched_appendices.append(appendix)
+ break # Prevent duplicates
+
+ return ", ".join(matched_appendices) if matched_appendices else ""
+
+ def ocr_all_table_cells(self, document: DoclingDocument, pdf_path) -> List[Dict[str, Any]]:
+ """
+ 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR을 수행합니다.
+ Args:
+ document: DoclingDocument 객체
+ pdf_path: PDF 파일 경로
+ Returns:
+ OCR이 완료된 문서의 DoclingDocument 객체
+ """
+ import fitz
+ import base64
+ import requests
+
+ def post_ocr_bytes(img_bytes: bytes, timeout=60) -> dict:
+ HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
+ payload = {"file": base64.b64encode(img_bytes).decode("ascii"), "fileType": 1, "visualize": False}
+ r = requests.post(self.ocr_endpoint, json=payload, headers=HEADERS, timeout=timeout)
+ if not r.ok:
+ # 진단에 도움되도록 본문 일부 출력
+ raise RuntimeError(f"OCR HTTP {r.status_code}: {r.text[:500]}")
+ return r.json()
+
+ def extract_ocr_fields(resp: dict):
+ """
+ resp: 위와 같은 OCR 응답 JSON(dict)
+ return: (rec_texts, rec_scores, rec_boxes) — 모두 list
+ """
+ if resp is None:
+ return [], [], []
+
+ # 최상위 상태 체크
+ if resp.get("errorCode") not in (0, None):
+ return [], [], []
+
+ ocr_results = resp.get("result", {}).get("ocrResults", [])
+ if not ocr_results:
+ return [], [], []
+
+ pruned = ocr_results[0].get("prunedResult", {})
+ if not pruned:
+ return [], [], []
+
+ rec_texts = pruned.get("rec_texts", []) # list[str]
+ rec_scores = pruned.get("rec_scores", []) # list[float]
+ rec_boxes = pruned.get("rec_boxes", []) # list[[x1,y1,x2,y2]]
+
+ # 길이 불일치 방어: 최소 길이에 맞춰 자르기
+ n = min(len(rec_texts), len(rec_scores), len(rec_boxes))
+ return rec_texts[:n], rec_scores[:n], rec_boxes[:n]
+
+ try:
+ doc = fitz.open(pdf_path)
+
+ for table_idx, table_item in enumerate(document.tables):
+ if not table_item.data or not table_item.data.table_cells:
+ continue
+
+ b_ocr = False
+ for cell_idx, cell in enumerate(table_item.data.table_cells):
+ if self.check_glyph_text(cell.text, threshold=1):
+ b_ocr = True
+ break
+
+ if b_ocr is False:
+ # 글리프 깨진 텍스트가 없는 경우, OCR을 수행하지 않음
+ continue
+
+ for cell_idx, cell in enumerate(table_item.data.table_cells):
+
+ # Provenance 정보에서 위치 정보 추출
+ if not table_item.prov:
+ continue
+
+ page_no = table_item.prov[0].page_no - 1
+ bbox = cell.bbox
+
+ page = doc.load_page(page_no)
+
+ # 셀의 바운딩 박스를 사용하여 이미지에서 해당 영역을 잘라냄
+ cell_bbox = fitz.Rect(bbox.l, min(bbox.t, bbox.b), bbox.r, max(bbox.t, bbox.b))
+
+ # bbox 높이 계산 (PDF 좌표계 단위)
+ bbox_height = cell_bbox.height
+
+ # 목표 픽셀 높이
+ target_height = 20
+
+ # zoom factor 계산
+ # (너무 작은 bbox일 경우 0으로 나누는 걸 방지)
+ zoom_factor = target_height / bbox_height if bbox_height > 0 else 1.0
+ zoom_factor = min(zoom_factor, 4.0) # 최대 확대 비율 제한
+ zoom_factor = max(zoom_factor, 1) # 최소 확대 비율 제한
+
+ # 페이지를 이미지로 렌더링
+ mat = fitz.Matrix(zoom_factor, zoom_factor)
+ pix = page.get_pixmap(matrix=mat, clip=cell_bbox)
+ img_data = pix.tobytes("png")
+
+ result = post_ocr_bytes(img_data, timeout=60)
+ rec_texts, rec_scores, rec_boxes = extract_ocr_fields(result)
+
+ cell.text = ""
+ for t in rec_texts:
+ if len(cell.text) > 0:
+ cell.text += " "
+ cell.text += t if t else ""
+ except Exception as e:
+ print(f"OCR processing failed: {e}")
+ pass
+
+ return document
+
+ def setup_logging(self, level_num: int):
+ """
+ 5"DEBUG", 4"INFO", 3"WARNING", 2"ERROR", 1"CRITICAL", 0"NOLOG" 중 하나를 받아서 로깅 레벨을 설정하는 메서드
+ """
+
+ def get_level_name(level_num: int) -> str:
+ level_map = {5: "DEBUG", 4: "INFO", 3: "WARNING", 2: "ERROR", 1: "CRITICAL", 0: "NOLOG"}
+ return level_map.get(level_num, "INFO")
+
+ level_name = get_level_name(level_num)
+ print(f"Setting log level to: {level_name}")
+
+ if level_name == "NOLOG" or not hasattr(logging, level_name):
+ logging.disable(logging.CRITICAL) # 모든 로그 비활성화
+ return
+
+ level = getattr(logging, level_name.upper())
+
+ # root logger 설정 (핸들러는 main에서만 설정)
+ logging.basicConfig(
+ level=level,
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+ handlers=[logging.StreamHandler()], # 콘솔 출력
+ )
+
+ # root logger level 적용
+ logging.getLogger().setLevel(level)
+
+ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
+ self.setup_logging(kwargs.get("log_level", 4))
+
+ _log.info(f"file_path: {file_path}")
+ _log.info(f"kwargs: {kwargs}")
+
+ document: DoclingDocument = self.load_documents(file_path, **kwargs)
+
+ # @@@@ 성민: 이게....... 여기 있는게 아니라 로드 중간에 있어야 할거같은데.
+ if not check_document(document, self.enrichment_options) or self.check_glyphs(document):
+ # OCR이 필요하다고 판단되면 OCR 수행
+ document: DoclingDocument = self.load_documents_with_docling_ocr(file_path, **kwargs)
+
+ # 글리프 깨진 텍스트가 있는 테이블에 대해서만 OCR 수행 (청크토큰 8k이상 발생 방지)
+ document: DoclingDocument = self.ocr_all_table_cells(document, file_path)
+
+ output_path, output_file = os.path.split(file_path)
+ filename, _ = os.path.splitext(output_file)
+ artifacts_dir = Path(f"{output_path}/{filename}")
+ if artifacts_dir.is_absolute():
+ reference_path = None
+ else:
+ reference_path = artifacts_dir.parent
+
+ document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
+
+ document = self.enrichment(document, **kwargs)
+
+ has_text_items = False
+ for item, _ in document.iterate_items():
+ if (
+ isinstance(item, (TextItem, ListItem, CodeItem, SectionHeaderItem)) and item.text and item.text.strip()
+ ) or (isinstance(item, TableItem) and item.data and len(item.data.table_cells) == 0):
+ has_text_items = True
+ break
+
+ if has_text_items:
+ # Extract Chunk from DoclingDocument
+ chunks: List[DocChunk] = self.split_documents(document, **kwargs)
+ else:
+ # text가 있는 item이 없을 때 document에 임의의 text item 추가
+ # 첫 번째 페이지의 기본 정보 사용 (1-based indexing)
+ page_no = 1
+
+ # ProvenanceItem 생성
+ prov = ProvenanceItem(page_no=page_no, bbox=BoundingBox(l=0, t=0, r=1, b=1), charspan=(0, 1)) # 최소 bbox
+
+ # document에 temp text item 추가
+ document.add_text(label=DocItemLabel.TEXT, text=".", prov=prov)
+
+ # split_documents 호출
+ chunks: List[DocChunk] = self.split_documents(document, **kwargs)
+ # await assert_cancelled(request)
+
+ vectors = []
+ if len(chunks) >= 1:
+ vectors: list[dict] = await self.compose_vectors(document, chunks, file_path, request, **kwargs)
+ else:
+ raise GenosServiceException(1, f"chunk length is 0")
+
+ """
+ # 미디어 파일 업로드 방법
+ media_files = [
+ { 'path': '/tmp/graph.jpg', 'name': 'graph.jpg', 'type': 'image' },
+ { 'path': '/result/1/graph.jpg', 'name': '1/graph.jpg', 'type': 'image' },
+ ]
+
+ # 업로드 요청 시에는 path, name 필요
+ file_list = [{k: v for k, v in file.items() if k != 'type'} for file in media_files]
+ await upload_files(file_list, request=request)
+
+ # 메타에 저장시에는 name, type 필요
+ meta = [{k: v for k, v in file.items() if k != 'path'} for file in media_files]
+ vectors[0].media_files = meta
+ """
+
+ return vectors
+
+
+class GenosServiceException(Exception):
+ # GenOS 와의 의존성 부분 제거를 위해 추가
+ def __init__(self, error_code: str, error_msg: Optional[str] = None, msg_params: Optional[dict] = None) -> None:
+ self.code = 1
+ self.error_code = error_code
+ self.error_msg = error_msg or "GenOS Service Exception"
+ self.msg_params = msg_params or {}
+
+ def __repr__(self) -> str:
+ class_name = self.__class__.__name__
+ return f"{class_name}(code={self.code!r}, errMsg={self.error_msg!r})"
+
+
+# GenOS 와의 의존성 제거를 위해 추가
+async def assert_cancelled(request: Request):
+ if await request.is_disconnected():
+ raise GenosServiceException(1, f"Cancelled")
+
+
+# -----------------------------------------------------------------
+# enrichment 프롬프트
+# -----------------------------------------------------------------
+
+toc_system_prompt = """You are an expert at generating table of contents (목차) from Korean documents. You specialize in regulatory documents, terms of service, contracts, and mixed-format documents that combine formal regulatory structures with general section headers.
+""".strip()
+toc_user_prompt = """
+Here is the Korean document you need to analyze:
+
+