From 2224d88109929857b8ed5bdc2f34e47a2b42a4c7 Mon Sep 17 00:00:00 2001 From: qianqiuer <3418979384@qq.com> Date: Mon, 30 Mar 2026 11:49:49 +0800 Subject: [PATCH] unstructuredio --- .../mapper/unstructured_npu/benchmark_npu.py | 297 ++++++++ .../unstructured_npu/fusion_result.json | 338 +++++++++ .../mapper/unstructured_npu/npu_adapter.py | 700 ++++++++++++++++++ .../unstructured_npu/ocr_npu_adapter.py | 257 +++++++ runtime/ops/mapper/unstructured_npu/run.sh | 88 +++ 5 files changed, 1680 insertions(+) create mode 100644 runtime/ops/mapper/unstructured_npu/benchmark_npu.py create mode 100644 runtime/ops/mapper/unstructured_npu/fusion_result.json create mode 100644 runtime/ops/mapper/unstructured_npu/npu_adapter.py create mode 100644 runtime/ops/mapper/unstructured_npu/ocr_npu_adapter.py create mode 100644 runtime/ops/mapper/unstructured_npu/run.sh diff --git a/runtime/ops/mapper/unstructured_npu/benchmark_npu.py b/runtime/ops/mapper/unstructured_npu/benchmark_npu.py new file mode 100644 index 00000000..b1e42289 --- /dev/null +++ b/runtime/ops/mapper/unstructured_npu/benchmark_npu.py @@ -0,0 +1,297 @@ +import os +import sys +import types +import importlib.machinery +import json + +# ============================================================================== +# [阶段 0] 绝对优先导入 OpenCV +# ============================================================================== +try: + import cv2 + cv2.setNumThreads(0) +except ImportError: + pass + +# ============================================================================== +# [阶段 1] 依赖屏蔽 (The Surgical Mock - Deep Path Fix) +# ============================================================================== +class MockClass: + """通用的伪造类,用于充当 TextBlock, UnstructuredModel 等""" + def __init__(self, *args, **kwargs): pass + def to_dict(self): return {} + def initialize(self, *args, **kwargs): pass + def predict(self, *args, **kwargs): return [] + +def create_fake_module(name, **kwargs): + fake_mod = types.ModuleType(name) + fake_mod.__file__ = f"fake_{name}.py" + fake_mod.__path__ = [] + fake_mod.__spec__ = importlib.machinery.ModuleSpec( + name=name, loader=None, origin=f"fake_{name}.py" + ) + fake_mod.is_available = lambda: False + for k, v in kwargs.items(): + setattr(fake_mod, k, v) + return fake_mod + +def mock_deep_path(full_path, **kwargs): + """ + 递归创建路径上的所有模块 + 例如输入 "a.b.c",会确保 a, a.b, a.b.c 都存在于 sys.modules + """ + parts = full_path.split('.') + for i in range(1, len(parts) + 1): + curr_name = ".".join(parts[:i]) + if curr_name not in sys.modules: + # 如果是路径终点,注入 kwargs;否则只创建空模块 + attrs = kwargs if i == len(parts) else {} + sys.modules[curr_name] = create_fake_module(curr_name, **attrs) + + # 将子模块挂载到父模块 (例如将 b 挂载到 a.b) + if i > 1: + parent_name = ".".join(parts[:i-1]) + child_name = parts[i-1] + setattr(sys.modules[parent_name], child_name, sys.modules[curr_name]) + + print(f"🛡️ [Deep Mock] 已构建路径: {full_path}") + +def mock_leaf(module_name, **kwargs): + """仅屏蔽叶子,假设父模块已存在或不需要""" + sys.modules[module_name] = create_fake_module(module_name, **kwargs) + print(f"🛡️ [Leaf Mock] 已屏蔽: {module_name}") + +# --- 开始屏蔽 --- + +# 1. 彻底干掉 ONNXRuntime +mock_deep_path("onnxruntime.capi._pybind_state") +sys.modules["onnxruntime"].InferenceSession = None +sys.modules["onnxruntime"].get_available_providers = lambda: ["CPUExecutionProvider"] + +# 2. 干掉 LayoutParser (关键修复:构建完整引用链) +# 报错显示代码需要 layoutparser.elements.layout.TextBlock +mock_deep_path("layoutparser.elements.layout", TextBlock=MockClass) + +# 3. 干掉 Detectron2 +mock_deep_path("detectron2.config") +mock_deep_path("detectron2.engine") + +# 4. 干掉 Unstructured 内部模型 +mock_leaf("unstructured_inference.models.chipper", + MODEL_TYPES={}, + UnstructuredChipperModel=MockClass +) +mock_leaf("unstructured_inference.models.detectron2", + MODEL_TYPES={}, + UnstructuredDetectronModel=MockClass +) +mock_leaf("unstructured_inference.models.detectron2onnx", + MODEL_TYPES={}, + UnstructuredDetectronONNXModel=MockClass +) +mock_leaf("unstructured_inference.models.super_gradients", + UnstructuredSuperGradients=MockClass, + UnstructuredSuperGradientsModel=MockClass +) +mock_leaf("unstructured_inference.models.paddle_ocr", + UnstructuredPaddleOCRModel=MockClass +) + +import logging +import time + +# ============================================================================== +# [阶段 2] 初始化 PyTorch NPU +# ============================================================================== +import torch +try: + import torch_npu + torch.npu.set_device(0) + print(f"✅ [Main Process] PyTorch NPU Initialized: {torch.npu.get_device_name(0)}") +except ImportError: + print("❌ 严重错误: 未找到 torch_npu。") + sys.exit(1) + +# ============================================================================== +# [阶段 3] 配置环境 +# ============================================================================== +os.environ["CUSTOM_DEVICE_ROOT"] = "/tmp/block_paddle_npu_in_main_process" +# 使用 hf-mirror 访问 HuggingFace +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +# 表格结构模型(table-transformer)需要从 HuggingFace 拉取/读取缓存 +os.environ["HF_HUB_OFFLINE"] = "0" + +sys.path.append(os.getcwd()) +if os.path.exists("YOLOX-main"): + sys.path.append(os.path.abspath("YOLOX-main")) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("NPU_Benchmark") + +# ============================================================================== +# [阶段 4] 加载适配器 +# ============================================================================== +if os.path.exists("npu_adapter.py"): + try: + import npu_adapter + logger.info("应用 YOLOX NPU 补丁...") + npu_adapter.apply_patches() + except Exception as e: + logger.error(f"NPU 适配器加载失败: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +# ============================================================================== +# [阶段 5] 业务逻辑 +# ============================================================================== +try: + from unstructured.partition.pdf import partition_pdf + from unstructured.partition.docx import partition_docx +except ImportError as e: + logger.error(f"缺少 unstructured 库: {e}") + sys.exit(1) + +try: + from unstructured.partition.doc import partition_doc +except ImportError: + partition_doc = None + + +def save_results(file_path, elements, duration): + output_dir = os.path.join(os.getcwd(), "output") + os.makedirs(output_dir, exist_ok=True) + + file_name = os.path.splitext(os.path.basename(file_path))[0] + txt_path = os.path.join(output_dir, f"{file_name}_result.txt") + json_path = os.path.join(output_dir, f"{file_name}_result.json") + + txt_sections = [] + for idx, e in enumerate(elements): + category = getattr(e, "category", "Unknown") + text = str(getattr(e, "text", str(e))).strip() + meta = getattr(e, "metadata", None) + text_as_html = getattr(meta, "text_as_html", None) if meta else None + + txt_sections.append(f"[{idx}] [{category}] {text}") + if text_as_html: + txt_sections.append(f"HTML: {text_as_html}") + + full_text = "\n\n".join(txt_sections) + + json_items = [] + for idx, e in enumerate(elements): + meta = getattr(e, "metadata", None) + coords = getattr(meta, "coordinates", None) if meta else None + page_number = getattr(meta, "page_number", None) if meta else None + item = { + "index": idx, + "category": getattr(e, "category", "Unknown"), + "text": str(getattr(e, "text", str(e))), + "page_number": page_number, + "coordinates": str(coords) if coords is not None else None, + "text_as_html": getattr(meta, "text_as_html", None) if meta else None, + } + json_items.append(item) + + summary = { + "input_file": file_path, + "duration_seconds": round(duration, 2), + "element_count": len(elements), + "elements": json_items, + } + + with open(txt_path, "w", encoding="utf-8") as f: + f.write(full_text) + + with open(json_path, "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2) + + logger.info(f"结果已写入: {txt_path}") + logger.info(f"结果已写入: {json_path}") + +def _extract_elements(file_path): + ext = os.path.splitext(file_path)[1].lower() + + if ext == ".pdf": + return partition_pdf( + filename=file_path, + strategy="hi_res", + hi_res_model_name="yolox", + infer_table_structure=True, + ocr_strategy="force", + languages=["chi_sim", "eng"], + ), "PyTorch Native (NPU) + Deep Mock LayoutParser" + + if ext == ".docx": + return partition_docx( + filename=file_path, + infer_table_structure=True, + ), "Word 文档解析 (docx)" + + if ext == ".doc": + if partition_doc is None: + raise RuntimeError("当前环境未安装 .doc 解析依赖,请先安装 unstructured[doc] 相关依赖") + return partition_doc( + filename=file_path, + infer_table_structure=True, + ), "Word 文档解析 (doc)" + + raise ValueError(f"暂不支持该文件类型: {ext},当前仅支持 .pdf/.docx/.doc") + + +def run_benchmark(file_path): + if not os.path.exists(file_path): + logger.error(f"文件不存在: {file_path}") + return + + logger.info(f"处理文件: {file_path}") + + start_time = time.time() + + try: + elements, mode_desc = _extract_elements(file_path) + logger.info(f"模式: {mode_desc}") + except Exception as e: + logger.error(f"处理崩溃: {e}") + import traceback + traceback.print_exc() + return + + duration = time.time() - start_time + + if not elements: + logger.error("未提取到元素。") + return + + count = len(elements) + full_text = "\n".join([str(e) for e in elements]) + + logger.info("-" * 40) + logger.info(f"耗时: {duration:.2f}s") + logger.info(f"检测到元素: {count}") + logger.info(f"字符数: {len(full_text)}") + + if count > 0: + types = list(set([e.category for e in elements])) + logger.info(f"元素类型: {types}") + + if len(full_text) > 0: + logger.info(f"预览:\n{full_text[:300]}...") + else: + logger.warning("OCR 结果为空") + + save_results(file_path, elements, duration) + + logger.info("-" * 40) + +if __name__ == "__main__": + test_file = sys.argv[1] if len(sys.argv) > 1 else "attention.pdf" + if not os.path.exists(test_file): + if os.path.exists("test_doc.pdf"): + test_file = "test_doc.pdf" + + if os.path.exists(test_file): + run_benchmark(test_file) + else: + logger.error("找不到测试文件。") \ No newline at end of file diff --git a/runtime/ops/mapper/unstructured_npu/fusion_result.json b/runtime/ops/mapper/unstructured_npu/fusion_result.json new file mode 100644 index 00000000..bee0b8bf --- /dev/null +++ b/runtime/ops/mapper/unstructured_npu/fusion_result.json @@ -0,0 +1,338 @@ +{ + "session_and_graph_id_0_0": { + "graph_fusion": { + "ARefreshCubeC0FusionPass": { + "effect_times": "1", + "match_times": "1" + }, + "Conv2dToConv2dV2FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvFormatRefreshFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvToFullyConnectionFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvWeightCompressFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "CubeTransFixpipeFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEAPREQUANTFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FixPipeAbilityProcessPass": { + "effect_times": "1", + "match_times": "1" + }, + "RefreshInt64ToInt32FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "TransdataCastFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFz2FzgFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFzg2FzFusionPass": { + "effect_times": "0", + "match_times": "3" + } + } + }, + "session_and_graph_id_1_1": { + "graph_fusion": { + "ARefreshCubeC0FusionPass": { + "effect_times": "1", + "match_times": "1" + }, + "Conv2dToConv2dV2FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvFormatRefreshFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvToFullyConnectionFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvWeightCompressFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "CubeTransFixpipeFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEAPREQUANTFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FixPipeAbilityProcessPass": { + "effect_times": "1", + "match_times": "1" + }, + "RefreshInt64ToInt32FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "TransdataCastFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFz2FzgFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFzg2FzFusionPass": { + "effect_times": "0", + "match_times": "3" + } + } + }, + "session_and_graph_id_2_2": { + "graph_fusion": { + "ARefreshCubeC0FusionPass": { + "effect_times": "1", + "match_times": "1" + }, + "Conv2dToConv2dV2FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvFormatRefreshFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvToFullyConnectionFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvWeightCompressFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "CubeTransFixpipeFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEAPREQUANTFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FixPipeAbilityProcessPass": { + "effect_times": "1", + "match_times": "1" + }, + "RefreshInt64ToInt32FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "TransdataCastFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFz2FzgFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFzg2FzFusionPass": { + "effect_times": "0", + "match_times": "3" + } + } + }, + "session_and_graph_id_3_3": { + "graph_fusion": { + "ARefreshCubeC0FusionPass": { + "effect_times": "1", + "match_times": "1" + }, + "Conv2dToConv2dV2FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvFormatRefreshFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvToFullyConnectionFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvWeightCompressFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "CubeTransFixpipeFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEAPREQUANTFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FixPipeAbilityProcessPass": { + "effect_times": "1", + "match_times": "1" + }, + "RefreshInt64ToInt32FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "TransdataCastFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFz2FzgFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFzg2FzFusionPass": { + "effect_times": "0", + "match_times": "3" + } + } + }, + "session_and_graph_id_4_4": { + "graph_fusion": { + "ARefreshCubeC0FusionPass": { + "effect_times": "1", + "match_times": "1" + }, + "Conv2dToConv2dV2FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvFormatRefreshFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvToFullyConnectionFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvWeightCompressFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "CubeTransFixpipeFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEAPREQUANTFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FixPipeAbilityProcessPass": { + "effect_times": "1", + "match_times": "1" + }, + "RefreshInt64ToInt32FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "TransdataCastFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFz2FzgFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFzg2FzFusionPass": { + "effect_times": "0", + "match_times": "3" + } + } + }, + "session_and_graph_id_5_5": { + "graph_fusion": { + "ARefreshCubeC0FusionPass": { + "effect_times": "1", + "match_times": "1" + }, + "Conv2dToConv2dV2FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvFormatRefreshFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvToFullyConnectionFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "ConvWeightCompressFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "CubeTransFixpipeFusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEAPREQUANTFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FIXPIPEFUSIONPASS": { + "effect_times": "0", + "match_times": "1" + }, + "FixPipeAbilityProcessPass": { + "effect_times": "1", + "match_times": "1" + }, + "RefreshInt64ToInt32FusionPass": { + "effect_times": "0", + "match_times": "1" + }, + "TransdataCastFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFz2FzgFusionPass": { + "effect_times": "0", + "match_times": "3" + }, + "TransdataFzg2FzFusionPass": { + "effect_times": "0", + "match_times": "3" + } + } + } +} \ No newline at end of file diff --git a/runtime/ops/mapper/unstructured_npu/npu_adapter.py b/runtime/ops/mapper/unstructured_npu/npu_adapter.py new file mode 100644 index 00000000..0df101be --- /dev/null +++ b/runtime/ops/mapper/unstructured_npu/npu_adapter.py @@ -0,0 +1,700 @@ +import os +import sys +import types +import torch +import torch_npu +import numpy as np +import requests +from torchvision.ops import nms +from requests.exceptions import ConnectionError +from urllib.parse import urlparse, urlunparse + +# 如用户未显式设置,默认使用 hf-mirror +os.environ.setdefault("HF_ENDPOINT", "https://hf-mirror.com") + +# ========================================== +# 0. 强力断网拦截 & 基础补丁 +# ========================================== +_orig_request = requests.Session.request + +def mocked_request(self, method, url, *args, **kwargs): + # 仅阻断 YOLOX 相关远程拉取,避免影响表格结构模型(table-transformer)下载 + lowered_url = str(url).lower() + if "yolox" in lowered_url or "yolo_x_layout" in lowered_url: + resp = requests.Response() + resp.status_code = 404 + return resp + + # 强制将 huggingface.co 请求路由到 HF_ENDPOINT(例如 https://hf-mirror.com) + hf_endpoint = os.environ.get("HF_ENDPOINT", "").strip() + if hf_endpoint and "huggingface.co" in lowered_url: + try: + src = urlparse(str(url)) + dst = urlparse(hf_endpoint) + if dst.scheme and dst.netloc: + url = urlunparse((dst.scheme, dst.netloc, src.path, src.params, src.query, src.fragment)) + except Exception: + pass + + return _orig_request(self, method, url, *args, **kwargs) + +requests.Session.request = mocked_request + +# ========================================== +# 1. 定义增强版 LayoutElements +# ========================================== +class NpuLayoutElements(list): + def __init__(self, items=None, **kwargs): + super().__init__(items if items is not None else []) + for k, v in kwargs.items(): + try: + setattr(self, k, v) + except AttributeError: + pass + + @property + def element_class_ids(self): + return np.array([getattr(x, "type", "Uncategorized") for x in self]) + + @property + def element_coords(self): + coords = [] + for el in self: + if hasattr(el, 'bbox'): + bbox = el.bbox + if hasattr(bbox, 'x1'): + coords.append([bbox.x1, bbox.y1, bbox.x2, bbox.y2]) + elif isinstance(bbox, (list, tuple, np.ndarray)) and len(bbox) >= 4: + coords.append([bbox[0], bbox[1], bbox[2], bbox[3]]) + else: + coords.append([0, 0, 0, 0]) + elif hasattr(el, 'x1') and hasattr(el, 'y1'): + coords.append([el.x1, el.y1, el.x2, el.y2]) + else: + coords.append([0, 0, 0, 0]) + return np.array(coords) if coords else np.empty((0, 4)) + + @property + def x1(self): return self.element_coords[:, 0] + @property + def y1(self): return self.element_coords[:, 1] + @property + def x2(self): return self.element_coords[:, 2] + @property + def y2(self): return self.element_coords[:, 3] + + @property + def texts(self): + return np.array([getattr(x, "text", None) for x in self]) + + @texts.setter + def texts(self, values): + for i, val in enumerate(values): + if i < len(self): + if hasattr(self[i], 'text'): + self[i].text = val + else: + try: + setattr(self[i], 'text', val) + except AttributeError: + pass + + @property + def probs(self): + return np.array([getattr(x, "prob", 0.0) for x in self]) + + def slice(self, selection): + if isinstance(selection, np.ndarray): + if selection.dtype == bool: + subset = [item for item, keep in zip(self, selection) if keep] + else: + subset = [self[i] for i in selection] + return NpuLayoutElements(subset) + + if isinstance(selection, list): + subset = [self[i] for i in selection] + return NpuLayoutElements(subset) + + res = super().__getitem__(selection) + if isinstance(res, list): + return NpuLayoutElements(res) + return NpuLayoutElements([res]) + + @classmethod + def concatenate(cls, layouts): + combined_items = [] + for layout in layouts: + combined_items.extend(layout) + return cls(items=combined_items) + +# ========================================== +# 2. 核心适配器入口 +# ========================================== +class NpuInferenceContext: + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + pass + +# ========================================== +# 3. NPU 强力安全算子 (带同步检测) +# ========================================== + +def safe_add(a, b): + try: + res = a + b + torch.npu.synchronize() + return res + except Exception: + return (a.cpu() + b.cpu()).to(a.device) + +def safe_cat(tensors, dim=1): + try: + res = torch.cat(tensors, dim=dim) + torch.npu.synchronize() + return res + except Exception: + cpu_tensors = [t.cpu() for t in tensors] + if not cpu_tensors: return torch.tensor([], device=tensors[0].device) + return torch.cat(cpu_tensors, dim=dim).to(tensors[0].device) + +def safe_sigmoid(x): + try: + res = torch.sigmoid(x) + torch.npu.synchronize() + return res + except Exception: + return torch.sigmoid(x.cpu()).to(x.device) + +def safe_exp(x): + try: + res = torch.exp(x) + torch.npu.synchronize() + return res + except Exception: + return torch.exp(x.cpu()).to(x.device) + +class SafeNpuSiLU(torch.nn.Module): + def __init__(self, inplace=False): + super().__init__() + + def forward(self, x): + try: + x = x.contiguous() + res = x * torch.sigmoid(x) + torch.npu.synchronize() + return res + except Exception: + device = x.device + x_cpu = x.cpu() + return (x_cpu * torch.sigmoid(x_cpu)).to(device) + +class SafeNpuUpsample(torch.nn.Module): + def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=None): + super().__init__() + self.size = size + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + self.op = torch.nn.Upsample(size, scale_factor, mode, align_corners) + + def forward(self, x): + dev = x.device + return self.op(x.cpu()).to(dev) + +class SafeNpuMaxPool2d(torch.nn.Module): + def __init__(self, kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False): + super().__init__() + self.op = torch.nn.MaxPool2d(kernel_size, stride, padding, dilation, return_indices, ceil_mode) + + def forward(self, x): + dev = x.device + return self.op(x.cpu()).to(dev) + +# ========================================== +# 4. YOLOX 模块补丁 +# ========================================== + +def npu_focus_forward(self, x): + target_device = x.device + x_cpu = x.cpu().float() + patch_top_left = x_cpu[..., ::2, ::2] + patch_bot_left = x_cpu[..., 1::2, ::2] + patch_top_right = x_cpu[..., ::2, 1::2] + patch_bot_right = x_cpu[..., 1::2, 1::2] + x_cat = torch.cat( + (patch_top_left, patch_bot_left, patch_top_right, patch_bot_right), + dim=1, + ).contiguous() + + x_npu = x_cat.to(target_device) + conv_out_npu = self.conv.conv(x_npu) + res_cpu = conv_out_npu.cpu() + res_cpu = res_cpu * torch.sigmoid(res_cpu) + return res_cpu.to(target_device) + +def npu_bottleneck_forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = safe_add(y, x) + return y + +def npu_csplayer_forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + x_1 = self.m(x_1) + x = safe_cat((x_1, x_2), dim=1) + return self.conv3(x) + +def npu_spp_forward(self, x): + x = self.conv1(x) + x_1 = self.m[0](x) + x_2 = self.m[1](x) + x_3 = self.m[2](x) + x = safe_cat((x, x_1, x_2, x_3), dim=1) + return self.conv2(x) + +def npu_yolopafpn_forward(self, input): + out_features = self.backbone(input) + features = [out_features[f] for f in self.in_features] + [x2, x1, x0] = features + + fpn_out0 = self.lateral_conv0(x0) + f_out0 = self.upsample(fpn_out0) + f_out0 = safe_cat([f_out0, x1], 1) + f_out0 = self.C3_p4(f_out0) + + fpn_out1 = self.reduce_conv1(f_out0) + f_out1 = self.upsample(fpn_out1) + f_out1 = safe_cat([f_out1, x2], 1) + pan_out2 = self.C3_p3(f_out1) + + p_out1 = self.bu_conv2(pan_out2) + p_out1 = safe_cat([p_out1, fpn_out1], 1) + pan_out1 = self.C3_n3(p_out1) + + p_out0 = self.bu_conv1(pan_out1) + p_out0 = safe_cat([p_out0, fpn_out0], 1) + pan_out0 = self.C3_n4(p_out0) + + return (pan_out2, pan_out1, pan_out0) + +def npu_yolohead_forward(self, xin, labels=None, imgs=None): + outputs = [] + for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate( + zip(self.cls_convs, self.reg_convs, self.strides, xin) + ): + x = self.stems[k](x) + cls_x = x + reg_x = x + + cls_feat = cls_conv(cls_x) + cls_output = self.cls_preds[k](cls_feat) + + reg_feat = reg_conv(reg_x) + reg_output = self.reg_preds[k](reg_feat) + obj_output = self.obj_preds[k](reg_feat) + + if self.training: + output = torch.cat( + [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1 + ) + else: + sig_obj = safe_sigmoid(obj_output) + sig_cls = safe_sigmoid(cls_output) + output = safe_cat([reg_output, sig_obj, sig_cls], 1) + + outputs.append(output) + + if self.training: + return outputs + else: + self.hw = [x.shape[-2:] for x in outputs] + outputs_flattened = [x.flatten(start_dim=2) for x in outputs] + cat_out = safe_cat(outputs_flattened, dim=2) + try: + outputs = cat_out.permute(0, 2, 1).contiguous() + torch.npu.synchronize() + except Exception: + outputs = cat_out.cpu().permute(0, 2, 1).contiguous() + + if self.decode_in_inference: + return self.decode_outputs(outputs, dtype=xin[0].type()) + else: + return outputs + +def npu_yolohead_decode_outputs(self, outputs, dtype=None): + outputs = outputs.cpu() + grids = [] + strides = [] + + for (hsize, wsize), stride in zip(self.hw, self.strides): + yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)]) + grid = torch.stack((xv, yv), 2).view(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + strides.append(torch.full((*shape, 1), stride)) + + grids = torch.cat(grids, dim=1).type(outputs.dtype) + strides = torch.cat(strides, dim=1).type(outputs.dtype) + + outputs_xy = outputs[..., :2] + outputs_wh = outputs[..., 2:4] + outputs_rest = outputs[..., 4:] + + outputs_xy = (outputs_xy + grids) * strides + outputs_wh = torch.exp(outputs_wh) * strides + + return torch.cat([outputs_xy, outputs_wh, outputs_rest], dim=-1) + +# ========================================== +# 5. 模型结构优化 +# ========================================== + +def optimize_model_for_npu(model): + print("[NPU Adapter] Optimizing model structure for Ascend NPU...") + from yolox.models.network_blocks import BaseConv + import torch.nn as nn + + counts = {"bn_fused": 0, "silu_replaced": 0, "upsample_replaced": 0, "maxpool_replaced": 0} + + def recursive_replace(m): + for name, child in m.named_children(): + if isinstance(child, nn.SiLU): + setattr(m, name, SafeNpuSiLU()) + counts["silu_replaced"] += 1 + elif isinstance(child, nn.Upsample): + safe_up = SafeNpuUpsample( + size=child.size, + scale_factor=child.scale_factor, + mode=child.mode, + align_corners=child.align_corners + ) + setattr(m, name, safe_up) + counts["upsample_replaced"] += 1 + elif isinstance(child, nn.MaxPool2d): + safe_pool = SafeNpuMaxPool2d( + kernel_size=child.kernel_size, + stride=child.stride, + padding=child.padding, + dilation=child.dilation, + return_indices=child.return_indices, + ceil_mode=child.ceil_mode + ) + setattr(m, name, safe_pool) + counts["maxpool_replaced"] += 1 + else: + recursive_replace(child) + + recursive_replace(model) + + for name, m in model.named_modules(): + if isinstance(m, BaseConv): + if hasattr(m, "bn") and isinstance(m.bn, nn.BatchNorm2d): + conv = m.conv + bn = m.bn + with torch.no_grad(): + w = conv.weight + if conv.bias is None: + b = torch.zeros(w.shape[0], device=w.device, dtype=w.dtype) + else: + b = conv.bias + bn_mean = bn.running_mean + bn_var = bn.running_var + bn_gamma = bn.weight + bn_beta = bn.bias + bn_eps = bn.eps + inv_std = 1.0 / torch.sqrt(bn_var + bn_eps) + w_fused = w * (bn_gamma * inv_std).reshape(-1, 1, 1, 1) + b_fused = (b - bn_mean) * (bn_gamma * inv_std) + bn_beta + m.conv.weight.copy_(w_fused) + if m.conv.bias is None: + m.conv.bias = torch.nn.Parameter(b_fused) + else: + m.conv.bias.copy_(b_fused) + m.bn = nn.Identity() + counts["bn_fused"] += 1 + + print(f"[NPU Adapter] Optimization Stats: {counts}") + +def apply_patches(): + print("[NPU Adapter] Applying monkey patches...") + import unstructured_inference.models.base as model_base + model_base.get_model = npu_get_model + + try: + import unstructured_inference.inference.layout as layout_module + layout_module.get_model = npu_get_model + except ImportError: pass + + from unstructured_inference.inference.layout import PageLayout + # 覆盖 PageLayout 的构造工厂方法 + PageLayout.from_image = classmethod(npu_pagelayout_from_image) + + from unstructured_inference.models.yolox import UnstructuredYoloXModel + UnstructuredYoloXModel.predict = npu_yolox_predict + + import unstructured_inference.inference.layoutelement as layoutelement_pkg + layoutelement_pkg.LayoutElements = NpuLayoutElements + sys.modules['unstructured_inference.inference.layoutelement'].LayoutElements = NpuLayoutElements + + try: + from yolox.models.network_blocks import Focus, Bottleneck, CSPLayer, SPPBottleneck + from yolox.models.yolo_pafpn import YOLOPAFPN + from yolox.models.yolo_head import YOLOXHead + + Focus.forward = npu_focus_forward + print("✅ Patch: Focus (Hybrid CPU/NPU).") + Bottleneck.forward = npu_bottleneck_forward + print("✅ Patch: Bottleneck (Safe Add w/ Sync).") + CSPLayer.forward = npu_csplayer_forward + print("✅ Patch: CSPLayer (Safe Cat w/ Sync).") + SPPBottleneck.forward = npu_spp_forward + print("✅ Patch: SPPBottleneck (Safe Cat w/ Sync).") + YOLOPAFPN.forward = npu_yolopafpn_forward + print("✅ Patch: YOLOPAFPN (Re-implemented with Safe Cat).") + YOLOXHead.forward = npu_yolohead_forward + print("✅ Patch: YOLOXHead (Safe Sigmoid & Cat).") + YOLOXHead.decode_outputs = npu_yolohead_decode_outputs + print("✅ Patch: YOLOXHead.decode_outputs (Force CPU).") + + except ImportError as e: + print(f"⚠️ Warning: Could not patch YOLOX blocks: {e}") + + print("✅ Monkey Patch: All NPU hooks applied.") + +# ========================================== +# 6. 模型加载逻辑 +# ========================================== +_NPU_MODEL_CACHE = {} + +def npu_get_model(model_name: str, **kwargs): + global _NPU_MODEL_CACHE + kwargs.pop('password', None) + + if model_name in _NPU_MODEL_CACHE: + return _NPU_MODEL_CACHE[model_name] + + if os.path.exists("./yolox_l.pt"): + model_path = "./yolox_l.pt" + else: + model_path = "/mnt/nvme0n1/pjj-data/data/models/yolox_l.pt" + + print(f"[NPU Adapter] Loading local model: {model_path}") + + from unstructured_inference.models.yolox import UnstructuredYoloXModel + model = UnstructuredYoloXModel() + model.model_path = model_path + + try: + ckpt = torch.load(model_path, map_location="cpu") + except Exception: + try: + ckpt = torch.jit.load(model_path, map_location="cpu") + except Exception as e: + print(f"❌ Error loading model: {e}") + raise FileNotFoundError(f"Model file not found or corrupted: {model_path}. Please download it.") + + if isinstance(ckpt, dict): + state_dict = ckpt.get("model", ckpt.get("state_dict", ckpt)) + else: + state_dict = ckpt.state_dict() if hasattr(ckpt, "state_dict") else ckpt + + from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead + + num_classes = 5 + for k, v in state_dict.items(): + if "head.cls_preds" in k and hasattr(v, "shape"): + if v.shape[0] != num_classes: + num_classes = v.shape[0] + break + + def init_yolo(depth, width): + in_channels = [256, 512, 1024] + backbone = YOLOPAFPN(depth, width, in_channels=in_channels) + head = YOLOXHead(num_classes, width, in_channels=in_channels) + return YOLOX(backbone, head) + + model.model = init_yolo(1.0, 1.0) + model.model.load_state_dict(state_dict, strict=False) + model.model.eval() + optimize_model_for_npu(model.model) + + print("Moving model to NPU (FP32)...") + model.model.to("npu") + + print("[NPU Adapter] Model Ready.") + + _NPU_MODEL_CACHE[model_name] = model + return model + +# ========================================== +# 7. 推理逻辑重写 +# ========================================== +def _local_yolox_preprocess(img, input_size, swap=(2, 0, 1)): + import cv2 + if len(img.shape) == 3: + padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 + else: + padded_img = np.ones(input_size, dtype=np.uint8) * 114 + + r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * r), int(img.shape[0] * r)), + interpolation=cv2.INTER_LINEAR, + ).astype(np.uint8) + + padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img + padded_img = padded_img.transpose(swap) + padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) + return padded_img, r + +def npu_yolox_predict(self, x: np.ndarray): + if not isinstance(x, np.ndarray): + x = np.asarray(x) + + input_shape = (1024, 1024) + image_h, image_w = x.shape[:2] + preprocessed_img, ratio = _local_yolox_preprocess(x, input_shape) + + input_tensor = torch.from_numpy(preprocessed_img).unsqueeze(0).to("npu") + + with torch.no_grad(): + torch.npu.synchronize() + outputs = self.model(input_tensor) + torch.npu.synchronize() + + raw_out = outputs.get("det", outputs.get("dets")) if isinstance(outputs, dict) else outputs + + if raw_out is not None: + decoder_outputs = raw_out.float().cpu() + decoder_outputs = torch.nan_to_num(decoder_outputs, nan=0.0, posinf=10000.0, neginf=0.0) + predictions = decoder_outputs[0] + else: + predictions = None + + if predictions is None: + return NpuLayoutElements([]) + + boxes_xywh = predictions[:, :4] + boxes_xyxy = torch.empty_like(boxes_xywh) + boxes_xyxy[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2.0 + boxes_xyxy[:, 1] = boxes_xywh[:, 1] - boxes_xywh[:, 3] / 2.0 + boxes_xyxy[:, 2] = boxes_xywh[:, 0] + boxes_xywh[:, 2] / 2.0 + boxes_xyxy[:, 3] = boxes_xywh[:, 1] + boxes_xywh[:, 3] / 2.0 + obj_scores = predictions[:, 4:5] + cls_scores = predictions[:, 5:] + + cls_max_scores, cls_ids = cls_scores.max(1, keepdim=True) + final_scores = obj_scores * cls_max_scores + + conf_thr = 0.1 + mask = final_scores.squeeze() > conf_thr + + filtered_boxes = boxes_xyxy[mask] + filtered_scores = final_scores[mask].squeeze() + filtered_cls_ids = cls_ids[mask].squeeze() + + if len(filtered_boxes) == 0: + return NpuLayoutElements([]) + + nms_thr = 0.45 + keep_indices = nms(filtered_boxes, filtered_scores, nms_thr) + + final_boxes = filtered_boxes[keep_indices] + final_scores = filtered_scores[keep_indices] + final_cls_ids = filtered_cls_ids[keep_indices] + + final_boxes /= ratio + + # 将坐标约束到原图边界内,并修正可能出现的颠倒坐标 + x1 = torch.minimum(final_boxes[:, 0], final_boxes[:, 2]).clamp(0.0, float(image_w)) + y1 = torch.minimum(final_boxes[:, 1], final_boxes[:, 3]).clamp(0.0, float(image_h)) + x2 = torch.maximum(final_boxes[:, 0], final_boxes[:, 2]).clamp(0.0, float(image_w)) + y2 = torch.maximum(final_boxes[:, 1], final_boxes[:, 3]).clamp(0.0, float(image_h)) + final_boxes = torch.stack([x1, y1, x2, y2], dim=1) + + valid_mask = (final_boxes[:, 2] - final_boxes[:, 0] > 1.0) & (final_boxes[:, 3] - final_boxes[:, 1] > 1.0) + final_boxes = final_boxes[valid_mask] + final_scores = final_scores[valid_mask] + final_cls_ids = final_cls_ids[valid_mask] + + if len(final_boxes) == 0: + return NpuLayoutElements([]) + + from unstructured_inference.inference.layoutelement import LayoutElement + elements_list = [] + + label_map = { + 0: "Caption", 1: "Footnote", 2: "Formula", 3: "List-item", + 4: "Page-footer", 5: "Page-header", 6: "Picture", 7: "Section-header", + 8: "Table", 9: "Text", 10: "Title" + } + + for box, score, cls_id in zip(final_boxes, final_scores, final_cls_ids): + x1, y1, x2, y2 = box.numpy() + label = label_map.get(int(cls_id.item()), "Text") + elements_list.append(LayoutElement.from_coords(x1, y1, x2, y2, text=None, type=label, prob=score.item())) + + return NpuLayoutElements(elements_list) + +# 【核心修复】兼容当前 unstructured_inference 版本的 PageLayout.from_image +def npu_pagelayout_from_image( + cls, + image, + image_path=None, + document_filename=None, + number=1, + detection_model=None, + element_extraction_model=None, + layout=None, + extract_tables=False, + fixed_layout=None, + extract_images_in_pdf=False, + image_output_dir_path=None, + analysis=False, + **kwargs, +): + if detection_model is None: + from unstructured_inference.models.base import get_model + detection_model = get_model("yolox", **kwargs) + + page = cls( + number=number, + image=image, + layout=layout, + detection_model=detection_model, + element_extraction_model=element_extraction_model, + extract_tables=extract_tables, + analysis=analysis, + ) + + if element_extraction_model is not None: + page.get_elements_using_image_extraction() + elif fixed_layout is not None: + page.elements = page.get_elements_from_layout(fixed_layout) + else: + inferred_layout = detection_model.predict(np.array(image)) + try: + inferred_layout = detection_model.deduplicate_detected_elements(inferred_layout) + except Exception: + pass + page.elements = page.get_elements_from_layout(inferred_layout) + if analysis: + page.inferred_layout = inferred_layout + + page.image_metadata = { + "format": page.image.format if page.image else None, + "width": page.image.width if page.image else None, + "height": page.image.height if page.image else None, + } + page.image_path = os.path.abspath(image_path) if image_path else None + page.document_filename = os.path.abspath(document_filename) if document_filename else None + + if extract_images_in_pdf: + page.extract_images(image_output_dir_path) + + # 与原始实现保持一致,释放图片内存 + page.image = None + return page \ No newline at end of file diff --git a/runtime/ops/mapper/unstructured_npu/ocr_npu_adapter.py b/runtime/ops/mapper/unstructured_npu/ocr_npu_adapter.py new file mode 100644 index 00000000..2930f80b --- /dev/null +++ b/runtime/ops/mapper/unstructured_npu/ocr_npu_adapter.py @@ -0,0 +1,257 @@ +import sys +import pandas as pd +import numpy as np +import os +import warnings +import multiprocessing +import atexit +import time +import threading +import types +import importlib.util +import importlib.machinery + +# ========================================== +# 0. Worker Process Logic (Isolated Environment) +# ========================================== +def _paddle_worker_main(in_queue, out_queue): + """ + Runs in a completely separate process. + PREVENTS Paddle from loading the NPU plugin to avoid memory conflicts. + """ + # 1. 基础环境配置 + os.environ["OMP_NUM_THREADS"] = "1" + os.environ["MKL_NUM_THREADS"] = "1" + os.environ["Paddle_OP_PARALLELISM_THREADS"] = "1" + + # 2. 内存分配器优化 + os.environ["FLAGS_allocator_strategy"] = 'naive_best_fit' + os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0' + os.environ["FLAGS_use_system_allocator"] = "1" + + # 3. 【核心修复】禁止加载 NPU 插件 + os.environ["CUSTOM_DEVICE_ROOT"] = "/tmp/dummy_empty_dir_for_isolation" + + # 4. 辅助屏蔽硬件可见性 + os.environ["CUDA_VISIBLE_DEVICES"] = "" + os.environ["ASCEND_VISIBLE_DEVICES"] = "" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "" + + try: + import paddle + from paddleocr import PaddleOCR + + warnings.filterwarnings("ignore") + paddle.disable_signal_handler() + + # 显式切换到 CPU + try: + paddle.set_device('cpu') + except Exception: + pass + + # 初始化 OCR + ocr_engine = PaddleOCR( + use_angle_cls=False, + lang="ch", + use_gpu=False, + show_log=False, + use_mp=False, + total_process_num=0, + enable_mkldnn=True, + use_tensorrt=False + ) + + out_queue.put(("INIT_SUCCESS", "CPU Mode (Plugin Disabled)")) + + while True: + task = in_queue.get() + if task is None: + break + req_id, img_array = task + try: + if not isinstance(img_array, np.ndarray): + img_array = np.array(img_array) + # 执行 OCR + result = ocr_engine.ocr(img_array, cls=False) + out_queue.put((req_id, "OK", result)) + except Exception as e: + out_queue.put((req_id, "ERROR", str(e))) + + except Exception as e: + out_queue.put(("INIT_ERROR", f"Worker Crash: {str(e)}")) + +# ========================================== +# 1. OCR Client (Main Process) +# ========================================== +class PaddleOCRInference: + _instance = None + + def __init__(self): + self.ctx = multiprocessing.get_context('spawn') + self.in_q = self.ctx.Queue() + self.out_q = self.ctx.Queue() + self.lock = threading.Lock() + self.is_alive = False + + print(f"\n\033[94m[OCR Adapter] Spawning isolated OCR process (CPU Mode)...\033[0m") + self.process = self.ctx.Process( + target=_paddle_worker_main, + args=(self.in_q, self.out_q) + ) + self.process.daemon = True + self.process.start() + + try: + status, msg = self.out_q.get(timeout=30) + if status == "INIT_SUCCESS": + print(f"\033[92m[OCR Adapter] OCR Process Ready. [{msg}]\033[0m") + self.is_alive = True + else: + print(f"\033[91m[OCR Adapter] Worker Init Failed: {msg}\033[0m") + self.kill() + except Exception as e: + print(f"\033[91m[OCR Adapter] Worker Timeout/Error: {e}\033[0m") + self.kill() + + atexit.register(self.kill) + + def kill(self): + if self.process.is_alive(): + self.in_q.put(None) + self.process.join(timeout=1) + if self.process.is_alive(): + self.process.terminate() + self.is_alive = False + + def ocr(self, img_array): + if not self.is_alive: + return [[]] + + with self.lock: + req_id = time.time() + try: + self.in_q.put((req_id, img_array)) + resp_id, status, data = self.out_q.get(timeout=30) + if resp_id != req_id or status == "ERROR": + return [[]] + return data + except Exception: + self.is_alive = False + return [[]] + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = PaddleOCRInference() + return cls._instance + +# ========================================== +# 2. Logic Implementation +# ========================================== +def _impl_paddle_to_data(image_array): + client = PaddleOCRInference.get_instance() + result = client.ocr(image_array) + + data = { + 'level': [], 'page_num': [], 'block_num': [], 'par_num': [], + 'line_num': [], 'word_num': [], 'left': [], 'top': [], + 'width': [], 'height': [], 'conf': [], 'text': [] + } + + if not result or result[0] is None: + return pd.DataFrame(data) + + for idx, line in enumerate(result[0]): + try: + box, (txt, conf) = line + xs = [pt[0] for pt in box] + ys = [pt[1] for pt in box] + x_min, y_min = int(min(xs)), int(min(ys)) + w, h = int(max(xs) - x_min), int(max(ys) - y_min) + + data['level'].append(5) + data['page_num'].append(1) + data['block_num'].append(1) + data['par_num'].append(1) + data['line_num'].append(idx + 1) + data['word_num'].append(1) + data['left'].append(x_min) + data['top'].append(y_min) + data['width'].append(w) + data['height'].append(h) + data['conf'].append(conf * 100) + data['text'].append(txt) + except Exception: + continue + return pd.DataFrame(data) + +def _impl_image_to_data(image, lang=None, output_type=None, **kwargs): + img_array = np.array(image) + df = _impl_paddle_to_data(img_array) + if output_type == 'data.frame': return df + elif output_type == 'dict': return df.to_dict(orient='list') + else: return df.to_csv(sep='\t', index=False) + +def _impl_image_to_string(image, lang=None, **kwargs): + img_array = np.array(image) + client = PaddleOCRInference.get_instance() + result = client.ocr(img_array) + if result is None or len(result) == 0 or result[0] is None: + return "" + try: + lines = [line[1][0] for line in result[0] if line[1]] + return "\n".join(lines) + except: + return "" + +def _impl_image_to_pdf(image, **kwargs): return b'' + +class _ImplOutput: + BYTES = "bytes" + DATAFRAME = "data.frame" + DICT = "dict" + STRING = "string" + +class _ImplTesseractNotFoundError(EnvironmentError): pass + +# ========================================== +# 3. Apply Patch (Module Injection) +# ========================================== +def apply_ocr_patch(): + # 使用 types.ModuleType 创建一个真实的模块对象 + # 这比使用 Class 伪装更稳定,兼容所有 inspect/importlib 检查 + fake_mod = types.ModuleType("pytesseract") + fake_mod.__file__ = "fake_pytesseract.py" + fake_mod.__path__ = [] + + # 关键修复:设置真实的 ModuleSpec + # loader=None 表示这是一个命名空间包或动态模块,这是允许的且不会报错 + fake_mod.__spec__ = importlib.machinery.ModuleSpec( + name="pytesseract", + loader=None, + origin="fake_pytesseract.py" + ) + + # 挂载功能函数 + fake_mod.image_to_data = _impl_image_to_data + fake_mod.image_to_string = _impl_image_to_string + fake_mod.image_to_pdf_or_hocr = _impl_image_to_pdf + fake_mod.Output = _ImplOutput + fake_mod.TesseractNotFoundError = _ImplTesseractNotFoundError + + # 强制替换系统模块 + sys.modules["pytesseract"] = fake_mod + sys.modules["unstructured_pytesseract"] = fake_mod + + # 尝试修补已经加载的引用 + modules_to_patch = [ + "unstructured.partition.pdf_image.ocr", + "unstructured.partition.utils.ocr_models" + ] + for mod_name in modules_to_patch: + if mod_name in sys.modules: + try: + sys.modules[mod_name].pytesseract = fake_mod + except AttributeError: + pass \ No newline at end of file diff --git a/runtime/ops/mapper/unstructured_npu/run.sh b/runtime/ops/mapper/unstructured_npu/run.sh new file mode 100644 index 00000000..d3516275 --- /dev/null +++ b/runtime/ops/mapper/unstructured_npu/run.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +set -euo pipefail + +# ========================================================= +# Ascend NPU 极简启动脚本 (Fix std::bad_alloc) +# ========================================================= + +# 1. 定义库路径 +JEMALLOC="/usr/lib/aarch64-linux-gnu/libjemalloc.so.2" +GOMP="/usr/lib/aarch64-linux-gnu/libgomp.so.1" + +# 0. 切换到脚本目录,避免从其他目录启动时找不到文件 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$SCRIPT_DIR" + +# 2. 检查库是否存在 +if [ ! -f "$JEMALLOC" ]; then + echo "❌ Error: jemalloc not found at $JEMALLOC" + exit 1 +fi + +# 3. 设置 LD_PRELOAD (覆盖式设置,防止重复) +# 注意:jemalloc 必须排在第一位,libgomp 排第二解决 TLS 问题 +export LD_PRELOAD="$JEMALLOC:$GOMP" + +# 4. Jemalloc 优化参数 (关键:关闭后台线程,防止 NPU 驱动冲突) +export MALLOC_CONF="background_thread:false,dirty_decay_ms:0,muzzy_decay_ms:0" + +# 5. NPU 环境变量 +export FLAGS_use_system_allocator=1 +export expandable_segments=True +export OMP_NUM_THREADS=1 + +# 6. Python 路径 (包含当前目录和 YOLOX) +export PYTHONPATH=$(pwd):$(pwd)/YOLOX-main:$PYTHONPATH + +# 6.1 可选加载 Ascend 环境(若存在) +if [ -f /usr/local/Ascend/ascend-toolkit/set_env.sh ]; then + # shellcheck disable=SC1091 + source /usr/local/Ascend/ascend-toolkit/set_env.sh +elif [ -f /usr/local/Ascend/ascend-toolkit/latest/set_env.sh ]; then + # shellcheck disable=SC1091 + source /usr/local/Ascend/ascend-toolkit/latest/set_env.sh +fi + +# 6.2 参数帮助 +if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then + echo "用法: bash run.sh [文件1] [文件2] ..." + echo "示例: bash run.sh demo.pdf word测试.docx" + echo "未传参时默认处理: attention.pdf" + exit 0 +fi + +# 7. 运行 +echo "🚀 Running Benchmark..." +echo "Using LD_PRELOAD=$LD_PRELOAD" + +if ! command -v python >/dev/null 2>&1; then + echo "❌ Error: python 命令不存在" + exit 1 +fi + +if [ "$#" -eq 0 ]; then + set -- "attention.pdf" +fi + +fail_count=0 +for input_file in "$@"; do + if [ ! -f "$input_file" ]; then + echo "❌ 文件不存在: $input_file" + fail_count=$((fail_count + 1)) + continue + fi + + echo "📄 Processing: $input_file" + if ! python benchmark_npu.py "$input_file"; then + echo "❌ 处理失败: $input_file" + fail_count=$((fail_count + 1)) + fi +done + +if [ "$fail_count" -gt 0 ]; then + echo "⚠️ 完成,但有 $fail_count 个文件失败" + exit 1 +fi + +echo "✅ 全部处理完成" \ No newline at end of file