Skip to content

Commit 28b80e3

Browse files
authored
Merge pull request #597 from ccprocessor/dev
v4.3.0-released
2 parents db5fbe6 + 9086a43 commit 28b80e3

26 files changed

Lines changed: 4260 additions & 315 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,5 @@ llm_web_kit.egg-info/*
4949
.llm-web-kit.jsonc
5050
.llm-web-kit-pageclassify.jsonc
5151
tests/llm_web_kit/extractor/ygq_testmd
52+
output.md
53+
output.jsonl

docs/specification/output_format/content_list_spec.md

Lines changed: 257 additions & 178 deletions
Large diffs are not rendered by default.
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""请求日志服务.
2+
3+
提供请求日志的创建、更新和查询功能。
4+
"""
5+
6+
import uuid
7+
from datetime import datetime
8+
from typing import Optional
9+
10+
from sqlalchemy import select
11+
from sqlalchemy.ext.asyncio import AsyncSession
12+
13+
from ..dependencies import get_logger
14+
from ..models.db_models import RequestLog
15+
16+
logger = get_logger(__name__)
17+
18+
19+
class RequestLogService:
20+
"""请求日志服务类."""
21+
@staticmethod
22+
def generate_request_id() -> str:
23+
"""生成唯一的请求ID."""
24+
return str(uuid.uuid4())
25+
26+
@staticmethod
27+
async def create_log(
28+
session: Optional[AsyncSession],
29+
request_id: str,
30+
input_type: str,
31+
input_html: Optional[str] = None,
32+
url: Optional[str] = None,
33+
) -> Optional[RequestLog]:
34+
"""创建请求日志记录.
35+
36+
Args:
37+
session: 数据库会话
38+
request_id: 请求ID
39+
input_type: 输入类型 (html_content, url, file)
40+
input_html: 输入HTML内容
41+
url: URL地址
42+
Returns:
43+
创建的日志记录,如果数据库未配置则返回 None
44+
"""
45+
if session is None:
46+
logger.debug("数据库会话为空,跳过日志记录")
47+
return None
48+
try:
49+
log = RequestLog(
50+
request_id=request_id,
51+
input_type=input_type,
52+
input_html=input_html,
53+
url=url,
54+
status='processing',
55+
created_at=datetime.now(),
56+
updated_at=datetime.now(),
57+
)
58+
session.add(log)
59+
await session.flush() # 立即写入,获取ID
60+
logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing")
61+
return log
62+
except Exception as e:
63+
logger.error(f"创建请求日志失败: {e}")
64+
return None
65+
66+
@staticmethod
67+
async def update_log_success(
68+
session: Optional[AsyncSession],
69+
request_id: str,
70+
output_markdown: Optional[str] = None,
71+
) -> bool:
72+
"""更新请求日志为成功状态.
73+
74+
Args:
75+
session: 数据库会话
76+
request_id: 请求ID
77+
output_markdown: 输出Markdown内容
78+
Returns:
79+
是否更新成功
80+
"""
81+
if session is None:
82+
return False
83+
try:
84+
result = await session.execute(
85+
select(RequestLog).where(RequestLog.request_id == request_id)
86+
)
87+
log = result.scalar_one_or_none()
88+
if log:
89+
log.status = 'success'
90+
log.output_markdown = output_markdown
91+
log.updated_at = datetime.now()
92+
await session.flush()
93+
logger.info(f"更新请求日志为成功: request_id={request_id}, status=success")
94+
return True
95+
else:
96+
logger.warning(f"未找到请求日志: request_id={request_id}")
97+
return False
98+
except Exception as e:
99+
logger.error(f"更新请求日志失败: {e}")
100+
return False
101+
102+
@staticmethod
103+
async def update_log_failure(
104+
session: Optional[AsyncSession],
105+
request_id: str,
106+
error_message: str,
107+
) -> bool:
108+
"""更新请求日志为失败状态.
109+
110+
Args:
111+
session: 数据库会话
112+
request_id: 请求ID
113+
error_message: 错误信息
114+
Returns:
115+
是否更新成功
116+
"""
117+
if session is None:
118+
return False
119+
try:
120+
result = await session.execute(
121+
select(RequestLog).where(RequestLog.request_id == request_id)
122+
)
123+
log = result.scalar_one_or_none()
124+
if log:
125+
log.status = 'fail'
126+
log.error_message = error_message
127+
log.updated_at = datetime.now()
128+
await session.flush()
129+
logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail")
130+
return True
131+
else:
132+
logger.warning(f"未找到请求日志: request_id={request_id}")
133+
return False
134+
135+
except Exception as e:
136+
logger.error(f"更新请求日志失败: {e}")
137+
return False
138+
139+
@staticmethod
140+
async def get_log_by_request_id(
141+
session: Optional[AsyncSession],
142+
request_id: str,
143+
) -> Optional[RequestLog]:
144+
"""根据请求ID查询日志.
145+
146+
Args:
147+
session: 数据库会话
148+
request_id: 请求ID
149+
Returns:
150+
日志记录,如果未找到则返回 None
151+
"""
152+
if session is None:
153+
return None
154+
try:
155+
result = await session.execute(
156+
select(RequestLog).where(RequestLog.request_id == request_id)
157+
)
158+
return result.scalar_one_or_none()
159+
except Exception as e:
160+
logger.error(f"查询请求日志失败: {e}")
161+
return None

llm_web_kit/extractor/html/recognizer/cccode.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,29 +88,29 @@ def recognize(
8888

8989
@override
9090
def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
91-
"""
92-
把代码元素转换为content list node.
91+
"""把代码元素转换为content list node. 注意:此方法只处理块级代码(CC_CODE),行内代码(CC_CODE_INLIN
92+
E)由TextParagraphRecognizer处理.
93+
9394
Args:
9495
base_url:
9596
parsed_content: HtmlElement对象
9697
raw_html_segment:
9798
9899
Returns:
99-
100100
"""
101101
d = {
102102
'type': 'code',
103-
# "bbox": [],
104-
'raw_content': raw_html_segment,
105-
'inline': parsed_content.get('inline', 'false') == 'true',
103+
'bbox': [],
106104
'content': {
107105
'code_content': parsed_content.text,
108106
},
109107
}
110108

109+
# 可选字段:language
111110
if lang := parsed_content.get('language', None):
112111
d['content']['language'] = lang
113112

113+
# 可选字段:by(代码高亮工具)
114114
if by := parsed_content.get('by', None):
115115
d['content']['by'] = by
116116

llm_web_kit/extractor/html/recognizer/ccmath.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl
5353

5454
@override
5555
def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
56-
"""将content转换成content_list_node.
57-
每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md
58-
例如代码的返回格式:
56+
"""将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考
57+
docs/specification/output_format/content_list_spec.md.
58+
59+
返回格式示例:
5960
```json
6061
{
61-
"type": "equation-inline", # 数学公式类型,一共equation-inline和equation-interline两种
62-
"raw_content": "<ccmath type="latex" by="mathjax">$u_{x_0}^{in}(x)$</ccmath>",
62+
"type": "equation-interline",
63+
"bbox": [],
6364
"content": {
64-
"math_content": "u_{x_0}^{in}(x)",
65+
"math_content": "a^2 + b^2 = c^2",
6566
"math_type": "latex",
6667
"by": "mathjax"
6768
}
6869
}
69-
```
70+
```
7071
71-
Args:
72-
content: str: 要转换的content
72+
Args:
73+
base_url: 基础URL
74+
parsed_content: 解析后的HtmlElement对象
75+
raw_html_segment: 原始HTML片段
7376
7477
Returns:
7578
dict: content_list_node
@@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
8689
math_content = self.cm.wrap_math_md(math_content)
8790
return {
8891
'type': DocElementType.EQUATION_INTERLINE,
89-
'raw_content': raw_html_segment,
92+
'bbox': [],
9093
'content': {
9194
'math_content': math_content,
9295
'math_type': inter_ele[0].get('type'), # 数学语言类型
@@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
97100
math_content = in_els[0].text
98101
return {
99102
'type': DocElementType.EQUATION_INLINE,
100-
'raw_content': raw_html_segment,
103+
'bbox': [],
101104
'content': {
102105
'math_content': math_content,
103106
'math_type': in_els[0].get('type'), # 数学语言类型

llm_web_kit/extractor/html/recognizer/image.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
5353
raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}')
5454

5555
def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict:
56+
caption = html_obj.get('caption')
57+
footnote = html_obj.get('footnote')
5658
result = {
5759
'type': DocElementType.IMAGE,
58-
'raw_content': raw_html_segment,
60+
'bbox': [],
5961
'content': {
6062
'url': html_obj.text if html_obj.get('format') == 'url' else None,
6163
'data': html_obj.text if html_obj.get('format') == 'base64' else None,
6264
'alt': html_obj.get('alt'),
6365
'title': html_obj.get('title'),
64-
'caption': html_obj.get('caption')
66+
'caption': [caption] if caption else [],
67+
'footnote': [footnote] if footnote else []
6568
}
6669
}
6770
return result

llm_web_kit/extractor/html/recognizer/list.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import re
23
from typing import Any, List, Tuple
34

45
from lxml import html as lxml_html
@@ -44,7 +45,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
4445

4546
ele_node = {
4647
'type': DocElementType.LIST,
47-
'raw_content': raw_html_segment,
48+
'bbox': [],
4849
'content': {
4950
'items': content_list,
5051
'list_attribute': list_attribute,
@@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
196197
if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
197198
paragraph[-1]['c'] += _new_tail
198199
else:
200+
if len(paragraph) > 0 and el.tag not in inline_tags:
201+
_new_tail = '$br$' + _new_tail
199202
paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})
200203

201204
if paragraph:
@@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
212215
text_paragraph.append(new_paragraph)
213216

214217
for n, item in enumerate(text_paragraph):
215-
tem_json = json.dumps(item).replace('$br$', '\\n\\n')
218+
tem_json = json.dumps(item, ensure_ascii=False)
219+
tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json)
216220
text_paragraph[n] = json.loads(tem_json)
217221

218222
return text_paragraph

llm_web_kit/extractor/html/recognizer/table.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
100100
# 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串
101101
if table_type:
102102
cc_table_type = DocElementType.COMPLEX_TABLE
103+
d = {
104+
'type': cc_table_type,
105+
'content': {
106+
'html': html_content,
107+
'table_nest_level': table_nest_level,
108+
"caption": [],
109+
"footnote": []
110+
}
111+
}
103112
else:
104113
cc_table_type = DocElementType.SIMPLE_TABLE
105-
d = {
106-
'type': cc_table_type,
107-
'raw_content': raw_html_segment,
108-
'content': {
109-
'html': html_content,
110-
'is_complex': table_type,
111-
'table_nest_level': table_nest_level
114+
d = {
115+
'type': cc_table_type,
116+
'content': {
117+
'html': html_content,
118+
"caption": [],
119+
"footnote": []
120+
}
112121
}
113-
}
114122
return d
115123

116124
def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool:

llm_web_kit/extractor/html/recognizer/text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
9393
el = parsed_content
9494
node = {
9595
'type': DocElementType.PARAGRAPH,
96-
'raw_content': raw_html_segment,
96+
# 'raw_content': raw_html_segment,
9797
'content': json.loads(el.text),
9898
}
9999
return node
@@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
271271

272272
for item in para_text:
273273
if item['c'] is not None:
274-
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
274+
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n')
275275
else:
276276
item['c'] = ""
277277

llm_web_kit/extractor/html/recognizer/title.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
3636
return None
3737
cctitle_content_node = {
3838
'type': DocElementType.TITLE,
39-
'raw_content': raw_html_segment,
39+
# 'raw_content': raw_html_segment,
4040
'content': {
4141
'title_content': text,
4242
'level': level

0 commit comments

Comments
 (0)