33提供 HTML 解析、内容提取等功能的 API 端点。
44"""
55
6+ import base64
7+ import html
8+ import time
69from typing import Optional
710
8- from fastapi import APIRouter , Depends , File , HTTPException , UploadFile
11+ from fastapi import (APIRouter , BackgroundTasks , Body , Depends , File ,
12+ HTTPException , UploadFile )
913from sqlalchemy .ext .asyncio import AsyncSession
1014
1115from ..database import get_db_session
12- from ..dependencies import get_logger , get_settings
16+ from ..dependencies import get_logger , get_settings , request_id_var
1317from ..models .request import HTMLParseRequest
1418from ..models .response import HTMLParseResponse
1519from ..services .html_service import HTMLService
2327
2428@router .post ('/html/parse' , response_model = HTMLParseResponse )
2529async def parse_html (
26- request : HTMLParseRequest ,
27- html_service : HTMLService = Depends (HTMLService ),
28- db_session : Optional [AsyncSession ] = Depends (get_db_session )
30+ background_tasks : BackgroundTasks ,
31+ request : HTMLParseRequest = Body (...),
32+ html_service : HTMLService = Depends (HTMLService ),
33+ db_session : Optional [AsyncSession ] = Depends (get_db_session )
2934):
3035 """解析 HTML 内容.
3136
3237 接收 HTML 字符串并返回解析后的结构化内容。
3338 """
34- # 生成请求ID
35- request_id = RequestLogService .generate_request_id ()
39+ # 从 context var 获取 request_id
40+ request_id = request_id_var .get ()
41+ decoded_bytes = base64 .b64decode (request .html_content )
42+ decoded_str = decoded_bytes .decode ('utf-8' )
43+ unescaped_html = html .unescape (decoded_str )
3644
3745 # 确定输入类型
3846 if request .html_content :
@@ -43,35 +51,32 @@ async def parse_html(
4351 input_type = 'unknown'
4452
4553 # 创建请求日志
46- await RequestLogService .create_log (
54+ start_time = time .time ()
55+ await RequestLogService .initial_log (
4756 session = db_session ,
4857 request_id = request_id ,
4958 input_type = input_type ,
50- input_html = request . html_content ,
59+ input_html = unescaped_html ,
5160 url = request .url ,
5261 )
53-
54- # 立即提交,使 processing 状态在数据库中可见
55- if db_session :
56- try :
57- await db_session .commit ()
58- except Exception as commit_error :
59- logger .error (f'提交初始日志时出错: { commit_error } ' )
62+ end_time = time .time ()
63+ logger .info (f'创建日志耗时: { end_time - start_time } 秒' )
6064
6165 try :
62- logger .info (f'开始解析 HTML [request_id= { request_id } ] ,内容长度: { len (request . html_content ) if request . html_content else 0 } ' )
66+ logger .info (f'开始解析 HTML,内容长度: { len (unescaped_html ) if unescaped_html else 0 } ' )
6367
6468 result = await html_service .parse_html (
65- html_content = request . html_content ,
69+ html_content = unescaped_html ,
6670 url = request .url ,
71+ request_id = request_id ,
6772 options = request .options
6873 )
6974
70- # 更新日志为成功
71- await RequestLogService . update_log_success (
72- session = db_session ,
73- request_id = request_id ,
74- output_markdown = result .get ('markdown' ),
75+ # 将成功日志更新操作添加到后台任务
76+ background_tasks . add_task (
77+ RequestLogService . log_success_bg ,
78+ request_id ,
79+ result .get ('markdown' )
7580 )
7681
7782 return HTMLParseResponse (
@@ -81,37 +86,32 @@ async def parse_html(
8186 request_id = request_id
8287 )
8388 except Exception as e :
84- logger .error (f'HTML 解析失败 [request_id={ request_id } ]: { str (e )} ' )
85-
86- # 更新日志为失败
87- await RequestLogService .update_log_failure (
88- session = db_session ,
89- request_id = request_id ,
90- error_message = str (e ),
89+ error_message = str (e )
90+ logger .error (f'HTML 解析失败: { error_message } ' )
91+
92+ # 将失败日志更新操作添加到后台任务
93+ background_tasks .add_task (
94+ RequestLogService .log_failure_bg ,
95+ request_id ,
96+ error_message
9197 )
9298
93- # 手动提交事务,确保失败日志被保存
94- if db_session :
95- try :
96- await db_session .commit ()
97- except Exception as commit_error :
98- logger .error (f'提交失败日志时出错: { commit_error } ' )
99-
100- raise HTTPException (status_code = 500 , detail = f'HTML 解析失败: { str (e )} ' )
99+ raise HTTPException (status_code = 500 , detail = f'HTML 解析失败: { error_message } ' )
101100
102101
103102@router .post ('/html/upload' )
104103async def upload_html_file (
105- file : UploadFile = File (...),
106- html_service : HTMLService = Depends (HTMLService ),
107- db_session : Optional [AsyncSession ] = Depends (get_db_session )
104+ background_tasks : BackgroundTasks ,
105+ file : UploadFile = File (...),
106+ html_service : HTMLService = Depends (HTMLService ),
107+ db_session : Optional [AsyncSession ] = Depends (get_db_session )
108108):
109109 """上传 HTML 文件进行解析.
110110
111111 支持上传 HTML 文件,自动解析并返回结果。
112112 """
113- # 生成请求ID
114- request_id = RequestLogService . generate_request_id ()
113+ # 从 context var 获取 request_id
114+ request_id = request_id_var . get ()
115115
116116 try :
117117 if not file .filename .endswith (('.html' , '.htm' )):
@@ -120,31 +120,26 @@ async def upload_html_file(
120120 content = await file .read ()
121121 html_content = content .decode ('utf-8' )
122122
123- logger .info (f'上传 HTML 文件 [request_id= { request_id } ] : { file .filename } , 大小: { len (content )} bytes' )
124-
123+ logger .info (f'上传 HTML 文件: { file .filename } , 大小: { len (content )} bytes' )
124+ start_time = time . time ()
125125 # 创建请求日志
126- await RequestLogService .create_log (
126+ await RequestLogService .initial_log (
127127 session = db_session ,
128128 request_id = request_id ,
129129 input_type = 'file' ,
130130 input_html = html_content ,
131131 url = None ,
132132 )
133+ end_time = time .time ()
134+ logger .info (f'创建日志耗时: { end_time - start_time } 秒' )
133135
134- # 立即提交,使 processing 状态在数据库中可见
135- if db_session :
136- try :
137- await db_session .commit ()
138- except Exception as commit_error :
139- logger .error (f'提交初始日志时出错: { commit_error } ' )
140-
141- result = await html_service .parse_html (html_content = html_content , url = "www.baidu.com" )
136+ result = await html_service .parse_html (html_content = html_content , url = "www.baidu.com" , request_id = request_id )
142137
143- # 更新日志为成功
144- await RequestLogService . update_log_success (
145- session = db_session ,
146- request_id = request_id ,
147- output_markdown = result .get ('markdown' ),
138+ # 将成功日志更新操作添加到后台任务
139+ background_tasks . add_task (
140+ RequestLogService . log_success_bg ,
141+ request_id ,
142+ result .get ('markdown' )
148143 )
149144
150145 return HTMLParseResponse (
@@ -154,23 +149,17 @@ async def upload_html_file(
154149 request_id = request_id
155150 )
156151 except Exception as e :
157- logger .error (f'HTML 文件解析失败 [request_id={ request_id } ]: { str (e )} ' )
158-
159- # 更新日志为失败
160- await RequestLogService .update_log_failure (
161- session = db_session ,
162- request_id = request_id ,
163- error_message = str (e ),
152+ error_message = str (e )
153+ logger .error (f'HTML 文件解析失败: { error_message } ' )
154+
155+ # 将失败日志更新操作添加到后台任务
156+ background_tasks .add_task (
157+ RequestLogService .log_failure_bg ,
158+ request_id ,
159+ error_message
164160 )
165161
166- # 手动提交事务,确保失败日志被保存
167- if db_session :
168- try :
169- await db_session .commit ()
170- except Exception as commit_error :
171- logger .error (f'提交失败日志时出错: { commit_error } ' )
172-
173- raise HTTPException (status_code = 500 , detail = f'HTML 文件解析失败: { str (e )} ' )
162+ raise HTTPException (status_code = 500 , detail = f'HTML 文件解析失败: { error_message } ' )
174163
175164
176165@router .get ('/html/status' )
0 commit comments