Skip to content

Commit d3be896

Browse files
Release v4.1.1 (#562)
Co-authored-by: linfeng <56671143+LollipopsAndWine@users.noreply.github.com>
1 parent 0e2bae1 commit d3be896

5 files changed

Lines changed: 54406 additions & 3 deletions

File tree

llm_web_kit/extractor/html/recognizer/text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,10 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
267267
return text
268268

269269
if final := __get_paragraph_text_recusive(root, ''):
270-
para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT})
270+
para_text.append({'c': final, 't': ParagraphTextType.TEXT})
271271

272272
for item in para_text:
273-
item['c'] = restore_sub_sup_from_text_regex(item['c'])
273+
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
274274
return para_text
275275

276276
def __extract_paragraphs(self, root: HtmlElement):

llm_web_kit/input/datajson.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,6 @@ def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str:
519519
c = el['c']
520520
if not c or not c.strip():
521521
continue
522-
c = c.strip()
523522
new_c = self.__escape_md_special_chars(c) # 转义特殊字符
524523
one_para.append(new_c)
525524
elif el['t'] == ParagraphTextType.EQUATION_INLINE:

0 commit comments

Comments
 (0)