-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchapter_extractor.py
More file actions
78 lines (61 loc) · 2.67 KB
/
chapter_extractor.py
File metadata and controls
78 lines (61 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from typing import List, Dict
from logger_config import get_logger
import re
logger = get_logger(__name__)
def detect_all_chapters(pages: List[str], skip_pages: int = 5) -> Dict[str, tuple]:
"""
Auto-detect all chapters in the document.
Returns a dict mapping chapter titles to (start_idx, end_idx) tuples.
"""
logger.info(f"Auto-detecting chapters from {len(pages)} pages (skipping first {skip_pages})")
pages = pages[skip_pages:]
chapters = {}
chapter_pattern = re.compile(
r'^\s*chapter\s+(\d+)',
re.IGNORECASE | re.MULTILINE
)
for i, page in enumerate(pages):
page_trimmed = page.strip()
match = chapter_pattern.search(page_trimmed[:1000])
if match:
lines = page_trimmed.split('\n')
for line in lines[:20]:
line = line.strip()
if line and line.lower().startswith('chapter'):
chapter_title = line
chapters[chapter_title] = i
logger.info(f"Found chapter at page {i}: '{chapter_title}'")
break
chapter_list = sorted(chapters.items(), key=lambda x: x[1])
result = {}
for i, (title, start_idx) in enumerate(chapter_list):
end_idx = chapter_list[i + 1][1] if i + 1 < len(chapter_list) else len(pages)
result[title] = (start_idx, end_idx)
logger.info(f"Chapter '{title}': pages {start_idx}-{end_idx}")
return result
def extract_chapter_by_number(
pages: List[str],
chapter_number: int,
skip_pages: int = 5
) -> tuple[str, str]:
"""
Extract a chapter by its number (1-indexed).
Returns (chapter_title, chapter_text).
"""
chapters = detect_all_chapters(pages, skip_pages)
if not chapters:
raise ValueError("No chapters detected in document")
if chapter_number < 1 or chapter_number > len(chapters):
raise ValueError(f"Invalid chapter number. Found {len(chapters)} chapters.")
chapter_titles = sorted(chapters.keys(), key=lambda t: chapters[t][0])
chapter_title = chapter_titles[chapter_number - 1]
start_idx, end_idx = chapters[chapter_title]
pages = pages[skip_pages:]
chapter_pages = pages[start_idx:end_idx]
chapter_text = "\n".join(chapter_pages)
logger.info(f"Extracted chapter {chapter_number}: '{chapter_title}' ({len(chapter_pages)} pages)")
return chapter_title, chapter_text
def list_all_chapters(pages: List[str], skip_pages: int = 5) -> List[str]:
"""List all detected chapter titles."""
chapters = detect_all_chapters(pages, skip_pages)
return sorted(chapters.keys(), key=lambda t: chapters[t][0])