Skip to content

Commit ea34ccb

Browse files
authored
Merge pull request #1 from DavidLMS/page-selection
Page selection
2 parents 1365557 + 8320ab5 commit ea34ccb

17 files changed

Lines changed: 2841 additions & 2311 deletions

.env.example

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ DEFAULT_OLLAMA_SUMMARY_MODEL="mistral-small3.1"
1515
# Common Configuration
1616
DEFAULT_LANGUAGE="Spanish"
1717
DEFAULT_USE_MARKITDOWN="true"
18-
DEFAULT_USE_SUMMARY="false"
18+
DEFAULT_USE_SUMMARY="false"
19+
DEFAULT_PAGE_SELECTION=""

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ DescribePDF is an open-source tool designed to convert PDF files into detailed p
2424
<a href="https://github.com/DavidLMS/DescribePDF/issues/new?assignees=&labels=bug&projects=&template=bug_report.md&title=%5BBUG%5D">Report Bug</a>
2525
·
2626
<a href="https://github.com/DavidLMS/DescribePDF/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.md&title=%5BREQUEST%5D">Request Feature</a>
27-
·
27+
·
2828
<a href="https://github.com/DavidLMS/DescribePDF/wiki">Wiki</a>
2929
</p>
3030

@@ -56,6 +56,7 @@ DescribePDF is an open-source tool designed to convert PDF files into detailed p
5656

5757
- 📄 **Comprehensive Page Analysis** - Detailed descriptions of each page's visual and textual content
5858
- 🔍 **Context-Aware Descriptions** - Generates descriptions that understand the document's overall structure and purpose
59+
- 📑 **Selective Page Processing** - Process only specific pages or ranges for targeted analysis and faster results
5960
- 🌐 **Multilingual Support** - Generate descriptions in multiple languages
6061
- 📊 **Enhanced Extraction with Markitdown** - Optional integration with Markitdown for better text extraction
6162
- ☁️ **Cloud Model Support** - Compatible with powerful VLMs through OpenRouter
@@ -217,6 +218,7 @@ DEFAULT_OLLAMA_SUMMARY_MODEL="mistral-small3.1"
217218
DEFAULT_LANGUAGE="English"
218219
DEFAULT_USE_MARKITDOWN="true"
219220
DEFAULT_USE_SUMMARY="false"
221+
DEFAULT_PAGE_SELECTION=""
220222
```
221223

222224
## Usage
@@ -238,6 +240,9 @@ describepdf document.pdf -o result.md
238240
# Change the output language
239241
describepdf document.pdf -l Spanish
240242

243+
# Process only specific pages
244+
describepdf document.pdf --pages "1,3,5-10,15"
245+
241246
# Use Markitdown and summary generation
242247
describepdf document.pdf --use-markitdown --use-summary
243248

@@ -270,6 +275,7 @@ optional arguments:
270275
VLM model to use
271276
-l LANGUAGE, --language LANGUAGE
272277
Output language
278+
--pages PAGES Pages to process (e.g. '1,3,5-10,15')
273279
--use-markitdown Use Markitdown for enhanced text extraction
274280
--use-summary Generate and use a PDF summary
275281
--summary-model SUMMARY_MODEL
@@ -371,4 +377,4 @@ DescribePDF is released under the [MIT License](https://github.com/DavidLMS/Desc
371377

372378
## Contributing
373379

374-
Contributions to DescribePDF are welcome! Whether you're improving the code, enhancing the documentation, or suggesting new features, your input is valuable. Please check out the [CONTRIBUTING.md](https://github.com/DavidLMS/DescribePDF/blob/main/CONTRIBUTING.md) file for guidelines on how to get started and make your contributions count.
380+
Contributions to DescribePDF are welcome! Whether you're improving the code, enhancing the documentation, or suggesting new features, your input is valuable. Please check out the [CONTRIBUTING.md](https://github.com/DavidLMS/DescribePDF/blob/main/CONTRIBUTING.md) file for guidelines on how to get started and make your contributions count.

describepdf/cli.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ def setup_cli_parser() -> argparse.ArgumentParser:
6161
help="VLM model to use (default: configured in .env)"
6262
)
6363

64+
parser.add_argument(
65+
"--pages",
66+
help="Pages to process (e.g. '1,3,5-10,15'). Default: all pages."
67+
)
6468
parser.add_argument(
6569
"-l", "--language",
6670
help="Output language (default: configured in .env)"
@@ -157,6 +161,7 @@ def run_cli() -> None:
157161
"output_language": args.language if args.language else env_config.get("output_language"),
158162
"use_markitdown": args.use_markitdown if args.use_markitdown is not None else env_config.get("use_markitdown"),
159163
"use_summary": args.use_summary if args.use_summary is not None else env_config.get("use_summary"),
164+
"page_selection": args.pages if args.pages else env_config.get("page_selection")
160165
}
161166

162167
# Configure provider-specific settings
@@ -219,6 +224,11 @@ def run_cli() -> None:
219224
logger.info(f"Summary: {'Yes' if run_config['use_summary'] else 'No'}")
220225
if run_config.get('use_summary') and run_config.get('summary_llm_model'):
221226
logger.info(f"Summary model: {run_config['summary_llm_model']}")
227+
228+
if run_config.get('page_selection'):
229+
logger.info(f"Page selection: {run_config['page_selection']}")
230+
else:
231+
logger.info("Page selection: All pages")
222232

223233
# Create progress callback
224234
progress_callback = create_progress_callback()

describepdf/config.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
SCRIPT_DIR = pathlib.Path(__file__).parent.parent.absolute()
1919
PROMPTS_DIR = pathlib.Path(SCRIPT_DIR) / "prompts"
2020

21+
# Default configuration values
22+
2123
# Default configuration values
2224
DEFAULT_CONFIG: Dict[str, Any] = {
2325
"openrouter_api_key": None,
@@ -30,7 +32,8 @@
3032

3133
"output_language": "English",
3234
"use_markitdown": False,
33-
"use_summary": False
35+
"use_summary": False,
36+
"page_selection": None
3437
}
3538

3639
# Mapping of prompt template identifiers to their file names
@@ -90,6 +93,9 @@ def load_env_config() -> Dict[str, Any]:
9093

9194
if os.getenv("DEFAULT_USE_SUMMARY"):
9295
loaded_config["use_summary"] = str(os.getenv("DEFAULT_USE_SUMMARY")).lower() == 'true'
96+
97+
if os.getenv("DEFAULT_PAGE_SELECTION"):
98+
loaded_config["page_selection"] = os.getenv("DEFAULT_PAGE_SELECTION")
9399

94100
logger.info("Configuration loaded from environment variables.")
95101

describepdf/core.py

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,87 @@ class ConversionError(Exception):
2424
"""Error raised during PDF conversion process."""
2525
pass
2626

27-
def format_markdown_output(descriptions: List[str], original_filename: str) -> str:
27+
def parse_page_selection(selection_string: Optional[str], total_pages: int) -> List[int]:
28+
"""
29+
Parse a page selection string into a list of page indices.
30+
31+
Args:
32+
selection_string: String with page selection (e.g. "1,3,5-10,15")
33+
total_pages: Total number of pages in the document
34+
35+
Returns:
36+
List[int]: List of zero-based page indices to process
37+
"""
38+
if not selection_string:
39+
# Return all pages if selection is empty
40+
return list(range(total_pages))
41+
42+
page_indices = []
43+
44+
try:
45+
sections = selection_string.split(',')
46+
for section in sections:
47+
section = section.strip()
48+
if not section:
49+
continue
50+
51+
if '-' in section:
52+
# Handle page range
53+
start, end = section.split('-', 1)
54+
start_idx = int(start.strip()) - 1 # Convert to 0-based index
55+
end_idx = int(end.strip()) - 1
56+
57+
# Validate range
58+
if start_idx < 0 or end_idx >= total_pages or start_idx > end_idx:
59+
logger.warning(f"Invalid page range: {section}. Must be between 1 and {total_pages}.")
60+
continue
61+
62+
page_indices.extend(range(start_idx, end_idx + 1))
63+
else:
64+
# Handle single page
65+
page_idx = int(section) - 1 # Convert to 0-based index
66+
67+
# Validate page number
68+
if page_idx < 0 or page_idx >= total_pages:
69+
logger.warning(f"Invalid page number: {section}. Must be between 1 and {total_pages}.")
70+
continue
71+
72+
page_indices.append(page_idx)
73+
74+
# Remove duplicates and sort
75+
page_indices = sorted(set(page_indices))
76+
77+
if not page_indices:
78+
logger.warning("No valid pages specified. Processing all pages.")
79+
return list(range(total_pages))
80+
81+
return page_indices
82+
83+
except ValueError as e:
84+
logger.error(f"Error parsing page selection '{selection_string}': {e}. Processing all pages.")
85+
return list(range(total_pages))
86+
87+
def format_markdown_output(descriptions: List[str], original_filename: str, page_numbers: Optional[List[int]] = None) -> str:
2888
"""
2989
Combine page descriptions into a single Markdown file.
3090
3191
Args:
3292
descriptions: List of strings, each being a description of a page
3393
original_filename: Name of the original PDF file
94+
page_numbers: List of actual page numbers corresponding to descriptions (1-based)
3495
3596
Returns:
3697
str: Complete Markdown content
3798
"""
3899
md_content = f"# Description of PDF: {original_filename}\n\n"
100+
39101
for i, desc in enumerate(descriptions):
40-
md_content += f"## Page {i + 1}\n\n"
102+
# Use actual page number if provided, otherwise use sequential numbering
103+
page_num = page_numbers[i] if page_numbers else (i + 1)
104+
md_content += f"## Page {page_num}\n\n"
41105
md_content += desc if desc else "*No description generated for this page.*"
42106
md_content += "\n\n---\n\n"
107+
43108
return md_content
44109

45110
def convert_pdf_to_markdown(
@@ -175,7 +240,17 @@ def convert_pdf_to_markdown(
175240
page_processing_progress_start = pdf_load_progress
176241
total_page_progress_ratio = (0.98 - page_processing_progress_start) if total_pages > 0 else 0
177242

178-
for i, page in enumerate(pages):
243+
# Parse page selection
244+
page_selection = cfg.get("page_selection")
245+
selected_indices = parse_page_selection(page_selection, total_pages)
246+
247+
if page_selection:
248+
logger.info(f"Processing {len(selected_indices)} selected pages out of {total_pages} total pages.")
249+
else:
250+
logger.info(f"Processing all {total_pages} pages.")
251+
252+
for i in selected_indices:
253+
page = pages[i]
179254
page_num = i + 1
180255
current_page_ratio = (page_num / total_pages) if total_pages > 0 else 1.0
181256

@@ -306,7 +381,10 @@ def convert_pdf_to_markdown(
306381
# Generate final markdown
307382
final_progress = 0.99
308383
progress_callback(final_progress, "Combining page descriptions into final Markdown...")
309-
final_markdown = format_markdown_output(all_descriptions, original_filename)
384+
385+
actual_page_numbers = [i + 1 for i in selected_indices] if 'selected_indices' in locals() else None
386+
387+
final_markdown = format_markdown_output(all_descriptions, original_filename, actual_page_numbers)
310388
logger.info("Final Markdown content assembled.")
311389

312390
# Report completion

describepdf/ui.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def convert_pdf_to_descriptive_markdown(
2929
ui_use_md: bool,
3030
ui_use_sum: bool,
3131
ui_sum_model: str,
32+
ui_page_selection: str,
3233
progress: gr.Progress = gr.Progress(track_tqdm=True)
3334
) -> Tuple[str, gr.update, Optional[str]]:
3435
"""
@@ -47,6 +48,7 @@ def convert_pdf_to_descriptive_markdown(
4748
ui_use_md: Whether to use Markitdown for enhanced text extraction
4849
ui_use_sum: Whether to generate a document summary for context
4950
ui_sum_model: Summary model name from UI (e.g., google/gemini-2.5-flash-preview)
51+
ui_page_selection: Optional page selection string (e.g., "1,3,5-10")
5052
progress: Gradio progress tracker
5153
5254
Returns:
@@ -72,7 +74,8 @@ def convert_pdf_to_descriptive_markdown(
7274
"output_language": ui_lang,
7375
"use_markitdown": ui_use_md,
7476
"use_summary": ui_use_sum,
75-
"summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model")
77+
"summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model"),
78+
"page_selection": ui_page_selection.strip() if ui_page_selection.strip() else None
7679
}
7780

7881
# Validate API key
@@ -243,6 +246,12 @@ def create_ui() -> gr.Blocks:
243246
allow_custom_value=True,
244247
info="Select or type the desired output language (e.g., English, Spanish)"
245248
)
249+
page_selection_input = gr.Textbox(
250+
label="Page Selection (Optional)",
251+
value="",
252+
placeholder="Example: 1,3,5-10,15 (leave empty for all pages)",
253+
info="Specify individual pages or ranges to process"
254+
)
246255
with gr.Row():
247256
use_markitdown_checkbox = gr.Checkbox(
248257
label="Use Markitdown for extra text context",
@@ -263,7 +272,7 @@ def create_ui() -> gr.Blocks:
263272
# Connect UI components
264273
conversion_inputs = [
265274
pdf_input, api_key_input, vlm_model_input, output_language_input,
266-
use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
275+
use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input, page_selection_input
267276
]
268277
conversion_outputs = [
269278
progress_output, download_button, markdown_output

describepdf/ui_ollama.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def convert_pdf_to_descriptive_markdown(
3030
ui_use_md: bool,
3131
ui_use_sum: bool,
3232
ui_sum_model: str,
33+
ui_page_selection: str,
3334
progress: gr.Progress = gr.Progress(track_tqdm=True)
3435
) -> Tuple[str, gr.update, Optional[str]]:
3536
"""
@@ -52,6 +53,7 @@ def convert_pdf_to_descriptive_markdown(
5253
ui_use_md: Whether to use Markitdown for enhanced text extraction
5354
ui_use_sum: Whether to generate a document summary for context
5455
ui_sum_model: Summary model name from UI (e.g., qwen2.5)
56+
ui_page_selection: Optional page selection string (e.g., "1,3,5-10")
5557
progress: Gradio progress tracker
5658
5759
Returns:
@@ -78,7 +80,8 @@ def convert_pdf_to_descriptive_markdown(
7880
"output_language": ui_lang,
7981
"use_markitdown": ui_use_md,
8082
"use_summary": ui_use_sum,
81-
"summary_llm_model": ui_sum_model
83+
"summary_llm_model": ui_sum_model,
84+
"page_selection": ui_page_selection.strip() if ui_page_selection.strip() else None
8285
}
8386

8487
# Create progress callback for Gradio
@@ -232,6 +235,12 @@ def create_ui() -> gr.Blocks:
232235
allow_custom_value=True,
233236
info="Select or type the desired output language (e.g., English, Spanish)"
234237
)
238+
page_selection_input = gr.Textbox(
239+
label="Page Selection (Optional)",
240+
value="",
241+
placeholder="Example: 1,3,5-10,15 (leave empty for all pages)",
242+
info="Specify individual pages or ranges to process"
243+
)
235244
with gr.Row():
236245
use_markitdown_checkbox = gr.Checkbox(
237246
label="Use Markitdown for extra text context",
@@ -248,11 +257,10 @@ def create_ui() -> gr.Blocks:
248257
allow_custom_value=True,
249258
info="Select or type the Ollama LLM model name for summaries"
250259
)
251-
252260
# Connect UI components
253261
conversion_inputs = [
254262
pdf_input, ollama_endpoint_input, vlm_model_input, output_language_input,
255-
use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
263+
use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input, page_selection_input
256264
]
257265
conversion_outputs = [
258266
progress_output, download_button, markdown_output

0 commit comments

Comments
 (0)