Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 57 additions & 50 deletions app/parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,22 @@ void set_loglevel(std::string level)
if(level=="info")
{
loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
//set_verbosity(loguru::Verbosity_INFO);
}
else if(level=="warning")
{
loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
//loguru::set_verbosity(loguru::Verbosity_WARNING);
}
else if(level=="error")
{
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
//loguru::set_verbosity(loguru::Verbosity_ERROR);
}
else if(level=="fatal")
{
loguru::g_stderr_verbosity = loguru::Verbosity_FATAL;
//loguru::set_verbosity(loguru::Verbosity_ERROR);
}
else
{
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
}
}

Expand Down Expand Up @@ -71,69 +67,93 @@ int main(int argc, char* argv[]) {
// Initialize loguru
loguru::init(argc, argv);

bool do_sanitization = true;
bool keep_shapes = true;
bool keep_bitmaps = true;

try {
cxxopts::Options options("PDFProcessor", "A program to process PDF files or configuration files");

// Define the options
options.add_options()
("i,input", "Input PDF file", cxxopts::value<std::string>())
("c,config", "Config file", cxxopts::value<std::string>())
("create-config", "Create config file", cxxopts::value<std::string>())
("p,page", "Pages to process (default: -1 for all)", cxxopts::value<int>()->default_value("-1"))
("password", "Password for accessing encrypted, password-protected files", cxxopts::value<std::string>())
("o,output", "Output file", cxxopts::value<std::string>())
("export-images", "Export images to directory", cxxopts::value<std::string>())
("print-cells", "Print cells to stdout [char, word, line, all] (default: none)", cxxopts::value<std::string>())
("keep-text", "Keep text cells in output (default: true)", cxxopts::value<bool>()->default_value("true"))
("keep-shapes", "Keep shapes in output (default: true)", cxxopts::value<bool>()->default_value("true"))
("keep-bitmaps", "Keep bitmaps in output (default: true)", cxxopts::value<bool>()->default_value("true"))
("do-sanitation", "Do text sanitation (default: true)", cxxopts::value<bool>()->default_value("true"))
("l,loglevel", "loglevel [error;warning;success;info]", cxxopts::value<std::string>())
("h,help", "Print usage");
("i,input", "Input PDF file", cxxopts::value<std::string>())
("c,config", "Config file", cxxopts::value<std::string>())
("create-config", "Create config file", cxxopts::value<std::string>())
("p,page", "Pages to process (default: -1 for all)", cxxopts::value<int>()->default_value("-1"))
("password", "Password for accessing encrypted, password-protected files", cxxopts::value<std::string>())
("o,output", "Output file", cxxopts::value<std::string>())
("export-images", "Export images to directory", cxxopts::value<std::string>())
("print-cells", "Print cells to stdout [char, word, line, all] (default: none)", cxxopts::value<std::string>())
("l,loglevel", "Log level [error, warning, info]", cxxopts::value<std::string>())
("h,help", "Print usage")

// ---- decode_config ----
("page-boundary", "Page boundary [crop_box, media_box, ...] (default: crop_box)", cxxopts::value<std::string>())
("do-sanitization", "Run post-parse sanitization (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("keep-char-cells", "Keep individual character cells (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("keep-shapes", "Keep shape items (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("keep-bitmaps", "Keep bitmap items (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("max-num-lines", "Cap on number of lines per page (-1 = no cap)", cxxopts::value<int>())
("max-num-bitmaps", "Cap on number of bitmaps per page (-1 = no cap)", cxxopts::value<int>())
("create-word-cells", "Build word-level cells (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("create-line-cells", "Build line-level cells (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("enforce-same-font", "Require same font within a word/line cell (default: true)", cxxopts::value<bool>()->implicit_value("true"))
("horizontal-cell-tolerance", "Horizontal merge tolerance (default: 1.0)", cxxopts::value<double>())
("word-space-factor", "Space-width factor for word merging (default: 0.33)", cxxopts::value<double>())
("line-space-factor", "Space-width factor for line merging (default: 1.0)", cxxopts::value<double>())
("line-space-factor-with-space", "Space-width factor for line merging with space (default: 0.33)", cxxopts::value<double>())
("keep-glyphs", "Keep unmapped GLYPH<...> tokens (default: false)", cxxopts::value<bool>()->implicit_value("true"))
("keep-qpdf-warnings", "Emit QPDF warnings (default: false)", cxxopts::value<bool>()->implicit_value("true"))
("populate-json", "Populate JSON objects during decode (default: false)", cxxopts::value<bool>()->implicit_value("true"));

// Parse command line arguments
auto result = options.parse(argc, argv);

// Check if either input or config file is provided (mandatory)
if (orig_argc == 1) {
LOG_S(INFO) << argc;
LOG_F(ERROR, "Either input (-i) or config (-c) must be specified.");
LOG_F(INFO, "%s", options.help().c_str());
return 1;
}

std::string level = "warning";
if (result.count("loglevel")){
if (result.count("loglevel")) {
level = result["loglevel"].as<std::string>();

// Convert the string to lowercase
std::transform(level.begin(), level.end(), level.begin(), [](unsigned char c) {
return std::tolower(c);
});

set_loglevel(level);
}

do_sanitization = result["do-sanitation"].as<bool>();
bool keep_text = result["keep-text"].as<bool>();
keep_shapes = result["keep-shapes"].as<bool>();
keep_bitmaps = result["keep-bitmaps"].as<bool>();
// Help option or no arguments provided
if (result.count("help")) {
LOG_F(INFO, "%s", options.help().c_str());
return 0;
}

// --- decode_config ---
pdflib::decode_config page_config;
if (result.count("page-boundary")) { page_config.page_boundary = result["page-boundary"].as<std::string>(); }
if (result.count("do-sanitization")) { page_config.do_sanitization = result["do-sanitization"].as<bool>(); }
if (result.count("keep-char-cells")) { page_config.keep_char_cells = result["keep-char-cells"].as<bool>(); }
if (result.count("keep-shapes")) { page_config.keep_shapes = result["keep-shapes"].as<bool>(); }
if (result.count("keep-bitmaps")) { page_config.keep_bitmaps = result["keep-bitmaps"].as<bool>(); }
if (result.count("max-num-lines")) { page_config.max_num_lines = result["max-num-lines"].as<int>(); }
if (result.count("max-num-bitmaps")) { page_config.max_num_bitmaps = result["max-num-bitmaps"].as<int>(); }
if (result.count("create-word-cells")) { page_config.create_word_cells = result["create-word-cells"].as<bool>(); }
if (result.count("create-line-cells")) { page_config.create_line_cells = result["create-line-cells"].as<bool>(); }
if (result.count("enforce-same-font")) { page_config.enforce_same_font = result["enforce-same-font"].as<bool>(); }
if (result.count("horizontal-cell-tolerance")){ page_config.horizontal_cell_tolerance = result["horizontal-cell-tolerance"].as<double>(); }
if (result.count("word-space-factor")) { page_config.word_space_width_factor_for_merge = result["word-space-factor"].as<double>(); }
if (result.count("line-space-factor")) { page_config.line_space_width_factor_for_merge = result["line-space-factor"].as<double>(); }
if (result.count("line-space-factor-with-space")) { page_config.line_space_width_factor_for_merge_with_space = result["line-space-factor-with-space"].as<double>(); }
if (result.count("keep-glyphs")) { page_config.keep_glyphs = result["keep-glyphs"].as<bool>(); }
if (result.count("keep-qpdf-warnings")) { page_config.keep_qpdf_warnings = result["keep-qpdf-warnings"].as<bool>(); }
if (result.count("populate-json")) { page_config.populate_json_objects = result["populate-json"].as<bool>(); }

if (result.count("config")) {
std::string config_file = result["config"].as<std::string>();
LOG_F(INFO, "Config file: %s", config_file.c_str());

pdflib::decode_config page_config;

page_config.do_sanitization = do_sanitization;
page_config.keep_char_cells = keep_text;
page_config.keep_shapes = keep_shapes;
page_config.keep_bitmaps = keep_bitmaps;

std::cout << "decode_config:\n" << page_config.to_string() << std::endl;

utils::timer timer;
Expand Down Expand Up @@ -196,12 +216,6 @@ int main(int argc, char* argv[]) {
config["password"] = result["password"].as<std::string>();
}

pdflib::decode_config page_config;
page_config.do_sanitization = do_sanitization;
page_config.keep_char_cells = keep_text;
page_config.keep_shapes = keep_shapes;
page_config.keep_bitmaps = keep_bitmaps;

std::cout << "decode_config:\n" << page_config.to_string() << std::endl;

utils::timer timer;
Expand Down Expand Up @@ -233,13 +247,6 @@ int main(int argc, char* argv[]) {
return 0;
}

// Help option or no arguments provided
if (result.count("help")) {
LOG_F(INFO, "%s", options.help().c_str());
return 0;
}

//} catch (const cxxopts::OptionException& e) {
} catch (const cxxopts::exceptions::exception& e) {
LOG_F(ERROR, "Error parsing options: %s", e.what());
return 1;
Expand Down
2 changes: 1 addition & 1 deletion app/render.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ int main(int argc, char* argv[])
("p,page", "Pages to process (default: -1 for all)", cxxopts::value<int>()->default_value("-1"))
("password", "Password for encrypted files", cxxopts::value<std::string>())
("o,output", "Output file or output directory (for -d mode)", cxxopts::value<std::string>())
("r,renderer", "Renderer type [NAIVE, BLEND2D] (default: NAIVE)", cxxopts::value<std::string>()->default_value("NAIVE"))
("r,renderer", "Renderer type [NAIVE, BLEND2D] (default: NAIVE)", cxxopts::value<std::string>()->default_value("BLEND2D"))
("l,loglevel", "Log level [error, warning, info]", cxxopts::value<std::string>())
("h,help", "Print usage")

Expand Down
9 changes: 9 additions & 0 deletions src/parse/page_item.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ namespace pdflib
PAGE_HYPERLINK,
PAGE_HYPERLINKS
};

// "/FT": "/Tx" | "/Btn" | "/Ch" | "/Sig"
enum widget_name {
TEXT_FIELD, // type: {`/FT`: `/Tx`}
BUTTON, // type: { `/FT`: `/Btn`}
CHOICE, // type: { `/FT`: `/Ch`}
SIGNATURE, // type: { `/FT`: `/Sig`}
UNDEFINED,
};

template<item_name name>
class page_item
Expand Down
3 changes: 3 additions & 0 deletions src/parse/page_items/page_widget.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ namespace pdflib

public:

widget_name name;

// Bounding box (in page coordinates)
double x0;
double y0;
Expand All @@ -34,6 +36,7 @@ namespace pdflib
};

page_item<PAGE_WIDGET>::page_item():
name(UNDEFINED),
x0(0), y0(0), x1(0), y1(0),
text(),
description(),
Expand Down
67 changes: 64 additions & 3 deletions src/parse/page_items/render_instructions.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ namespace pdflib
enum RENDER_INSTRUCTION_NAME {
SIZE_INSTRUCTION, // set the size of the canvas on which we render
TEXT_RENDER_INSTRUCTION, // render text on the canvas
TEXT_WIDGET_RENDER_INSTRUCTION, // render a fillable-field widget (bbox + value text)
BITMAP_RENDER_INSTRUCTION, // paste bitmap image on the canvas
SHAPE_RENDER_INSTRUCTION, // draw shapes (lines, shapes, filling, etc)
};
Expand Down Expand Up @@ -135,6 +136,50 @@ namespace pdflib
const double base_y0;
};

class text_widget_instruction
{
public:
const static RENDER_INSTRUCTION_NAME instr = TEXT_WIDGET_RENDER_INSTRUCTION;

text_widget_instruction(std::string text,
double x0, double y0,
double x1, double y1,
double r_x0, double r_y0,
double r_x1, double r_y1,
double r_x2, double r_y2,
double r_x3, double r_y3):
text_(std::move(text)),
x0_(x0), y0_(y0),
x1_(x1), y1_(y1),
r_x0_(r_x0), r_y0_(r_y0),
r_x1_(r_x1), r_y1_(r_y1),
r_x2_(r_x2), r_y2_(r_y2),
r_x3_(r_x3), r_y3_(r_y3)
{}

const std::string& get_text() const { return text_; }

double get_x0() const { return x0_; }
double get_y0() const { return y0_; }
double get_x1() const { return x1_; }
double get_y1() const { return y1_; }

double get_r_x0() const { return r_x0_; }
double get_r_y0() const { return r_y0_; }
double get_r_x1() const { return r_x1_; }
double get_r_y1() const { return r_y1_; }
double get_r_x2() const { return r_x2_; }
double get_r_y2() const { return r_y2_; }
double get_r_x3() const { return r_x3_; }
double get_r_y3() const { return r_y3_; }

private:
const std::string text_;
const double x0_, y0_, x1_, y1_;
const double r_x0_, r_y0_, r_x1_, r_y1_;
const double r_x2_, r_y2_, r_x3_, r_y3_;
};

class bitmap_instruction
{
public:
Expand Down Expand Up @@ -239,6 +284,7 @@ namespace pdflib
typedef instruction instruction_type;

typedef text_instruction text_instruction_type;
typedef text_widget_instruction text_widget_instruction_type;
typedef bitmap_instruction bitmap_instruction_type;
typedef shape_instruction shape_instruction_type;

Expand All @@ -251,6 +297,7 @@ namespace pdflib

void add_size_instruction(size_instruction& instr);
void add_text_instruction(text_instruction_type instr);
void add_widget_instruction(text_widget_instruction_type instr);
void add_bitmap_instruction(bitmap_instruction_type instr);
void add_shape_instruction(shape_instruction_type instr);

Expand All @@ -264,9 +311,10 @@ namespace pdflib

std::vector<instruction_type> instructions;

std::vector<text_instruction_type> text_instructions;
std::vector<bitmap_instruction_type> bitmap_instructions;
std::vector<shape_instruction_type> shape_instructions;
std::vector<text_instruction_type> text_instructions;
std::vector<text_widget_instruction_type> widget_instructions;
std::vector<bitmap_instruction_type> bitmap_instructions;
std::vector<shape_instruction_type> shape_instructions;

};

Expand All @@ -291,6 +339,12 @@ namespace pdflib
text_instructions.push_back(std::move(instr));
}

inline void pdf_render_instructions::add_widget_instruction(text_widget_instruction instr)
{
instructions.emplace_back(TEXT_WIDGET_RENDER_INSTRUCTION, widget_instructions.size());
widget_instructions.push_back(std::move(instr));
}

inline void pdf_render_instructions::add_bitmap_instruction(bitmap_instruction instr)
{
instructions.emplace_back(BITMAP_RENDER_INSTRUCTION, bitmap_instructions.size());
Expand Down Expand Up @@ -319,6 +373,13 @@ namespace pdflib
}
break;

case TEXT_WIDGET_RENDER_INSTRUCTION:
{
auto& widget_instr = widget_instructions.at(instr.index);
renderer.render_widget(widget_instr);
}
break;

case BITMAP_RENDER_INSTRUCTION:
{
auto& bmap_instr = bitmap_instructions.at(instr.index);
Expand Down
Loading
Loading