From efef259dd0be24b957f2ef3f004fc516b4eedebf Mon Sep 17 00:00:00 2001 From: David Regla Date: Mon, 9 Feb 2026 03:00:47 +0000 Subject: [PATCH 01/10] =?UTF-8?q?feat:=20a=C3=B1adir=20binario=20ejecutabl?= =?UTF-8?q?e=20scraper.php=20para=20uso=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/scraper.php | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 bin/scraper.php diff --git a/bin/scraper.php b/bin/scraper.php new file mode 100755 index 0000000..61f1e61 --- /dev/null +++ b/bin/scraper.php @@ -0,0 +1,66 @@ +#!/usr/bin/env php +" . PHP_EOL; + echo " Opciones:" . PHP_EOL; + echo " archivo.pdf : Ruta al archivo PDF local" . PHP_EOL; + echo " url : URL pública al archivo PDF" . PHP_EOL; + echo " --json-schema : Muestra el esquema JSON de la salida" . PHP_EOL; + echo " -h, --help : Muestra este mensaje de ayuda" . PHP_EOL; + echo PHP_EOL; + echo "Ejemplos:" . PHP_EOL; + echo " php bin/scraper.php constancia.pdf | jq" . PHP_EOL; + echo " php bin/scraper.php https://dominio.com/mi_constancia.pdf | jq" . PHP_EOL; + echo " php bin/scraper.php --json-schema | jq" . PHP_EOL; +} + +if ($argc < 2 || in_array($argv[1], ['-h', '--help'])) { + showUsage(); + exit(0); +} + +if ($argv[1] === '--json-schema') { + echo file_get_contents(__DIR__ . '/../docs/schemas/csf.schema.json'); + exit(0); +} + +$input = $argv[1]; +$isUrl = filter_var($input, FILTER_VALIDATE_URL); +$tempFile = null; + +try { + if ($isUrl) { + $tempFile = tempnam(sys_get_temp_dir(), 'csf_'); + $content = file_get_contents($input); + if ($content === false) { + throw new Exception("No se pudo descargar el archivo desde la URL: $input"); + } + file_put_contents($tempFile, $content); + $path = $tempFile; + } else { + $path = $input; + if (!file_exists($path)) { + throw new Exception("El archivo no existe en la ruta: $path"); + } + } + + $scraper = Scraper::create(); + $person = $scraper->obtainFromPdfPath($path); + + echo json_encode($person, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . PHP_EOL; +} catch (Exception $e) { + fwrite(STDERR, "Error: " . $e->getMessage() . PHP_EOL); + exit(1); +} finally { + if ($tempFile && file_exists($tempFile)) { + unlink($tempFile); + } +} From 3e81fa79f425ed8f93d4a0e3ece1c5c916ca0441 Mon Sep 17 00:00:00 2001 From: David Regla Date: Mon, 9 Feb 2026 03:00:57 +0000 Subject: [PATCH 02/10] =?UTF-8?q?feat:=20a=C3=B1adir=20Dockerfile=20y=20so?= =?UTF-8?q?porte=20para=20contenedores?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .dockerignore | 21 +++++++++++++++++ Dockerfile | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 34 ++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..688ca65 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +.git +.github +.phive +build +docs/CHANGELOG.md +docs/TODO.md +docs/SEMVER.md +tests +vendor +.php-cs-fixer.dist.php +composer.lock +phpcs.xml.dist +phpstan.neon.dist +phpunit.xml.dist +sonar-project.properties +Dockerfile +.dockerignore +README.md +LICENSE +CONTRIBUTING.md +CODE_OF_CONDUCT.md diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..311e302 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,62 @@ +# Stage 1: Build dependencies +FROM composer:2 AS builder + +WORKDIR /app + +# Copy composer files for better caching +COPY composer.json ./ + +# Install dependencies +RUN composer install \ + --no-dev \ + --no-interaction \ + --no-plugins \ + --no-scripts \ + --prefer-dist \ + --optimize-autoloader + +# Stage 2: Production image +FROM php:8.2-alpine + +LABEL maintainer="PhpCfdi" +LABEL description="Producción image for csf-scraper library" + +# Install system dependencies and PHP extensions +RUN apk add --no-cache \ + poppler-utils \ + libxml2-dev \ + oniguruma-dev \ + libcurl \ + curl-dev + +RUN docker-php-ext-install \ + curl \ + mbstring \ + opcache \ + dom + +# Configure PHP for production +RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini" + +# Create a non-root user +RUN addgroup -S appgroup && adduser -S appuser -G appgroup + +WORKDIR /app + +# Copy code and vendor from builder +COPY --from=builder /app/vendor /app/vendor +COPY ./src /app/src +COPY ./bin /app/bin +COPY ./docs /app/docs +COPY ./composer.json /app/composer.json + +# Adjust permissions +RUN chown -R appuser:appgroup /app + +# Switch to non-root user +USER appuser + +# Essential for the library (pdftotext is part of poppler-utils) +ENV PATH="/usr/bin:${PATH}" + +ENTRYPOINT ["php", "/app/bin/scraper.php"] diff --git a/README.md b/README.md index e5e7a37..8f8b6fa 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,40 @@ Ejemplo de salida de `json_encode($person)` para persona física: } ``` +### Uso con Docker + +Esta librería incluye un `Dockerfile` optimizado que funciona como una herramienta de línea de comandos (CLI). La imagen devuelve el resultado en formato **JSON** a través de la salida estándar (stdout), facilitando su integración con otras herramientas como `jq`. + +#### Construir la imagen + +```bash +docker build -t csf-scraper:latest . +``` + +#### Ejecutar extracción + +Puedes pasar una ruta local (montada como volumen) o una URL pública: + +**Desde un archivo local:** +```shell +docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper /csf.pdf | jq +``` + +**Desde una URL:** +```shell +docker run --rm phpcfdi/csf-scraper https://dominio.com/archivo.pdf | jq +``` + +**Obtener el esquema JSON de salida:** +```shell +docker run --rm phpcfdi/csf-scraper --json-schema | jq +``` + +**Ver ayuda:** +```shell +docker run --rm phpcfdi/csf-scraper --help +``` + ## Soporte Puedes obtener soporte abriendo un ticket en Github. From 1b725de3b6504d72f8ec9d25e206aa8bbd4ab2c5 Mon Sep 17 00:00:00 2001 From: David Regla Date: Mon, 9 Feb 2026 03:01:03 +0000 Subject: [PATCH 03/10] =?UTF-8?q?docs:=20a=C3=B1adir=20esquema=20JSON=20pa?= =?UTF-8?q?ra=20la=20estructura=20de=20datos=20CSF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/schemas/csf.schema.json | 155 +++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 docs/schemas/csf.schema.json diff --git a/docs/schemas/csf.schema.json b/docs/schemas/csf.schema.json new file mode 100644 index 0000000..0cec6a4 --- /dev/null +++ b/docs/schemas/csf.schema.json @@ -0,0 +1,155 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://raw.githubusercontent.com/phpcfdi/csf-scraper/main/docs/schemas/csf.schema.json", + "title": "Persona CSF Data", + "description": "Datos fiscales extraídos de una Constancia de Situación Fiscal del SAT (Física o Moral)", + "type": "object", + "oneOf": [ + { + "title": "Persona Física", + "required": [ + "rfc", + "id_cif", + "curp", + "nombre", + "apellido_paterno", + "situacion_contribuyente", + "regimenes" + ], + "properties": { + "curp": { + "type": "string" + }, + "nombre": { + "type": "string" + }, + "apellido_paterno": { + "type": "string" + }, + "apellido_materno": { + "type": "string" + }, + "fecha_nacimiento": { + "$ref": "#/definitions/dateTime" + } + } + }, + { + "title": "Persona Moral", + "required": [ + "rfc", + "id_cif", + "razon_social", + "regimen_de_capital", + "situacion_contribuyente", + "regimenes" + ], + "properties": { + "razon_social": { + "type": "string" + }, + "regimen_de_capital": { + "type": "string" + }, + "fecha_constitucion": { + "$ref": "#/definitions/dateTime" + } + } + } + ], + "properties": { + "rfc": { + "type": "string", + "pattern": "^[A-Z&Ñ]{3,4}[0-9]{2}(0[1-9]|1[012])(0[1-9]|[12][0-9]|3[01])[A-Z0-9]{2}[0-9A]$" + }, + "id_cif": { + "type": "string" + }, + "fecha_inicio_operaciones": { + "$ref": "#/definitions/dateTime" + }, + "situacion_contribuyente": { + "type": "string" + }, + "fecha_ultimo_cambio_situacion": { + "$ref": "#/definitions/dateTime" + }, + "entidad_federativa": { + "type": "string" + }, + "municipio_delegacion": { + "type": "string" + }, + "colonia": { + "type": "string" + }, + "tipo_vialidad": { + "type": "string" + }, + "nombre_vialidad": { + "type": "string" + }, + "numero_exterior": { + "type": "string" + }, + "numero_interior": { + "type": "string" + }, + "codigo_postal": { + "type": "string" + }, + "correo_electronico": { + "type": "string" + }, + "al": { + "type": "string" + }, + "regimenes": { + "type": "array", + "items": { + "type": "object", + "required": [ + "regimen", + "regimen_id" + ], + "properties": { + "regimen": { + "type": "string" + }, + "regimen_id": { + "type": "string" + }, + "fecha_alta": { + "$ref": "#/definitions/dateTime" + } + } + } + }, + "extra_data": { + "type": "object", + "additionalProperties": true + } + }, + "definitions": { + "dateTime": { + "type": "object", + "required": [ + "date", + "timezone_type", + "timezone" + ], + "properties": { + "date": { + "type": "string", + "format": "date-time" + }, + "timezone_type": { + "type": "integer" + }, + "timezone": { + "type": "string" + } + } + } + } +} \ No newline at end of file From 7734eb286ab5228c4d550a994ebdf2f1da0bf8f7 Mon Sep 17 00:00:00 2001 From: David Regla Date: Mon, 9 Feb 2026 03:01:10 +0000 Subject: [PATCH 04/10] =?UTF-8?q?ci:=20a=C3=B1adir=20workflow=20de=20Docke?= =?UTF-8?q?r=20para=20disparos=20por=20release=20y=20Docker=20Hub?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/docker.yml | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .github/workflows/docker.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000..bf8b2e5 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,54 @@ +name: Docker + +on: + release: + branches: [ "main" ] + types: [ "published" ] + tags: [ "v*" ] + + +# Actions +# docker/setup-buildx-action@v3 https://github.com/marketplace/actions/docker-setup-buildx +# docker/login-action@v3 https://github.com/marketplace/actions/docker-login +# docker/metadata-action@v5 https://github.com/marketplace/actions/docker-metadata-action +# docker/build-push-action@v6 https://github.com/marketplace/actions/build-and-push-docker-images + +jobs: + docker: + name: Docker image + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set application version + run: sed -i "s#@box_git_version@#${{ github.ref_name }}#" bin/* + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ github.repository }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max From 5a3cbb66653b407ae91f661741eee916835d0c68 Mon Sep 17 00:00:00 2001 From: David Regla Date: Mon, 9 Feb 2026 03:16:59 +0000 Subject: [PATCH 05/10] docs: simplificar comando de ejemplo --- README.md | 6 +++--- bin/scraper.php | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8f8b6fa..c2f5a40 100644 --- a/README.md +++ b/README.md @@ -179,17 +179,17 @@ Puedes pasar una ruta local (montada como volumen) o una URL pública: **Desde un archivo local:** ```shell -docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper /csf.pdf | jq +docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper /csf.pdf ``` **Desde una URL:** ```shell -docker run --rm phpcfdi/csf-scraper https://dominio.com/archivo.pdf | jq +docker run --rm phpcfdi/csf-scraper https://dominio.com/archivo.pdf ``` **Obtener el esquema JSON de salida:** ```shell -docker run --rm phpcfdi/csf-scraper --json-schema | jq +docker run --rm phpcfdi/csf-scraper --json-schema ``` **Ver ayuda:** diff --git a/bin/scraper.php b/bin/scraper.php index 61f1e61..e240136 100755 --- a/bin/scraper.php +++ b/bin/scraper.php @@ -17,9 +17,9 @@ function showUsage(): void echo " -h, --help : Muestra este mensaje de ayuda" . PHP_EOL; echo PHP_EOL; echo "Ejemplos:" . PHP_EOL; - echo " php bin/scraper.php constancia.pdf | jq" . PHP_EOL; - echo " php bin/scraper.php https://dominio.com/mi_constancia.pdf | jq" . PHP_EOL; - echo " php bin/scraper.php --json-schema | jq" . PHP_EOL; + echo " php bin/scraper.php constancia.pdf" . PHP_EOL; + echo " php bin/scraper.php https://dominio.com/mi_constancia.pdf" . PHP_EOL; + echo " php bin/scraper.php --json-schema" . PHP_EOL; } if ($argc < 2 || in_array($argv[1], ['-h', '--help'])) { From 155f4fdd1e01c200a33698725fa43569e0a80092 Mon Sep 17 00:00:00 2001 From: David Regla Date: Mon, 9 Feb 2026 03:19:02 +0000 Subject: [PATCH 06/10] =?UTF-8?q?feat:=20a=C3=B1adir=20soporte=20para=20le?= =?UTF-8?q?ctura=20de=20PDF=20desde=20stdin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +++++ bin/scraper.php | 14 +++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c2f5a40..72579c4 100644 --- a/README.md +++ b/README.md @@ -192,6 +192,11 @@ docker run --rm phpcfdi/csf-scraper https://dominio.com/archivo.pdf docker run --rm phpcfdi/csf-scraper --json-schema ``` +**Desde la entrada estándar (stdin):** +```shell +cat mi-archivo.pdf | docker run --rm -i phpcfdi/csf-scraper - +``` + **Ver ayuda:** ```shell docker run --rm phpcfdi/csf-scraper --help diff --git a/bin/scraper.php b/bin/scraper.php index e240136..578e00a 100755 --- a/bin/scraper.php +++ b/bin/scraper.php @@ -9,16 +9,18 @@ function showUsage(): void { - echo "Uso: php bin/scraper.php " . PHP_EOL; + echo "Uso: php bin/scraper.php " . PHP_EOL; echo " Opciones:" . PHP_EOL; echo " archivo.pdf : Ruta al archivo PDF local" . PHP_EOL; echo " url : URL pública al archivo PDF" . PHP_EOL; + echo " - : Leer el PDF desde la entrada estándar (stdin)" . PHP_EOL; echo " --json-schema : Muestra el esquema JSON de la salida" . PHP_EOL; echo " -h, --help : Muestra este mensaje de ayuda" . PHP_EOL; echo PHP_EOL; echo "Ejemplos:" . PHP_EOL; echo " php bin/scraper.php constancia.pdf" . PHP_EOL; echo " php bin/scraper.php https://dominio.com/mi_constancia.pdf" . PHP_EOL; + echo " cat constancia.pdf | php bin/scraper.php -" . PHP_EOL; echo " php bin/scraper.php --json-schema" . PHP_EOL; } @@ -34,14 +36,16 @@ function showUsage(): void $input = $argv[1]; $isUrl = filter_var($input, FILTER_VALIDATE_URL); +$isStdin = $input === '-'; $tempFile = null; try { - if ($isUrl) { + if ($isUrl || $isStdin) { $tempFile = tempnam(sys_get_temp_dir(), 'csf_'); - $content = file_get_contents($input); - if ($content === false) { - throw new Exception("No se pudo descargar el archivo desde la URL: $input"); + $content = $isStdin ? file_get_contents('php://stdin') : file_get_contents($input); + if ($content === false || $content === '') { + $source = $isStdin ? 'stdin' : "la URL: $input"; + throw new Exception("No se pudo obtener el contenido desde $source"); } file_put_contents($tempFile, $content); $path = $tempFile; From 4e3c0c641253f217814a1a7cca5e3315463a6604 Mon Sep 17 00:00:00 2001 From: David Regla Date: Tue, 10 Feb 2026 04:48:32 +0000 Subject: [PATCH 07/10] =?UTF-8?q?fix:=20a=C3=B1adir=20flag=20-nopgbrk=20a?= =?UTF-8?q?=20pdftotext=20para=20compatibilidad=20con=20Poppler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/PdfReader/PdfToText.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PdfReader/PdfToText.php b/src/PdfReader/PdfToText.php index debc891..239535b 100644 --- a/src/PdfReader/PdfToText.php +++ b/src/PdfReader/PdfToText.php @@ -46,6 +46,6 @@ public function extract(string $path): string /** @return list */ public function buildCommand(string $pdfFile): array { - return [$this->pdftotext, '-eol', 'unix', '-raw', '-q', $pdfFile, '-']; + return [$this->pdftotext, '-eol', 'unix', '-nopgbrk', '-raw', '-q', $pdfFile, '-']; } } From 2cd58be102ee4793c2d0fb7b1d9ff5d61c4ca42c Mon Sep 17 00:00:00 2001 From: David Regla Date: Tue, 10 Feb 2026 04:48:42 +0000 Subject: [PATCH 08/10] feat: renombrar binario a csf-scraper y unificar comando obtain --- Dockerfile | 2 +- README.md | 23 ++++++-- bin/csf-scraper | 105 ++++++++++++++++++++++++++++++++++ bin/scraper.php | 70 ----------------------- tests/Integration/CliTest.php | 74 ++++++++++++++++++++++++ 5 files changed, 197 insertions(+), 77 deletions(-) create mode 100755 bin/csf-scraper delete mode 100755 bin/scraper.php create mode 100644 tests/Integration/CliTest.php diff --git a/Dockerfile b/Dockerfile index 311e302..2e5a189 100644 --- a/Dockerfile +++ b/Dockerfile @@ -59,4 +59,4 @@ USER appuser # Essential for the library (pdftotext is part of poppler-utils) ENV PATH="/usr/bin:${PATH}" -ENTRYPOINT ["php", "/app/bin/scraper.php"] +ENTRYPOINT ["php", "/app/bin/csf-scraper"] diff --git a/README.md b/README.md index 72579c4..8beda89 100644 --- a/README.md +++ b/README.md @@ -175,31 +175,42 @@ docker build -t csf-scraper:latest . #### Ejecutar extracción -Puedes pasar una ruta local (montada como volumen) o una URL pública: +Puedes pasar una ruta local, una URL pública o directamente el ID CIF y RFC: + +**Mediante ID CIF y RFC (Flujo principal):** + +```shell +docker run --rm phpcfdi/csf-scraper obtain 19040141021 DIM8701081LA +``` **Desde un archivo local:** + ```shell -docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper /csf.pdf +docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper obtain /csf.pdf ``` **Desde una URL:** + ```shell -docker run --rm phpcfdi/csf-scraper https://dominio.com/archivo.pdf +docker run --rm phpcfdi/csf-scraper obtain https://dominio.com/archivo.pdf ``` **Obtener el esquema JSON de salida:** + ```shell -docker run --rm phpcfdi/csf-scraper --json-schema +docker run --rm phpcfdi/csf-scraper schema ``` **Desde la entrada estándar (stdin):** + ```shell -cat mi-archivo.pdf | docker run --rm -i phpcfdi/csf-scraper - +cat mi-archivo.pdf | docker run --rm -i phpcfdi/csf-scraper obtain - ``` **Ver ayuda:** + ```shell -docker run --rm phpcfdi/csf-scraper --help +docker run --rm phpcfdi/csf-scraper help ``` ## Soporte diff --git a/bin/csf-scraper b/bin/csf-scraper new file mode 100755 index 0000000..ddd9460 --- /dev/null +++ b/bin/csf-scraper @@ -0,0 +1,105 @@ +#!/usr/bin/env php + [argumentos]" . PHP_EOL; + echo PHP_EOL; + echo 'Comandos:' . PHP_EOL; + echo ' obtain - Obtener datos por ID CIF y RFC' . PHP_EOL; + echo ' obtain - Obtener datos desde un archivo PDF local, URL o stdin con "-"' . PHP_EOL; + echo ' schema - Muestra el esquema JSON de la salida' . PHP_EOL; + echo ' help - Muestra este mensaje de ayuda' . PHP_EOL; + echo PHP_EOL; + echo 'Ejemplos:' . PHP_EOL; + echo " $binary obtain 14111045399 UNA2907227Y5" . PHP_EOL; + echo " $binary obtain ./constancia.pdf" . PHP_EOL; + echo " $binary obtain https://siafweb.ib.unam.mx/CSF.pdf" . PHP_EOL; + echo " cat ./constancia.pdf | $binary obtain -" . PHP_EOL; + echo " $binary schema" . PHP_EOL; +} + +foreach ($argv as $arg) { + if (in_array($arg, ['-h', '--help', 'help'])) { + showUsage(); + exit(0); + } +} + +if ($argc < 2) { + showUsage(); + exit(0); +} + +$command = $argv[1]; + +// Manejo de comandos explícitos +if ('schema' === $command || '--json-schema' === $command) { + echo file_get_contents(__DIR__ . '/../docs/schemas/csf.schema.json'); + exit(0); +} + +$scraper = Scraper::create(); + +try { + if ('obtain' === $command) { + if (3 === $argc) { + // Un solo argumento después de obtain: asumimos ruta/url/stdin + $person = obtainFromSource($argv[2], $scraper); + } elseif (4 === $argc) { + // Dos argumentos después de obtain: asumimos id-cif y rfc + $idCif = $argv[2]; + $rfcValue = $argv[3]; + $person = $scraper->obtainFromRfcAndCif(Rfc::parse($rfcValue), $idCif); + } else { + throw new Exception('El comando obtain requiere [id-cif rfc] o [archivo|url|-]'); + } + } else { + throw new Exception("Comando '$command' no reconocido. Usa help para ver la lista de comandos."); + } + + echo json_encode($person, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . PHP_EOL; +} catch (Exception $e) { + fwrite(STDERR, 'Error: ' . $e->getMessage() . PHP_EOL); + exit(1); +} + +function obtainFromSource(string $input, Scraper $scraper): PhpCfdi\CsfScraper\PersonaMoral|PhpCfdi\CsfScraper\PersonaFisica +{ + $isUrl = filter_var($input, FILTER_VALIDATE_URL); + $isStdin = '-' === $input; + $tempFile = null; + + try { + if ($isUrl || $isStdin) { + $tempFile = tempnam(sys_get_temp_dir(), 'csf_'); + $content = $isStdin ? file_get_contents('php://stdin') : file_get_contents($input); + if (false === $content || '' === $content) { + $source = $isStdin ? 'stdin' : "la URL: $input"; + throw new Exception("No se pudo obtener el contenido desde $source"); + } + file_put_contents($tempFile, $content); + $path = $tempFile; + } else { + $path = $input; + if (! file_exists($path)) { + throw new Exception("El archivo no existe en la ruta: $path"); + } + } + + return $scraper->obtainFromPdfPath($path); + } finally { + if ($tempFile && file_exists($tempFile)) { + unlink($tempFile); + } + } +} diff --git a/bin/scraper.php b/bin/scraper.php deleted file mode 100755 index 578e00a..0000000 --- a/bin/scraper.php +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env php -" . PHP_EOL; - echo " Opciones:" . PHP_EOL; - echo " archivo.pdf : Ruta al archivo PDF local" . PHP_EOL; - echo " url : URL pública al archivo PDF" . PHP_EOL; - echo " - : Leer el PDF desde la entrada estándar (stdin)" . PHP_EOL; - echo " --json-schema : Muestra el esquema JSON de la salida" . PHP_EOL; - echo " -h, --help : Muestra este mensaje de ayuda" . PHP_EOL; - echo PHP_EOL; - echo "Ejemplos:" . PHP_EOL; - echo " php bin/scraper.php constancia.pdf" . PHP_EOL; - echo " php bin/scraper.php https://dominio.com/mi_constancia.pdf" . PHP_EOL; - echo " cat constancia.pdf | php bin/scraper.php -" . PHP_EOL; - echo " php bin/scraper.php --json-schema" . PHP_EOL; -} - -if ($argc < 2 || in_array($argv[1], ['-h', '--help'])) { - showUsage(); - exit(0); -} - -if ($argv[1] === '--json-schema') { - echo file_get_contents(__DIR__ . '/../docs/schemas/csf.schema.json'); - exit(0); -} - -$input = $argv[1]; -$isUrl = filter_var($input, FILTER_VALIDATE_URL); -$isStdin = $input === '-'; -$tempFile = null; - -try { - if ($isUrl || $isStdin) { - $tempFile = tempnam(sys_get_temp_dir(), 'csf_'); - $content = $isStdin ? file_get_contents('php://stdin') : file_get_contents($input); - if ($content === false || $content === '') { - $source = $isStdin ? 'stdin' : "la URL: $input"; - throw new Exception("No se pudo obtener el contenido desde $source"); - } - file_put_contents($tempFile, $content); - $path = $tempFile; - } else { - $path = $input; - if (!file_exists($path)) { - throw new Exception("El archivo no existe en la ruta: $path"); - } - } - - $scraper = Scraper::create(); - $person = $scraper->obtainFromPdfPath($path); - - echo json_encode($person, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . PHP_EOL; -} catch (Exception $e) { - fwrite(STDERR, "Error: " . $e->getMessage() . PHP_EOL); - exit(1); -} finally { - if ($tempFile && file_exists($tempFile)) { - unlink($tempFile); - } -} diff --git a/tests/Integration/CliTest.php b/tests/Integration/CliTest.php new file mode 100644 index 0000000..091a729 --- /dev/null +++ b/tests/Integration/CliTest.php @@ -0,0 +1,74 @@ +run(); + return $process; + } + + public function test_command_help(): void + { + $process = $this->runCli(['help']); + + $this->assertTrue($process->isSuccessful()); + $this->assertStringContainsString('Uso:', $process->getOutput()); + $this->assertStringContainsString('obtain ', $process->getOutput()); + } + + public function test_command_help_legacy(): void + { + $process = $this->runCli(['--help']); + + $this->assertTrue($process->isSuccessful()); + $this->assertStringContainsString('Uso:', $process->getOutput()); + } + + public function test_command_help_in_any_position(): void + { + $process = $this->runCli(['obtain', '--help']); + + $this->assertTrue($process->isSuccessful()); + $this->assertStringContainsString('Uso:', $process->getOutput()); + } + + public function test_command_schema(): void + { + $process = $this->runCli(['schema']); + + $this->assertTrue($process->isSuccessful()); + $this->assertJson($process->getOutput()); + $this->assertStringContainsString('csf.schema.json', $process->getOutput()); + } + + public function test_obtain_missing_arguments(): void + { + $process = $this->runCli(['obtain']); + + $this->assertFalse($process->isSuccessful()); + $this->assertStringContainsString('Error: El comando obtain requiere [id-cif rfc] o [archivo|url|-]', $process->getErrorOutput()); + } + + public function test_obtain_local_file(): void + { + $path = $this->filePath('csf-without-cif.pdf'); + $process = $this->runCli(['obtain', $path]); + + // Expect failure because it doesn't have CIF ID, but it confirms the file was read and processed + $this->assertFalse($process->isSuccessful()); + $this->assertStringContainsString('Cannot obtain cif from given PDF', $process->getErrorOutput()); + } +} From e37930a37cae9b376d430e5f480528484ab93f21 Mon Sep 17 00:00:00 2001 From: David Regla Date: Tue, 10 Feb 2026 05:00:35 +0000 Subject: [PATCH 09/10] =?UTF-8?q?feat:=20a=C3=B1adir=20timeout=20configura?= =?UTF-8?q?ble=20para=20peticiones=20HTTP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 10 ++++++++++ bin/csf-scraper | 20 ++++++++++++++++++-- src/Scraper.php | 3 ++- tests/Integration/CliTest.php | 10 ++++++++++ tests/Unit/ScraperTest.php | 9 +++++++++ 5 files changed, 49 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8beda89..1625652 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,16 @@ cat mi-archivo.pdf | docker run --rm -i phpcfdi/csf-scraper obtain - docker run --rm phpcfdi/csf-scraper help ``` +### Opciones Globales + +* `--timeout `: Configura el tiempo máximo de espera para las peticiones HTTP al SAT (por defecto: 10 segundos). + +Ejemplo con timeout personalizado: + +```shell +docker run --rm phpcfdi/csf-scraper obtain 14111045399 UNA2907227Y5 --timeout 20 +``` + ## Soporte Puedes obtener soporte abriendo un ticket en Github. diff --git a/bin/csf-scraper b/bin/csf-scraper index ddd9460..4dbcb33 100755 --- a/bin/csf-scraper +++ b/bin/csf-scraper @@ -20,14 +20,30 @@ function showUsage(): void echo ' schema - Muestra el esquema JSON de la salida' . PHP_EOL; echo ' help - Muestra este mensaje de ayuda' . PHP_EOL; echo PHP_EOL; + echo 'Opciones:' . PHP_EOL; + echo ' --timeout - Tiempo máximo de espera para peticiones HTTP (default: 10)' . PHP_EOL; + echo PHP_EOL; echo 'Ejemplos:' . PHP_EOL; - echo " $binary obtain 14111045399 UNA2907227Y5" . PHP_EOL; + echo " $binary obtain 14111045399 UNA2907227Y5 --timeout 20" . PHP_EOL; echo " $binary obtain ./constancia.pdf" . PHP_EOL; echo " $binary obtain https://siafweb.ib.unam.mx/CSF.pdf" . PHP_EOL; echo " cat ./constancia.pdf | $binary obtain -" . PHP_EOL; echo " $binary schema" . PHP_EOL; } +$timeout = 10.0; +$clean_argv = []; +for ($i = 0; $i < $argc; $i++) { + if ('--timeout' === $argv[$i] && isset($argv[$i + 1])) { + $timeout = (float) $argv[$i + 1]; + $i++; + continue; + } + $clean_argv[] = $argv[$i]; +} +$argv = $clean_argv; +$argc = count($argv); + foreach ($argv as $arg) { if (in_array($arg, ['-h', '--help', 'help'])) { showUsage(); @@ -48,7 +64,7 @@ if ('schema' === $command || '--json-schema' === $command) { exit(0); } -$scraper = Scraper::create(); +$scraper = Scraper::create($timeout); try { if ('obtain' === $command) { diff --git a/src/Scraper.php b/src/Scraper.php index 8d28be3..5b8e5c9 100644 --- a/src/Scraper.php +++ b/src/Scraper.php @@ -34,10 +34,11 @@ public function getClient(): ClientInterface /** * Factory method to create a scraper object with configuration that simply works */ - public static function create(): self + public static function create(float $timeout = 10.0): self { return new self(new Client([ 'curl' => [CURLOPT_SSL_CIPHER_LIST => 'DEFAULT@SECLEVEL=1'], + 'timeout' => $timeout, ])); } diff --git a/tests/Integration/CliTest.php b/tests/Integration/CliTest.php index 091a729..59590ca 100644 --- a/tests/Integration/CliTest.php +++ b/tests/Integration/CliTest.php @@ -45,6 +45,16 @@ public function test_command_help_in_any_position(): void $this->assertStringContainsString('Uso:', $process->getOutput()); } + public function test_obtain_with_timeout(): void + { + $process = $this->runCli(['obtain', '12345678', 'RFC010101AAA', '--timeout', '5']); + + // We expect it to fail because the RFC/CIF are fake, but it should NOT fail because of the option + $this->assertFalse($process->isSuccessful()); + $this->assertStringContainsString('Error:', $process->getErrorOutput()); + $this->assertStringNotContainsString('no reconocido', $process->getErrorOutput()); + } + public function test_command_schema(): void { $process = $this->runCli(['schema']); diff --git a/tests/Unit/ScraperTest.php b/tests/Unit/ScraperTest.php index eda7abb..fa4bf80 100644 --- a/tests/Unit/ScraperTest.php +++ b/tests/Unit/ScraperTest.php @@ -26,5 +26,14 @@ public function test_create_with_specific_openssl_cipher_list(): void [CURLOPT_SSL_CIPHER_LIST => 'DEFAULT@SECLEVEL=1'], $client->getConfig('curl'), ); + $this->assertSame(10.0, $client->getConfig('timeout')); + } + + public function test_create_with_timeout(): void + { + $scraper = Scraper::create(5.0); + $client = $scraper->getClient(); + + $this->assertSame(5.0, $client->getConfig('timeout')); } } From 301328664b27ca7b412ec043bb718fad2c9cb033 Mon Sep 17 00:00:00 2001 From: David Regla Date: Tue, 10 Feb 2026 08:27:16 +0000 Subject: [PATCH 10/10] chroe: Definir herramienta CLI como vendor binary --- composer.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/composer.json b/composer.json index 3479612..94a43ad 100644 --- a/composer.json +++ b/composer.json @@ -90,5 +90,8 @@ "dev:coverage": "DEV: run phpunit with xdebug and storage coverage in build/coverage/html/", "dev:fix-style": "DEV: fix code style errors using composer-normalize, php-cs-fixer and phpcbf", "dev:test": "DEV: run @dev:check-style, phpunit and phpstan" - } + }, + "bin": [ + "bin/csf-scraper" + ] }