Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.git
.github
.phive
build
docs/CHANGELOG.md
docs/TODO.md
docs/SEMVER.md
tests
vendor
.php-cs-fixer.dist.php
composer.lock
phpcs.xml.dist
phpstan.neon.dist
phpunit.xml.dist
sonar-project.properties
Dockerfile
.dockerignore
README.md
LICENSE
CONTRIBUTING.md
CODE_OF_CONDUCT.md
54 changes: 54 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Docker

on:
release:
branches: [ "main" ]
types: [ "published" ]
tags: [ "v*" ]


# Actions
# docker/setup-buildx-action@v3 https://github.com/marketplace/actions/docker-setup-buildx
# docker/login-action@v3 https://github.com/marketplace/actions/docker-login
# docker/metadata-action@v5 https://github.com/marketplace/actions/docker-metadata-action
# docker/build-push-action@v6 https://github.com/marketplace/actions/build-and-push-docker-images

jobs:
docker:
name: Docker image
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v6

- name: Set application version
run: sed -i "s#@box_git_version@#${{ github.ref_name }}#" bin/*

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Extract metadata for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ github.repository }}
tags: |
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}

- name: Build and push Docker image
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
62 changes: 62 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Stage 1: Build dependencies
FROM composer:2 AS builder

WORKDIR /app

# Copy composer files for better caching
COPY composer.json ./

# Install dependencies
RUN composer install \
--no-dev \
--no-interaction \
--no-plugins \
--no-scripts \
--prefer-dist \
--optimize-autoloader

# Stage 2: Production image
FROM php:8.2-alpine

LABEL maintainer="PhpCfdi"
LABEL description="Producción image for csf-scraper library"

# Install system dependencies and PHP extensions
RUN apk add --no-cache \
poppler-utils \
libxml2-dev \
oniguruma-dev \
libcurl \
curl-dev

RUN docker-php-ext-install \
curl \
mbstring \
opcache \
dom

# Configure PHP for production
RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini"

# Create a non-root user
RUN addgroup -S appgroup && adduser -S appuser -G appgroup

WORKDIR /app

# Copy code and vendor from builder
COPY --from=builder /app/vendor /app/vendor
COPY ./src /app/src
COPY ./bin /app/bin
COPY ./docs /app/docs
COPY ./composer.json /app/composer.json

# Adjust permissions
RUN chown -R appuser:appgroup /app

# Switch to non-root user
USER appuser

# Essential for the library (pdftotext is part of poppler-utils)
ENV PATH="/usr/bin:${PATH}"

ENTRYPOINT ["php", "/app/bin/csf-scraper"]
60 changes: 60 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,66 @@ Ejemplo de salida de `json_encode($person)` para persona física:
}
```

### Uso con Docker

Esta librería incluye un `Dockerfile` optimizado que funciona como una herramienta de línea de comandos (CLI). La imagen devuelve el resultado en formato **JSON** a través de la salida estándar (stdout), facilitando su integración con otras herramientas como `jq`.

#### Construir la imagen

```bash
docker build -t csf-scraper:latest .
```

#### Ejecutar extracción

Puedes pasar una ruta local, una URL pública o directamente el ID CIF y RFC:

**Mediante ID CIF y RFC (Flujo principal):**

```shell
docker run --rm phpcfdi/csf-scraper obtain 19040141021 DIM8701081LA
```

**Desde un archivo local:**

```shell
docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper obtain /csf.pdf
```

**Desde una URL:**

```shell
docker run --rm phpcfdi/csf-scraper obtain https://dominio.com/archivo.pdf
```

**Obtener el esquema JSON de salida:**

```shell
docker run --rm phpcfdi/csf-scraper schema
```

**Desde la entrada estándar (stdin):**

```shell
cat mi-archivo.pdf | docker run --rm -i phpcfdi/csf-scraper obtain -
```

**Ver ayuda:**

```shell
docker run --rm phpcfdi/csf-scraper help
```

### Opciones Globales

* `--timeout <segundos>`: Configura el tiempo máximo de espera para las peticiones HTTP al SAT (por defecto: 10 segundos).

Ejemplo con timeout personalizado:

```shell
docker run --rm phpcfdi/csf-scraper obtain 14111045399 UNA2907227Y5 --timeout 20
```

## Soporte

Puedes obtener soporte abriendo un ticket en Github.
Expand Down
121 changes: 121 additions & 0 deletions bin/csf-scraper
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env php
<?php

declare(strict_types=1);

require __DIR__ . '/../vendor/autoload.php';

use PhpCfdi\CsfScraper\Scraper;
use PhpCfdi\Rfc\Rfc;

function showUsage(): void
{
$binary = basename($_SERVER['PHP_SELF']);

echo "Uso: $binary <comando> [argumentos]" . PHP_EOL;
echo PHP_EOL;
echo 'Comandos:' . PHP_EOL;
echo ' obtain <id-cif> <rfc> - Obtener datos por ID CIF y RFC' . PHP_EOL;
echo ' obtain <archivo|url|-> - Obtener datos desde un archivo PDF local, URL o stdin con "-"' . PHP_EOL;
echo ' schema - Muestra el esquema JSON de la salida' . PHP_EOL;
echo ' help - Muestra este mensaje de ayuda' . PHP_EOL;
echo PHP_EOL;
echo 'Opciones:' . PHP_EOL;
echo ' --timeout <segundos> - Tiempo máximo de espera para peticiones HTTP (default: 10)' . PHP_EOL;
echo PHP_EOL;
echo 'Ejemplos:' . PHP_EOL;
echo " $binary obtain 14111045399 UNA2907227Y5 --timeout 20" . PHP_EOL;
echo " $binary obtain ./constancia.pdf" . PHP_EOL;
echo " $binary obtain https://siafweb.ib.unam.mx/CSF.pdf" . PHP_EOL;
echo " cat ./constancia.pdf | $binary obtain -" . PHP_EOL;
echo " $binary schema" . PHP_EOL;
}

$timeout = 10.0;
$clean_argv = [];
for ($i = 0; $i < $argc; $i++) {
if ('--timeout' === $argv[$i] && isset($argv[$i + 1])) {
$timeout = (float) $argv[$i + 1];
$i++;
continue;
}
$clean_argv[] = $argv[$i];
}
$argv = $clean_argv;
$argc = count($argv);

foreach ($argv as $arg) {
if (in_array($arg, ['-h', '--help', 'help'])) {
showUsage();
exit(0);
}
}

if ($argc < 2) {
showUsage();
exit(0);
}

$command = $argv[1];

// Manejo de comandos explícitos
if ('schema' === $command || '--json-schema' === $command) {
echo file_get_contents(__DIR__ . '/../docs/schemas/csf.schema.json');
exit(0);
}

$scraper = Scraper::create($timeout);

try {
if ('obtain' === $command) {
if (3 === $argc) {
// Un solo argumento después de obtain: asumimos ruta/url/stdin
$person = obtainFromSource($argv[2], $scraper);
} elseif (4 === $argc) {
// Dos argumentos después de obtain: asumimos id-cif y rfc
$idCif = $argv[2];
$rfcValue = $argv[3];
$person = $scraper->obtainFromRfcAndCif(Rfc::parse($rfcValue), $idCif);
} else {
throw new Exception('El comando obtain requiere [id-cif rfc] o [archivo|url|-]');
}
} else {
throw new Exception("Comando '$command' no reconocido. Usa help para ver la lista de comandos.");
}

echo json_encode($person, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . PHP_EOL;
} catch (Exception $e) {
fwrite(STDERR, 'Error: ' . $e->getMessage() . PHP_EOL);
exit(1);
}

function obtainFromSource(string $input, Scraper $scraper): PhpCfdi\CsfScraper\PersonaMoral|PhpCfdi\CsfScraper\PersonaFisica
{
$isUrl = filter_var($input, FILTER_VALIDATE_URL);
$isStdin = '-' === $input;
$tempFile = null;

try {
if ($isUrl || $isStdin) {
$tempFile = tempnam(sys_get_temp_dir(), 'csf_');
$content = $isStdin ? file_get_contents('php://stdin') : file_get_contents($input);
if (false === $content || '' === $content) {
$source = $isStdin ? 'stdin' : "la URL: $input";
throw new Exception("No se pudo obtener el contenido desde $source");
}
file_put_contents($tempFile, $content);
$path = $tempFile;
} else {
$path = $input;
if (! file_exists($path)) {
throw new Exception("El archivo no existe en la ruta: $path");
}
}

return $scraper->obtainFromPdfPath($path);
} finally {
if ($tempFile && file_exists($tempFile)) {
unlink($tempFile);
}
}
}
5 changes: 4 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,8 @@
"dev:coverage": "DEV: run phpunit with xdebug and storage coverage in build/coverage/html/",
"dev:fix-style": "DEV: fix code style errors using composer-normalize, php-cs-fixer and phpcbf",
"dev:test": "DEV: run @dev:check-style, phpunit and phpstan"
}
},
"bin": [
"bin/csf-scraper"
]
}
Loading