diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..688ca65 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +.git +.github +.phive +build +docs/CHANGELOG.md +docs/TODO.md +docs/SEMVER.md +tests +vendor +.php-cs-fixer.dist.php +composer.lock +phpcs.xml.dist +phpstan.neon.dist +phpunit.xml.dist +sonar-project.properties +Dockerfile +.dockerignore +README.md +LICENSE +CONTRIBUTING.md +CODE_OF_CONDUCT.md diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000..bf8b2e5 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,54 @@ +name: Docker + +on: + release: + branches: [ "main" ] + types: [ "published" ] + tags: [ "v*" ] + + +# Actions +# docker/setup-buildx-action@v3 https://github.com/marketplace/actions/docker-setup-buildx +# docker/login-action@v3 https://github.com/marketplace/actions/docker-login +# docker/metadata-action@v5 https://github.com/marketplace/actions/docker-metadata-action +# docker/build-push-action@v6 https://github.com/marketplace/actions/build-and-push-docker-images + +jobs: + docker: + name: Docker image + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set application version + run: sed -i "s#@box_git_version@#${{ github.ref_name }}#" bin/* + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ github.repository }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2e5a189 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,62 @@ +# Stage 1: Build dependencies +FROM composer:2 AS builder + +WORKDIR /app + +# Copy composer files for better caching +COPY composer.json ./ + +# Install dependencies +RUN composer install \ + --no-dev \ + --no-interaction \ + --no-plugins \ + --no-scripts \ + --prefer-dist \ + --optimize-autoloader + +# Stage 2: Production image +FROM php:8.2-alpine + +LABEL maintainer="PhpCfdi" +LABEL description="Producción image for csf-scraper library" + +# Install system dependencies and PHP extensions +RUN apk add --no-cache \ + poppler-utils \ + libxml2-dev \ + oniguruma-dev \ + libcurl \ + curl-dev + +RUN docker-php-ext-install \ + curl \ + mbstring \ + opcache \ + dom + +# Configure PHP for production +RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini" + +# Create a non-root user +RUN addgroup -S appgroup && adduser -S appuser -G appgroup + +WORKDIR /app + +# Copy code and vendor from builder +COPY --from=builder /app/vendor /app/vendor +COPY ./src /app/src +COPY ./bin /app/bin +COPY ./docs /app/docs +COPY ./composer.json /app/composer.json + +# Adjust permissions +RUN chown -R appuser:appgroup /app + +# Switch to non-root user +USER appuser + +# Essential for the library (pdftotext is part of poppler-utils) +ENV PATH="/usr/bin:${PATH}" + +ENTRYPOINT ["php", "/app/bin/csf-scraper"] diff --git a/README.md b/README.md index e5e7a37..1625652 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,66 @@ Ejemplo de salida de `json_encode($person)` para persona física: } ``` +### Uso con Docker + +Esta librería incluye un `Dockerfile` optimizado que funciona como una herramienta de línea de comandos (CLI). La imagen devuelve el resultado en formato **JSON** a través de la salida estándar (stdout), facilitando su integración con otras herramientas como `jq`. + +#### Construir la imagen + +```bash +docker build -t csf-scraper:latest . +``` + +#### Ejecutar extracción + +Puedes pasar una ruta local, una URL pública o directamente el ID CIF y RFC: + +**Mediante ID CIF y RFC (Flujo principal):** + +```shell +docker run --rm phpcfdi/csf-scraper obtain 19040141021 DIM8701081LA +``` + +**Desde un archivo local:** + +```shell +docker run --rm -v $(pwd)/mi-archivo.pdf:/csf.pdf phpcfdi/csf-scraper obtain /csf.pdf +``` + +**Desde una URL:** + +```shell +docker run --rm phpcfdi/csf-scraper obtain https://dominio.com/archivo.pdf +``` + +**Obtener el esquema JSON de salida:** + +```shell +docker run --rm phpcfdi/csf-scraper schema +``` + +**Desde la entrada estándar (stdin):** + +```shell +cat mi-archivo.pdf | docker run --rm -i phpcfdi/csf-scraper obtain - +``` + +**Ver ayuda:** + +```shell +docker run --rm phpcfdi/csf-scraper help +``` + +### Opciones Globales + +* `--timeout `: Configura el tiempo máximo de espera para las peticiones HTTP al SAT (por defecto: 10 segundos). + +Ejemplo con timeout personalizado: + +```shell +docker run --rm phpcfdi/csf-scraper obtain 14111045399 UNA2907227Y5 --timeout 20 +``` + ## Soporte Puedes obtener soporte abriendo un ticket en Github. diff --git a/bin/csf-scraper b/bin/csf-scraper new file mode 100755 index 0000000..4dbcb33 --- /dev/null +++ b/bin/csf-scraper @@ -0,0 +1,121 @@ +#!/usr/bin/env php + [argumentos]" . PHP_EOL; + echo PHP_EOL; + echo 'Comandos:' . PHP_EOL; + echo ' obtain - Obtener datos por ID CIF y RFC' . PHP_EOL; + echo ' obtain - Obtener datos desde un archivo PDF local, URL o stdin con "-"' . PHP_EOL; + echo ' schema - Muestra el esquema JSON de la salida' . PHP_EOL; + echo ' help - Muestra este mensaje de ayuda' . PHP_EOL; + echo PHP_EOL; + echo 'Opciones:' . PHP_EOL; + echo ' --timeout - Tiempo máximo de espera para peticiones HTTP (default: 10)' . PHP_EOL; + echo PHP_EOL; + echo 'Ejemplos:' . PHP_EOL; + echo " $binary obtain 14111045399 UNA2907227Y5 --timeout 20" . PHP_EOL; + echo " $binary obtain ./constancia.pdf" . PHP_EOL; + echo " $binary obtain https://siafweb.ib.unam.mx/CSF.pdf" . PHP_EOL; + echo " cat ./constancia.pdf | $binary obtain -" . PHP_EOL; + echo " $binary schema" . PHP_EOL; +} + +$timeout = 10.0; +$clean_argv = []; +for ($i = 0; $i < $argc; $i++) { + if ('--timeout' === $argv[$i] && isset($argv[$i + 1])) { + $timeout = (float) $argv[$i + 1]; + $i++; + continue; + } + $clean_argv[] = $argv[$i]; +} +$argv = $clean_argv; +$argc = count($argv); + +foreach ($argv as $arg) { + if (in_array($arg, ['-h', '--help', 'help'])) { + showUsage(); + exit(0); + } +} + +if ($argc < 2) { + showUsage(); + exit(0); +} + +$command = $argv[1]; + +// Manejo de comandos explícitos +if ('schema' === $command || '--json-schema' === $command) { + echo file_get_contents(__DIR__ . '/../docs/schemas/csf.schema.json'); + exit(0); +} + +$scraper = Scraper::create($timeout); + +try { + if ('obtain' === $command) { + if (3 === $argc) { + // Un solo argumento después de obtain: asumimos ruta/url/stdin + $person = obtainFromSource($argv[2], $scraper); + } elseif (4 === $argc) { + // Dos argumentos después de obtain: asumimos id-cif y rfc + $idCif = $argv[2]; + $rfcValue = $argv[3]; + $person = $scraper->obtainFromRfcAndCif(Rfc::parse($rfcValue), $idCif); + } else { + throw new Exception('El comando obtain requiere [id-cif rfc] o [archivo|url|-]'); + } + } else { + throw new Exception("Comando '$command' no reconocido. Usa help para ver la lista de comandos."); + } + + echo json_encode($person, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . PHP_EOL; +} catch (Exception $e) { + fwrite(STDERR, 'Error: ' . $e->getMessage() . PHP_EOL); + exit(1); +} + +function obtainFromSource(string $input, Scraper $scraper): PhpCfdi\CsfScraper\PersonaMoral|PhpCfdi\CsfScraper\PersonaFisica +{ + $isUrl = filter_var($input, FILTER_VALIDATE_URL); + $isStdin = '-' === $input; + $tempFile = null; + + try { + if ($isUrl || $isStdin) { + $tempFile = tempnam(sys_get_temp_dir(), 'csf_'); + $content = $isStdin ? file_get_contents('php://stdin') : file_get_contents($input); + if (false === $content || '' === $content) { + $source = $isStdin ? 'stdin' : "la URL: $input"; + throw new Exception("No se pudo obtener el contenido desde $source"); + } + file_put_contents($tempFile, $content); + $path = $tempFile; + } else { + $path = $input; + if (! file_exists($path)) { + throw new Exception("El archivo no existe en la ruta: $path"); + } + } + + return $scraper->obtainFromPdfPath($path); + } finally { + if ($tempFile && file_exists($tempFile)) { + unlink($tempFile); + } + } +} diff --git a/composer.json b/composer.json index 3479612..94a43ad 100644 --- a/composer.json +++ b/composer.json @@ -90,5 +90,8 @@ "dev:coverage": "DEV: run phpunit with xdebug and storage coverage in build/coverage/html/", "dev:fix-style": "DEV: fix code style errors using composer-normalize, php-cs-fixer and phpcbf", "dev:test": "DEV: run @dev:check-style, phpunit and phpstan" - } + }, + "bin": [ + "bin/csf-scraper" + ] } diff --git a/docs/schemas/csf.schema.json b/docs/schemas/csf.schema.json new file mode 100644 index 0000000..0cec6a4 --- /dev/null +++ b/docs/schemas/csf.schema.json @@ -0,0 +1,155 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://raw.githubusercontent.com/phpcfdi/csf-scraper/main/docs/schemas/csf.schema.json", + "title": "Persona CSF Data", + "description": "Datos fiscales extraídos de una Constancia de Situación Fiscal del SAT (Física o Moral)", + "type": "object", + "oneOf": [ + { + "title": "Persona Física", + "required": [ + "rfc", + "id_cif", + "curp", + "nombre", + "apellido_paterno", + "situacion_contribuyente", + "regimenes" + ], + "properties": { + "curp": { + "type": "string" + }, + "nombre": { + "type": "string" + }, + "apellido_paterno": { + "type": "string" + }, + "apellido_materno": { + "type": "string" + }, + "fecha_nacimiento": { + "$ref": "#/definitions/dateTime" + } + } + }, + { + "title": "Persona Moral", + "required": [ + "rfc", + "id_cif", + "razon_social", + "regimen_de_capital", + "situacion_contribuyente", + "regimenes" + ], + "properties": { + "razon_social": { + "type": "string" + }, + "regimen_de_capital": { + "type": "string" + }, + "fecha_constitucion": { + "$ref": "#/definitions/dateTime" + } + } + } + ], + "properties": { + "rfc": { + "type": "string", + "pattern": "^[A-Z&Ñ]{3,4}[0-9]{2}(0[1-9]|1[012])(0[1-9]|[12][0-9]|3[01])[A-Z0-9]{2}[0-9A]$" + }, + "id_cif": { + "type": "string" + }, + "fecha_inicio_operaciones": { + "$ref": "#/definitions/dateTime" + }, + "situacion_contribuyente": { + "type": "string" + }, + "fecha_ultimo_cambio_situacion": { + "$ref": "#/definitions/dateTime" + }, + "entidad_federativa": { + "type": "string" + }, + "municipio_delegacion": { + "type": "string" + }, + "colonia": { + "type": "string" + }, + "tipo_vialidad": { + "type": "string" + }, + "nombre_vialidad": { + "type": "string" + }, + "numero_exterior": { + "type": "string" + }, + "numero_interior": { + "type": "string" + }, + "codigo_postal": { + "type": "string" + }, + "correo_electronico": { + "type": "string" + }, + "al": { + "type": "string" + }, + "regimenes": { + "type": "array", + "items": { + "type": "object", + "required": [ + "regimen", + "regimen_id" + ], + "properties": { + "regimen": { + "type": "string" + }, + "regimen_id": { + "type": "string" + }, + "fecha_alta": { + "$ref": "#/definitions/dateTime" + } + } + } + }, + "extra_data": { + "type": "object", + "additionalProperties": true + } + }, + "definitions": { + "dateTime": { + "type": "object", + "required": [ + "date", + "timezone_type", + "timezone" + ], + "properties": { + "date": { + "type": "string", + "format": "date-time" + }, + "timezone_type": { + "type": "integer" + }, + "timezone": { + "type": "string" + } + } + } + } +} \ No newline at end of file diff --git a/src/PdfReader/PdfToText.php b/src/PdfReader/PdfToText.php index debc891..239535b 100644 --- a/src/PdfReader/PdfToText.php +++ b/src/PdfReader/PdfToText.php @@ -46,6 +46,6 @@ public function extract(string $path): string /** @return list */ public function buildCommand(string $pdfFile): array { - return [$this->pdftotext, '-eol', 'unix', '-raw', '-q', $pdfFile, '-']; + return [$this->pdftotext, '-eol', 'unix', '-nopgbrk', '-raw', '-q', $pdfFile, '-']; } } diff --git a/src/Scraper.php b/src/Scraper.php index 8d28be3..5b8e5c9 100644 --- a/src/Scraper.php +++ b/src/Scraper.php @@ -34,10 +34,11 @@ public function getClient(): ClientInterface /** * Factory method to create a scraper object with configuration that simply works */ - public static function create(): self + public static function create(float $timeout = 10.0): self { return new self(new Client([ 'curl' => [CURLOPT_SSL_CIPHER_LIST => 'DEFAULT@SECLEVEL=1'], + 'timeout' => $timeout, ])); } diff --git a/tests/Integration/CliTest.php b/tests/Integration/CliTest.php new file mode 100644 index 0000000..59590ca --- /dev/null +++ b/tests/Integration/CliTest.php @@ -0,0 +1,84 @@ +run(); + return $process; + } + + public function test_command_help(): void + { + $process = $this->runCli(['help']); + + $this->assertTrue($process->isSuccessful()); + $this->assertStringContainsString('Uso:', $process->getOutput()); + $this->assertStringContainsString('obtain ', $process->getOutput()); + } + + public function test_command_help_legacy(): void + { + $process = $this->runCli(['--help']); + + $this->assertTrue($process->isSuccessful()); + $this->assertStringContainsString('Uso:', $process->getOutput()); + } + + public function test_command_help_in_any_position(): void + { + $process = $this->runCli(['obtain', '--help']); + + $this->assertTrue($process->isSuccessful()); + $this->assertStringContainsString('Uso:', $process->getOutput()); + } + + public function test_obtain_with_timeout(): void + { + $process = $this->runCli(['obtain', '12345678', 'RFC010101AAA', '--timeout', '5']); + + // We expect it to fail because the RFC/CIF are fake, but it should NOT fail because of the option + $this->assertFalse($process->isSuccessful()); + $this->assertStringContainsString('Error:', $process->getErrorOutput()); + $this->assertStringNotContainsString('no reconocido', $process->getErrorOutput()); + } + + public function test_command_schema(): void + { + $process = $this->runCli(['schema']); + + $this->assertTrue($process->isSuccessful()); + $this->assertJson($process->getOutput()); + $this->assertStringContainsString('csf.schema.json', $process->getOutput()); + } + + public function test_obtain_missing_arguments(): void + { + $process = $this->runCli(['obtain']); + + $this->assertFalse($process->isSuccessful()); + $this->assertStringContainsString('Error: El comando obtain requiere [id-cif rfc] o [archivo|url|-]', $process->getErrorOutput()); + } + + public function test_obtain_local_file(): void + { + $path = $this->filePath('csf-without-cif.pdf'); + $process = $this->runCli(['obtain', $path]); + + // Expect failure because it doesn't have CIF ID, but it confirms the file was read and processed + $this->assertFalse($process->isSuccessful()); + $this->assertStringContainsString('Cannot obtain cif from given PDF', $process->getErrorOutput()); + } +} diff --git a/tests/Unit/ScraperTest.php b/tests/Unit/ScraperTest.php index eda7abb..fa4bf80 100644 --- a/tests/Unit/ScraperTest.php +++ b/tests/Unit/ScraperTest.php @@ -26,5 +26,14 @@ public function test_create_with_specific_openssl_cipher_list(): void [CURLOPT_SSL_CIPHER_LIST => 'DEFAULT@SECLEVEL=1'], $client->getConfig('curl'), ); + $this->assertSame(10.0, $client->getConfig('timeout')); + } + + public function test_create_with_timeout(): void + { + $scraper = Scraper::create(5.0); + $client = $scraper->getClient(); + + $this->assertSame(5.0, $client->getConfig('timeout')); } }