diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5eb15d8..235298b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,22 +14,10 @@ jobs: fail-fast: false matrix: php_version: - - '8.0' - - '8.1' - - '8.2' - - '8.3' - '8.4' dependencies: - 'default' include: - - php_version: '8.0' - dependencies: 'lowest' - - php_version: '8.1' - dependencies: 'lowest' - - php_version: '8.2' - dependencies: 'lowest' - - php_version: '8.3' - dependencies: 'lowest' - php_version: '8.4' dependencies: 'lowest' steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 947d5f6..e6712d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ This project adheres to [Semantic Versioning](http://semver.org/). ## Unreleased +* Use native `HTMLDocument` parsing instead of masterminds/html5 +* Drop support for PHP < 8.4 (to allow us to use native HTML parsing) + +## 1.3.3 / 2026-03-10 + +* Fix handling and reporting parse errors when parsing an html table. + ## 1.3.2 / 2025-05-06 * Support PHP 8.4 (thanks @mharmuth) diff --git a/composer.json b/composer.json index f46a2ed..48efb25 100644 --- a/composer.json +++ b/composer.json @@ -13,8 +13,7 @@ "minimum-stability": "stable", "require": { "behat/gherkin": ">=2.0.0 <5.0.0", - "masterminds/html5": "^2.7.5", - "php": "~8.0.0 || ~8.1.0 || ~8.2.0 || ~8.3.0 || ~8.4.0", + "php": ">= 8.4 < 8.5", "ext-dom": "*", "ext-SimpleXML": "*", "ext-libxml": "*" diff --git a/src/TableParser/HTML/HTMLStringTableParser.php b/src/TableParser/HTML/HTMLStringTableParser.php index d309d97..0b56345 100644 --- a/src/TableParser/HTML/HTMLStringTableParser.php +++ b/src/TableParser/HTML/HTMLStringTableParser.php @@ -7,9 +7,13 @@ namespace Ingenerator\BehatTableAssert\TableParser\HTML; +use Behat\Gherkin\Node\TableNode; +use Dom\HTMLDocument; +use Dom\HTMLElement; use Ingenerator\BehatTableAssert\TableNode\PaddedTableNode; use LibXMLError; use Masterminds\HTML5; +use function Dom\import_simplexml; /** * Parses an HTML string for a element into a TableNode. The table must have a single row @@ -90,33 +94,43 @@ public function parse($html) */ protected function parseHTMLString($html) { - $old_use_internal_errors = \libxml_use_internal_errors(TRUE); try { - $html5 = new HTML5(); - $dom = $html5->loadHTML( - '' - .'' - .''.\trim($html).'' - .'' + set_error_handler( + fn($errno, $errstr) => throw new \InvalidArgumentException( + sprintf("Invalid HTML: %s\n\n===HTML===\n%s", $errstr, $html), + ), ); - $table_elem = $dom->getElementsByTagName('body')->item(0)->firstChild; - $table = \simplexml_import_dom($table_elem); - if ($errors = \libxml_get_errors()) { - $this->throwInvalidHTMLException($html, $errors); - } + $dom = HTMLDocument::createFromString( + sprintf( + <<<'HTML' + + + + %s + + HTML, + trim($html), + ), + ); } finally { - \libxml_clear_errors(); - \libxml_use_internal_errors($old_use_internal_errors); + restore_error_handler(); + } + $table_elem = $dom->getElementsByTagName('body')->item(0)->firstChild; + + if (!$table_elem instanceof HTMLElement) { + throw new \InvalidArgumentException( + sprintf("Expected html root element but got %s\n\n===HTML===\n%s", get_debug_type($table_elem), $html), + ); } + $table = \simplexml_import_dom($table_elem); return $table; } /** - * @param string $html - * @param LibXMLError[] $errors + * @deprecated no longer called by the library */ protected function throwInvalidHTMLException($html, $errors) { @@ -224,7 +238,7 @@ protected function findCellTextValues(\SimpleXMLElement $table_row) */ protected function parseCellText(\SimpleXmlElement $cell) { - $text = \trim(\preg_replace('/\s+/', ' ', \dom_import_simplexml($cell)->textContent)); + $text = \trim(\preg_replace('/\s+/', ' ', import_simplexml($cell)->textContent)); if ($prefix = (string) $cell['data-behat-table-prefix']) { $text = $prefix.' '.$text; diff --git a/test/TableParser/HTML/HTMLStringTableParserTest.php b/test/TableParser/HTML/HTMLStringTableParserTest.php index 2303175..d29b8df 100644 --- a/test/TableParser/HTML/HTMLStringTableParserTest.php +++ b/test/TableParser/HTML/HTMLStringTableParserTest.php @@ -35,37 +35,16 @@ public function test_it_throws_when_parsing_non_or_empty_string($value) } /** - * @testWith ["random", false] - * ["random", true] - * ["
", false] - * ["
1
", false] + * @testWith ["random text", "Expected html root element but got Dom\\Text"] + * ["
<17sd>illegal tag name
", "Invalid HTML: Dom\\HTMLDocument::createFromString(): tokenizer error invalid-first-character-of-tag-name"] + * ["
", "Expected a but got div"] */ - public function test_it_always_restores_state_of_libxml_error_handling( - $html, - $use_errors_before - ) { - $old_setting = \libxml_use_internal_errors($use_errors_before); - try { - $this->newSubject()->parse($html); - } catch (\Exception $e) { /* ignore */ - } - $errors_after = \libxml_get_errors(); - $use_errors_after = \libxml_use_internal_errors($old_setting); - - $this->assertSame([], $errors_after, 'Should clear libxml errors'); - $this->assertEquals( - $use_errors_before, - $use_errors_after, - 'Should restore libxml_use_internal_errors' - ); - } - - public function test_it_throws_when_parsing_html_that_is_not_a_table() + public function test_it_throws_when_parsing_html_that_is_not_a_table_or_not_valid(string $input, string $expect_msg) { $this->expectException(\InvalidArgumentException::class); - $this->expectExceptionMessage('Expected a
'); + $this->expectExceptionMessage($expect_msg); - $this->newSubject()->parse('
'); + $this->newSubject()->parse($input); } public function test_it_throws_when_parsing_table_without_thead() @@ -224,7 +203,7 @@ public function provider_valid_html_tables() '
'. ''. ''. - ''. + ''. '
HeaderDate
Cell1
Cell1
', [ ['Header', 'Date'],