From 3251c29d43cb42278814955675408a04ac6a2252 Mon Sep 17 00:00:00 2001 From: Rene Schwietzke Date: Sun, 7 Sep 2025 14:16:17 +0200 Subject: [PATCH 1/3] Rework of HtmlElements to become immutable for the elements but still caching access patterns for the new HTMLELements class, hence the usage pattern is similar but we get proper caching without messing up the state --- .../org/htmlunit/cyberneko/HTMLElements.java | 190 +++++++++--------- .../cyberneko/HTMLElementsCollection.java | 165 +++++++++++++++ 2 files changed, 259 insertions(+), 96 deletions(-) create mode 100644 src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLElements.java b/src/main/java/org/htmlunit/cyberneko/HTMLElements.java index b03b36be..3d1b0880 100644 --- a/src/main/java/org/htmlunit/cyberneko/HTMLElements.java +++ b/src/main/java/org/htmlunit/cyberneko/HTMLElements.java @@ -15,18 +15,21 @@ */ package org.htmlunit.cyberneko; -import java.util.HashMap; import java.util.Locale; import org.htmlunit.cyberneko.util.FastHashMap; /** - * Collection of HTML element information. + * Collection of HTML element information. Parts if it was in the original + * HtmlElements, now HtmlElementsCollection. It has been changed to allow + * modifications at runtime in the sense of caching any lookup results, + * especially for not found elements to speed things up. * * @author Andy Clark * @author Ahmed Ashour * @author Marc Guillemot * @author Ronald Brill + * @author Rene Schwietzke */ public class HTMLElements { @@ -35,6 +38,8 @@ public class HTMLElements { // NOTE: The element codes *must* start with 0 and increment in // sequence. The parent and closes references depends on // this assumption. -Ac + // Note 2: This codes are here to maintain compatibility with + // existing code despite having the main use in HtmlElementsCollection. public static final short A = 0; public static final short ABBR = A + 1; @@ -183,26 +188,43 @@ public class HTMLElements { public static final short XML = WBR + 1; public static final short XMP = XML + 1; public static final short UNKNOWN = XMP + 1; - - // information - - /** No such element. */ - public final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY}, null); - - // these fields became private to avoid exposing them for indirect modification - // this cannot be final because HtmlUnit might add to that - private Element[] elementsByCode_; - - // keep the list here for later modification - private final HashMap elementsByNameForReference_ = new HashMap<>(); - - // this is a optimized version which will be later queried - private final FastHashMap elementsByNameOptimized_ = new FastHashMap<>(311, 0.70f); + + // holds the original elements to query against + private final HTMLElementsCollection htmlElementsCollection_; // this map helps us to know what elements we don't have and speed things up - private final FastHashMap unknownElements_ = new FastHashMap<>(11, 0.70f); + private final FastHashMap unknownElements_ = new FastHashMap<>(17, 0.50f); + /** + * Create a new lookup instance based on the default elements. This instance + * features a cache for not found elements to speed things up. + * + * @param htmlElements the elements to lookup against + */ public HTMLElements() { + this.htmlElementsCollection_ = HTMLElementsCollection.DEFAULT; + } + + /** + * Create a new lookup instance based on the given elements. This instance + * features a cache for not found elements to speed things up. + * + * @param htmlElements the elements to lookup against + */ + public HTMLElements(final HTMLElementsCollection htmlElements) { + this.htmlElementsCollection_ = htmlElements; + } + + /** + * Creates the default setup of HTML elements. We cannot statically set + * them up because some elements refer to others and these references + * might be changed later when adding custom elements. + * + * This is not for public use. + * + * @return the array of arrays of default elements + */ + static Element[][] setupDefaultHTMElements() { final Element[][] elementsArray = new Element[26][]; // // @@ -550,102 +572,65 @@ public HTMLElements() { // XMP new Element(XMP, "XMP", Element.SPECIAL, BODY, new short[] {P}), }; - - // keep contiguous list of elements for lookups by code - for (final Element[] elements : elementsArray) { - if (elements != null) { - for (final Element element : elements) { - this.elementsByNameForReference_.put(element.name, element); - } - } - } - - // setup optimized versions - setupOptimizedVersions(); - } - - public void setElement(final Element element) { - this.elementsByNameForReference_.put(element.name, element); - - // rebuild the information "trees" - setupOptimizedVersions(); - } - - private void setupOptimizedVersions() { - // we got x amount of elements + 1 unknown - // put that into an array instead of a map, that - // is a faster look up and avoids equals - // ATTENTION: Due to some HtmlUnit custom tag handling that overwrites our - // list here, we might get a list with holes, so check the range first - final int size = elementsByNameForReference_.values().stream().mapToInt(v -> v.code).max().getAsInt(); - elementsByCode_ = new Element[Math.max(size, NO_SUCH_ELEMENT.code) + 1]; - elementsByNameForReference_.values().forEach(v -> elementsByCode_[v.code] = v); - elementsByCode_[NO_SUCH_ELEMENT.code] = NO_SUCH_ELEMENT; - - // initialize cross references to parent elements - for (final Element element : elementsByCode_) { - if (element != null) { - defineParents(element); - } - } - // get us a second version that is lowercase stringified to - // reduce lookup overhead - for (final Element element : elementsByCode_) { - // we might have holes due to HtmlUnitNekoHtmlParser - if (element != null) { - elementsByNameOptimized_.put(element.name.toLowerCase(Locale.ROOT), element); - } - } - } - - private void defineParents(final Element element) { - if (element.parentCodes_ != null) { - element.parent = new Element[element.parentCodes_.length]; - for (int j = 0; j < element.parentCodes_.length; j++) { - element.parent[j] = elementsByCode_[element.parentCodes_[j]]; - } - element.parentCodes_ = null; - } + + return elementsArray; } /** - * @return the element information for the specified element code. + * Lookup table for elements by code. There is not range check applied + * for the sake of performance. Java will check that anyway, so we + * don't have to. * * @param code The element code. + * @return the element information for the specified element code. */ public final Element getElement(final short code) { - return elementsByCode_[code]; + return this.htmlElementsCollection_.getElement(code); } /** - * @return the element information for the specified element name. - * + * Finds an element by its name and returns the element information. If + * the element is not known, a new element with the UNKNOWN code is + * returned. + * * @param ename The element name. + * @return the element information for the specified element name or an new instance with the + * UNKNOWN code if not found. */ public final Element getElement(final String ename) { - Element element = getElement(ename, NO_SUCH_ELEMENT); - if (element == NO_SUCH_ELEMENT) { - element = new Element(UNKNOWN, ename.toUpperCase(Locale.ROOT), - NO_SUCH_ELEMENT.flags, NO_SUCH_ELEMENT.parentCodes_, NO_SUCH_ELEMENT.closes); - element.parent = NO_SUCH_ELEMENT.parent; - element.parentCodes_ = NO_SUCH_ELEMENT.parentCodes_; + Element element = getElement(ename, this.htmlElementsCollection_.NO_SUCH_ELEMENT); + + if (element == this.htmlElementsCollection_.NO_SUCH_ELEMENT) { + element = new Element(UNKNOWN, + ename.toUpperCase(Locale.ROOT), + this.htmlElementsCollection_.NO_SUCH_ELEMENT.flags, + this.htmlElementsCollection_.NO_SUCH_ELEMENT.parentCodes_, + this.htmlElementsCollection_.NO_SUCH_ELEMENT.closes); + element.parent = this.htmlElementsCollection_.NO_SUCH_ELEMENT.parent; + element.parentCodes_ = this.htmlElementsCollection_.NO_SUCH_ELEMENT.parentCodes_; } + return element; } - /** - * @return the element information for the specified element name. - * + * Looks up an element by its name and returns the element information. If the + * current form of the name is not found, it tries again with the lowercase form. + * If still not found, the given default element is returned. + * * @param ename The element name. * @param element The default element to return if not found. + * + * @return the element information for the specified element name. */ public final Element getElement(final String ename, final Element element) { // check the current form casing first, which is mostly lowercase only - Element r = elementsByNameOptimized_.get(ename); + Element r = this.htmlElementsCollection_.lookupElement(ename); + + // we have not found it, so it might feature different casing if (r == null) { // check first if we know that we don't know and avoid the // lowercasing later - if (unknownElements_.get(ename) != null) { + if (this.unknownElements_.get(ename) != null) { // we added it to the cache, so we know it has been // queried once unsuccessfully before return element; @@ -656,14 +641,15 @@ public final Element getElement(final String ename, final Element element) { // good HTML is mostly all lowercase in the first place so this is the // fallback for atypical HTML // we also have not seen that element missing yet - r = elementsByNameOptimized_.get(ename.toLowerCase(Locale.ROOT)); + r = this.htmlElementsCollection_.lookupElement(ename.toLowerCase(Locale.ROOT)); // remember that we had a miss if (r == null) { - unknownElements_.put(ename, Boolean.TRUE); + this.unknownElements_.put(ename, Boolean.TRUE); return element; } } + return r; } @@ -698,6 +684,12 @@ public static class Element { /** The element code. */ public final short code; + /** + * hash code base on the name, kept here to avoid one extra memory + * access as well as the extra logic in String.hashCode() + */ + private final int hashCode_; + /** The element name. */ public final String name; @@ -707,7 +699,10 @@ public static class Element { /** Informational flags. */ public final int flags; - /** Parent elements. */ + /** + * Parent elements, having that open is dangerous but for + * legacy reasons we won't change that right now + */ public Element[] parent; /** The bounding element code. */ @@ -718,7 +713,7 @@ public static class Element { /** Parent elements. */ short[] parentCodes_; - + /** * Constructs an element object. * @@ -782,6 +777,9 @@ public Element(final short code, final String name, final int flags, this.parent = null; this.bounds = bounds; this.closes = closes; + // the name is never null, if so, the lowercase would have blown + // up already + this.hashCode_ = name.hashCode(); } /** @@ -844,11 +842,11 @@ public boolean closes(final short tag) { } /** - * @return a hash code for this object. + * @return the hash code for this object. */ @Override public int hashCode() { - return name.hashCode(); + return this.hashCode_; } /** @@ -858,7 +856,7 @@ public int hashCode() { public boolean equals(final Object o) { if (o instanceof Element) { final Element e = (Element) o; - return lowercaseName.equals(e.name) || name.equals(e.name); + return name.equals(e.name) || lowercaseName.equals(e.name); } return false; } diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java b/src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java new file mode 100644 index 00000000..247a1ae7 --- /dev/null +++ b/src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2002-2009 Andy Clark, Marc Guillemot + * Copyright (c) 2017-2024 Ronald Brill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.htmlunit.cyberneko; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; + +import org.htmlunit.cyberneko.HTMLElements.Element; +import org.htmlunit.cyberneko.util.FastHashMap; + +/** + * Collection of HTML element information. This is an immutable object + * so it is safe to share between multiple threads. But one must not + * continue to use any custom elements after adding it to this collection. + * + * This class was originally in parts HtmlElements. To allow reuse and safe + * use in a concurrent environment, it was split into this class and the new + * HtmlElements. In parts to make the API less breaking. + * + * @author Andy Clark + * @author Ahmed Ashour + * @author Marc Guillemot + * @author Ronald Brill + * @author Rene Schwietzke + */ +public class HTMLElementsCollection { + + /** A default reusable instance */ + public static final HTMLElementsCollection DEFAULT = new HTMLElementsCollection(); + + /** No such element. */ + public final Element NO_SUCH_ELEMENT = new Element(HTMLElements.UNKNOWN, "", Element.CONTAINER, new short[]{HTMLElements.BODY}, null); + + // information + + // these fields became private to avoid exposing them for indirect modification + // It cannot be final because we know only later the needed size. + private Element[] elementsByCode_; + + // keep the list here for later modification + private final HashMap elementsByNameForReference_ = new HashMap<>(); + + // this is a optimized version which will be later queried, sparsely populated to avoid too many collisions + private final FastHashMap elementsByNameOptimized_ = new FastHashMap<>(311, 0.50f); + + /** + * Creates a new HTMLElements with all default objects only. If you need that, + * use the static {@link #DEFAULT} instance. + */ + private HTMLElementsCollection() { + this(Collections.emptyList()); + } + + /** + * Creates a new HTMLElements with one extra custom element. + * + * @param customElement our custom element to add + */ + public HTMLElementsCollection(Element customElement) { + this(Collections.singletonList(customElement)); + } + + /** + * Creates a new HTMLElements with all default objects plus our custom ones. + */ + public HTMLElementsCollection(List customElements) { + final Element[][] elementsArray = HTMLElements.setupDefaultHTMElements(); + + // keep contiguous list of elements for lookups by name + for (final Element[] elements : elementsArray) { + if (elements != null) { + for (final Element element : elements) { + this.elementsByNameForReference_.put(element.name, element); + } + } + } + // add our custom elements + for (final Element customElement : customElements) { + this.elementsByNameForReference_.put(customElement.name, customElement); + } + + // setup an optimized versions with all references to parents and + // some optimized lookup structures + setupOptimizedVersions(); + } + + private void setupOptimizedVersions() { + // we got x amount of elements + 1 unknown + // put that into an array instead of a map, that + // is a faster look up and avoids equals + + // ATTENTION: Due to some HtmlUnit custom tag handling that overwrites our + // list here, we might get a list with holes, so check the range first + final int size = elementsByNameForReference_.values().stream().mapToInt(v -> v.code).max().getAsInt(); + + elementsByCode_ = new Element[Math.max(size, NO_SUCH_ELEMENT.code) + 1]; + elementsByNameForReference_.values().forEach(v -> elementsByCode_[v.code] = v); + elementsByCode_[NO_SUCH_ELEMENT.code] = NO_SUCH_ELEMENT; + + // initialize cross references to parent elements + for (final Element element : elementsByCode_) { + if (element != null) { + defineParents(element); + } + } + + // get us a second version that is lowercase to + // reduce lookup overhead + for (final Element element : elementsByCode_) { + // we might have holes due to HtmlUnitNekoHtmlParser + if (element != null) { + elementsByNameOptimized_.put(element.name.toLowerCase(Locale.ROOT), element); + } + } + } + + private void defineParents(final Element element) { + if (element.parentCodes_ != null) { + element.parent = new Element[element.parentCodes_.length]; + + for (int j = 0; j < element.parentCodes_.length; j++) { + element.parent[j] = elementsByCode_[element.parentCodes_[j]]; + } + + element.parentCodes_ = null; + } + } + + /** + * Lookup table for elements by code. There is no range check applied + * for the sake of performance. Java will check that anyway, so we + * don't have to. + * + * @param code The element code. + * @return the element information for the specified element code. + */ + final Element getElement(final short code) { + return elementsByCode_[code]; + } + + /** + * Lookup the element by name, returns null if not found. + * + * @param ename the name of the element to lookup + * @return the element or null if not found + */ + final Element lookupElement(final String ename) { + return elementsByNameOptimized_.get(ename); + } +} From 8edcb07e4d4d4c19b022ffc1c9bebe997b0c9ca1 Mon Sep 17 00:00:00 2001 From: Rene Schwietzke Date: Sun, 7 Sep 2025 14:16:29 +0200 Subject: [PATCH 2/3] Comment added --- src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java b/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java index 277b8bef..ed219e3d 100644 --- a/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java +++ b/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java @@ -126,6 +126,7 @@ public class HTMLConfiguration extends ParserConfigurationSettings implements XM /** Namespace binder. */ private final NamespaceBinder namespaceBinder_ = new NamespaceBinder(this); + /** HTML elements to be used */ private final HTMLElements htmlElements_; /** Default constructor. */ From 73319b35142650d94bc428585d52899c7c507f0a Mon Sep 17 00:00:00 2001 From: Rene Schwietzke Date: Fri, 12 Sep 2025 22:48:11 +0200 Subject: [PATCH 3/3] Pom for debugging --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 9acdc5bf..b6d22861 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.htmlunit neko-htmlunit - 4.16.0 + 4.17.0-rschwietzke HtmlUnit NekoHtml HtmlUnit @@ -339,4 +339,4 @@ Sean Smith - \ No newline at end of file +