Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions c_src/lazy_html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,6 +1048,137 @@ std::vector<fine::Term> tag(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {

FINE_NIF(tag, 0);

// ---------------------------------------------------------------------------
// DOM mutation operations
// ---------------------------------------------------------------------------

// Remove all elements matching a CSS selector from the DOM tree.
//
// Uses the same CSS selector infrastructure as query(). Collects all
// matching nodes first (can't modify the tree during traversal), then
// destroys them via lxb_dom_node_destroy (unlink + free).
//
// Also scrubs the LazyHTML node vector to remove any root nodes that
// were destroyed, preventing dangling pointer access.
ExLazyHTML dom_remove(ErlNifEnv *env, ExLazyHTML ex_lazy_html,
ErlNifBinary css_selector) {
auto parser = lxb_css_parser_create();
auto status = lxb_css_parser_init(parser, NULL);
if (status != LXB_STATUS_OK) {
throw std::runtime_error("failed to create css parser");
}
auto parser_guard =
ScopeGuard([&]() { lxb_css_parser_destroy(parser, true); });

auto css_selector_list = parse_css_selector(parser, css_selector);
auto css_selector_list_guard = ScopeGuard(
[&]() { lxb_css_selector_list_destroy_memory(css_selector_list); });

auto selectors = lxb_selectors_create();
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
throw std::runtime_error("failed to create selectors");
}
auto selectors_guard =
ScopeGuard([&]() { lxb_selectors_destroy(selectors, true); });

lxb_selectors_opt_set(selectors, static_cast<lxb_selectors_opt_t>(
LXB_SELECTORS_OPT_MATCH_FIRST |
LXB_SELECTORS_OPT_MATCH_ROOT));

// Phase 1: Collect nodes to remove
auto to_remove = std::vector<lxb_dom_node_t *>();
auto seen = std::unordered_set<lxb_dom_node_t *>();

struct FindCtx {
std::vector<lxb_dom_node_t *> *to_remove;
std::unordered_set<lxb_dom_node_t *> *seen;
};

auto ctx = FindCtx{&to_remove, &seen};

for (auto node : ex_lazy_html.resource->nodes) {
status = lxb_selectors_find(
selectors, node, css_selector_list,
[](lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
void *ctx) -> lxb_status_t {
auto find_ctx = static_cast<FindCtx *>(ctx);
if (find_ctx->seen->insert(node).second) {
find_ctx->to_remove->push_back(node);
}
return LXB_STATUS_OK;
},
&ctx);
if (status != LXB_STATUS_OK) {
throw std::runtime_error("failed to run find");
}
}

// Phase 2: Destroy matched nodes (unlink from tree + free memory)
for (auto node : to_remove) {
lxb_dom_node_destroy(node);
}

// Phase 3: Scrub the LazyHTML root node vector to remove any
// destroyed nodes that were also roots
auto &nodes = ex_lazy_html.resource->nodes;
nodes.erase(
std::remove_if(nodes.begin(), nodes.end(),
[&seen](lxb_dom_node_t *n) { return seen.count(n) > 0; }),
nodes.end());

return ex_lazy_html;
}

FINE_NIF(dom_remove, ERL_NIF_DIRTY_JOB_CPU_BOUND);

// Remove a named attribute from all element nodes in the set and
// all their descendants.
ExLazyHTML dom_remove_attribute(ErlNifEnv *env, ExLazyHTML ex_lazy_html,
ErlNifBinary name) {
for (auto node : ex_lazy_html.resource->nodes) {
// Remove from the node itself
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto element = lxb_dom_interface_element(node);
lxb_dom_element_remove_attribute(element, name.data, name.size);
}

// Remove from all descendants
lxb_dom_node_simple_walk(
node,
[](lxb_dom_node_t *child, void *ctx) -> lexbor_action_t {
if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto walk_ctx = static_cast<ErlNifBinary *>(ctx);
auto element = lxb_dom_interface_element(child);
lxb_dom_element_remove_attribute(element, walk_ctx->data,
walk_ctx->size);
}
return LEXBOR_ACTION_OK;
},
&name);
}

return ex_lazy_html;
}

FINE_NIF(dom_remove_attribute, ERL_NIF_DIRTY_JOB_CPU_BOUND);

// Set a named attribute on all element nodes in the set.
ExLazyHTML dom_set_attribute(ErlNifEnv *env, ExLazyHTML ex_lazy_html,
ErlNifBinary name, ErlNifBinary value) {
for (auto node : ex_lazy_html.resource->nodes) {
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto element = lxb_dom_interface_element(node);
lxb_dom_element_set_attribute(element, name.data, name.size, value.data,
value.size);
}
}

return ex_lazy_html;
}

FINE_NIF(dom_set_attribute, 0);

} // namespace lazy_html

FINE_INIT("Elixir.LazyHTML.NIF");
76 changes: 76 additions & 0 deletions lib/lazy_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,82 @@ defmodule LazyHTML do
LazyHTML.NIF.tag(lazy_html)
end

@doc ~S'''
Removes all elements matching the CSS selector from the DOM tree.

This mutates the native DOM in place, so subsequent operations
(including `to_html/1`) reflect the removal. Much faster than
round-tripping through `to_tree/1` for tree transformation
workloads like HTML sanitization or content stripping.

The selector uses the same CSS engine as `query/2`, including
compound selectors.

> #### Warning {: .warning}
>
> Because this mutates the underlying DOM, any `%LazyHTML{}`
> values previously obtained via `query/2` that reference
> removed nodes become invalid. Always call `remove/2` before
> querying, or re-query after removal.

## Examples

iex> doc = LazyHTML.from_fragment("<div><script>x</script><p>keep</p></div>")
iex> doc = LazyHTML.remove(doc, "script")
iex> LazyHTML.to_html(doc)
"<div><p>keep</p></div>"

Multiple selectors can be combined:

iex> doc = LazyHTML.from_fragment("<div><script>x</script><style>y</style><p>keep</p></div>")
iex> doc = LazyHTML.remove(doc, "script, style")
iex> LazyHTML.to_html(doc)
"<div><p>keep</p></div>"

'''
@spec remove(t(), String.t()) :: t()
def remove(%LazyHTML{} = lazy_html, selector) when is_binary(selector) do
LazyHTML.NIF.dom_remove(lazy_html, selector)
end

@doc ~S'''
Removes the named attribute from all element nodes and their descendants.

This mutates the native DOM in place.

## Examples

iex> doc = LazyHTML.from_fragment(~s(<p style="color:red" class="x">hi</p>))
iex> doc = LazyHTML.remove_attribute(doc, "style")
iex> LazyHTML.to_html(doc)
~s(<p class="x">hi</p>)

'''
@spec remove_attribute(t(), String.t()) :: t()
def remove_attribute(%LazyHTML{} = lazy_html, name) when is_binary(name) do
LazyHTML.NIF.dom_remove_attribute(lazy_html, name)
end

@doc ~S'''
Sets an attribute on all element nodes in the set.

If the attribute already exists, its value is replaced.
This mutates the native DOM in place.

## Examples

iex> doc = LazyHTML.from_fragment(~s(<a href="/x">link</a>))
iex> doc = LazyHTML.set_attribute(doc, "rel", "nofollow")
iex> LazyHTML.to_html(doc)
~s(<a href="/x" rel="nofollow">link</a>)

'''
@spec set_attribute(t(), String.t(), String.t()) :: t()
def set_attribute(%LazyHTML{} = lazy_html, name, value)
when is_binary(name) and is_binary(value) do
LazyHTML.NIF.dom_set_attribute(lazy_html, name, value)
end

@doc ~S"""
Escapes the given string to make a valid HTML text.

Expand Down
3 changes: 3 additions & 0 deletions lib/lazy_html/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ defmodule LazyHTML.NIF do
def tag(_lazy_html), do: err!()
def nodes(_lazy_html), do: err!()
def num_nodes(_lazy_html), do: err!()
def dom_remove(_lazy_html, _css_selector), do: err!()
def dom_remove_attribute(_lazy_html, _name), do: err!()
def dom_set_attribute(_lazy_html, _name, _value), do: err!()

defp err!(), do: :erlang.nif_error(:not_loaded)
end
90 changes: 90 additions & 0 deletions test/lazy_html_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -633,4 +633,94 @@ defmodule LazyHTMLTest do
"""
end
end

describe "remove/2" do
test "removes elements matching a simple selector" do
doc = LazyHTML.from_fragment("<div><script>bad</script><p>good</p></div>")
doc = LazyHTML.remove(doc, "script")
assert LazyHTML.to_html(doc) == "<div><p>good</p></div>"
end

test "removes elements matching a compound selector" do
doc = LazyHTML.from_fragment("<div><script>a</script><style>b</style><p>keep</p></div>")
doc = LazyHTML.remove(doc, "script, style")
assert LazyHTML.to_html(doc) == "<div><p>keep</p></div>"
end

test "removes nested matching elements" do
doc = LazyHTML.from_fragment("<div><div><script>deep</script></div><p>keep</p></div>")
doc = LazyHTML.remove(doc, "script")
assert LazyHTML.to_html(doc) == "<div><div></div><p>keep</p></div>"
end

test "removes elements matching attribute selectors" do
doc = LazyHTML.from_fragment(~s(<div><p hidden>hidden</p><p>visible</p></div>))
doc = LazyHTML.remove(doc, "[hidden]")
assert LazyHTML.to_html(doc) == "<div><p>visible</p></div>"
end

test "removes root nodes that match" do
doc = LazyHTML.from_fragment("<script>bad</script><p>good</p>")
doc = LazyHTML.remove(doc, "script")
assert LazyHTML.to_html(doc) == "<p>good</p>"
end

test "no-op when nothing matches" do
doc = LazyHTML.from_fragment("<p>hello</p>")
doc = LazyHTML.remove(doc, "script")
assert LazyHTML.to_html(doc) == "<p>hello</p>"
end

test "subsequent queries reflect removal" do
doc = LazyHTML.from_fragment("<div><script>x</script><p>keep</p></div>")
doc = LazyHTML.remove(doc, "script")
assert LazyHTML.query(doc, "script") |> Enum.to_list() == []
assert LazyHTML.query(doc, "p") |> LazyHTML.text() == "keep"
end
end

describe "remove_attribute/2" do
test "removes a named attribute from elements" do
doc = LazyHTML.from_fragment(~s(<p style="color:red" class="x">hi</p>))
doc = LazyHTML.remove_attribute(doc, "style")
assert LazyHTML.to_html(doc) == ~s(<p class="x">hi</p>)
end

test "removes attribute from nested elements" do
doc = LazyHTML.from_fragment(~s(<div style="a"><p style="b">text</p></div>))
doc = LazyHTML.remove_attribute(doc, "style")
assert LazyHTML.to_html(doc) == "<div><p>text</p></div>"
end

test "no-op when attribute doesn't exist" do
doc = LazyHTML.from_fragment("<p>hello</p>")
doc = LazyHTML.remove_attribute(doc, "style")
assert LazyHTML.to_html(doc) == "<p>hello</p>"
end
end

describe "set_attribute/3" do
test "sets a new attribute" do
doc = LazyHTML.from_fragment(~s(<a href="/x">link</a>))
doc = LazyHTML.set_attribute(doc, "rel", "nofollow")
html = LazyHTML.to_html(doc)
assert html =~ ~s(rel="nofollow")
assert html =~ ~s(href="/x")
end

test "overwrites an existing attribute" do
doc = LazyHTML.from_fragment(~s(<p class="old">text</p>))
doc = LazyHTML.set_attribute(doc, "class", "new")
assert LazyHTML.to_html(doc) == ~s(<p class="new">text</p>)
end

test "sets attribute on multiple nodes from query" do
doc = LazyHTML.from_fragment("<a>one</a><a>two</a>")
links = LazyHTML.query(doc, "a")
LazyHTML.set_attribute(links, "target", "_blank")
html = LazyHTML.to_html(doc)
assert html =~ ~s(<a target="_blank">one</a>)
assert html =~ ~s(<a target="_blank">two</a>)
end
end
end