Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x/net/html: add Node.{Ancestors,ChildNodes,Descendants}() #215

Closed
wants to merge 17 commits into from
Closed
7 changes: 1 addition & 6 deletions html/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,11 @@ example, to process each anchor node in depth-first order:
if err != nil {
// ...
}
var f func(*html.Node)
f = func(n *html.Node) {
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.Data == "a" {
// Do something with n...
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)

The relevant specifications include:
https://html.spec.whatwg.org/multipage/syntax.html and
Expand Down
13 changes: 6 additions & 7 deletions html/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.23

// This example demonstrates parsing HTML data and walking the resulting tree.
package html_test

Expand All @@ -11,6 +13,7 @@ import (
"strings"

"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

func ExampleParse() {
Expand All @@ -19,21 +22,17 @@ func ExampleParse() {
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.DataAtom == atom.A {
for _, a := range n.Attr {
if a.Key == "href" {
fmt.Println(a.Val)
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)

// Output:
// foo
// /bar/baz
Expand Down
56 changes: 56 additions & 0 deletions html/iter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.23

package html

import "iter"

// Ancestors returns an iterator over the ancestors of n, starting with n.Parent.
//
// Mutating a Node or its parents while iterating may have unexpected results.
func (n *Node) Ancestors() iter.Seq[*Node] {
_ = n.Parent // eager nil check

return func(yield func(*Node) bool) {
for p := n.Parent; p != nil && yield(p); p = p.Parent {
}
}
}

// ChildNodes returns an iterator over the immediate children of n,
// starting with n.FirstChild.
//
// Mutating a Node or its children while iterating may have unexpected results.
func (n *Node) ChildNodes() iter.Seq[*Node] {
_ = n.FirstChild // eager nil check

return func(yield func(*Node) bool) {
for c := n.FirstChild; c != nil && yield(c); c = c.NextSibling {
}
}

}

// Descendants returns an iterator over all nodes recursively beneath
// n, excluding n itself. Nodes are visited in depth-first preorder.
//
// Mutating a Node or its descendants while iterating may have unexpected results.
func (n *Node) Descendants() iter.Seq[*Node] {
_ = n.FirstChild // eager nil check

return func(yield func(*Node) bool) {
n.descendants(yield)
}
}

func (n *Node) descendants(yield func(*Node) bool) bool {
for c := range n.ChildNodes() {
if !yield(c) || !c.descendants(yield) {
return false
}
}
return true
}
96 changes: 96 additions & 0 deletions html/iter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.23

package html

import (
"strings"
"testing"
)

func TestNode_ChildNodes(t *testing.T) {
tests := []struct {
in string
want string
}{
{"", ""},
{"<a></a>", "a"},
{"a", "a"},
{"<a></a><!--b-->", "a b"},
{"a<b></b>c", "a b c"},
{"a<b><!--c--></b>d", "a b d"},
{"<a><b>c<!--d-->e</b></a>f<!--g--><h>i</h>", "a f g h"},
}
for _, test := range tests {
doc, err := Parse(strings.NewReader(test.in))
if err != nil {
t.Fatal(err)
}
// Drill to <html><head></head><body>
n := doc.FirstChild.FirstChild.NextSibling
var results []string
for c := range n.ChildNodes() {
results = append(results, c.Data)
}
if got := strings.Join(results, " "); got != test.want {
t.Errorf("ChildNodes = %q, want %q", got, test.want)
}
}
}

func TestNode_Descendants(t *testing.T) {
tests := []struct {
in string
want string
}{
{"", ""},
{"<a></a>", "a"},
{"<a><b></b></a>", "a b"},
{"<a>b</a>", "a b"},
{"<a><!--b--></a>", "a b"},
{"<a>b<c></c>d</a>", "a b c d"},
{"<a>b<c><!--d--></c>e</a>", "a b c d e"},
{"<a><b><c>d<!--e-->f</c></b>g<!--h--><i>j</i></a>", "a b c d e f g h i j"},
}
for _, test := range tests {
doc, err := Parse(strings.NewReader(test.in))
if err != nil {
t.Fatal(err)
}
// Drill to <html><head></head><body>
n := doc.FirstChild.FirstChild.NextSibling
var results []string
for c := range n.Descendants() {
results = append(results, c.Data)
}
if got := strings.Join(results, " "); got != test.want {
t.Errorf("Descendants = %q; want: %q", got, test.want)
}
}
}

func TestNode_Ancestors(t *testing.T) {
for _, size := range []int{0, 1, 2, 10, 100, 10_000} {
n := buildChain(size)
nParents := 0
for _ = range n.Ancestors() {
nParents++
}
if nParents != size {
t.Errorf("number of Ancestors = %d; want: %d", nParents, size)
}
}
}

func buildChain(size int) *Node {
child := new(Node)
for range size {
parent := child
child = new(Node)
parent.AppendChild(child)
}
return child
}
4 changes: 4 additions & 0 deletions html/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ var scopeMarker = Node{Type: scopeMarkerNode}
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
//
// Node trees may be navigated using the link fields (Parent,
// FirstChild, and so on) or a range loop over iterators such as
// [Node.Descendants].
//
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
// "svg" is short for "http://www.w3.org/2000/svg".
Expand Down