Skip to content

Commit 511cc3a

Browse files
earthboundkidgopherbot
authored andcommitted
html: add Node.{Ancestors,ChildNodes,Descendants}()
Adds iterators for the parents, immediate children, and all children of a Node respectively. Fixes golang/go#62113 Change-Id: Iab015872cc3a20fe5e7cae3bc90b89cba68cc3f8 GitHub-Last-Rev: d99de58 GitHub-Pull-Request: #215 Reviewed-on: https://go-review.googlesource.com/c/net/+/594195 Reviewed-by: Ian Lance Taylor <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Ian Lance Taylor <[email protected]> Reviewed-by: Damien Neil <[email protected]>
1 parent 4783315 commit 511cc3a

File tree

5 files changed

+163
-13
lines changed

5 files changed

+163
-13
lines changed

html/doc.go

+1-6
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,11 @@ example, to process each anchor node in depth-first order:
7878
if err != nil {
7979
// ...
8080
}
81-
var f func(*html.Node)
82-
f = func(n *html.Node) {
81+
for n := range doc.Descendants() {
8382
if n.Type == html.ElementNode && n.Data == "a" {
8483
// Do something with n...
8584
}
86-
for c := n.FirstChild; c != nil; c = c.NextSibling {
87-
f(c)
88-
}
8985
}
90-
f(doc)
9186
9287
The relevant specifications include:
9388
https://html.spec.whatwg.org/multipage/syntax.html and

html/example_test.go

+6-7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
//go:build go1.23
6+
57
// This example demonstrates parsing HTML data and walking the resulting tree.
68
package html_test
79

@@ -11,6 +13,7 @@ import (
1113
"strings"
1214

1315
"golang.org/x/net/html"
16+
"golang.org/x/net/html/atom"
1417
)
1518

1619
func ExampleParse() {
@@ -19,21 +22,17 @@ func ExampleParse() {
1922
if err != nil {
2023
log.Fatal(err)
2124
}
22-
var f func(*html.Node)
23-
f = func(n *html.Node) {
24-
if n.Type == html.ElementNode && n.Data == "a" {
25+
for n := range doc.Descendants() {
26+
if n.Type == html.ElementNode && n.DataAtom == atom.A {
2527
for _, a := range n.Attr {
2628
if a.Key == "href" {
2729
fmt.Println(a.Val)
2830
break
2931
}
3032
}
3133
}
32-
for c := n.FirstChild; c != nil; c = c.NextSibling {
33-
f(c)
34-
}
3534
}
36-
f(doc)
35+
3736
// Output:
3837
// foo
3938
// /bar/baz

html/iter.go

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build go1.23
6+
7+
package html
8+
9+
import "iter"
10+
11+
// Ancestors returns an iterator over the ancestors of n, starting with n.Parent.
12+
//
13+
// Mutating a Node or its parents while iterating may have unexpected results.
14+
func (n *Node) Ancestors() iter.Seq[*Node] {
15+
_ = n.Parent // eager nil check
16+
17+
return func(yield func(*Node) bool) {
18+
for p := n.Parent; p != nil && yield(p); p = p.Parent {
19+
}
20+
}
21+
}
22+
23+
// ChildNodes returns an iterator over the immediate children of n,
24+
// starting with n.FirstChild.
25+
//
26+
// Mutating a Node or its children while iterating may have unexpected results.
27+
func (n *Node) ChildNodes() iter.Seq[*Node] {
28+
_ = n.FirstChild // eager nil check
29+
30+
return func(yield func(*Node) bool) {
31+
for c := n.FirstChild; c != nil && yield(c); c = c.NextSibling {
32+
}
33+
}
34+
35+
}
36+
37+
// Descendants returns an iterator over all nodes recursively beneath
38+
// n, excluding n itself. Nodes are visited in depth-first preorder.
39+
//
40+
// Mutating a Node or its descendants while iterating may have unexpected results.
41+
func (n *Node) Descendants() iter.Seq[*Node] {
42+
_ = n.FirstChild // eager nil check
43+
44+
return func(yield func(*Node) bool) {
45+
n.descendants(yield)
46+
}
47+
}
48+
49+
func (n *Node) descendants(yield func(*Node) bool) bool {
50+
for c := range n.ChildNodes() {
51+
if !yield(c) || !c.descendants(yield) {
52+
return false
53+
}
54+
}
55+
return true
56+
}

html/iter_test.go

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build go1.23
6+
7+
package html
8+
9+
import (
10+
"strings"
11+
"testing"
12+
)
13+
14+
func TestNode_ChildNodes(t *testing.T) {
15+
tests := []struct {
16+
in string
17+
want string
18+
}{
19+
{"", ""},
20+
{"<a></a>", "a"},
21+
{"a", "a"},
22+
{"<a></a><!--b-->", "a b"},
23+
{"a<b></b>c", "a b c"},
24+
{"a<b><!--c--></b>d", "a b d"},
25+
{"<a><b>c<!--d-->e</b></a>f<!--g--><h>i</h>", "a f g h"},
26+
}
27+
for _, test := range tests {
28+
doc, err := Parse(strings.NewReader(test.in))
29+
if err != nil {
30+
t.Fatal(err)
31+
}
32+
// Drill to <html><head></head><body>
33+
n := doc.FirstChild.FirstChild.NextSibling
34+
var results []string
35+
for c := range n.ChildNodes() {
36+
results = append(results, c.Data)
37+
}
38+
if got := strings.Join(results, " "); got != test.want {
39+
t.Errorf("ChildNodes = %q, want %q", got, test.want)
40+
}
41+
}
42+
}
43+
44+
func TestNode_Descendants(t *testing.T) {
45+
tests := []struct {
46+
in string
47+
want string
48+
}{
49+
{"", ""},
50+
{"<a></a>", "a"},
51+
{"<a><b></b></a>", "a b"},
52+
{"<a>b</a>", "a b"},
53+
{"<a><!--b--></a>", "a b"},
54+
{"<a>b<c></c>d</a>", "a b c d"},
55+
{"<a>b<c><!--d--></c>e</a>", "a b c d e"},
56+
{"<a><b><c>d<!--e-->f</c></b>g<!--h--><i>j</i></a>", "a b c d e f g h i j"},
57+
}
58+
for _, test := range tests {
59+
doc, err := Parse(strings.NewReader(test.in))
60+
if err != nil {
61+
t.Fatal(err)
62+
}
63+
// Drill to <html><head></head><body>
64+
n := doc.FirstChild.FirstChild.NextSibling
65+
var results []string
66+
for c := range n.Descendants() {
67+
results = append(results, c.Data)
68+
}
69+
if got := strings.Join(results, " "); got != test.want {
70+
t.Errorf("Descendants = %q; want: %q", got, test.want)
71+
}
72+
}
73+
}
74+
75+
func TestNode_Ancestors(t *testing.T) {
76+
for _, size := range []int{0, 1, 2, 10, 100, 10_000} {
77+
n := buildChain(size)
78+
nParents := 0
79+
for _ = range n.Ancestors() {
80+
nParents++
81+
}
82+
if nParents != size {
83+
t.Errorf("number of Ancestors = %d; want: %d", nParents, size)
84+
}
85+
}
86+
}
87+
88+
func buildChain(size int) *Node {
89+
child := new(Node)
90+
for range size {
91+
parent := child
92+
child = new(Node)
93+
parent.AppendChild(child)
94+
}
95+
return child
96+
}

html/node.go

+4
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ var scopeMarker = Node{Type: scopeMarkerNode}
3838
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
3939
// is the atom for Data, or zero if Data is not a known tag name.
4040
//
41+
// Node trees may be navigated using the link fields (Parent,
42+
// FirstChild, and so on) or a range loop over iterators such as
43+
// [Node.Descendants].
44+
//
4145
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
4246
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
4347
// "svg" is short for "http://www.w3.org/2000/svg".

0 commit comments

Comments
 (0)