kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/html/html.go (about) 1 /* 2 * Copyright 2014 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Package html is a set of utilities for manipulating html Nodes. 18 package html // import "kythe.io/kythe/go/util/html" 19 20 import ( 21 "bytes" 22 "fmt" 23 24 "kythe.io/kythe/go/util/log" 25 26 "golang.org/x/net/html" 27 ) 28 29 // Decoration is a template for an HTML node that should span some textual 30 // offsets. 31 type Decoration struct { 32 // 0-based character offsets for the span of the decoration. 33 Start, End int 34 35 // Template to use as the HTML decoration Node. 36 Node *html.Node 37 } 38 39 // Decorate will apply the given slice of Decorations to the HTML tree starting 40 // at root. 41 func Decorate(root *html.Node, decor []Decoration) { 42 offsets := textualOffsets(root) 43 44 for _, d := range decor { 45 nodes := nodeRange(root, offsets, d.Start, d.End) 46 47 decorNode := copyNode(d.Node) 48 parent := nodes[0].Parent 49 parent.InsertBefore(decorNode, nodes[0]) 50 for _, n := range nodes { 51 parent.RemoveChild(n) 52 decorNode.AppendChild(n) 53 } 54 offsets.update(decorNode, d.Start) 55 } 56 } 57 58 // nodeOffsets is a structure storing the 0-based character offsets at the 59 // beginning and ending of html.Nodes. 60 type nodeOffsets struct { 61 starts map[*html.Node]int 62 ends map[*html.Node]int 63 } 64 65 // Bounds returns the starting and ending offset of n. 66 func (o *nodeOffsets) Bounds(n *html.Node) (int, int) { 67 return o.starts[n], o.ends[n] 68 } 69 70 // textualOffsets returns nodeOffsets for every html.Node in the tree starting 71 // at root. 72 func textualOffsets(root *html.Node) *nodeOffsets { 73 offsets := &nodeOffsets{make(map[*html.Node]int), make(map[*html.Node]int)} 74 offsets.update(root, 0) 75 return offsets 76 } 77 78 // update updates the nodeOffsets for every html.Node starting at root, assuming 79 // offset is the starting offset for the root html.Node. 80 func (o *nodeOffsets) update(root *html.Node, offset int) int { 81 o.starts[root] = offset 82 83 if root.Type == html.TextNode { 84 offset += len(root.Data) 85 } else { 86 for n := root.FirstChild; n != nil; n = n.NextSibling { 87 offset = o.update(n, offset) 88 } 89 } 90 91 o.ends[root] = offset 92 return offset 93 } 94 95 // nodeRange returns an ordered slice of sibling html.Nodes that span exactly 96 // the range between the given start and end offsets. If necessary, nodeRange 97 // will split html.Nodes so that the returned range is precise. 98 func nodeRange(root *html.Node, offsets *nodeOffsets, start, end int) []*html.Node { 99 if rootStart, rootEnd := offsets.Bounds(root); rootStart > start || rootEnd < end { 100 log.Fatalf("nodeRange: root %d%q Node (%d → %d) does not contain range %d → %d", 101 root.Type, root.Data, rootStart, rootEnd, start, end) 102 } 103 104 var nodes []*html.Node 105 for n := root.FirstChild; n != nil; n = n.NextSibling { 106 nStart, nEnd := offsets.Bounds(n) 107 if nStart < end && nEnd >= start { 108 nodes = append(nodes, n) 109 } else if nStart > end { 110 break 111 } 112 } 113 114 if len(nodes) == 1 && nodes[0] != root { 115 return nodeRange(nodes[0], offsets, start, end) 116 } else if len(nodes) == 0 { 117 nodes = []*html.Node{root} 118 } 119 120 // Slice end nodes, if necessary 121 if rangeStart, _ := offsets.Bounds(nodes[0]); start != rangeStart { 122 _, m := sliceNode(offsets, nodes[0], start) 123 nodes = append([]*html.Node{m}, nodes[1:]...) 124 } 125 if rangeEnd, _ := offsets.Bounds(nodes[len(nodes)-1]); end != rangeEnd { 126 n, _ := sliceNode(offsets, nodes[len(nodes)-1], end) 127 nodes = append(nodes[:len(nodes)-1], n) 128 } 129 130 return nodes 131 } 132 133 // sliceNode returns the two halves of the HTML tree starting at node after 134 // splitting it at the given textual offset. 135 func sliceNode(offsets *nodeOffsets, node *html.Node, offset int) (*html.Node, *html.Node) { 136 origStart, origEnd := offsets.Bounds(node) 137 if origStart > offset || origEnd < offset { 138 log.Fatalf("sliceNode: offset %d out of node's span (%d → %d)", offset, origStart, origEnd) 139 } 140 141 n, m := copyNode(node), copyNode(node) 142 parent := node.Parent 143 if parent != nil { 144 parent.InsertBefore(n, node) 145 parent.InsertBefore(m, node) 146 parent.RemoveChild(node) 147 } 148 149 switch node.Type { 150 default: 151 log.Fatalf("Unhandled node kind: %d", node.Type) 152 case html.ElementNode: 153 child := node.FirstChild 154 for child != nil { 155 next := child.NextSibling 156 157 if _, end := offsets.Bounds(child); end <= offset { 158 node.RemoveChild(child) 159 n.AppendChild(child) 160 } else if start, _ := offsets.Bounds(child); start > offset { 161 node.RemoveChild(child) 162 m.AppendChild(child) 163 } else { 164 left, right := sliceNode(offsets, child, offset) 165 node.RemoveChild(left) 166 node.RemoveChild(right) 167 n.AppendChild(left) 168 m.AppendChild(right) 169 } 170 171 child = next 172 } 173 case html.TextNode: 174 mark := offset - origStart 175 n.Data = node.Data[:mark] 176 m.Data = node.Data[mark:] 177 } 178 179 if split := offsets.update(n, origStart); split != offset { 180 log.Fatalf("split %d ≠ %d", split, offset) 181 } 182 if newEnd := offsets.update(m, offset); newEnd != origEnd { 183 log.Fatalf("end %d ≠ %d", newEnd, origEnd) 184 } 185 186 return n, m 187 } 188 189 // PlainText returns the concatenation of the textual contents for the given 190 // html Nodes. 191 func PlainText(nodes ...*html.Node) string { 192 var text bytes.Buffer 193 for _, n := range nodes { 194 switch n.Type { 195 default: 196 log.Fatalf("PlainText: unhandled node kind: %d", n.Type) 197 case html.DocumentNode, html.ElementNode: 198 for child := n.FirstChild; child != nil; child = child.NextSibling { 199 fmt.Fprint(&text, PlainText(child)) 200 } 201 case html.TextNode: 202 fmt.Fprintf(&text, n.Data) 203 } 204 } 205 return text.String() 206 } 207 208 // copyNode returns a shallow copy of n excluding sibling/parent/child pointers. 209 func copyNode(n *html.Node) *html.Node { 210 return &html.Node{ 211 Type: n.Type, 212 Data: n.Data, 213 DataAtom: n.DataAtom, 214 Namespace: n.Namespace, 215 Attr: n.Attr, 216 } 217 } 218 219 // Zip returns the Node at the end of the specified path where path contains 220 // only the following characters: 221 // 222 // 'u' Parent 223 // 'f' FirstChild 224 // 'l' LastChild 225 // 'n' NextSibling 226 // 'p' PrevSibling 227 func Zip(n *html.Node, path string) (*html.Node, error) { 228 for i, step := range path { 229 if n == nil { 230 return nil, fmt.Errorf("ran into nil Node after %d steps: %q", i, path) 231 } 232 switch step { 233 case 'f': 234 n = n.FirstChild 235 case 'l': 236 n = n.LastChild 237 case 'n': 238 n = n.NextSibling 239 case 'p': 240 n = n.PrevSibling 241 case 'u': 242 n = n.Parent 243 default: 244 return nil, fmt.Errorf("invalid zip path (%q) rune: %q", path, step) 245 } 246 } 247 return n, nil 248 } 249 250 // MustZip delegates to Zip and panics on any error. 251 func MustZip(n *html.Node, path string) *html.Node { 252 res, err := Zip(n, path) 253 if err != nil { 254 panic(err) 255 } 256 return res 257 }