kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/html/html.go (about)

     1  /*
     2   * Copyright 2014 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Package html is a set of utilities for manipulating html Nodes.
    18  package html // import "kythe.io/kythe/go/util/html"
    19  
    20  import (
    21  	"bytes"
    22  	"fmt"
    23  
    24  	"kythe.io/kythe/go/util/log"
    25  
    26  	"golang.org/x/net/html"
    27  )
    28  
    29  // Decoration is a template for an HTML node that should span some textual
    30  // offsets.
    31  type Decoration struct {
    32  	// 0-based character offsets for the span of the decoration.
    33  	Start, End int
    34  
    35  	// Template to use as the HTML decoration Node.
    36  	Node *html.Node
    37  }
    38  
    39  // Decorate will apply the given slice of Decorations to the HTML tree starting
    40  // at root.
    41  func Decorate(root *html.Node, decor []Decoration) {
    42  	offsets := textualOffsets(root)
    43  
    44  	for _, d := range decor {
    45  		nodes := nodeRange(root, offsets, d.Start, d.End)
    46  
    47  		decorNode := copyNode(d.Node)
    48  		parent := nodes[0].Parent
    49  		parent.InsertBefore(decorNode, nodes[0])
    50  		for _, n := range nodes {
    51  			parent.RemoveChild(n)
    52  			decorNode.AppendChild(n)
    53  		}
    54  		offsets.update(decorNode, d.Start)
    55  	}
    56  }
    57  
    58  // nodeOffsets is a structure storing the 0-based character offsets at the
    59  // beginning and ending of html.Nodes.
    60  type nodeOffsets struct {
    61  	starts map[*html.Node]int
    62  	ends   map[*html.Node]int
    63  }
    64  
    65  // Bounds returns the starting and ending offset of n.
    66  func (o *nodeOffsets) Bounds(n *html.Node) (int, int) {
    67  	return o.starts[n], o.ends[n]
    68  }
    69  
    70  // textualOffsets returns nodeOffsets for every html.Node in the tree starting
    71  // at root.
    72  func textualOffsets(root *html.Node) *nodeOffsets {
    73  	offsets := &nodeOffsets{make(map[*html.Node]int), make(map[*html.Node]int)}
    74  	offsets.update(root, 0)
    75  	return offsets
    76  }
    77  
    78  // update updates the nodeOffsets for every html.Node starting at root, assuming
    79  // offset is the starting offset for the root html.Node.
    80  func (o *nodeOffsets) update(root *html.Node, offset int) int {
    81  	o.starts[root] = offset
    82  
    83  	if root.Type == html.TextNode {
    84  		offset += len(root.Data)
    85  	} else {
    86  		for n := root.FirstChild; n != nil; n = n.NextSibling {
    87  			offset = o.update(n, offset)
    88  		}
    89  	}
    90  
    91  	o.ends[root] = offset
    92  	return offset
    93  }
    94  
    95  // nodeRange returns an ordered slice of sibling html.Nodes that span exactly
    96  // the range between the given start and end offsets. If necessary, nodeRange
    97  // will split html.Nodes so that the returned range is precise.
    98  func nodeRange(root *html.Node, offsets *nodeOffsets, start, end int) []*html.Node {
    99  	if rootStart, rootEnd := offsets.Bounds(root); rootStart > start || rootEnd < end {
   100  		log.Fatalf("nodeRange: root %d%q Node (%d → %d) does not contain range %d → %d",
   101  			root.Type, root.Data, rootStart, rootEnd, start, end)
   102  	}
   103  
   104  	var nodes []*html.Node
   105  	for n := root.FirstChild; n != nil; n = n.NextSibling {
   106  		nStart, nEnd := offsets.Bounds(n)
   107  		if nStart < end && nEnd >= start {
   108  			nodes = append(nodes, n)
   109  		} else if nStart > end {
   110  			break
   111  		}
   112  	}
   113  
   114  	if len(nodes) == 1 && nodes[0] != root {
   115  		return nodeRange(nodes[0], offsets, start, end)
   116  	} else if len(nodes) == 0 {
   117  		nodes = []*html.Node{root}
   118  	}
   119  
   120  	// Slice end nodes, if necessary
   121  	if rangeStart, _ := offsets.Bounds(nodes[0]); start != rangeStart {
   122  		_, m := sliceNode(offsets, nodes[0], start)
   123  		nodes = append([]*html.Node{m}, nodes[1:]...)
   124  	}
   125  	if rangeEnd, _ := offsets.Bounds(nodes[len(nodes)-1]); end != rangeEnd {
   126  		n, _ := sliceNode(offsets, nodes[len(nodes)-1], end)
   127  		nodes = append(nodes[:len(nodes)-1], n)
   128  	}
   129  
   130  	return nodes
   131  }
   132  
   133  // sliceNode returns the two halves of the HTML tree starting at node after
   134  // splitting it at the given textual offset.
   135  func sliceNode(offsets *nodeOffsets, node *html.Node, offset int) (*html.Node, *html.Node) {
   136  	origStart, origEnd := offsets.Bounds(node)
   137  	if origStart > offset || origEnd < offset {
   138  		log.Fatalf("sliceNode: offset %d out of node's span (%d → %d)", offset, origStart, origEnd)
   139  	}
   140  
   141  	n, m := copyNode(node), copyNode(node)
   142  	parent := node.Parent
   143  	if parent != nil {
   144  		parent.InsertBefore(n, node)
   145  		parent.InsertBefore(m, node)
   146  		parent.RemoveChild(node)
   147  	}
   148  
   149  	switch node.Type {
   150  	default:
   151  		log.Fatalf("Unhandled node kind: %d", node.Type)
   152  	case html.ElementNode:
   153  		child := node.FirstChild
   154  		for child != nil {
   155  			next := child.NextSibling
   156  
   157  			if _, end := offsets.Bounds(child); end <= offset {
   158  				node.RemoveChild(child)
   159  				n.AppendChild(child)
   160  			} else if start, _ := offsets.Bounds(child); start > offset {
   161  				node.RemoveChild(child)
   162  				m.AppendChild(child)
   163  			} else {
   164  				left, right := sliceNode(offsets, child, offset)
   165  				node.RemoveChild(left)
   166  				node.RemoveChild(right)
   167  				n.AppendChild(left)
   168  				m.AppendChild(right)
   169  			}
   170  
   171  			child = next
   172  		}
   173  	case html.TextNode:
   174  		mark := offset - origStart
   175  		n.Data = node.Data[:mark]
   176  		m.Data = node.Data[mark:]
   177  	}
   178  
   179  	if split := offsets.update(n, origStart); split != offset {
   180  		log.Fatalf("split %d ≠ %d", split, offset)
   181  	}
   182  	if newEnd := offsets.update(m, offset); newEnd != origEnd {
   183  		log.Fatalf("end %d ≠ %d", newEnd, origEnd)
   184  	}
   185  
   186  	return n, m
   187  }
   188  
   189  // PlainText returns the concatenation of the textual contents for the given
   190  // html Nodes.
   191  func PlainText(nodes ...*html.Node) string {
   192  	var text bytes.Buffer
   193  	for _, n := range nodes {
   194  		switch n.Type {
   195  		default:
   196  			log.Fatalf("PlainText: unhandled node kind: %d", n.Type)
   197  		case html.DocumentNode, html.ElementNode:
   198  			for child := n.FirstChild; child != nil; child = child.NextSibling {
   199  				fmt.Fprint(&text, PlainText(child))
   200  			}
   201  		case html.TextNode:
   202  			fmt.Fprintf(&text, n.Data)
   203  		}
   204  	}
   205  	return text.String()
   206  }
   207  
   208  // copyNode returns a shallow copy of n excluding sibling/parent/child pointers.
   209  func copyNode(n *html.Node) *html.Node {
   210  	return &html.Node{
   211  		Type:      n.Type,
   212  		Data:      n.Data,
   213  		DataAtom:  n.DataAtom,
   214  		Namespace: n.Namespace,
   215  		Attr:      n.Attr,
   216  	}
   217  }
   218  
   219  // Zip returns the Node at the end of the specified path where path contains
   220  // only the following characters:
   221  //
   222  //	'u' Parent
   223  //	'f' FirstChild
   224  //	'l' LastChild
   225  //	'n' NextSibling
   226  //	'p' PrevSibling
   227  func Zip(n *html.Node, path string) (*html.Node, error) {
   228  	for i, step := range path {
   229  		if n == nil {
   230  			return nil, fmt.Errorf("ran into nil Node after %d steps: %q", i, path)
   231  		}
   232  		switch step {
   233  		case 'f':
   234  			n = n.FirstChild
   235  		case 'l':
   236  			n = n.LastChild
   237  		case 'n':
   238  			n = n.NextSibling
   239  		case 'p':
   240  			n = n.PrevSibling
   241  		case 'u':
   242  			n = n.Parent
   243  		default:
   244  			return nil, fmt.Errorf("invalid zip path (%q) rune: %q", path, step)
   245  		}
   246  	}
   247  	return n, nil
   248  }
   249  
   250  // MustZip delegates to Zip and panics on any error.
   251  func MustZip(n *html.Node, path string) *html.Node {
   252  	res, err := Zip(n, path)
   253  	if err != nil {
   254  		panic(err)
   255  	}
   256  	return res
   257  }