github.com/Kindred87/Obsidian@v0.0.0-20210809203756-86936424b848/retrieval/html/routing.go (about)

     1  package html
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  
     7  	"github.com/PuerkitoBio/goquery"
     8  	"golang.org/x/net/html"
     9  )
    10  
    11  // Route represents a unidirectional series of steps connecting two nodes.
    12  type Route struct {
    13  	Start *html.Node
    14  	End   *html.Node
    15  	Steps []Step
    16  }
    17  
    18  // AddStep appends a Step to the Steps field.
    19  func (r Route) AddStep(s Step) Route {
    20  	r.Steps = append(r.Steps, s)
    21  	return r
    22  }
    23  
    24  // AppendRoute appends the Steps field of a to the Step field of the caller.
    25  func (r Route) AppendRoute(a Route) Route {
    26  	r.Steps = append(r.Steps, a.Steps...)
    27  	return r
    28  }
    29  
    30  // LastNodeInRoute returns the node resulting from navigating the caller's
    31  // Steps from the caller's Start node.
    32  func (r Route) LastNodeInRoute() (*html.Node, error) {
    33  	return lastNodeInRoute(r.Start, r.Steps)
    34  }
    35  
    36  // String returns a string representation of the Start, Steps, and End fields,
    37  // respectively.
    38  func (r Route) String() string {
    39  	out := r.Start.Data + " -> "
    40  
    41  	for _, s := range r.Steps {
    42  		switch s {
    43  		case Parent:
    44  			out += "Parent -> "
    45  		case NextSibling:
    46  			out += "NextSibling -> "
    47  		case FirstChild:
    48  			out += "FirstChild -> "
    49  		}
    50  	}
    51  
    52  	return out + r.End.Data
    53  }
    54  
    55  // Step represents a navigation option between two nodes.
    56  type Step int
    57  
    58  const (
    59  	Parent Step = iota
    60  	NextSibling
    61  	FirstChild
    62  )
    63  
    64  // routeGroup contains information necessary for routing between and within arbitrary groups
    65  // of nodes.
    66  type routeGroup struct {
    67  	First         []Route
    68  	Second        []Route
    69  	Grandparent   *html.Node
    70  	ParentClass   string
    71  	ParentToGroup []Step
    72  }
    73  
    74  // childrenOf returns the nodes routed to from the given parent according to the contents
    75  // of the routeGroup's First field.  The caller is expected to handle nil nodes since they
    76  // indicate a missing field and are thus valid.
    77  //
    78  // The first node returned is the root of the group.
    79  //
    80  // An error is returned if the root node of the group is nil.
    81  func (rg *routeGroup) childrenOf(parent *html.Node) ([]*html.Node, error) {
    82  	nStore, err := lastNodeInRoute(parent, rg.ParentToGroup)
    83  	if err != nil {
    84  		return nil, err
    85  	} else if nStore == nil {
    86  		return nil, fmt.Errorf("root of child group is nil for node (%v)", parent)
    87  	}
    88  
    89  	children := []*html.Node{nStore}
    90  
    91  	for i := 0; i < len(rg.First); i++ {
    92  		nStore, _ = lastNodeInRoute(children[0], rg.First[i].Steps)
    93  		children = append(children, nStore)
    94  	}
    95  	return children, nil
    96  }
    97  
    98  // lastNodeInRoute returns the node resulting from navigating from the given
    99  // start node along the given steps.
   100  func lastNodeInRoute(node *html.Node, steps []Step) (*html.Node, error) {
   101  	curr := node
   102  	for i, step := range steps {
   103  		if curr == nil {
   104  			return nil, fmt.Errorf("nil node found at step %d in route beginning with %v", i, node)
   105  		}
   106  
   107  		switch step {
   108  		case Parent:
   109  			curr = curr.Parent
   110  		case NextSibling:
   111  			curr = curr.NextSibling
   112  		case FirstChild:
   113  			curr = curr.FirstChild
   114  		}
   115  	}
   116  	return curr, nil
   117  }
   118  
   119  // nodeGroupRouting returns a collection of nodes with associations matching that of the
   120  // two groups of nodes provided.
   121  func nodeGroupRouting(sel [][]goquery.Selection) (routeGroup, error) {
   122  	var rGroup routeGroup
   123  
   124  	if r, err := infragroupRouting(sel[0]); err != nil {
   125  		return rGroup, err
   126  	} else {
   127  		rGroup.First = r
   128  	}
   129  
   130  	if r, err := infragroupRouting(sel[1]); err != nil {
   131  		return rGroup, err
   132  	} else {
   133  		rGroup.Second = r
   134  	}
   135  
   136  	if g, err := grandparentOf(rGroup); err != nil {
   137  		return rGroup, err
   138  	} else {
   139  		rGroup.Grandparent = g
   140  	}
   141  
   142  	if c, err := parentClassName(rGroup); err != nil {
   143  		return rGroup, err
   144  	} else {
   145  		rGroup.ParentClass = c
   146  	}
   147  
   148  	if s, err := parentToGroupSteps(rGroup); err != nil {
   149  		return rGroup, err
   150  	} else {
   151  		rGroup.ParentToGroup = s
   152  	}
   153  
   154  	return rGroup, nil
   155  }
   156  
   157  // infragroupRouting determines routes within a group between the first node and all others.
   158  func infragroupRouting(sel []goquery.Selection) (routes []Route, err error) {
   159  	// All routes based off of first selection
   160  	for i := 1; i < len(sel); i++ {
   161  		r, err := findRoute(Route{
   162  			Start: sel[0].Nodes[0],
   163  			End:   sel[i].Nodes[0],
   164  		})
   165  		if err != nil {
   166  			return routes, err
   167  		}
   168  		routes = append(routes, r)
   169  	}
   170  
   171  	return routes, err
   172  }
   173  
   174  // findRoute determines a route between a start and end node.
   175  func findRoute(route Route) (Route, error) {
   176  	if found, node := finalNodeInLayer(route); found {
   177  		return node, nil
   178  	} else if n, _ := route.LastNodeInRoute(); n.Parent != nil {
   179  		return findRoute(route.AddStep(Parent))
   180  
   181  	} else {
   182  		return route, errors.New("Could not find route between " +
   183  			route.Start.Data + " and " + route.End.Data)
   184  	}
   185  }
   186  
   187  // finalNodeInLayer searches for the route's final node in the layer containing the last node in
   188  // the given route's steps, as well as its child layers.
   189  func finalNodeInLayer(route Route) (bool, Route) {
   190  	// Progress is saved by adding steps to the route, with the last node in the
   191  	// saved steps being the entry point into the layer being checked.
   192  	for sibling, _ := route.LastNodeInRoute(); sibling != nil; sibling = sibling.NextSibling {
   193  		if sibling == route.End {
   194  			return true, route
   195  		}
   196  
   197  		if sibling.FirstChild != nil {
   198  			if found, route := finalNodeInLayer(route.AddStep(FirstChild)); found {
   199  				return true, route
   200  			}
   201  		}
   202  
   203  		route = route.AddStep(NextSibling)
   204  	}
   205  
   206  	return false, route
   207  }
   208  
   209  // grandparentOf searches for a common parent shared by the the given routeGroup's
   210  // first and second routes.
   211  func grandparentOf(routes routeGroup) (*html.Node, error) {
   212  	if len(routes.First) <= 0 || len(routes.Second) <= 0 {
   213  		return nil, errors.New("one or more given routes is empty")
   214  	}
   215  
   216  	first := routes.First[0].Start
   217  	second := routes.Second[0].Start
   218  
   219  	for {
   220  		if first == second {
   221  			break
   222  		} else if first == nil || second == nil {
   223  			return nil, errors.New("could not find a common grandparent")
   224  		} else {
   225  			first = first.Parent
   226  			second = second.Parent
   227  		}
   228  	}
   229  
   230  	return first, nil
   231  }
   232  
   233  // parentClassName identifies the class name of the parents of the given routeGroup's
   234  // first and second routes.  An error is returned should the class name differ between
   235  // the two parents.
   236  func parentClassName(routes routeGroup) (string, error) {
   237  	if len(routes.First) <= 0 || len(routes.Second) <= 0 {
   238  		return "", errors.New("one or more given routes is empty")
   239  	}
   240  
   241  	if routes.Grandparent == nil {
   242  		g, err := grandparentOf(routes)
   243  		if err != nil {
   244  			return "", fmt.Errorf("the grandparent is nil, %w", err)
   245  		}
   246  		routes.Grandparent = g
   247  	}
   248  
   249  	firstClass, err := parentChildClass(routes.First[0].Start, routes.Grandparent)
   250  	if err != nil {
   251  		return "", err
   252  	}
   253  
   254  	secondClass, err := parentChildClass(routes.Second[0].Start, routes.Grandparent)
   255  	if err != nil {
   256  		return "", err
   257  	}
   258  
   259  	if firstClass == secondClass {
   260  		return firstClass, nil
   261  	} else {
   262  		return "", fmt.Errorf("class names are mismatched: %s, %s", firstClass, secondClass)
   263  	}
   264  }
   265  
   266  // parentChildClass returns the class of the node serving as a's parent and b's child.
   267  func parentChildClass(a, b *html.Node) (string, error) {
   268  	node, err := secondToLastNode(a, b)
   269  	if err != nil {
   270  		return "", err
   271  	}
   272  
   273  	if found, class := classAttributeFrom(node.Attr); found {
   274  		return class, nil
   275  	}
   276  
   277  	return "", fmt.Errorf("failed to identify class of parent node of %s", a.Data)
   278  }
   279  
   280  // secondToLastNode returns the second to last node between a and b.
   281  func secondToLastNode(a, b *html.Node) (*html.Node, error) {
   282  	route := Route{
   283  		Start: a,
   284  		End:   b,
   285  	}
   286  
   287  	r, err := findRoute(route)
   288  	if err != nil {
   289  		return nil, err
   290  	}
   291  
   292  	// Remove grandparent step so that steps point to the grandparent's child.
   293  	r.Steps = r.Steps[:len(r.Steps)-1]
   294  
   295  	child, err := r.LastNodeInRoute()
   296  	if err != nil {
   297  		return nil, err
   298  	}
   299  
   300  	return child, nil
   301  }
   302  
   303  // parentToGroupSteps returns the navigation steps between the the first group's parent and
   304  // the first group itself.
   305  //
   306  // It is expected that all groups will share the same hierarchical structure.
   307  func parentToGroupSteps(routes routeGroup) ([]Step, error) {
   308  	if len(routes.First) <= 0 || len(routes.Second) <= 0 {
   309  		return nil, errors.New("one or more given routes is empty")
   310  	}
   311  
   312  	parent, err := secondToLastNode(routes.First[0].Start, routes.Grandparent)
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  
   317  	route := Route{
   318  		Start: parent,
   319  		End:   routes.First[0].Start,
   320  	}
   321  
   322  	r, err := findRoute(route)
   323  	if err != nil {
   324  		return nil, err
   325  	}
   326  
   327  	return r.Steps, nil
   328  }