github.com/Kindred87/Obsidian@v0.0.0-20210809203756-86936424b848/retrieval/html/routing.go (about) 1 package html 2 3 import ( 4 "errors" 5 "fmt" 6 7 "github.com/PuerkitoBio/goquery" 8 "golang.org/x/net/html" 9 ) 10 11 // Route represents a unidirectional series of steps connecting two nodes. 12 type Route struct { 13 Start *html.Node 14 End *html.Node 15 Steps []Step 16 } 17 18 // AddStep appends a Step to the Steps field. 19 func (r Route) AddStep(s Step) Route { 20 r.Steps = append(r.Steps, s) 21 return r 22 } 23 24 // AppendRoute appends the Steps field of a to the Step field of the caller. 25 func (r Route) AppendRoute(a Route) Route { 26 r.Steps = append(r.Steps, a.Steps...) 27 return r 28 } 29 30 // LastNodeInRoute returns the node resulting from navigating the caller's 31 // Steps from the caller's Start node. 32 func (r Route) LastNodeInRoute() (*html.Node, error) { 33 return lastNodeInRoute(r.Start, r.Steps) 34 } 35 36 // String returns a string representation of the Start, Steps, and End fields, 37 // respectively. 38 func (r Route) String() string { 39 out := r.Start.Data + " -> " 40 41 for _, s := range r.Steps { 42 switch s { 43 case Parent: 44 out += "Parent -> " 45 case NextSibling: 46 out += "NextSibling -> " 47 case FirstChild: 48 out += "FirstChild -> " 49 } 50 } 51 52 return out + r.End.Data 53 } 54 55 // Step represents a navigation option between two nodes. 56 type Step int 57 58 const ( 59 Parent Step = iota 60 NextSibling 61 FirstChild 62 ) 63 64 // routeGroup contains information necessary for routing between and within arbitrary groups 65 // of nodes. 66 type routeGroup struct { 67 First []Route 68 Second []Route 69 Grandparent *html.Node 70 ParentClass string 71 ParentToGroup []Step 72 } 73 74 // childrenOf returns the nodes routed to from the given parent according to the contents 75 // of the routeGroup's First field. The caller is expected to handle nil nodes since they 76 // indicate a missing field and are thus valid. 77 // 78 // The first node returned is the root of the group. 79 // 80 // An error is returned if the root node of the group is nil. 81 func (rg *routeGroup) childrenOf(parent *html.Node) ([]*html.Node, error) { 82 nStore, err := lastNodeInRoute(parent, rg.ParentToGroup) 83 if err != nil { 84 return nil, err 85 } else if nStore == nil { 86 return nil, fmt.Errorf("root of child group is nil for node (%v)", parent) 87 } 88 89 children := []*html.Node{nStore} 90 91 for i := 0; i < len(rg.First); i++ { 92 nStore, _ = lastNodeInRoute(children[0], rg.First[i].Steps) 93 children = append(children, nStore) 94 } 95 return children, nil 96 } 97 98 // lastNodeInRoute returns the node resulting from navigating from the given 99 // start node along the given steps. 100 func lastNodeInRoute(node *html.Node, steps []Step) (*html.Node, error) { 101 curr := node 102 for i, step := range steps { 103 if curr == nil { 104 return nil, fmt.Errorf("nil node found at step %d in route beginning with %v", i, node) 105 } 106 107 switch step { 108 case Parent: 109 curr = curr.Parent 110 case NextSibling: 111 curr = curr.NextSibling 112 case FirstChild: 113 curr = curr.FirstChild 114 } 115 } 116 return curr, nil 117 } 118 119 // nodeGroupRouting returns a collection of nodes with associations matching that of the 120 // two groups of nodes provided. 121 func nodeGroupRouting(sel [][]goquery.Selection) (routeGroup, error) { 122 var rGroup routeGroup 123 124 if r, err := infragroupRouting(sel[0]); err != nil { 125 return rGroup, err 126 } else { 127 rGroup.First = r 128 } 129 130 if r, err := infragroupRouting(sel[1]); err != nil { 131 return rGroup, err 132 } else { 133 rGroup.Second = r 134 } 135 136 if g, err := grandparentOf(rGroup); err != nil { 137 return rGroup, err 138 } else { 139 rGroup.Grandparent = g 140 } 141 142 if c, err := parentClassName(rGroup); err != nil { 143 return rGroup, err 144 } else { 145 rGroup.ParentClass = c 146 } 147 148 if s, err := parentToGroupSteps(rGroup); err != nil { 149 return rGroup, err 150 } else { 151 rGroup.ParentToGroup = s 152 } 153 154 return rGroup, nil 155 } 156 157 // infragroupRouting determines routes within a group between the first node and all others. 158 func infragroupRouting(sel []goquery.Selection) (routes []Route, err error) { 159 // All routes based off of first selection 160 for i := 1; i < len(sel); i++ { 161 r, err := findRoute(Route{ 162 Start: sel[0].Nodes[0], 163 End: sel[i].Nodes[0], 164 }) 165 if err != nil { 166 return routes, err 167 } 168 routes = append(routes, r) 169 } 170 171 return routes, err 172 } 173 174 // findRoute determines a route between a start and end node. 175 func findRoute(route Route) (Route, error) { 176 if found, node := finalNodeInLayer(route); found { 177 return node, nil 178 } else if n, _ := route.LastNodeInRoute(); n.Parent != nil { 179 return findRoute(route.AddStep(Parent)) 180 181 } else { 182 return route, errors.New("Could not find route between " + 183 route.Start.Data + " and " + route.End.Data) 184 } 185 } 186 187 // finalNodeInLayer searches for the route's final node in the layer containing the last node in 188 // the given route's steps, as well as its child layers. 189 func finalNodeInLayer(route Route) (bool, Route) { 190 // Progress is saved by adding steps to the route, with the last node in the 191 // saved steps being the entry point into the layer being checked. 192 for sibling, _ := route.LastNodeInRoute(); sibling != nil; sibling = sibling.NextSibling { 193 if sibling == route.End { 194 return true, route 195 } 196 197 if sibling.FirstChild != nil { 198 if found, route := finalNodeInLayer(route.AddStep(FirstChild)); found { 199 return true, route 200 } 201 } 202 203 route = route.AddStep(NextSibling) 204 } 205 206 return false, route 207 } 208 209 // grandparentOf searches for a common parent shared by the the given routeGroup's 210 // first and second routes. 211 func grandparentOf(routes routeGroup) (*html.Node, error) { 212 if len(routes.First) <= 0 || len(routes.Second) <= 0 { 213 return nil, errors.New("one or more given routes is empty") 214 } 215 216 first := routes.First[0].Start 217 second := routes.Second[0].Start 218 219 for { 220 if first == second { 221 break 222 } else if first == nil || second == nil { 223 return nil, errors.New("could not find a common grandparent") 224 } else { 225 first = first.Parent 226 second = second.Parent 227 } 228 } 229 230 return first, nil 231 } 232 233 // parentClassName identifies the class name of the parents of the given routeGroup's 234 // first and second routes. An error is returned should the class name differ between 235 // the two parents. 236 func parentClassName(routes routeGroup) (string, error) { 237 if len(routes.First) <= 0 || len(routes.Second) <= 0 { 238 return "", errors.New("one or more given routes is empty") 239 } 240 241 if routes.Grandparent == nil { 242 g, err := grandparentOf(routes) 243 if err != nil { 244 return "", fmt.Errorf("the grandparent is nil, %w", err) 245 } 246 routes.Grandparent = g 247 } 248 249 firstClass, err := parentChildClass(routes.First[0].Start, routes.Grandparent) 250 if err != nil { 251 return "", err 252 } 253 254 secondClass, err := parentChildClass(routes.Second[0].Start, routes.Grandparent) 255 if err != nil { 256 return "", err 257 } 258 259 if firstClass == secondClass { 260 return firstClass, nil 261 } else { 262 return "", fmt.Errorf("class names are mismatched: %s, %s", firstClass, secondClass) 263 } 264 } 265 266 // parentChildClass returns the class of the node serving as a's parent and b's child. 267 func parentChildClass(a, b *html.Node) (string, error) { 268 node, err := secondToLastNode(a, b) 269 if err != nil { 270 return "", err 271 } 272 273 if found, class := classAttributeFrom(node.Attr); found { 274 return class, nil 275 } 276 277 return "", fmt.Errorf("failed to identify class of parent node of %s", a.Data) 278 } 279 280 // secondToLastNode returns the second to last node between a and b. 281 func secondToLastNode(a, b *html.Node) (*html.Node, error) { 282 route := Route{ 283 Start: a, 284 End: b, 285 } 286 287 r, err := findRoute(route) 288 if err != nil { 289 return nil, err 290 } 291 292 // Remove grandparent step so that steps point to the grandparent's child. 293 r.Steps = r.Steps[:len(r.Steps)-1] 294 295 child, err := r.LastNodeInRoute() 296 if err != nil { 297 return nil, err 298 } 299 300 return child, nil 301 } 302 303 // parentToGroupSteps returns the navigation steps between the the first group's parent and 304 // the first group itself. 305 // 306 // It is expected that all groups will share the same hierarchical structure. 307 func parentToGroupSteps(routes routeGroup) ([]Step, error) { 308 if len(routes.First) <= 0 || len(routes.Second) <= 0 { 309 return nil, errors.New("one or more given routes is empty") 310 } 311 312 parent, err := secondToLastNode(routes.First[0].Start, routes.Grandparent) 313 if err != nil { 314 return nil, err 315 } 316 317 route := Route{ 318 Start: parent, 319 End: routes.First[0].Start, 320 } 321 322 r, err := findRoute(route) 323 if err != nil { 324 return nil, err 325 } 326 327 return r.Steps, nil 328 }