github.com/Kindred87/Obsidian@v0.0.0-20210809203756-86936424b848/retrieval/html/search.go (about) 1 package html 2 3 import ( 4 "errors" 5 "os" 6 "strconv" 7 8 "github.com/Kindred87/Obsidian/datasource" 9 "github.com/PuerkitoBio/goquery" 10 "golang.org/x/net/html" 11 ) 12 13 /* 14 CollectionSiblings identifies arbitrary collections of HTML nodes from a datasource and 15 returns the contents of both the identified collections and their inferred siblings. 16 17 An arbitrary collection of nodes could be [b, a, d] from a table organized as [a, b, c, d]. 18 19 Furthermore, with three collections of nodes in tables sharing parent p... 20 21 p 22 |> [a1, b1, c1, d1] 23 |> [a2, b2, c2, c2] 24 |> [a3, b3, c3, d3] 25 26 ... and the two arbitrary collections are given... 27 [b1, a1, d1] 28 [b2, a2, d2] 29 30 ...the values returned will be: 31 [b1, a1, d1] 32 [b2, a2, d2] 33 [b3, a3, d3] 34 35 Had the third table not been the child of p, its nodes would not have been inferred as a sibling. 36 37 Each searchGroups subarray should contain a group of values expected to be within a shared structure, such as a table or table row. 38 39 Value order as given will be maintained, regardless of how the associated nodes are organized in the datasource. 40 */ 41 func CollectionSiblings(alias string, searchGroups [][]string) ([]NodeList, error) { 42 if len(searchGroups) < 2 { 43 return nil, errors.New("Only " + strconv.Itoa(len(searchGroups)) + 44 " search groups were provided") 45 } 46 47 result, path, err := datasource.Find(alias) 48 if err != nil { 49 return nil, err 50 } else if !result { 51 return nil, errors.New("Could not locate datasource by the alias of " + alias) 52 } 53 54 fi, err := os.Open(path) 55 if err != nil { 56 return nil, err 57 } 58 59 sel, err := findNodeGroups(fi, searchGroups) 60 if err != nil { 61 return nil, err 62 } 63 64 routes, err := nodeGroupRouting(sel) 65 if err != nil { 66 return nil, err 67 } 68 69 nList, err := getSiblingContents(routes.Grandparent.FirstChild, routes, []NodeList{}) 70 if err != nil { 71 return nil, err 72 } 73 return nList, nil 74 } 75 76 // findNodeGroups returns HTML nodes containing the given values. 77 // 78 // The backing nodes of the returned selections are not shared between calls. 79 func findNodeGroups(fi *os.File, toFind [][]string) (selections [][]goquery.Selection, err error) { 80 doc, err := goquery.NewDocumentFromReader(fi) 81 if err != nil { 82 return nil, err 83 } 84 85 // Reset the read position in case the file was previously read. 86 if _, err := fi.Seek(0, 0); err != nil { 87 return nil, err 88 } 89 90 for _, t := range toFind { 91 if result, err := selectionsContaining(doc, t); err != nil { 92 // Result is ignored in order to make it abundantly clear that 93 // the user's query could not be satisfied in full. 94 return nil, err 95 } else { 96 selections = append(selections, result) 97 } 98 } 99 100 return selections, nil 101 } 102 103 // selectionsContaining retrieves selections with data matching values within toFind. 104 // If a value within toFind is not found, an error is returned in addition to the selections 105 // for all values that could be found. 106 func selectionsContaining(doc *goquery.Document, toFind []string) ([]goquery.Selection, error) { 107 var selections []goquery.Selection 108 109 found := make(map[string]bool) 110 111 for _, val := range toFind { 112 doc.Find("*:contains('" + val + "')").Each(func(i int, s *goquery.Selection) { 113 if len(s.Nodes) > 0 && s.Nodes[0].Data == val && !found[val] { 114 selections = append(selections, *s) 115 found[val] = true 116 } 117 }) 118 119 } 120 121 for _, v := range toFind { 122 if !found[v] { 123 return selections, errors.New("Could not find value: " + v) 124 } 125 } 126 127 return selections, nil 128 } 129 130 // getSiblingContents selectively returns the contents of all sibling node collections within a layer. 131 // Content selection is determined by the given routeGroup. 132 // 133 // The initial call should provide the first node of the layer. This will usually be the 134 // first child of a grandparent node. 135 func getSiblingContents(curr *html.Node, rg routeGroup, nodes []NodeList) ([]NodeList, error) { 136 if curr == nil { 137 return nodes, nil 138 } 139 140 if found, class := classAttributeFrom(curr.Attr); found && class == rg.ParentClass { 141 children, err := listChildrenOf(curr, rg) 142 if err != nil { 143 return nodes, err 144 } 145 146 nodes = append(nodes, children) 147 148 } 149 return getSiblingContents(curr.NextSibling, rg, nodes) 150 } 151 152 // listChildrenOf returns children routed to from the given parent. 153 func listChildrenOf(parent *html.Node, rg routeGroup) (nList NodeList, err error) { 154 children, err := rg.childrenOf(parent) 155 if err != nil { 156 return nList, err 157 } 158 159 for i, child := range children { 160 if child == nil { 161 missingNode := Node{ 162 ID: i, 163 Data: "N/A", 164 } 165 nList.Nodes = append(nList.Nodes, missingNode) 166 } else { 167 nList.Nodes = append(nList.Nodes, nodeFromNode(i, child)) 168 } 169 } 170 171 return nList, nil 172 } 173 174 // findNodes identifies nodes containing any of the values within toFind. 175 func findNodes(fi *os.File, toFind []string) ([]goquery.Selection, error) { 176 doc, err := goquery.NewDocumentFromReader(fi) 177 if err != nil { 178 return nil, err 179 } 180 181 // Reset the read position in case the file was previously read. 182 if _, err := fi.Seek(0, 0); err != nil { 183 return nil, err 184 } 185 186 sel, err := selectionsContaining(doc, toFind) 187 if err != nil { 188 return nil, err 189 } 190 191 return sel, nil 192 }