github.com/Kindred87/Obsidian@v0.0.0-20210809203756-86936424b848/retrieval/html/search.go (about)

     1  package html
     2  
     3  import (
     4  	"errors"
     5  	"os"
     6  	"strconv"
     7  
     8  	"github.com/Kindred87/Obsidian/datasource"
     9  	"github.com/PuerkitoBio/goquery"
    10  	"golang.org/x/net/html"
    11  )
    12  
    13  /*
    14  CollectionSiblings identifies arbitrary collections of HTML nodes from a datasource and
    15  returns the contents of both the identified collections and their inferred siblings.
    16  
    17  An arbitrary collection of nodes could be [b, a, d] from a table organized as [a, b, c, d].
    18  
    19  Furthermore, with three collections of nodes in tables sharing parent p...
    20  
    21   p
    22   |> [a1, b1, c1, d1]
    23   |> [a2, b2, c2, c2]
    24   |> [a3, b3, c3, d3]
    25  
    26  ... and the two arbitrary collections are given...
    27   [b1, a1, d1]
    28   [b2, a2, d2]
    29  
    30  ...the values returned will be:
    31   [b1, a1, d1]
    32   [b2, a2, d2]
    33   [b3, a3, d3]
    34  
    35  Had the third table not been the child of p, its nodes would not have been inferred as a sibling.
    36  
    37  Each searchGroups subarray should contain a group of values expected to be within a shared structure, such as a table or table row.
    38  
    39  Value order as given will be maintained, regardless of how the associated nodes are organized in the datasource.
    40  */
    41  func CollectionSiblings(alias string, searchGroups [][]string) ([]NodeList, error) {
    42  	if len(searchGroups) < 2 {
    43  		return nil, errors.New("Only " + strconv.Itoa(len(searchGroups)) +
    44  			" search groups were provided")
    45  	}
    46  
    47  	result, path, err := datasource.Find(alias)
    48  	if err != nil {
    49  		return nil, err
    50  	} else if !result {
    51  		return nil, errors.New("Could not locate datasource by the alias of " + alias)
    52  	}
    53  
    54  	fi, err := os.Open(path)
    55  	if err != nil {
    56  		return nil, err
    57  	}
    58  
    59  	sel, err := findNodeGroups(fi, searchGroups)
    60  	if err != nil {
    61  		return nil, err
    62  	}
    63  
    64  	routes, err := nodeGroupRouting(sel)
    65  	if err != nil {
    66  		return nil, err
    67  	}
    68  
    69  	nList, err := getSiblingContents(routes.Grandparent.FirstChild, routes, []NodeList{})
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  	return nList, nil
    74  }
    75  
    76  // findNodeGroups returns HTML nodes containing the given values.
    77  //
    78  // The backing nodes of the returned selections are not shared between calls.
    79  func findNodeGroups(fi *os.File, toFind [][]string) (selections [][]goquery.Selection, err error) {
    80  	doc, err := goquery.NewDocumentFromReader(fi)
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  
    85  	// Reset the read position in case the file was previously read.
    86  	if _, err := fi.Seek(0, 0); err != nil {
    87  		return nil, err
    88  	}
    89  
    90  	for _, t := range toFind {
    91  		if result, err := selectionsContaining(doc, t); err != nil {
    92  			// Result is ignored in order to make it abundantly clear that
    93  			// the user's query could not be satisfied in full.
    94  			return nil, err
    95  		} else {
    96  			selections = append(selections, result)
    97  		}
    98  	}
    99  
   100  	return selections, nil
   101  }
   102  
   103  // selectionsContaining retrieves selections with data matching values within toFind.
   104  // If a value within toFind is not found, an error is returned in addition to the selections
   105  // for all values that could be found.
   106  func selectionsContaining(doc *goquery.Document, toFind []string) ([]goquery.Selection, error) {
   107  	var selections []goquery.Selection
   108  
   109  	found := make(map[string]bool)
   110  
   111  	for _, val := range toFind {
   112  		doc.Find("*:contains('" + val + "')").Each(func(i int, s *goquery.Selection) {
   113  			if len(s.Nodes) > 0 && s.Nodes[0].Data == val && !found[val] {
   114  				selections = append(selections, *s)
   115  				found[val] = true
   116  			}
   117  		})
   118  
   119  	}
   120  
   121  	for _, v := range toFind {
   122  		if !found[v] {
   123  			return selections, errors.New("Could not find value: " + v)
   124  		}
   125  	}
   126  
   127  	return selections, nil
   128  }
   129  
   130  // getSiblingContents selectively returns the contents of all sibling node collections within a layer.
   131  // Content selection is determined by the given routeGroup.
   132  //
   133  // The initial call should provide the first node of the layer.  This will usually be the
   134  // first child of a grandparent node.
   135  func getSiblingContents(curr *html.Node, rg routeGroup, nodes []NodeList) ([]NodeList, error) {
   136  	if curr == nil {
   137  		return nodes, nil
   138  	}
   139  
   140  	if found, class := classAttributeFrom(curr.Attr); found && class == rg.ParentClass {
   141  		children, err := listChildrenOf(curr, rg)
   142  		if err != nil {
   143  			return nodes, err
   144  		}
   145  
   146  		nodes = append(nodes, children)
   147  
   148  	}
   149  	return getSiblingContents(curr.NextSibling, rg, nodes)
   150  }
   151  
   152  // listChildrenOf returns children routed to from the given parent.
   153  func listChildrenOf(parent *html.Node, rg routeGroup) (nList NodeList, err error) {
   154  	children, err := rg.childrenOf(parent)
   155  	if err != nil {
   156  		return nList, err
   157  	}
   158  
   159  	for i, child := range children {
   160  		if child == nil {
   161  			missingNode := Node{
   162  				ID:   i,
   163  				Data: "N/A",
   164  			}
   165  			nList.Nodes = append(nList.Nodes, missingNode)
   166  		} else {
   167  			nList.Nodes = append(nList.Nodes, nodeFromNode(i, child))
   168  		}
   169  	}
   170  
   171  	return nList, nil
   172  }
   173  
   174  // findNodes identifies nodes containing any of the values within toFind.
   175  func findNodes(fi *os.File, toFind []string) ([]goquery.Selection, error) {
   176  	doc, err := goquery.NewDocumentFromReader(fi)
   177  	if err != nil {
   178  		return nil, err
   179  	}
   180  
   181  	// Reset the read position in case the file was previously read.
   182  	if _, err := fi.Seek(0, 0); err != nil {
   183  		return nil, err
   184  	}
   185  
   186  	sel, err := selectionsContaining(doc, toFind)
   187  	if err != nil {
   188  		return nil, err
   189  	}
   190  
   191  	return sel, nil
   192  }