github.com/vlifesystems/rulehunter@v0.0.0-20180501090014-673078aa4a83/html/helpers_test.go (about)

     1  package html
     2  
     3  import (
     4  	"bytes"
     5  	"io/ioutil"
     6  	"regexp"
     7  	"strings"
     8  
     9  	"golang.org/x/net/html"
    10  )
    11  
    12  func getReportUrls(filename string) ([]string, error) {
    13  	urls, err := getUrls(filename)
    14  	if err != nil {
    15  		return []string{}, err
    16  	}
    17  	reportUrls := make([]string, len(urls))
    18  	numReportUrls := 0
    19  	for _, url := range urls {
    20  		isCategoryIndexUrl, err := regexp.MatchString("category\\/[^/]+\\/$", url)
    21  		if err != nil {
    22  			return []string{}, err
    23  		}
    24  		if (strings.HasPrefix(url, "reports/category/") && !isCategoryIndexUrl) ||
    25  			(strings.HasPrefix(url, "reports/nocategory/") &&
    26  				len(url) > len("reports/nocategory")) {
    27  			reportUrls[numReportUrls] = url
    28  			numReportUrls++
    29  		}
    30  	}
    31  	return reportUrls[:numReportUrls], nil
    32  }
    33  
    34  func getUrls(filename string) ([]string, error) {
    35  	urls := make([]string, 0)
    36  	text, err := ioutil.ReadFile(filename)
    37  	if err != nil {
    38  		return urls, err
    39  	}
    40  	b := bytes.NewBuffer(text)
    41  	z := html.NewTokenizer(b)
    42  
    43  	for {
    44  		tt := z.Next()
    45  
    46  		switch tt {
    47  		case html.ErrorToken:
    48  			return urls, nil
    49  		case html.StartTagToken:
    50  			t := z.Token()
    51  
    52  			if t.Data == "a" { // Is Anchor
    53  				if url := getHref(t); len(url) > 0 {
    54  					urls = append(urls, url)
    55  				}
    56  			}
    57  		}
    58  	}
    59  }
    60  
    61  func getHref(t html.Token) string {
    62  	for _, a := range t.Attr {
    63  		if a.Key == "href" {
    64  			return a.Val
    65  		}
    66  	}
    67  	return ""
    68  }
    69  
    70  func getInnerText(z *html.Tokenizer) string {
    71  	tt := z.Next()
    72  	if tt == html.TextToken {
    73  		return string(z.Text())
    74  	}
    75  	return ""
    76  }
    77  
    78  func getH1(filename string) (string, error) {
    79  	text, err := ioutil.ReadFile(filename)
    80  	if err != nil {
    81  		return "", err
    82  	}
    83  	b := bytes.NewBuffer(text)
    84  	z := html.NewTokenizer(b)
    85  
    86  	for {
    87  		tt := z.Next()
    88  
    89  		switch tt {
    90  		case html.ErrorToken:
    91  			return "", nil
    92  		case html.StartTagToken:
    93  			t := z.Token()
    94  
    95  			if t.Data == "h1" { // Is h1 header
    96  				return getInnerText(z), nil
    97  			}
    98  		}
    99  	}
   100  }