github.com/vlifesystems/rulehunter@v0.0.0-20180501090014-673078aa4a83/html/helpers_test.go (about) 1 package html 2 3 import ( 4 "bytes" 5 "io/ioutil" 6 "regexp" 7 "strings" 8 9 "golang.org/x/net/html" 10 ) 11 12 func getReportUrls(filename string) ([]string, error) { 13 urls, err := getUrls(filename) 14 if err != nil { 15 return []string{}, err 16 } 17 reportUrls := make([]string, len(urls)) 18 numReportUrls := 0 19 for _, url := range urls { 20 isCategoryIndexUrl, err := regexp.MatchString("category\\/[^/]+\\/$", url) 21 if err != nil { 22 return []string{}, err 23 } 24 if (strings.HasPrefix(url, "reports/category/") && !isCategoryIndexUrl) || 25 (strings.HasPrefix(url, "reports/nocategory/") && 26 len(url) > len("reports/nocategory")) { 27 reportUrls[numReportUrls] = url 28 numReportUrls++ 29 } 30 } 31 return reportUrls[:numReportUrls], nil 32 } 33 34 func getUrls(filename string) ([]string, error) { 35 urls := make([]string, 0) 36 text, err := ioutil.ReadFile(filename) 37 if err != nil { 38 return urls, err 39 } 40 b := bytes.NewBuffer(text) 41 z := html.NewTokenizer(b) 42 43 for { 44 tt := z.Next() 45 46 switch tt { 47 case html.ErrorToken: 48 return urls, nil 49 case html.StartTagToken: 50 t := z.Token() 51 52 if t.Data == "a" { // Is Anchor 53 if url := getHref(t); len(url) > 0 { 54 urls = append(urls, url) 55 } 56 } 57 } 58 } 59 } 60 61 func getHref(t html.Token) string { 62 for _, a := range t.Attr { 63 if a.Key == "href" { 64 return a.Val 65 } 66 } 67 return "" 68 } 69 70 func getInnerText(z *html.Tokenizer) string { 71 tt := z.Next() 72 if tt == html.TextToken { 73 return string(z.Text()) 74 } 75 return "" 76 } 77 78 func getH1(filename string) (string, error) { 79 text, err := ioutil.ReadFile(filename) 80 if err != nil { 81 return "", err 82 } 83 b := bytes.NewBuffer(text) 84 z := html.NewTokenizer(b) 85 86 for { 87 tt := z.Next() 88 89 switch tt { 90 case html.ErrorToken: 91 return "", nil 92 case html.StartTagToken: 93 t := z.Token() 94 95 if t.Data == "h1" { // Is h1 header 96 return getInnerText(z), nil 97 } 98 } 99 } 100 }