github.com/instill-ai/component@v0.16.0-beta/pkg/connector/website/v0/scrape_website.go (about) 1 package website 2 3 import ( 4 "fmt" 5 "math/rand" 6 "net/http" 7 "net/url" 8 "strings" 9 10 "github.com/PuerkitoBio/goquery" 11 "github.com/gocolly/colly/v2" 12 "github.com/instill-ai/component/pkg/connector/util" 13 ) 14 15 type PageInfo struct { 16 Link string `json:"link"` 17 Title string `json:"title"` 18 LinkText string `json:"link_text"` 19 LinkHTML string `json:"link_html"` 20 } 21 22 // ScrapeWebsiteInput defines the input of the scrape website task 23 type ScrapeWebsiteInput struct { 24 // TargetURL: The URL of the website to scrape. 25 TargetURL string `json:"target_url"` 26 // AllowedDomains: The list of allowed domains to scrape. 27 AllowedDomains []string `json:"allowed_domains"` 28 // MaxK: The maximum number of pages to scrape. 29 MaxK int `json:"max_k"` 30 // IncludeLinkText: Whether to include the scraped text of the scraped web page. 31 IncludeLinkText *bool `json:"include_link_text"` 32 // IncludeLinkHTML: Whether to include the scraped HTML of the scraped web page. 33 IncludeLinkHTML *bool `json:"include_link_html"` 34 } 35 36 // ScrapeWebsiteOutput defines the output of the scrape website task 37 type ScrapeWebsiteOutput struct { 38 // Pages: The list of pages that were scraped. 39 Pages []PageInfo `json:"pages"` 40 } 41 42 const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 43 44 // randomString generates a random string of length 10-20 45 func randomString() string { 46 b := make([]byte, rand.Intn(10)+10) 47 for i := range b { 48 b[i] = letterBytes[rand.Intn(len(letterBytes))] 49 } 50 return string(b) 51 } 52 53 // stripQueryAndTrailingSlash removes query parameters and trailing '/' from a URL 54 func stripQueryAndTrailingSlash(u *url.URL) *url.URL { 55 // Remove query parameters by setting RawQuery to an empty string 56 u.RawQuery = "" 57 58 // Remove trailing '/' from the path 59 u.Path = strings.TrimSuffix(u.Path, "/") 60 61 return u 62 } 63 64 // existsInSlice checks if a string exists in a slice 65 func existsInSlice(slice []string, item string) bool { 66 for _, s := range slice { 67 if s == item { 68 return true // Item already exists, so don't add it again 69 } 70 } 71 return false // Item doesn't exist, so add it to the slice 72 } 73 74 // getHTMLPageDoc returns the *goquery.Document of a webpage 75 func getHTMLPageDoc(url string) (*goquery.Document, error) { 76 // Request the HTML page. 77 client := &http.Client{Transport: &http.Transport{ 78 DisableKeepAlives: true, 79 }} 80 res, err := client.Get(url) 81 if err != nil { 82 return nil, err 83 } 84 defer res.Body.Close() 85 86 // Load the HTML document 87 doc, err := goquery.NewDocumentFromReader(res.Body) 88 if err != nil { 89 return nil, err 90 } 91 92 return doc, nil 93 } 94 95 // Scrape crawls a webpage and returns a slice of PageInfo 96 func Scrape(input ScrapeWebsiteInput) (ScrapeWebsiteOutput, error) { 97 output := ScrapeWebsiteOutput{} 98 99 if input.IncludeLinkHTML == nil { 100 b := false 101 input.IncludeLinkHTML = &b 102 } 103 if input.IncludeLinkText == nil { 104 b := false 105 input.IncludeLinkText = &b 106 } 107 if input.MaxK < 0 { 108 input.MaxK = 0 109 } 110 111 pageLinks := []string{} 112 113 c := colly.NewCollector() 114 if len(input.AllowedDomains) > 0 { 115 c.AllowedDomains = input.AllowedDomains 116 } 117 c.AllowURLRevisit = false 118 119 // On every a element which has href attribute call callback 120 // Wont be called if error occurs 121 c.OnHTML("a[href]", func(e *colly.HTMLElement) { 122 link := e.Attr("href") 123 _ = c.Visit(e.Request.AbsoluteURL(link)) 124 }) 125 126 // Set error handler 127 c.OnError(func(r *colly.Response, err error) { 128 fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err) 129 }) 130 131 c.OnRequest(func(r *colly.Request) { 132 133 if input.MaxK > 0 && len(output.Pages) >= input.MaxK { 134 r.Abort() 135 return 136 } 137 138 // Set a random user agent to avoid being blocked by websites 139 r.Headers.Set("User-Agent", randomString()) 140 // Strip query parameters and trailing '/' from the URL 141 strippedURL := stripQueryAndTrailingSlash(r.URL) 142 // Check if the URL already exists in the slice 143 if !existsInSlice(pageLinks, strippedURL.String()) { 144 // Add the URL to the slice if it doesn't already exist 145 pageLinks = append(pageLinks, strippedURL.String()) 146 // Scrape the webpage information 147 doc, err := getHTMLPageDoc(strippedURL.String()) 148 if err != nil { 149 fmt.Printf("Error parsing %s: %v", strippedURL.String(), err) 150 return 151 } 152 page := PageInfo{} 153 title := util.ScrapeWebpageTitle(doc) 154 page.Title = title 155 page.Link = strippedURL.String() 156 157 if *input.IncludeLinkHTML || *input.IncludeLinkText { 158 html, err := util.ScrapeWebpageHTML(doc) 159 if err != nil { 160 fmt.Printf("Error scraping HTML from %s: %v", strippedURL.String(), err) 161 return 162 } 163 164 if *input.IncludeLinkHTML { 165 page.LinkHTML = html 166 } 167 168 if *input.IncludeLinkText { 169 markdown, err := util.ScrapeWebpageHTMLToMarkdown(html) 170 if err != nil { 171 fmt.Printf("Error scraping text from %s: %v", strippedURL.String(), err) 172 return 173 } 174 page.LinkText = markdown 175 } 176 } 177 output.Pages = append(output.Pages, page) 178 } 179 }) 180 181 // Start scraping 182 if !strings.HasPrefix(input.TargetURL, "http://") && !strings.HasPrefix(input.TargetURL, "https://") { 183 input.TargetURL = "https://" + input.TargetURL 184 } 185 _ = c.Visit(input.TargetURL) 186 187 return output, nil 188 }