github.com/instill-ai/component@v0.16.0-beta/pkg/connector/website/v0/scrape_website.go

github.com/instill-ai/component@v0.16.0-beta/pkg/connector/website/v0/scrape_website.go (about)

     1  package website
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"net/http"
     7  	"net/url"
     8  	"strings"
     9  
    10  	"github.com/PuerkitoBio/goquery"
    11  	"github.com/gocolly/colly/v2"
    12  	"github.com/instill-ai/component/pkg/connector/util"
    13  )
    14  
    15  type PageInfo struct {
    16  	Link     string `json:"link"`
    17  	Title    string `json:"title"`
    18  	LinkText string `json:"link_text"`
    19  	LinkHTML string `json:"link_html"`
    20  }
    21  
    22  // ScrapeWebsiteInput defines the input of the scrape website task
    23  type ScrapeWebsiteInput struct {
    24  	// TargetURL: The URL of the website to scrape.
    25  	TargetURL string `json:"target_url"`
    26  	// AllowedDomains: The list of allowed domains to scrape.
    27  	AllowedDomains []string `json:"allowed_domains"`
    28  	// MaxK: The maximum number of pages to scrape.
    29  	MaxK int `json:"max_k"`
    30  	// IncludeLinkText: Whether to include the scraped text of the scraped web page.
    31  	IncludeLinkText *bool `json:"include_link_text"`
    32  	// IncludeLinkHTML: Whether to include the scraped HTML of the scraped web page.
    33  	IncludeLinkHTML *bool `json:"include_link_html"`
    34  }
    35  
    36  // ScrapeWebsiteOutput defines the output of the scrape website task
    37  type ScrapeWebsiteOutput struct {
    38  	// Pages: The list of pages that were scraped.
    39  	Pages []PageInfo `json:"pages"`
    40  }
    41  
    42  const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    43  
    44  // randomString generates a random string of length 10-20
    45  func randomString() string {
    46  	b := make([]byte, rand.Intn(10)+10)
    47  	for i := range b {
    48  		b[i] = letterBytes[rand.Intn(len(letterBytes))]
    49  	}
    50  	return string(b)
    51  }
    52  
    53  // stripQueryAndTrailingSlash removes query parameters and trailing '/' from a URL
    54  func stripQueryAndTrailingSlash(u *url.URL) *url.URL {
    55  	// Remove query parameters by setting RawQuery to an empty string
    56  	u.RawQuery = ""
    57  
    58  	// Remove trailing '/' from the path
    59  	u.Path = strings.TrimSuffix(u.Path, "/")
    60  
    61  	return u
    62  }
    63  
    64  // existsInSlice checks if a string exists in a slice
    65  func existsInSlice(slice []string, item string) bool {
    66  	for _, s := range slice {
    67  		if s == item {
    68  			return true // Item already exists, so don't add it again
    69  		}
    70  	}
    71  	return false // Item doesn't exist, so add it to the slice
    72  }
    73  
    74  // getHTMLPageDoc returns the *goquery.Document of a webpage
    75  func getHTMLPageDoc(url string) (*goquery.Document, error) {
    76  	// Request the HTML page.
    77  	client := &http.Client{Transport: &http.Transport{
    78  		DisableKeepAlives: true,
    79  	}}
    80  	res, err := client.Get(url)
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  	defer res.Body.Close()
    85  
    86  	// Load the HTML document
    87  	doc, err := goquery.NewDocumentFromReader(res.Body)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	return doc, nil
    93  }
    94  
    95  // Scrape crawls a webpage and returns a slice of PageInfo
    96  func Scrape(input ScrapeWebsiteInput) (ScrapeWebsiteOutput, error) {
    97  	output := ScrapeWebsiteOutput{}
    98  
    99  	if input.IncludeLinkHTML == nil {
   100  		b := false
   101  		input.IncludeLinkHTML = &b
   102  	}
   103  	if input.IncludeLinkText == nil {
   104  		b := false
   105  		input.IncludeLinkText = &b
   106  	}
   107  	if input.MaxK < 0 {
   108  		input.MaxK = 0
   109  	}
   110  
   111  	pageLinks := []string{}
   112  
   113  	c := colly.NewCollector()
   114  	if len(input.AllowedDomains) > 0 {
   115  		c.AllowedDomains = input.AllowedDomains
   116  	}
   117  	c.AllowURLRevisit = false
   118  
   119  	// On every a element which has href attribute call callback
   120  	// Wont be called if error occurs
   121  	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
   122  		link := e.Attr("href")
   123  		_ = c.Visit(e.Request.AbsoluteURL(link))
   124  	})
   125  
   126  	// Set error handler
   127  	c.OnError(func(r *colly.Response, err error) {
   128  		fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
   129  	})
   130  
   131  	c.OnRequest(func(r *colly.Request) {
   132  
   133  		if input.MaxK > 0 && len(output.Pages) >= input.MaxK {
   134  			r.Abort()
   135  			return
   136  		}
   137  
   138  		// Set a random user agent to avoid being blocked by websites
   139  		r.Headers.Set("User-Agent", randomString())
   140  		// Strip query parameters and trailing '/' from the URL
   141  		strippedURL := stripQueryAndTrailingSlash(r.URL)
   142  		// Check if the URL already exists in the slice
   143  		if !existsInSlice(pageLinks, strippedURL.String()) {
   144  			// Add the URL to the slice if it doesn't already exist
   145  			pageLinks = append(pageLinks, strippedURL.String())
   146  			// Scrape the webpage information
   147  			doc, err := getHTMLPageDoc(strippedURL.String())
   148  			if err != nil {
   149  				fmt.Printf("Error parsing %s: %v", strippedURL.String(), err)
   150  				return
   151  			}
   152  			page := PageInfo{}
   153  			title := util.ScrapeWebpageTitle(doc)
   154  			page.Title = title
   155  			page.Link = strippedURL.String()
   156  
   157  			if *input.IncludeLinkHTML || *input.IncludeLinkText {
   158  				html, err := util.ScrapeWebpageHTML(doc)
   159  				if err != nil {
   160  					fmt.Printf("Error scraping HTML from %s: %v", strippedURL.String(), err)
   161  					return
   162  				}
   163  
   164  				if *input.IncludeLinkHTML {
   165  					page.LinkHTML = html
   166  				}
   167  
   168  				if *input.IncludeLinkText {
   169  					markdown, err := util.ScrapeWebpageHTMLToMarkdown(html)
   170  					if err != nil {
   171  						fmt.Printf("Error scraping text from %s: %v", strippedURL.String(), err)
   172  						return
   173  					}
   174  					page.LinkText = markdown
   175  				}
   176  			}
   177  			output.Pages = append(output.Pages, page)
   178  		}
   179  	})
   180  
   181  	// Start scraping
   182  	if !strings.HasPrefix(input.TargetURL, "http://") && !strings.HasPrefix(input.TargetURL, "https://") {
   183  		input.TargetURL = "https://" + input.TargetURL
   184  	}
   185  	_ = c.Visit(input.TargetURL)
   186  
   187  	return output, nil
   188  }