github.com/instill-ai/component@v0.16.0-beta/pkg/connector/googlesearch/v0/search.go (about) 1 package googlesearch 2 3 import ( 4 "fmt" 5 "log" 6 "net/http" 7 8 "github.com/PuerkitoBio/goquery" 9 "google.golang.org/api/customsearch/v1" 10 11 "github.com/instill-ai/component/pkg/connector/util" 12 ) 13 14 const ( 15 // MaxResultsPerPage is the default max number of search results per page 16 MaxResultsPerPage = 10 17 // MaxResults is the maximum number of search results 18 MaxResults = 100 19 ) 20 21 // Min returns the smaller of x or y. 22 func Min(x, y int) int { 23 if x > y { 24 return y 25 } 26 return x 27 } 28 29 // SearchInput defines the input of the search task 30 type SearchInput struct { 31 // Query: The search query. 32 Query string `json:"query"` 33 34 // TopK: The number of search results to return. 35 TopK *int `json:"top_k,omitempty"` 36 37 // IncludeLinkText: Whether to include the scraped text of the search web page result. 38 IncludeLinkText *bool `json:"include_link_text,omitempty"` 39 40 // IncludeLinkHTML: Whether to include the scraped HTML of the search web page result. 41 IncludeLinkHTML *bool `json:"include_link_html,omitempty"` 42 } 43 44 type Result struct { 45 // Title: The title of the search result, in plain text. 46 Title string `json:"title"` 47 48 // Link: The full URL to which the search result is pointing, e.g. 49 // http://www.example.com/foo/bar. 50 Link string `json:"link"` 51 52 // Snippet: The snippet of the search result, in plain text. 53 Snippet string `json:"snippet"` 54 55 // LinkText: The scraped text of the search web page result, in plain text. 56 LinkText string `json:"link_text"` 57 58 // LinkHTML: The full raw HTML of the search web page result. 59 LinkHTML string `json:"link_html"` 60 } 61 62 // SearchOutput defines the output of the search task 63 type SearchOutput struct { 64 // Results: The search results. 65 Results []*Result `json:"results"` 66 } 67 68 // Scrape the search results if needed 69 func scrapeSearchResults(searchResults *customsearch.Search, includeLinkText, includeLinkHTML bool) ([]*Result, error) { 70 results := []*Result{} 71 for _, item := range searchResults.Items { 72 linkText, linkHTML := "", "" 73 if includeLinkText || includeLinkHTML { 74 // Make an HTTP GET request to the web page 75 client := &http.Client{Transport: &http.Transport{ 76 DisableKeepAlives: true, 77 }} 78 response, err := client.Get(item.Link) 79 if err != nil { 80 log.Printf("Error making HTTP GET request to %s: %v", item.Link, err) 81 continue 82 } 83 defer response.Body.Close() 84 85 // Parse the HTML content 86 doc, err := goquery.NewDocumentFromReader(response.Body) 87 if err != nil { 88 fmt.Printf("Error parsing %s: %v", item.Link, err) 89 } 90 91 if includeLinkHTML { 92 linkHTML, err = util.ScrapeWebpageHTML(doc) 93 if err != nil { 94 log.Printf("Error scraping HTML from %s: %v", item.Link, err) 95 } 96 } 97 98 if includeLinkText { 99 linkHTML, err = util.ScrapeWebpageHTML(doc) 100 if err != nil { 101 log.Printf("Error scraping HTML from %s: %v", item.Link, err) 102 } 103 linkText, err = util.ScrapeWebpageHTMLToMarkdown(linkHTML) 104 if err != nil { 105 log.Printf("Error scraping text from %s: %v", item.Link, err) 106 } 107 } 108 109 } 110 111 results = append(results, &Result{ 112 Title: item.Title, 113 Link: item.Link, 114 Snippet: item.Snippet, 115 LinkText: linkText, 116 LinkHTML: linkHTML, 117 }) 118 } 119 return results, nil 120 } 121 122 // Search the web using Google Custom Search API and scrape the results if needed 123 func search(cseListCall *customsearch.CseListCall, input SearchInput) (SearchOutput, error) { 124 output := SearchOutput{} 125 126 if input.TopK == nil { 127 defaultTopK := int(MaxResultsPerPage) 128 input.TopK = &defaultTopK 129 } 130 if *input.TopK <= 0 || int64(*input.TopK) > MaxResults { 131 return output, fmt.Errorf("top_k must be between 1 and %d", MaxResults) 132 } 133 134 if input.IncludeLinkHTML == nil { 135 defaultValue := false 136 input.IncludeLinkHTML = &defaultValue 137 } 138 if input.IncludeLinkText == nil { 139 defaultValue := false 140 input.IncludeLinkText = &defaultValue 141 } 142 143 // Make the search request 144 results := []*Result{} 145 146 for start := 1; start <= *input.TopK; start += MaxResultsPerPage { 147 searchNum := Min(*input.TopK-start+1, MaxResultsPerPage) 148 searchResults, err := cseListCall.Q(input.Query).Start(int64(start)).Num(int64(searchNum)).Do() 149 if err != nil { 150 return output, err 151 } 152 rs, err := scrapeSearchResults(searchResults, *input.IncludeLinkText, *input.IncludeLinkHTML) 153 if err != nil { 154 return output, err 155 } 156 results = append(results, rs...) 157 } 158 output.Results = results 159 160 return output, nil 161 }