github.com/instill-ai/component@v0.16.0-beta/pkg/connector/googlesearch/v0/search.go (about)

     1  package googlesearch
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"net/http"
     7  
     8  	"github.com/PuerkitoBio/goquery"
     9  	"google.golang.org/api/customsearch/v1"
    10  
    11  	"github.com/instill-ai/component/pkg/connector/util"
    12  )
    13  
    14  const (
    15  	// MaxResultsPerPage is the default max number of search results per page
    16  	MaxResultsPerPage = 10
    17  	// MaxResults is the maximum number of search results
    18  	MaxResults = 100
    19  )
    20  
    21  // Min returns the smaller of x or y.
    22  func Min(x, y int) int {
    23  	if x > y {
    24  		return y
    25  	}
    26  	return x
    27  }
    28  
    29  // SearchInput defines the input of the search task
    30  type SearchInput struct {
    31  	// Query: The search query.
    32  	Query string `json:"query"`
    33  
    34  	// TopK: The number of search results to return.
    35  	TopK *int `json:"top_k,omitempty"`
    36  
    37  	// IncludeLinkText: Whether to include the scraped text of the search web page result.
    38  	IncludeLinkText *bool `json:"include_link_text,omitempty"`
    39  
    40  	// IncludeLinkHTML: Whether to include the scraped HTML of the search web page result.
    41  	IncludeLinkHTML *bool `json:"include_link_html,omitempty"`
    42  }
    43  
    44  type Result struct {
    45  	// Title: The title of the search result, in plain text.
    46  	Title string `json:"title"`
    47  
    48  	// Link: The full URL to which the search result is pointing, e.g.
    49  	// http://www.example.com/foo/bar.
    50  	Link string `json:"link"`
    51  
    52  	// Snippet: The snippet of the search result, in plain text.
    53  	Snippet string `json:"snippet"`
    54  
    55  	// LinkText: The scraped text of the search web page result, in plain text.
    56  	LinkText string `json:"link_text"`
    57  
    58  	// LinkHTML: The full raw HTML of the search web page result.
    59  	LinkHTML string `json:"link_html"`
    60  }
    61  
    62  // SearchOutput defines the output of the search task
    63  type SearchOutput struct {
    64  	// Results: The search results.
    65  	Results []*Result `json:"results"`
    66  }
    67  
    68  // Scrape the search results if needed
    69  func scrapeSearchResults(searchResults *customsearch.Search, includeLinkText, includeLinkHTML bool) ([]*Result, error) {
    70  	results := []*Result{}
    71  	for _, item := range searchResults.Items {
    72  		linkText, linkHTML := "", ""
    73  		if includeLinkText || includeLinkHTML {
    74  			// Make an HTTP GET request to the web page
    75  			client := &http.Client{Transport: &http.Transport{
    76  				DisableKeepAlives: true,
    77  			}}
    78  			response, err := client.Get(item.Link)
    79  			if err != nil {
    80  				log.Printf("Error making HTTP GET request to %s: %v", item.Link, err)
    81  				continue
    82  			}
    83  			defer response.Body.Close()
    84  
    85  			// Parse the HTML content
    86  			doc, err := goquery.NewDocumentFromReader(response.Body)
    87  			if err != nil {
    88  				fmt.Printf("Error parsing %s: %v", item.Link, err)
    89  			}
    90  
    91  			if includeLinkHTML {
    92  				linkHTML, err = util.ScrapeWebpageHTML(doc)
    93  				if err != nil {
    94  					log.Printf("Error scraping HTML from %s: %v", item.Link, err)
    95  				}
    96  			}
    97  
    98  			if includeLinkText {
    99  				linkHTML, err = util.ScrapeWebpageHTML(doc)
   100  				if err != nil {
   101  					log.Printf("Error scraping HTML from %s: %v", item.Link, err)
   102  				}
   103  				linkText, err = util.ScrapeWebpageHTMLToMarkdown(linkHTML)
   104  				if err != nil {
   105  					log.Printf("Error scraping text from %s: %v", item.Link, err)
   106  				}
   107  			}
   108  
   109  		}
   110  
   111  		results = append(results, &Result{
   112  			Title:    item.Title,
   113  			Link:     item.Link,
   114  			Snippet:  item.Snippet,
   115  			LinkText: linkText,
   116  			LinkHTML: linkHTML,
   117  		})
   118  	}
   119  	return results, nil
   120  }
   121  
   122  // Search the web using Google Custom Search API and scrape the results if needed
   123  func search(cseListCall *customsearch.CseListCall, input SearchInput) (SearchOutput, error) {
   124  	output := SearchOutput{}
   125  
   126  	if input.TopK == nil {
   127  		defaultTopK := int(MaxResultsPerPage)
   128  		input.TopK = &defaultTopK
   129  	}
   130  	if *input.TopK <= 0 || int64(*input.TopK) > MaxResults {
   131  		return output, fmt.Errorf("top_k must be between 1 and %d", MaxResults)
   132  	}
   133  
   134  	if input.IncludeLinkHTML == nil {
   135  		defaultValue := false
   136  		input.IncludeLinkHTML = &defaultValue
   137  	}
   138  	if input.IncludeLinkText == nil {
   139  		defaultValue := false
   140  		input.IncludeLinkText = &defaultValue
   141  	}
   142  
   143  	// Make the search request
   144  	results := []*Result{}
   145  
   146  	for start := 1; start <= *input.TopK; start += MaxResultsPerPage {
   147  		searchNum := Min(*input.TopK-start+1, MaxResultsPerPage)
   148  		searchResults, err := cseListCall.Q(input.Query).Start(int64(start)).Num(int64(searchNum)).Do()
   149  		if err != nil {
   150  			return output, err
   151  		}
   152  		rs, err := scrapeSearchResults(searchResults, *input.IncludeLinkText, *input.IncludeLinkHTML)
   153  		if err != nil {
   154  			return output, err
   155  		}
   156  		results = append(results, rs...)
   157  	}
   158  	output.Results = results
   159  
   160  	return output, nil
   161  }