github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/unfurl/scrape_generic.go (about)

     1  package unfurl
     2  
     3  import (
     4  	"context"
     5  	"io"
     6  	"strings"
     7  	"time"
     8  
     9  	"github.com/keybase/client/go/libkb"
    10  
    11  	"github.com/gocolly/colly/v2"
    12  	"github.com/keybase/client/go/chat/attachments"
    13  	"github.com/keybase/client/go/protocol/chat1"
    14  )
    15  
    16  func fullURL(hostname, path string) string {
    17  	if strings.HasPrefix(path, "http") {
    18  		return path
    19  	} else if strings.HasPrefix(path, "//") {
    20  		return "http:" + path
    21  	} else {
    22  		return "http://" + hostname + path
    23  	}
    24  }
    25  
    26  func (s *Scraper) setAndParsePubTime(ctx context.Context, content string, generic *scoredGenericRaw, score int) {
    27  	s.Debug(ctx, "scrapeGeneric: pubdate: %s", content)
    28  	formats := []string{
    29  		"2006-01-02T15:04:05Z",
    30  		"20060102",
    31  	}
    32  	var t time.Time
    33  	var err error
    34  	for _, f := range formats {
    35  		if t, err = time.Parse(f, content); err != nil {
    36  			s.Debug(ctx, "scrapeGeneric: failed to parse pubdate: format: %s err: %s", f, err)
    37  		} else {
    38  			break
    39  		}
    40  	}
    41  	if err != nil {
    42  		s.Debug(ctx, "scrapeGeneric: failed to parse pubdate with any format")
    43  	} else {
    44  		publishTime := int(t.Unix())
    45  		s.Debug(ctx, "scrapeGeneric: success: %d", publishTime)
    46  		generic.setPublishTime(&publishTime, score)
    47  	}
    48  }
    49  
    50  func (s *Scraper) setAttr(ctx context.Context, attr, hostname, domain string, generic *scoredGenericRaw,
    51  	e *colly.HTMLElement) {
    52  	ranker, ok := attrRankMap[attr]
    53  	if !ok { // invalid attribute, ignore
    54  		return
    55  	}
    56  	contents := ranker.content(e)
    57  	score := ranker.score(domain, e)
    58  	for _, content := range contents {
    59  		content = strings.Trim(content, " ")
    60  		if content == "" {
    61  			continue
    62  		}
    63  		switch ranker.setter {
    64  		case setTitle:
    65  			generic.setTitle(content, score)
    66  		case setURL:
    67  			url := fullURL(hostname, content)
    68  			generic.setURL(url, score)
    69  		case setSiteName:
    70  			generic.setSiteName(content, score)
    71  		case setFaviconURL:
    72  			url := fullURL(hostname, content)
    73  			generic.setFaviconURL(&url, score)
    74  		case setImageURL:
    75  			url := fullURL(hostname, content)
    76  			generic.setImageURL(&url, score)
    77  		case setPublishTime:
    78  			s.setAndParsePubTime(ctx, content, generic, score)
    79  		case setDescription:
    80  			generic.setDescription(&content, score)
    81  		case setVideo:
    82  			generic.setVideo(content, score)
    83  		}
    84  	}
    85  }
    86  
    87  type bodyReadResetter struct {
    88  	io.ReadCloser
    89  }
    90  
    91  func (b bodyReadResetter) Reset() error {
    92  	return nil
    93  }
    94  
    95  func (s *Scraper) tryAppleTouchIcon(ctx context.Context, generic *scoredGenericRaw, uri, domain string) {
    96  	path, err := GetDefaultAppleTouchURL(uri)
    97  	if err != nil {
    98  		s.Debug(ctx, "tryAppleTouchIcon: failed to get Apple touch URL: %s", err)
    99  		return
   100  	}
   101  	resp, err := libkb.ProxyHTTPGet(s.G().ExternalG(), s.G().Env, path, "UnfurlScraper")
   102  	if err != nil {
   103  		s.Debug(ctx, "tryAppleTouchIcon: failed to read Apple touch icon: %s", err)
   104  		return
   105  	}
   106  	defer resp.Body.Close()
   107  	if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
   108  		s.Debug(ctx, "tryAppleTouchIcon: found Apple touch icon at known path")
   109  		mimeType, err := attachments.DetectMIMEType(ctx, bodyReadResetter{ReadCloser: resp.Body},
   110  			"apple-touch-icon.png")
   111  		if err != nil {
   112  			s.Debug(ctx, "tryAppleTouchIcon: failed to get MIME type from response: %s", err)
   113  			return
   114  		}
   115  		if mimeType != "image/png" {
   116  			s.Debug(ctx, "tryAppleTouchIcon: response not a PNG: %s", mimeType)
   117  			return
   118  		}
   119  		generic.setFaviconURL(&path, getAppleTouchFaviconScoreFromPath())
   120  	}
   121  }
   122  
   123  func (s *Scraper) addGenericScraperToCollector(ctx context.Context, c *colly.Collector,
   124  	generic *scoredGenericRaw, uri, domain string) error {
   125  	// default favicon location as a fallback
   126  	defaultFaviconURL, err := GetDefaultFaviconURL(uri)
   127  	if err != nil {
   128  		return err
   129  	}
   130  	hostname, err := GetHostname(uri)
   131  	if err != nil {
   132  		return err
   133  	}
   134  	generic.setURL(uri, 0)
   135  	generic.setSiteName(domain, 0)
   136  	generic.setFaviconURL(&defaultFaviconURL, 0)
   137  
   138  	c.OnResponse(func(r *colly.Response) {
   139  		contentType := r.Headers.Get("content-type")
   140  		mediaType, _, _ := strings.Cut(contentType, ";")
   141  		mediaType = strings.TrimSpace(strings.ToLower(mediaType))
   142  		if mediaType == "image/jpeg" || mediaType == "image/png" || mediaType == "image/gif" {
   143  			generic.ImageUrl = &uri
   144  		}
   145  	})
   146  	// Run the Colly scraper
   147  	c.OnHTML("head title", func(e *colly.HTMLElement) {
   148  		s.setAttr(ctx, "title", hostname, domain, generic, e)
   149  	})
   150  	c.OnHTML("head link[rel][href]", func(e *colly.HTMLElement) {
   151  		rel := strings.ToLower(e.Attr("rel"))
   152  		if strings.Contains(rel, "apple-touch-icon") {
   153  			s.setAttr(ctx, "apple-touch-icon", hostname, domain, generic, e)
   154  		} else if strings.Contains(rel, "shortcut icon") {
   155  			s.setAttr(ctx, "shortcut icon", hostname, domain, generic, e)
   156  		} else if strings.Contains(rel, "icon") &&
   157  			(e.Attr("type") == "image/x-icon" || e.Attr("type") == "image/png") {
   158  			s.setAttr(ctx, "icon", hostname, domain, generic, e)
   159  		}
   160  	})
   161  	c.OnHTML("meta[content][name]", func(e *colly.HTMLElement) {
   162  		attr := strings.ToLower(e.Attr("name"))
   163  		s.setAttr(ctx, attr, hostname, domain, generic, e)
   164  	})
   165  	c.OnHTML("meta[content][property]", func(e *colly.HTMLElement) {
   166  		attr := strings.ToLower(e.Attr("property"))
   167  		s.setAttr(ctx, attr, hostname, domain, generic, e)
   168  	})
   169  	return nil
   170  }
   171  
   172  func (s *Scraper) isValidGenericScrape(generic chat1.UnfurlGenericRaw) bool {
   173  	return len(generic.Title) > 0 || (generic.Description != nil && len(*generic.Description) > 0) ||
   174  		generic.ImageUrl != nil || generic.Video != nil
   175  }
   176  
   177  func (s *Scraper) exportGenericResult(generic *scoredGenericRaw) (res chat1.UnfurlRaw, err error) {
   178  	// Check to make sure we have a legit unfurl that is useful
   179  	if !s.isValidGenericScrape(generic.UnfurlGenericRaw) {
   180  		return res, newUnfurlPermanentError("not enough information to display")
   181  	}
   182  	return chat1.NewUnfurlRawWithGeneric(generic.UnfurlGenericRaw), nil
   183  }
   184  
   185  func (s *Scraper) scrapeGeneric(ctx context.Context, uri, domain string) (res chat1.UnfurlRaw, err error) {
   186  	// setup some defaults with score 0 and hope we can find better info.
   187  	generic := new(scoredGenericRaw)
   188  	c := s.makeCollector()
   189  	if err = s.addGenericScraperToCollector(ctx, c, generic, uri, domain); err != nil {
   190  		return res, err
   191  	}
   192  	if err := c.Visit(uri); err != nil {
   193  		return res, err
   194  	}
   195  	// Try to get Apple touch icon from known URL if we are going to use one that is worse
   196  	if generic.faviconURLScore < getAppleTouchFaviconScoreFromPath() {
   197  		s.Debug(ctx, "scrapeGeneric: favicon score below Apple touch score, trying to find it")
   198  		s.tryAppleTouchIcon(ctx, generic, uri, domain)
   199  	}
   200  	return s.exportGenericResult(generic)
   201  }