github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/unfurl/scrape_generic.go (about) 1 package unfurl 2 3 import ( 4 "context" 5 "io" 6 "strings" 7 "time" 8 9 "github.com/keybase/client/go/libkb" 10 11 "github.com/gocolly/colly/v2" 12 "github.com/keybase/client/go/chat/attachments" 13 "github.com/keybase/client/go/protocol/chat1" 14 ) 15 16 func fullURL(hostname, path string) string { 17 if strings.HasPrefix(path, "http") { 18 return path 19 } else if strings.HasPrefix(path, "//") { 20 return "http:" + path 21 } else { 22 return "http://" + hostname + path 23 } 24 } 25 26 func (s *Scraper) setAndParsePubTime(ctx context.Context, content string, generic *scoredGenericRaw, score int) { 27 s.Debug(ctx, "scrapeGeneric: pubdate: %s", content) 28 formats := []string{ 29 "2006-01-02T15:04:05Z", 30 "20060102", 31 } 32 var t time.Time 33 var err error 34 for _, f := range formats { 35 if t, err = time.Parse(f, content); err != nil { 36 s.Debug(ctx, "scrapeGeneric: failed to parse pubdate: format: %s err: %s", f, err) 37 } else { 38 break 39 } 40 } 41 if err != nil { 42 s.Debug(ctx, "scrapeGeneric: failed to parse pubdate with any format") 43 } else { 44 publishTime := int(t.Unix()) 45 s.Debug(ctx, "scrapeGeneric: success: %d", publishTime) 46 generic.setPublishTime(&publishTime, score) 47 } 48 } 49 50 func (s *Scraper) setAttr(ctx context.Context, attr, hostname, domain string, generic *scoredGenericRaw, 51 e *colly.HTMLElement) { 52 ranker, ok := attrRankMap[attr] 53 if !ok { // invalid attribute, ignore 54 return 55 } 56 contents := ranker.content(e) 57 score := ranker.score(domain, e) 58 for _, content := range contents { 59 content = strings.Trim(content, " ") 60 if content == "" { 61 continue 62 } 63 switch ranker.setter { 64 case setTitle: 65 generic.setTitle(content, score) 66 case setURL: 67 url := fullURL(hostname, content) 68 generic.setURL(url, score) 69 case setSiteName: 70 generic.setSiteName(content, score) 71 case setFaviconURL: 72 url := fullURL(hostname, content) 73 generic.setFaviconURL(&url, score) 74 case setImageURL: 75 url := fullURL(hostname, content) 76 generic.setImageURL(&url, score) 77 case setPublishTime: 78 s.setAndParsePubTime(ctx, content, generic, score) 79 case setDescription: 80 generic.setDescription(&content, score) 81 case setVideo: 82 generic.setVideo(content, score) 83 } 84 } 85 } 86 87 type bodyReadResetter struct { 88 io.ReadCloser 89 } 90 91 func (b bodyReadResetter) Reset() error { 92 return nil 93 } 94 95 func (s *Scraper) tryAppleTouchIcon(ctx context.Context, generic *scoredGenericRaw, uri, domain string) { 96 path, err := GetDefaultAppleTouchURL(uri) 97 if err != nil { 98 s.Debug(ctx, "tryAppleTouchIcon: failed to get Apple touch URL: %s", err) 99 return 100 } 101 resp, err := libkb.ProxyHTTPGet(s.G().ExternalG(), s.G().Env, path, "UnfurlScraper") 102 if err != nil { 103 s.Debug(ctx, "tryAppleTouchIcon: failed to read Apple touch icon: %s", err) 104 return 105 } 106 defer resp.Body.Close() 107 if resp.StatusCode >= 200 && resp.StatusCode <= 299 { 108 s.Debug(ctx, "tryAppleTouchIcon: found Apple touch icon at known path") 109 mimeType, err := attachments.DetectMIMEType(ctx, bodyReadResetter{ReadCloser: resp.Body}, 110 "apple-touch-icon.png") 111 if err != nil { 112 s.Debug(ctx, "tryAppleTouchIcon: failed to get MIME type from response: %s", err) 113 return 114 } 115 if mimeType != "image/png" { 116 s.Debug(ctx, "tryAppleTouchIcon: response not a PNG: %s", mimeType) 117 return 118 } 119 generic.setFaviconURL(&path, getAppleTouchFaviconScoreFromPath()) 120 } 121 } 122 123 func (s *Scraper) addGenericScraperToCollector(ctx context.Context, c *colly.Collector, 124 generic *scoredGenericRaw, uri, domain string) error { 125 // default favicon location as a fallback 126 defaultFaviconURL, err := GetDefaultFaviconURL(uri) 127 if err != nil { 128 return err 129 } 130 hostname, err := GetHostname(uri) 131 if err != nil { 132 return err 133 } 134 generic.setURL(uri, 0) 135 generic.setSiteName(domain, 0) 136 generic.setFaviconURL(&defaultFaviconURL, 0) 137 138 c.OnResponse(func(r *colly.Response) { 139 contentType := r.Headers.Get("content-type") 140 mediaType, _, _ := strings.Cut(contentType, ";") 141 mediaType = strings.TrimSpace(strings.ToLower(mediaType)) 142 if mediaType == "image/jpeg" || mediaType == "image/png" || mediaType == "image/gif" { 143 generic.ImageUrl = &uri 144 } 145 }) 146 // Run the Colly scraper 147 c.OnHTML("head title", func(e *colly.HTMLElement) { 148 s.setAttr(ctx, "title", hostname, domain, generic, e) 149 }) 150 c.OnHTML("head link[rel][href]", func(e *colly.HTMLElement) { 151 rel := strings.ToLower(e.Attr("rel")) 152 if strings.Contains(rel, "apple-touch-icon") { 153 s.setAttr(ctx, "apple-touch-icon", hostname, domain, generic, e) 154 } else if strings.Contains(rel, "shortcut icon") { 155 s.setAttr(ctx, "shortcut icon", hostname, domain, generic, e) 156 } else if strings.Contains(rel, "icon") && 157 (e.Attr("type") == "image/x-icon" || e.Attr("type") == "image/png") { 158 s.setAttr(ctx, "icon", hostname, domain, generic, e) 159 } 160 }) 161 c.OnHTML("meta[content][name]", func(e *colly.HTMLElement) { 162 attr := strings.ToLower(e.Attr("name")) 163 s.setAttr(ctx, attr, hostname, domain, generic, e) 164 }) 165 c.OnHTML("meta[content][property]", func(e *colly.HTMLElement) { 166 attr := strings.ToLower(e.Attr("property")) 167 s.setAttr(ctx, attr, hostname, domain, generic, e) 168 }) 169 return nil 170 } 171 172 func (s *Scraper) isValidGenericScrape(generic chat1.UnfurlGenericRaw) bool { 173 return len(generic.Title) > 0 || (generic.Description != nil && len(*generic.Description) > 0) || 174 generic.ImageUrl != nil || generic.Video != nil 175 } 176 177 func (s *Scraper) exportGenericResult(generic *scoredGenericRaw) (res chat1.UnfurlRaw, err error) { 178 // Check to make sure we have a legit unfurl that is useful 179 if !s.isValidGenericScrape(generic.UnfurlGenericRaw) { 180 return res, newUnfurlPermanentError("not enough information to display") 181 } 182 return chat1.NewUnfurlRawWithGeneric(generic.UnfurlGenericRaw), nil 183 } 184 185 func (s *Scraper) scrapeGeneric(ctx context.Context, uri, domain string) (res chat1.UnfurlRaw, err error) { 186 // setup some defaults with score 0 and hope we can find better info. 187 generic := new(scoredGenericRaw) 188 c := s.makeCollector() 189 if err = s.addGenericScraperToCollector(ctx, c, generic, uri, domain); err != nil { 190 return res, err 191 } 192 if err := c.Visit(uri); err != nil { 193 return res, err 194 } 195 // Try to get Apple touch icon from known URL if we are going to use one that is worse 196 if generic.faviconURLScore < getAppleTouchFaviconScoreFromPath() { 197 s.Debug(ctx, "scrapeGeneric: favicon score below Apple touch score, trying to find it") 198 s.tryAppleTouchIcon(ctx, generic, uri, domain) 199 } 200 return s.exportGenericResult(generic) 201 }