github.com/keybase/client/go@v0.0.0-20240309051027-028f7c731f8b/chat/unfurl/scrape_generic_scoring.go (about) 1 package unfurl 2 3 import ( 4 "fmt" 5 "strconv" 6 "strings" 7 8 "github.com/gocolly/colly/v2" 9 "github.com/keybase/client/go/protocol/chat1" 10 ) 11 12 // Contents are scored based on source. Higher scores win but falsey values 13 // always loose. 14 const ( 15 defaultScore = 1 16 defaultArticleScore = 8 17 defaultTwitterScore = 10 18 defaultOpenGraphScore = 11 19 ) 20 21 type setterType int 22 23 const ( 24 setTitle setterType = iota 25 setURL 26 setSiteName 27 setFaviconURL 28 setImageURL 29 setPublishTime 30 setDescription 31 setVideo 32 ) 33 34 func getOpenGraphScore(domain string, e *colly.HTMLElement) int { 35 // TODO: change score based on domain? 36 return defaultOpenGraphScore 37 } 38 39 func getTwitterScore(domain string, e *colly.HTMLElement) int { 40 // TODO: change score based on domain? 41 return defaultTwitterScore 42 } 43 44 func getArticleScore(domain string, e *colly.HTMLElement) int { 45 // TODO: change score based on domain? 46 return defaultArticleScore 47 } 48 49 func getDefaultScore(domain string, e *colly.HTMLElement) int { 50 return defaultScore 51 } 52 53 func getFaviconMultiplier(e *colly.HTMLElement) int { 54 // 192x192 55 sizes := strings.Split(e.Attr("sizes"), "x") 56 width, err := strconv.Atoi(sizes[0]) 57 if err != nil { 58 return 1 59 } 60 height, err := strconv.Atoi(sizes[1]) 61 if err != nil { 62 return 1 63 } 64 return width + height 65 } 66 67 // Favor apple-touch-icon over other favicons, try to get the highest 68 // resolution. 69 func getAppleTouchFaviconScore(domain string, e *colly.HTMLElement) int { 70 return (getDefaultScore(domain, e) + 1) * getFaviconMultiplier(e) 71 } 72 73 func getAppleTouchFaviconScoreFromPath() int { 74 return defaultScore * 384 75 } 76 77 // Metadata to describe how to extra and score content and which field this 78 // attribute describes 79 type attrRanker struct { 80 content func(e *colly.HTMLElement) []string 81 score func(domain string, e *colly.HTMLElement) int 82 setter setterType 83 } 84 85 func getHrefAttr(e *colly.HTMLElement) []string { 86 return []string{e.Attr("href")} 87 } 88 89 func getContentAttr(e *colly.HTMLElement) []string { 90 return []string{e.Attr("content")} 91 } 92 93 func getHrefAndContentAttr(e *colly.HTMLElement) []string { 94 return append(getHrefAttr(e), getContentAttr(e)...) 95 } 96 97 func getOpenGraphVideo(e *colly.HTMLElement) []string { 98 url := e.Attr("content") 99 if len(url) == 0 { 100 return nil 101 } 102 mimeType := "video/mp4" 103 var height, width *int 104 heightStr, _ := 105 e.DOM.SiblingsFiltered("meta[content][property=\"og:video:height\"]").Eq(0).Attr("content") 106 if h, err := strconv.Atoi(heightStr); err == nil && h > 0 { 107 height = &h 108 } 109 widthStr, _ := e.DOM.SiblingsFiltered("meta[content][property=\"og:video:width\"]").Eq(0).Attr("content") 110 if w, err := strconv.Atoi(widthStr); err == nil && w > 0 { 111 width = &w 112 } 113 typeStr, ok := e.DOM.SiblingsFiltered("meta[content][property=\"og:video:type\"]").Eq(0).Attr("content") 114 if ok { 115 mimeType = typeStr 116 } 117 if height == nil || width == nil { 118 return nil 119 } 120 return []string{fmt.Sprintf("%s %d %d %s", url, *height, *width, mimeType)} 121 } 122 123 // Map of supported attributes/tags 124 var attrRankMap = map[string]attrRanker{ 125 // title 126 "title": { 127 content: func(e *colly.HTMLElement) []string { return []string{e.Text} }, 128 score: getDefaultScore, 129 setter: setTitle, 130 }, 131 "twitter:title": { 132 content: getContentAttr, 133 score: getTwitterScore, 134 setter: setTitle, 135 }, 136 "og:title": { 137 content: getContentAttr, 138 score: getOpenGraphScore, 139 setter: setTitle, 140 }, 141 142 // siteName 143 "application-name": { 144 content: getContentAttr, 145 score: getDefaultScore, 146 setter: setSiteName, 147 }, 148 "og:site_name": { 149 content: getContentAttr, 150 score: getOpenGraphScore, 151 setter: setSiteName, 152 }, 153 154 // favicon 155 "shortcut icon": { 156 content: getHrefAttr, 157 score: getDefaultScore, 158 setter: setFaviconURL, 159 }, 160 "icon": { 161 content: getHrefAttr, 162 score: getDefaultScore, 163 setter: setFaviconURL, 164 }, 165 "apple-touch-icon": { 166 content: getHrefAttr, 167 score: getAppleTouchFaviconScore, 168 setter: setFaviconURL, 169 }, 170 171 // imageUrl 172 "twitter:image": { 173 content: getHrefAndContentAttr, 174 score: getTwitterScore, 175 setter: setImageURL, 176 }, 177 "og:image": { 178 content: getHrefAndContentAttr, 179 score: getOpenGraphScore, 180 setter: setImageURL, 181 }, 182 183 // video 184 "og:video": { 185 content: getOpenGraphVideo, 186 score: getOpenGraphScore, 187 setter: setVideo, 188 }, 189 190 // publishTime 191 "lastmod": { 192 content: getContentAttr, 193 score: getDefaultScore, 194 setter: setPublishTime, 195 }, 196 "pubdate": { 197 content: getContentAttr, 198 score: getArticleScore, 199 setter: setPublishTime, 200 }, 201 "og:pubdate": { 202 content: getContentAttr, 203 score: getOpenGraphScore, 204 setter: setPublishTime, 205 }, 206 "pdate": { 207 content: getContentAttr, 208 score: getDefaultScore, 209 setter: setPublishTime, 210 }, 211 "article.published": { 212 content: getContentAttr, 213 score: getArticleScore, 214 setter: setPublishTime, 215 }, 216 "article:published": { 217 content: getContentAttr, 218 score: getArticleScore, 219 setter: setPublishTime, 220 }, 221 "datePublished": { 222 content: getContentAttr, 223 score: getArticleScore, 224 setter: setPublishTime, 225 }, 226 227 // description 228 "description": { 229 content: getContentAttr, 230 score: getDefaultScore, 231 setter: setDescription, 232 }, 233 "twitter:description": { 234 content: getContentAttr, 235 score: getTwitterScore, 236 setter: setDescription, 237 }, 238 "og:description": { 239 content: getContentAttr, 240 score: getOpenGraphScore, 241 setter: setDescription, 242 }, 243 } 244 245 // Score each attribute we parse from the webpage. If we encounter multiple 246 // sources we can use the highest rated one. 247 type scoredGenericRaw struct { 248 chat1.UnfurlGenericRaw 249 titleScore int 250 urlScore int 251 siteNameScore int 252 faviconURLScore int 253 imageURLScore int 254 videoScore int 255 publishTimeScore int 256 descriptionScore int 257 } 258 259 func (g *scoredGenericRaw) setTitle(title string, score int) { 260 if score > g.titleScore || g.Title == "" { 261 g.Title = title 262 g.titleScore = score 263 } 264 } 265 266 func (g *scoredGenericRaw) setURL(url string, score int) { 267 if score > g.urlScore || g.Url == "" { 268 g.Url = url 269 g.urlScore = score 270 } 271 } 272 273 func (g *scoredGenericRaw) setSiteName(siteName string, score int) { 274 if score > g.siteNameScore || g.SiteName == "" { 275 g.SiteName = siteName 276 g.siteNameScore = score 277 } 278 } 279 280 func (g *scoredGenericRaw) setFaviconURL(faviconURL *string, score int) { 281 if score > g.faviconURLScore || g.FaviconUrl == nil { 282 g.FaviconUrl = faviconURL 283 g.faviconURLScore = score 284 } 285 } 286 287 func (g *scoredGenericRaw) setImageURL(imageURL *string, score int) { 288 if score > g.imageURLScore || g.ImageUrl == nil { 289 g.ImageUrl = imageURL 290 g.imageURLScore = score 291 } 292 } 293 294 func (g *scoredGenericRaw) setVideo(videoDesc string, score int) { 295 if score > g.videoScore || g.Video == nil { 296 parts := strings.Split(videoDesc, " ") 297 height, _ := strconv.Atoi(parts[1]) 298 width, _ := strconv.Atoi(parts[2]) 299 g.Video = &chat1.UnfurlVideo{ 300 Url: parts[0], 301 MimeType: parts[3], 302 Height: height, 303 Width: width, 304 } 305 g.videoScore = score 306 } 307 } 308 309 func (g *scoredGenericRaw) setPublishTime(publishTime *int, score int) { 310 if score > g.publishTimeScore || g.PublishTime == nil || (g.PublishTime != nil && publishTime != nil && *publishTime > *g.PublishTime) { 311 g.PublishTime = publishTime 312 g.publishTimeScore = score 313 } 314 } 315 316 func (g *scoredGenericRaw) setDescription(description *string, score int) { 317 if score > g.descriptionScore || g.Description == nil { 318 g.Description = description 319 g.descriptionScore = score 320 } 321 }