github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/unfurl/scrape_generic_scoring.go (about)

     1  package unfurl
     2  
     3  import (
     4  	"fmt"
     5  	"strconv"
     6  	"strings"
     7  
     8  	"github.com/gocolly/colly/v2"
     9  	"github.com/keybase/client/go/protocol/chat1"
    10  )
    11  
    12  // Contents are scored based on source. Higher scores win but falsey values
    13  // always loose.
    14  const (
    15  	defaultScore          = 1
    16  	defaultArticleScore   = 8
    17  	defaultTwitterScore   = 10
    18  	defaultOpenGraphScore = 11
    19  )
    20  
    21  type setterType int
    22  
    23  const (
    24  	setTitle setterType = iota
    25  	setURL
    26  	setSiteName
    27  	setFaviconURL
    28  	setImageURL
    29  	setPublishTime
    30  	setDescription
    31  	setVideo
    32  )
    33  
    34  func getOpenGraphScore(domain string, e *colly.HTMLElement) int {
    35  	// TODO: change score based on domain?
    36  	return defaultOpenGraphScore
    37  }
    38  
    39  func getTwitterScore(domain string, e *colly.HTMLElement) int {
    40  	// TODO: change score based on domain?
    41  	return defaultTwitterScore
    42  }
    43  
    44  func getArticleScore(domain string, e *colly.HTMLElement) int {
    45  	// TODO: change score based on domain?
    46  	return defaultArticleScore
    47  }
    48  
    49  func getDefaultScore(domain string, e *colly.HTMLElement) int {
    50  	return defaultScore
    51  }
    52  
    53  func getFaviconMultiplier(e *colly.HTMLElement) int {
    54  	// 192x192
    55  	sizes := strings.Split(e.Attr("sizes"), "x")
    56  	width, err := strconv.Atoi(sizes[0])
    57  	if err != nil {
    58  		return 1
    59  	}
    60  	height, err := strconv.Atoi(sizes[1])
    61  	if err != nil {
    62  		return 1
    63  	}
    64  	return width + height
    65  }
    66  
    67  // Favor apple-touch-icon over other favicons, try to get the highest
    68  // resolution.
    69  func getAppleTouchFaviconScore(domain string, e *colly.HTMLElement) int {
    70  	return (getDefaultScore(domain, e) + 1) * getFaviconMultiplier(e)
    71  }
    72  
    73  func getAppleTouchFaviconScoreFromPath() int {
    74  	return defaultScore * 384
    75  }
    76  
    77  // Metadata to describe how to extra and score content and which field this
    78  // attribute describes
    79  type attrRanker struct {
    80  	content func(e *colly.HTMLElement) []string
    81  	score   func(domain string, e *colly.HTMLElement) int
    82  	setter  setterType
    83  }
    84  
    85  func getHrefAttr(e *colly.HTMLElement) []string {
    86  	return []string{e.Attr("href")}
    87  }
    88  
    89  func getContentAttr(e *colly.HTMLElement) []string {
    90  	return []string{e.Attr("content")}
    91  }
    92  
    93  func getHrefAndContentAttr(e *colly.HTMLElement) []string {
    94  	return append(getHrefAttr(e), getContentAttr(e)...)
    95  }
    96  
    97  func getOpenGraphVideo(e *colly.HTMLElement) []string {
    98  	url := e.Attr("content")
    99  	if len(url) == 0 {
   100  		return nil
   101  	}
   102  	mimeType := "video/mp4"
   103  	var height, width *int
   104  	heightStr, _ :=
   105  		e.DOM.SiblingsFiltered("meta[content][property=\"og:video:height\"]").Eq(0).Attr("content")
   106  	if h, err := strconv.Atoi(heightStr); err == nil && h > 0 {
   107  		height = &h
   108  	}
   109  	widthStr, _ := e.DOM.SiblingsFiltered("meta[content][property=\"og:video:width\"]").Eq(0).Attr("content")
   110  	if w, err := strconv.Atoi(widthStr); err == nil && w > 0 {
   111  		width = &w
   112  	}
   113  	typeStr, ok := e.DOM.SiblingsFiltered("meta[content][property=\"og:video:type\"]").Eq(0).Attr("content")
   114  	if ok {
   115  		mimeType = typeStr
   116  	}
   117  	if height == nil || width == nil {
   118  		return nil
   119  	}
   120  	return []string{fmt.Sprintf("%s %d %d %s", url, *height, *width, mimeType)}
   121  }
   122  
   123  // Map of supported attributes/tags
   124  var attrRankMap = map[string]attrRanker{
   125  	// title
   126  	"title": {
   127  		content: func(e *colly.HTMLElement) []string { return []string{e.Text} },
   128  		score:   getDefaultScore,
   129  		setter:  setTitle,
   130  	},
   131  	"twitter:title": {
   132  		content: getContentAttr,
   133  		score:   getTwitterScore,
   134  		setter:  setTitle,
   135  	},
   136  	"og:title": {
   137  		content: getContentAttr,
   138  		score:   getOpenGraphScore,
   139  		setter:  setTitle,
   140  	},
   141  
   142  	// siteName
   143  	"application-name": {
   144  		content: getContentAttr,
   145  		score:   getDefaultScore,
   146  		setter:  setSiteName,
   147  	},
   148  	"og:site_name": {
   149  		content: getContentAttr,
   150  		score:   getOpenGraphScore,
   151  		setter:  setSiteName,
   152  	},
   153  
   154  	// favicon
   155  	"shortcut icon": {
   156  		content: getHrefAttr,
   157  		score:   getDefaultScore,
   158  		setter:  setFaviconURL,
   159  	},
   160  	"icon": {
   161  		content: getHrefAttr,
   162  		score:   getDefaultScore,
   163  		setter:  setFaviconURL,
   164  	},
   165  	"apple-touch-icon": {
   166  		content: getHrefAttr,
   167  		score:   getAppleTouchFaviconScore,
   168  		setter:  setFaviconURL,
   169  	},
   170  
   171  	// imageUrl
   172  	"twitter:image": {
   173  		content: getHrefAndContentAttr,
   174  		score:   getTwitterScore,
   175  		setter:  setImageURL,
   176  	},
   177  	"og:image": {
   178  		content: getHrefAndContentAttr,
   179  		score:   getOpenGraphScore,
   180  		setter:  setImageURL,
   181  	},
   182  
   183  	// video
   184  	"og:video": {
   185  		content: getOpenGraphVideo,
   186  		score:   getOpenGraphScore,
   187  		setter:  setVideo,
   188  	},
   189  
   190  	// publishTime
   191  	"lastmod": {
   192  		content: getContentAttr,
   193  		score:   getDefaultScore,
   194  		setter:  setPublishTime,
   195  	},
   196  	"pubdate": {
   197  		content: getContentAttr,
   198  		score:   getArticleScore,
   199  		setter:  setPublishTime,
   200  	},
   201  	"og:pubdate": {
   202  		content: getContentAttr,
   203  		score:   getOpenGraphScore,
   204  		setter:  setPublishTime,
   205  	},
   206  	"pdate": {
   207  		content: getContentAttr,
   208  		score:   getDefaultScore,
   209  		setter:  setPublishTime,
   210  	},
   211  	"article.published": {
   212  		content: getContentAttr,
   213  		score:   getArticleScore,
   214  		setter:  setPublishTime,
   215  	},
   216  	"article:published": {
   217  		content: getContentAttr,
   218  		score:   getArticleScore,
   219  		setter:  setPublishTime,
   220  	},
   221  	"datePublished": {
   222  		content: getContentAttr,
   223  		score:   getArticleScore,
   224  		setter:  setPublishTime,
   225  	},
   226  
   227  	// description
   228  	"description": {
   229  		content: getContentAttr,
   230  		score:   getDefaultScore,
   231  		setter:  setDescription,
   232  	},
   233  	"twitter:description": {
   234  		content: getContentAttr,
   235  		score:   getTwitterScore,
   236  		setter:  setDescription,
   237  	},
   238  	"og:description": {
   239  		content: getContentAttr,
   240  		score:   getOpenGraphScore,
   241  		setter:  setDescription,
   242  	},
   243  }
   244  
   245  // Score each attribute we parse from the webpage. If we encounter multiple
   246  // sources we can use the highest rated one.
   247  type scoredGenericRaw struct {
   248  	chat1.UnfurlGenericRaw
   249  	titleScore       int
   250  	urlScore         int
   251  	siteNameScore    int
   252  	faviconURLScore  int
   253  	imageURLScore    int
   254  	videoScore       int
   255  	publishTimeScore int
   256  	descriptionScore int
   257  }
   258  
   259  func (g *scoredGenericRaw) setTitle(title string, score int) {
   260  	if score > g.titleScore || g.Title == "" {
   261  		g.Title = title
   262  		g.titleScore = score
   263  	}
   264  }
   265  
   266  func (g *scoredGenericRaw) setURL(url string, score int) {
   267  	if score > g.urlScore || g.Url == "" {
   268  		g.Url = url
   269  		g.urlScore = score
   270  	}
   271  }
   272  
   273  func (g *scoredGenericRaw) setSiteName(siteName string, score int) {
   274  	if score > g.siteNameScore || g.SiteName == "" {
   275  		g.SiteName = siteName
   276  		g.siteNameScore = score
   277  	}
   278  }
   279  
   280  func (g *scoredGenericRaw) setFaviconURL(faviconURL *string, score int) {
   281  	if score > g.faviconURLScore || g.FaviconUrl == nil {
   282  		g.FaviconUrl = faviconURL
   283  		g.faviconURLScore = score
   284  	}
   285  }
   286  
   287  func (g *scoredGenericRaw) setImageURL(imageURL *string, score int) {
   288  	if score > g.imageURLScore || g.ImageUrl == nil {
   289  		g.ImageUrl = imageURL
   290  		g.imageURLScore = score
   291  	}
   292  }
   293  
   294  func (g *scoredGenericRaw) setVideo(videoDesc string, score int) {
   295  	if score > g.videoScore || g.Video == nil {
   296  		parts := strings.Split(videoDesc, " ")
   297  		height, _ := strconv.Atoi(parts[1])
   298  		width, _ := strconv.Atoi(parts[2])
   299  		g.Video = &chat1.UnfurlVideo{
   300  			Url:      parts[0],
   301  			MimeType: parts[3],
   302  			Height:   height,
   303  			Width:    width,
   304  		}
   305  		g.videoScore = score
   306  	}
   307  }
   308  
   309  func (g *scoredGenericRaw) setPublishTime(publishTime *int, score int) {
   310  	if score > g.publishTimeScore || g.PublishTime == nil || (g.PublishTime != nil && publishTime != nil && *publishTime > *g.PublishTime) {
   311  		g.PublishTime = publishTime
   312  		g.publishTimeScore = score
   313  	}
   314  }
   315  
   316  func (g *scoredGenericRaw) setDescription(description *string, score int) {
   317  	if score > g.descriptionScore || g.Description == nil {
   318  		g.Description = description
   319  		g.descriptionScore = score
   320  	}
   321  }