github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/unfurl/scraper.go (about)

     1  package unfurl
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/gocolly/colly/v2"
     8  	"github.com/keybase/client/go/chat/globals"
     9  	"github.com/keybase/client/go/chat/utils"
    10  	"github.com/keybase/client/go/libkb"
    11  	"github.com/keybase/client/go/protocol/chat1"
    12  	"github.com/keybase/go-framed-msgpack-rpc/rpc"
    13  )
    14  
    15  const userAgent = "Mozilla/5.0 (compatible; KeybaseBot; +https://keybase.io)"
    16  
    17  type Scraper struct {
    18  	globals.Contextified
    19  	utils.DebugLabeler
    20  	cache *unfurlCache
    21  }
    22  
    23  func NewScraper(g *globals.Context) *Scraper {
    24  	return &Scraper{
    25  		Contextified: globals.NewContextified(g),
    26  		DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "Scraper", false),
    27  		cache:        newUnfurlCache(),
    28  	}
    29  }
    30  
    31  func (s *Scraper) makeCollector() *colly.Collector {
    32  	c := colly.NewCollector(
    33  		colly.UserAgent(userAgent),
    34  	)
    35  	c.SetRequestTimeout(time.Second * 30)
    36  	var record *rpc.NetworkInstrumenter
    37  	c.OnRequest(func(r *colly.Request) {
    38  		r.Headers.Set("connection", "keep-alive")
    39  		r.Headers.Set("upgrade-insecure-requests", "1")
    40  		record = rpc.NewNetworkInstrumenter(s.G().ExternalG().RemoteNetworkInstrumenterStorage, "UnfurlScraper")
    41  	})
    42  	c.OnResponse(func(r *colly.Response) {
    43  		if err := record.RecordAndFinish(context.TODO(), int64(len(r.Body))); err != nil {
    44  			s.Debug(context.TODO(), "colly OnResponse: unable to instrument network request %s, %s", record, err)
    45  		}
    46  	})
    47  	if s.G().Env.GetProxyType() != libkb.NoProxy {
    48  		err := c.SetProxy(libkb.BuildProxyAddressWithProtocol(s.G().Env.GetProxyType(), s.G().Env.GetProxy()))
    49  		if err != nil {
    50  			s.Debug(context.TODO(), "makeCollector: error setting proxy: %+v", err)
    51  		}
    52  	}
    53  	return c
    54  }
    55  
    56  func (s *Scraper) Scrape(ctx context.Context, uri string, forceTyp *chat1.UnfurlType) (res chat1.UnfurlRaw, err error) {
    57  	defer s.Trace(ctx, nil, "Scrape")()
    58  	// Check if we have a cached valued
    59  	if item, valid := s.cache.get(uri); valid {
    60  		s.Debug(ctx, "Scape: using cached value")
    61  		return item.data.(chat1.UnfurlRaw), nil
    62  	}
    63  	defer func() {
    64  		if err == nil {
    65  			s.cache.put(uri, res)
    66  		}
    67  	}()
    68  
    69  	domain, err := GetDomain(uri)
    70  	if err != nil {
    71  		return res, err
    72  	}
    73  
    74  	var unfurlTyp chat1.UnfurlType
    75  	if forceTyp != nil {
    76  		unfurlTyp = *forceTyp
    77  	} else {
    78  		unfurlTyp = ClassifyDomain(domain)
    79  	}
    80  
    81  	switch unfurlTyp {
    82  	case chat1.UnfurlType_GENERIC:
    83  		return s.scrapeGeneric(ctx, uri, domain)
    84  	case chat1.UnfurlType_GIPHY:
    85  		return s.scrapeGiphy(ctx, uri)
    86  	case chat1.UnfurlType_MAPS:
    87  		return s.scrapeMap(ctx, uri)
    88  	default:
    89  		return s.scrapeGeneric(ctx, uri, domain)
    90  	}
    91  }