github.com/keybase/client/go@v0.0.0-20240424154521-52f30ea26cb1/chat/unfurl/scraper.go (about) 1 package unfurl 2 3 import ( 4 "context" 5 6 "github.com/gocolly/colly/v2" 7 "github.com/keybase/client/go/chat/globals" 8 "github.com/keybase/client/go/chat/utils" 9 "github.com/keybase/client/go/libkb" 10 "github.com/keybase/client/go/protocol/chat1" 11 "github.com/keybase/go-framed-msgpack-rpc/rpc" 12 ) 13 14 const userAgent = "Mozilla/5.0 (compatible; KeybaseBot; +https://keybase.io)" 15 16 type Scraper struct { 17 globals.Contextified 18 utils.DebugLabeler 19 cache *unfurlCache 20 } 21 22 func NewScraper(g *globals.Context) *Scraper { 23 return &Scraper{ 24 Contextified: globals.NewContextified(g), 25 DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "Scraper", false), 26 cache: newUnfurlCache(), 27 } 28 } 29 30 func (s *Scraper) makeCollector() *colly.Collector { 31 c := colly.NewCollector( 32 colly.UserAgent(userAgent), 33 ) 34 var record *rpc.NetworkInstrumenter 35 c.OnRequest(func(r *colly.Request) { 36 r.Headers.Set("connection", "keep-alive") 37 r.Headers.Set("upgrade-insecure-requests", "1") 38 record = rpc.NewNetworkInstrumenter(s.G().ExternalG().RemoteNetworkInstrumenterStorage, "UnfurlScraper") 39 }) 40 c.OnResponse(func(r *colly.Response) { 41 if err := record.RecordAndFinish(context.TODO(), int64(len(r.Body))); err != nil { 42 s.Debug(context.TODO(), "colly OnResponse: unable to instrument network request %s, %s", record, err) 43 } 44 }) 45 if s.G().Env.GetProxyType() != libkb.NoProxy { 46 err := c.SetProxy(libkb.BuildProxyAddressWithProtocol(s.G().Env.GetProxyType(), s.G().Env.GetProxy())) 47 if err != nil { 48 s.Debug(context.TODO(), "makeCollector: error setting proxy: %+v", err) 49 } 50 } 51 return c 52 } 53 54 func (s *Scraper) Scrape(ctx context.Context, uri string, forceTyp *chat1.UnfurlType) (res chat1.UnfurlRaw, err error) { 55 defer s.Trace(ctx, nil, "Scrape")() 56 // Check if we have a cached valued 57 if item, valid := s.cache.get(uri); valid { 58 s.Debug(ctx, "Scape: using cached value") 59 return item.data.(chat1.UnfurlRaw), nil 60 } 61 defer func() { 62 if err == nil { 63 s.cache.put(uri, res) 64 } 65 }() 66 67 domain, err := GetDomain(uri) 68 if err != nil { 69 return res, err 70 } 71 72 var unfurlTyp chat1.UnfurlType 73 if forceTyp != nil { 74 unfurlTyp = *forceTyp 75 } else { 76 unfurlTyp = ClassifyDomain(domain) 77 } 78 79 switch unfurlTyp { 80 case chat1.UnfurlType_GENERIC: 81 return s.scrapeGeneric(ctx, uri, domain) 82 case chat1.UnfurlType_GIPHY: 83 return s.scrapeGiphy(ctx, uri) 84 case chat1.UnfurlType_MAPS: 85 return s.scrapeMap(ctx, uri) 86 default: 87 return s.scrapeGeneric(ctx, uri, domain) 88 } 89 }