github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/unfurl/scraper.go (about) 1 package unfurl 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/gocolly/colly/v2" 8 "github.com/keybase/client/go/chat/globals" 9 "github.com/keybase/client/go/chat/utils" 10 "github.com/keybase/client/go/libkb" 11 "github.com/keybase/client/go/protocol/chat1" 12 "github.com/keybase/go-framed-msgpack-rpc/rpc" 13 ) 14 15 const userAgent = "Mozilla/5.0 (compatible; KeybaseBot; +https://keybase.io)" 16 17 type Scraper struct { 18 globals.Contextified 19 utils.DebugLabeler 20 cache *unfurlCache 21 } 22 23 func NewScraper(g *globals.Context) *Scraper { 24 return &Scraper{ 25 Contextified: globals.NewContextified(g), 26 DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "Scraper", false), 27 cache: newUnfurlCache(), 28 } 29 } 30 31 func (s *Scraper) makeCollector() *colly.Collector { 32 c := colly.NewCollector( 33 colly.UserAgent(userAgent), 34 ) 35 c.SetRequestTimeout(time.Second * 30) 36 var record *rpc.NetworkInstrumenter 37 c.OnRequest(func(r *colly.Request) { 38 r.Headers.Set("connection", "keep-alive") 39 r.Headers.Set("upgrade-insecure-requests", "1") 40 record = rpc.NewNetworkInstrumenter(s.G().ExternalG().RemoteNetworkInstrumenterStorage, "UnfurlScraper") 41 }) 42 c.OnResponse(func(r *colly.Response) { 43 if err := record.RecordAndFinish(context.TODO(), int64(len(r.Body))); err != nil { 44 s.Debug(context.TODO(), "colly OnResponse: unable to instrument network request %s, %s", record, err) 45 } 46 }) 47 if s.G().Env.GetProxyType() != libkb.NoProxy { 48 err := c.SetProxy(libkb.BuildProxyAddressWithProtocol(s.G().Env.GetProxyType(), s.G().Env.GetProxy())) 49 if err != nil { 50 s.Debug(context.TODO(), "makeCollector: error setting proxy: %+v", err) 51 } 52 } 53 return c 54 } 55 56 func (s *Scraper) Scrape(ctx context.Context, uri string, forceTyp *chat1.UnfurlType) (res chat1.UnfurlRaw, err error) { 57 defer s.Trace(ctx, nil, "Scrape")() 58 // Check if we have a cached valued 59 if item, valid := s.cache.get(uri); valid { 60 s.Debug(ctx, "Scape: using cached value") 61 return item.data.(chat1.UnfurlRaw), nil 62 } 63 defer func() { 64 if err == nil { 65 s.cache.put(uri, res) 66 } 67 }() 68 69 domain, err := GetDomain(uri) 70 if err != nil { 71 return res, err 72 } 73 74 var unfurlTyp chat1.UnfurlType 75 if forceTyp != nil { 76 unfurlTyp = *forceTyp 77 } else { 78 unfurlTyp = ClassifyDomain(domain) 79 } 80 81 switch unfurlTyp { 82 case chat1.UnfurlType_GENERIC: 83 return s.scrapeGeneric(ctx, uri, domain) 84 case chat1.UnfurlType_GIPHY: 85 return s.scrapeGiphy(ctx, uri) 86 case chat1.UnfurlType_MAPS: 87 return s.scrapeMap(ctx, uri) 88 default: 89 return s.scrapeGeneric(ctx, uri, domain) 90 } 91 }