github.com/pinpoint-apm/pinpoint-go-agent@v1.4.1-0.20240110120318-a50c2eb18c8c/agent.go (about)

     1  package pinpoint
     2  
     3  import (
     4  	"errors"
     5  	"io"
     6  	"strconv"
     7  	"sync"
     8  	"sync/atomic"
     9  	"testing"
    10  	"time"
    11  
    12  	lru "github.com/hashicorp/golang-lru"
    13  	pb "github.com/pinpoint-apm/pinpoint-go-agent/protobuf"
    14  	"github.com/spaolacci/murmur3"
    15  )
    16  
    17  func init() {
    18  	initLogger()
    19  	initConfig()
    20  	initGoroutine()
    21  	globalAgent = NoopAgent()
    22  }
    23  
    24  type agent struct {
    25  	appName   string
    26  	appType   int32
    27  	agentID   string
    28  	agentName string
    29  
    30  	startTime   int64
    31  	sequence    int64
    32  	agentGrpc   *agentGrpc
    33  	spanGrpc    *spanGrpc
    34  	statGrpc    *statGrpc
    35  	cmdGrpc     *cmdGrpc
    36  	spanChan    chan *span
    37  	metaChan    chan interface{}
    38  	urlStatChan chan *urlStat
    39  	statChan    chan *pb.PStatMessage
    40  	sampler     traceSampler
    41  
    42  	errorCache  *lru.Cache
    43  	errorIdGen  int32
    44  	sqlCache    *lru.Cache
    45  	sqlIdGen    int32
    46  	sqlUidCache *lru.Cache
    47  	apiCache    *lru.Cache
    48  	apiIdGen    int32
    49  
    50  	config   *Config
    51  	wg       sync.WaitGroup
    52  	enable   bool
    53  	shutdown bool
    54  }
    55  
    56  type apiMeta struct {
    57  	id         int32
    58  	descriptor string
    59  	apiType    int
    60  }
    61  
    62  type stringMeta struct {
    63  	id       int32
    64  	funcName string
    65  }
    66  
    67  type sqlMeta struct {
    68  	id  int32
    69  	sql string
    70  }
    71  
    72  type sqlUidMeta struct {
    73  	uid []byte
    74  	sql string
    75  }
    76  
    77  type exceptionMeta struct {
    78  	txId        TransactionId
    79  	spanId      int64
    80  	uriTemplate string
    81  	exceptions  []*exception
    82  }
    83  
    84  type exception struct {
    85  	exceptionId int64
    86  	callstack   *errorWithCallStack
    87  }
    88  
    89  const (
    90  	cacheSize        = 1024
    91  	defaultQueueSize = 1024
    92  )
    93  
    94  var globalAgent Agent
    95  
    96  // GetAgent returns a global Agent created by NewAgent.
    97  func GetAgent() Agent {
    98  	return globalAgent
    99  }
   100  
   101  // NewAgent creates an Agent and spawns goroutines that manage spans and statistical data.
   102  // The generated Agent is maintained globally and only one instance is retained.
   103  // The provided config is generated by NewConfig and an error is returned if it is nil.
   104  //
   105  // example:
   106  //
   107  //	opts := []pinpoint.ConfigOption{
   108  //	  pinpoint.WithAppName("GoTestApp"),
   109  //	  pinpoint.WithConfigFile(os.Getenv("HOME") + "/tmp/pinpoint-config.yaml"),
   110  //	}
   111  //	cfg, err := pinpoint.NewConfig(opts...)
   112  //	agent, err := pinpoint.NewAgent(cfg)
   113  func NewAgent(config *Config) (Agent, error) {
   114  	if globalAgent != NoopAgent() {
   115  		return globalAgent, errors.New("agent is already created")
   116  	}
   117  	if config == nil {
   118  		return NoopAgent(), errors.New("configuration is missing")
   119  	}
   120  
   121  	logger.setup(config)
   122  	if err := config.checkNameAndID(); err != nil {
   123  		return NoopAgent(), err
   124  	}
   125  	if !config.Bool(CfgEnable) {
   126  		return NoopAgent(), nil
   127  	}
   128  
   129  	Log("agent").Infof("new pinpoint agent")
   130  	config.printConfigString()
   131  
   132  	agent := &agent{
   133  		appName:     config.String(CfgAppName),
   134  		appType:     int32(config.Int(CfgAppType)),
   135  		agentID:     config.String(CfgAgentID),
   136  		agentName:   config.String(CfgAgentName),
   137  		startTime:   time.Now().UnixNano() / int64(time.Millisecond),
   138  		spanChan:    make(chan *span, config.Int(CfgSpanQueueSize)),
   139  		metaChan:    make(chan interface{}, config.Int(CfgSpanQueueSize)),
   140  		urlStatChan: make(chan *urlStat, config.Int(CfgSpanQueueSize)),
   141  		statChan:    make(chan *pb.PStatMessage, config.Int(CfgSpanQueueSize)),
   142  		config:      config,
   143  	}
   144  
   145  	var err error
   146  	if agent.errorCache, err = lru.New(cacheSize); err != nil {
   147  		return NoopAgent(), err
   148  	}
   149  	if agent.sqlCache, err = lru.New(cacheSize); err != nil {
   150  		return NoopAgent(), err
   151  	}
   152  	if agent.sqlUidCache, err = lru.New(cacheSize); err != nil {
   153  		return NoopAgent(), err
   154  	}
   155  	if agent.apiCache, err = lru.New(cacheSize); err != nil {
   156  		return NoopAgent(), err
   157  	}
   158  
   159  	agent.newSampler()
   160  	samplingOpts := []string{CfgSamplingType, CfgSamplingCounterRate, CfgSamplingPercentRate, CfgSamplingNewThroughput, CfgSamplingContinueThroughput}
   161  	config.AddReloadCallback(samplingOpts, agent.newSampler)
   162  	config.AddReloadCallback([]string{CfgLogLevel}, logger.reloadLevel)
   163  	config.AddReloadCallback([]string{CfgLogOutput, CfgLogMaxSize}, logger.reloadOutput)
   164  
   165  	if !config.offGrpc {
   166  		go agent.connectGrpcServer()
   167  	}
   168  	globalAgent = agent
   169  	return agent, nil
   170  }
   171  
   172  func (agent *agent) newSampler() {
   173  	config := agent.config
   174  	var baseSampler sampler
   175  	if config.String(CfgSamplingType) == samplingTypeCounter {
   176  		baseSampler = newRateSampler(config.Int(CfgSamplingCounterRate))
   177  	} else {
   178  		baseSampler = newPercentSampler(config.Float(CfgSamplingPercentRate))
   179  	}
   180  
   181  	if config.Int(CfgSamplingNewThroughput) > 0 || config.Int(CfgSamplingContinueThroughput) > 0 {
   182  		agent.sampler = newThroughputLimitTraceSampler(baseSampler, config.Int(CfgSamplingNewThroughput),
   183  			config.Int(CfgSamplingContinueThroughput))
   184  	} else {
   185  		agent.sampler = newBasicTraceSampler(baseSampler)
   186  	}
   187  }
   188  
   189  func (agent *agent) connectGrpcServer() {
   190  	var err error
   191  
   192  	if agent.agentGrpc, err = newAgentGrpc(agent); err != nil {
   193  		return
   194  	}
   195  	if !agent.agentGrpc.registerAgentWithRetry() {
   196  		return
   197  	}
   198  	if agent.spanGrpc, err = newSpanGrpc(agent); err != nil {
   199  		return
   200  	}
   201  	if agent.statGrpc, err = newStatGrpc(agent); err != nil {
   202  		return
   203  	}
   204  	if agent.cmdGrpc, err = newCommandGrpc(agent); err != nil {
   205  		return
   206  	}
   207  
   208  	agent.enable = true
   209  	go agent.sendPingWorker()
   210  	go agent.sendSpanWorker()
   211  	go agent.runCommandService()
   212  	go agent.sendMetaWorker()
   213  	go agent.collectAgentStatWorker()
   214  	go agent.collectUrlStatWorker()
   215  	go agent.sendUrlStatWorker()
   216  	go agent.sendStatsWorker()
   217  	agent.wg.Add(8)
   218  }
   219  
   220  func (agent *agent) Shutdown() {
   221  	agent.shutdown = true
   222  	Log("agent").Infof("shutdown pinpoint agent")
   223  
   224  	if !agent.enable {
   225  		return
   226  	}
   227  
   228  	agent.enable = false
   229  	globalAgent = NoopAgent()
   230  
   231  	close(agent.spanChan)
   232  	close(agent.metaChan)
   233  	close(agent.urlStatChan)
   234  	close(agent.statChan)
   235  
   236  	//To terminate the listening state of the command stream,
   237  	//close the command grpc channel first
   238  	if agent.cmdGrpc != nil {
   239  		agent.cmdGrpc.close()
   240  	}
   241  
   242  	agent.wg.Wait()
   243  
   244  	if agent.agentGrpc != nil {
   245  		agent.agentGrpc.close()
   246  	}
   247  	if agent.spanGrpc != nil {
   248  		agent.spanGrpc.close()
   249  	}
   250  	if agent.statGrpc != nil {
   251  		agent.statGrpc.close()
   252  	}
   253  }
   254  
   255  func (agent *agent) NewSpanTracer(operation string, rpcName string) Tracer {
   256  	var tracer Tracer
   257  
   258  	if agent.enable {
   259  		reader := &noopDistributedTracingContextReader{}
   260  		tracer = agent.NewSpanTracerWithReader(operation, rpcName, reader)
   261  	} else {
   262  		tracer = NoopTracer()
   263  	}
   264  	return tracer
   265  }
   266  
   267  func (agent *agent) NewSpanTracerWithReader(operation string, rpcName string, reader DistributedTracingContextReader) Tracer {
   268  	if !agent.enable || reader == nil {
   269  		return NoopTracer()
   270  	}
   271  
   272  	sampled := reader.Get(HeaderSampled)
   273  	if sampled == "s0" {
   274  		incrUnSampleCont()
   275  		return newUnSampledSpan(agent, rpcName)
   276  	}
   277  
   278  	tid := reader.Get(HeaderTraceId)
   279  	if tid == "" {
   280  		return agent.samplingSpan(func() bool { return agent.sampler.isNewSampled() }, operation, rpcName, reader)
   281  	} else {
   282  		return agent.samplingSpan(func() bool { return agent.sampler.isContinueSampled() }, operation, rpcName, reader)
   283  	}
   284  }
   285  
   286  func (agent *agent) samplingSpan(samplingFunc func() bool, operation string, rpcName string, reader DistributedTracingContextReader) Tracer {
   287  	if samplingFunc() {
   288  		tracer := newSampledSpan(agent, operation, rpcName)
   289  		tracer.Extract(reader)
   290  		return tracer
   291  	} else {
   292  		return newUnSampledSpan(agent, rpcName)
   293  	}
   294  }
   295  
   296  func (agent *agent) generateTransactionId() TransactionId {
   297  	atomic.AddInt64(&agent.sequence, 1)
   298  	return TransactionId{agent.agentID, agent.startTime, agent.sequence}
   299  }
   300  
   301  func (agent *agent) Enable() bool {
   302  	return agent.enable
   303  }
   304  
   305  func (agent *agent) Config() *Config {
   306  	return agent.config
   307  }
   308  
   309  func (agent *agent) sendPingWorker() {
   310  	Log("agent").Infof("start ping goroutine")
   311  	defer agent.wg.Done()
   312  	stream := agent.agentGrpc.newPingStreamWithRetry()
   313  
   314  	for agent.enable {
   315  		err := stream.sendPing()
   316  		if err != nil {
   317  			if err != io.EOF {
   318  				Log("agent").Errorf("send ping - %v", err)
   319  			}
   320  
   321  			stream.close()
   322  			stream = agent.agentGrpc.newPingStreamWithRetry()
   323  		}
   324  
   325  		time.Sleep(60 * time.Second)
   326  	}
   327  
   328  	stream.close()
   329  	Log("agent").Infof("end ping goroutine")
   330  }
   331  
   332  func (agent *agent) sendSpanWorker() {
   333  	Log("agent").Infof("start span goroutine")
   334  	defer agent.wg.Done()
   335  
   336  	var (
   337  		skipOldSpan  = bool(false)
   338  		skipBaseTime time.Time
   339  	)
   340  
   341  	stream := agent.spanGrpc.newSpanStreamWithRetry()
   342  	for span := range agent.spanChan {
   343  		if !agent.enable {
   344  			break
   345  		}
   346  
   347  		if skipOldSpan {
   348  			if span.startTime.Before(skipBaseTime) {
   349  				continue //skip old span
   350  			} else {
   351  				skipOldSpan = false
   352  			}
   353  		}
   354  
   355  		err := stream.sendSpan(span)
   356  		if err != nil {
   357  			if err != io.EOF {
   358  				Log("agent").Errorf("send span - %v", err)
   359  			}
   360  
   361  			stream.close()
   362  			stream = agent.spanGrpc.newSpanStreamWithRetry()
   363  
   364  			skipOldSpan = true
   365  			skipBaseTime = time.Now().Add(-time.Second * 1)
   366  		}
   367  	}
   368  
   369  	stream.close()
   370  	Log("agent").Infof("end span goroutine")
   371  }
   372  
   373  func (agent *agent) enqueueSpan(span *span) bool {
   374  	if !agent.enable {
   375  		return false
   376  	}
   377  
   378  	select {
   379  	case agent.spanChan <- span:
   380  		return true
   381  	default:
   382  		break
   383  	}
   384  
   385  	<-agent.spanChan
   386  	return false
   387  }
   388  
   389  func (agent *agent) sendMetaWorker() {
   390  	Log("agent").Infof("start meta goroutine")
   391  	defer agent.wg.Done()
   392  
   393  	for md := range agent.metaChan {
   394  		if !agent.enable {
   395  			break
   396  		}
   397  
   398  		var success bool
   399  		switch md.(type) {
   400  		case apiMeta:
   401  			api := md.(apiMeta)
   402  			success = agent.agentGrpc.sendApiMetadataWithRetry(api.id, api.descriptor, -1, api.apiType)
   403  			break
   404  		case stringMeta:
   405  			str := md.(stringMeta)
   406  			success = agent.agentGrpc.sendStringMetadataWithRetry(str.id, str.funcName)
   407  			break
   408  		case sqlMeta:
   409  			sql := md.(sqlMeta)
   410  			success = agent.agentGrpc.sendSqlMetadataWithRetry(sql.id, sql.sql)
   411  			break
   412  		case sqlUidMeta:
   413  			sql := md.(sqlUidMeta)
   414  			success = agent.agentGrpc.sendSqlUidMetadataWithRetry(sql.uid, sql.sql)
   415  			break
   416  		case exceptionMeta:
   417  			em := md.(exceptionMeta)
   418  			success = agent.agentGrpc.sendExceptionMetadataWithRetry(&em)
   419  		}
   420  
   421  		if !success {
   422  			agent.deleteMetaCache(md)
   423  		}
   424  	}
   425  
   426  	Log("agent").Infof("end meta goroutine")
   427  }
   428  
   429  func (agent *agent) deleteMetaCache(md interface{}) {
   430  	switch md.(type) {
   431  	case apiMeta:
   432  		api := md.(apiMeta)
   433  		key := api.descriptor + "_" + strconv.Itoa(api.apiType)
   434  		agent.apiCache.Remove(key)
   435  		break
   436  	case stringMeta:
   437  		agent.errorCache.Remove(md.(stringMeta).funcName)
   438  		break
   439  	case sqlMeta:
   440  		agent.sqlCache.Remove(md.(sqlMeta).sql)
   441  		break
   442  	case sqlUidMeta:
   443  		agent.sqlUidCache.Remove(md.(sqlUidMeta).sql)
   444  		break
   445  	case exceptionMeta:
   446  		break
   447  	}
   448  }
   449  
   450  func (agent *agent) tryEnqueueMeta(md interface{}) bool {
   451  	if !agent.enable {
   452  		return false
   453  	}
   454  
   455  	select {
   456  	case agent.metaChan <- md:
   457  		return true
   458  	default:
   459  		break
   460  	}
   461  
   462  	<-agent.metaChan
   463  	return false
   464  }
   465  
   466  func (agent *agent) cacheError(errorName string) int32 {
   467  	if !agent.enable {
   468  		return 0
   469  	}
   470  
   471  	if v, ok := agent.errorCache.Peek(errorName); ok {
   472  		return v.(int32)
   473  	}
   474  
   475  	id := atomic.AddInt32(&agent.errorIdGen, 1)
   476  	agent.errorCache.Add(errorName, id)
   477  
   478  	md := stringMeta{
   479  		id:       id,
   480  		funcName: errorName,
   481  	}
   482  	agent.tryEnqueueMeta(md)
   483  
   484  	Log("agent").Infof("cache error id: %d, %s", id, errorName)
   485  	return id
   486  }
   487  
   488  func (agent *agent) cacheSql(sql string) int32 {
   489  	if !agent.enable {
   490  		return 0
   491  	}
   492  
   493  	if v, ok := agent.sqlCache.Peek(sql); ok {
   494  		return v.(int32)
   495  	}
   496  
   497  	id := atomic.AddInt32(&agent.sqlIdGen, 1)
   498  	agent.sqlCache.Add(sql, id)
   499  
   500  	md := sqlMeta{
   501  		id:  id,
   502  		sql: sql,
   503  	}
   504  	agent.tryEnqueueMeta(md)
   505  
   506  	Log("agent").Infof("cache sql id: %d, %s", id, sql)
   507  	return id
   508  }
   509  
   510  func (agent *agent) cacheSqlUid(sql string) []byte {
   511  	if !agent.enable {
   512  		return nil
   513  	}
   514  
   515  	if v, ok := agent.sqlUidCache.Peek(sql); ok {
   516  		return v.([]byte)
   517  	}
   518  
   519  	hash := murmur3.New128()
   520  	hash.Write([]byte(sql))
   521  	uid := hash.Sum(nil)
   522  	agent.sqlUidCache.Add(sql, uid)
   523  
   524  	md := sqlUidMeta{
   525  		uid: uid,
   526  		sql: sql,
   527  	}
   528  	agent.tryEnqueueMeta(md)
   529  
   530  	Log("agent").Infof("cache sql uid: %#v, %s", uid, sql)
   531  	return uid
   532  }
   533  
   534  func (agent *agent) cacheSpanApi(descriptor string, apiType int) int32 {
   535  	if !agent.enable {
   536  		return 0
   537  	}
   538  
   539  	key := descriptor + "_" + strconv.Itoa(apiType)
   540  
   541  	if v, ok := agent.apiCache.Peek(key); ok {
   542  		return v.(int32)
   543  	}
   544  
   545  	id := atomic.AddInt32(&agent.apiIdGen, 1)
   546  	agent.apiCache.Add(key, id)
   547  
   548  	md := apiMeta{}
   549  	md.id = id
   550  	md.descriptor = descriptor
   551  	md.apiType = apiType
   552  	agent.tryEnqueueMeta(md)
   553  
   554  	Log("agent").Infof("cache api id: %d, %s", id, key)
   555  	return id
   556  }
   557  
   558  func (agent *agent) enqueueExceptionMeta(span *span) {
   559  	if !agent.enable || !agent.config.errorTraceCallStack {
   560  		return
   561  	}
   562  
   563  	md := exceptionMeta{
   564  		txId:       span.txId,
   565  		spanId:     span.spanId,
   566  		exceptions: span.errorChains,
   567  	}
   568  	if span.urlStat != nil {
   569  		md.uriTemplate = span.urlStat.Url
   570  	} else {
   571  		md.uriTemplate = "NULL"
   572  	}
   573  
   574  	agent.tryEnqueueMeta(md)
   575  	Log("agent").Debugf("enqueue exception meta: %v", md)
   576  }
   577  
   578  func (agent *agent) enqueueUrlStat(stat *urlStat) bool {
   579  	if !agent.enable {
   580  		return false
   581  	}
   582  
   583  	select {
   584  	case agent.urlStatChan <- stat:
   585  		return true
   586  	default:
   587  		break
   588  	}
   589  
   590  	<-agent.urlStatChan
   591  	Log("agent").Tracef("url stat channel - max capacity reached or closed")
   592  	return false
   593  }
   594  
   595  func (agent *agent) collectUrlStatWorker() {
   596  	Log("agent").Infof("start collect uri stat goroutine")
   597  	defer agent.wg.Done()
   598  
   599  	agent.initUrlStat()
   600  
   601  	for uri := range agent.urlStatChan {
   602  		if !agent.enable {
   603  			break
   604  		}
   605  		snapshot := agent.currentUrlStatSnapshot()
   606  		snapshot.add(uri)
   607  	}
   608  
   609  	Log("agent").Infof("end collect uri stat goroutine")
   610  }
   611  
   612  func (agent *agent) sendUrlStatWorker() {
   613  	Log("agent").Infof("start send uri stat goroutine")
   614  	defer agent.wg.Done()
   615  
   616  	interval := 30 * time.Second
   617  	time.Sleep(interval)
   618  
   619  	for agent.enable {
   620  		if agent.config.collectUrlStat {
   621  			snapshot := agent.takeUrlStatSnapshot()
   622  			agent.enqueueStat(makePAgentUriStat(snapshot))
   623  		}
   624  		time.Sleep(interval)
   625  	}
   626  
   627  	Log("agent").Infof("end send uri stat goroutine")
   628  }
   629  
   630  func (agent *agent) enqueueStat(stat *pb.PStatMessage) bool {
   631  	select {
   632  	case agent.statChan <- stat:
   633  		return true
   634  	default:
   635  		break
   636  	}
   637  
   638  	<-agent.statChan
   639  	return false
   640  }
   641  
   642  func (agent *agent) sendStatsWorker() {
   643  	Log("agent").Infof("start send stats goroutine")
   644  	defer agent.wg.Done()
   645  
   646  	stream := agent.statGrpc.newStatStreamWithRetry()
   647  	for stats := range agent.statChan {
   648  		if !agent.enable {
   649  			break
   650  		}
   651  
   652  		err := stream.sendStats(stats)
   653  		if err != nil {
   654  			if err != io.EOF {
   655  				Log("stats").Errorf("send stats - %v", err)
   656  			}
   657  
   658  			stream.close()
   659  			stream = agent.statGrpc.newStatStreamWithRetry()
   660  		}
   661  	}
   662  	stream.close()
   663  
   664  	Log("agent").Infof("end send stats goroutine")
   665  }
   666  
   667  func NewTestAgent(config *Config, t *testing.T) (Agent, error) {
   668  	config.offGrpc = true
   669  	logger.setup(config)
   670  
   671  	agent := &agent{
   672  		appName:     config.String(CfgAppName),
   673  		appType:     int32(config.Int(CfgAppType)),
   674  		agentID:     config.String(CfgAgentID),
   675  		agentName:   config.String(CfgAgentName),
   676  		startTime:   time.Now().UnixNano() / int64(time.Millisecond),
   677  		spanChan:    make(chan *span, config.Int(CfgSpanQueueSize)),
   678  		metaChan:    make(chan interface{}, config.Int(CfgSpanQueueSize)),
   679  		urlStatChan: make(chan *urlStat, config.Int(CfgSpanQueueSize)),
   680  		statChan:    make(chan *pb.PStatMessage, config.Int(CfgSpanQueueSize)),
   681  		config:      config,
   682  	}
   683  	agent.errorCache, _ = lru.New(cacheSize)
   684  	agent.sqlCache, _ = lru.New(cacheSize)
   685  	agent.sqlUidCache, _ = lru.New(cacheSize)
   686  	agent.apiCache, _ = lru.New(cacheSize)
   687  	agent.newSampler()
   688  
   689  	agent.agentGrpc = newMockAgentGrpc(agent, t)
   690  	//agent.spanGrpc = newMockSpanGrpc(agent, t)
   691  	//agent.statGrpc = newMockStatGrpc(agent, t)
   692  
   693  	globalAgent = agent
   694  	agent.enable = true
   695  
   696  	return agent, nil
   697  }