github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/agent/session.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	logger2 "github.com/pyroscope-io/pyroscope/pkg/agent/log"
     6  	"github.com/pyroscope-io/pyroscope/pkg/util/alignedticker"
     7  	"os"
     8  	"sync"
     9  	"time"
    10  
    11  	// revive:disable:blank-imports Depending on configuration these packages may or may not be used.
    12  	//   That's why we do a blank import here and then packages themselves register with the rest of the code.
    13  
    14  	_ "github.com/pyroscope-io/pyroscope/pkg/agent/debugspy"
    15  	_ "github.com/pyroscope-io/pyroscope/pkg/agent/dotnetspy"
    16  	_ "github.com/pyroscope-io/pyroscope/pkg/agent/ebpfspy"
    17  	_ "github.com/pyroscope-io/pyroscope/pkg/agent/gospy"
    18  	_ "github.com/pyroscope-io/pyroscope/pkg/agent/phpspy"
    19  	"github.com/pyroscope-io/pyroscope/pkg/agent/upstream"
    20  	"github.com/pyroscope-io/pyroscope/pkg/flameql"
    21  	"github.com/pyroscope-io/pyroscope/pkg/storage/segment"
    22  	"github.com/pyroscope-io/pyroscope/pkg/util/process"
    23  	"github.com/pyroscope-io/pyroscope/pkg/util/throttle"
    24  
    25  	// revive:enable:blank-imports
    26  
    27  	"github.com/pyroscope-io/pyroscope/pkg/agent/spy"
    28  	"github.com/pyroscope-io/pyroscope/pkg/structs/transporttrie"
    29  )
    30  
    31  // Each Session can deal with:
    32  // * multiple processes (one main process and zero or more subprocesses)
    33  // * multiple profile types (cpu, mem, etc)
    34  // * multiple names (app.cpu{} or app.cpu{controller=foo}) (one at a time)
    35  
    36  /*
    37                  PROCESSES
    38              ┌─────┬─────┬─────┐
    39              │pid 1│pid 2│pid 3│
    40              └──┬──┴──┬──┴──┬──┘
    41                 │     │     │          NAMES/TAGS
    42                 │     │     │            ┌─app.cpu{}
    43               0 ▼   1 ▼   2 ▼            │     ┌─app.cpu{controller=bar}
    44              ┌─────┬─────┬─────┐      ┌─────┬─────┐     ┌──────┐
    45       0 cpu  │     │     │     │ ───► │     │     │ ──► │      │
    46              └─────┴─────┴─────┘      └─────┴─────┘     │      │
    47  PROFILE TYPES      SPIES                 TRIES     ──► │server│
    48              ┌─────┬─────┬─────┐      ┌─────┬─────┐     │      │
    49       1 mem  │     │     │     │ ───► │     │     │ ──► │      │
    50              └─────┴─────┴─────┘      └─────┴─────┘     └──────┘
    51  */
    52  // type process struct {
    53  // 	pid            int
    54  // 	spies          []*spy.Spy
    55  // 	errorThrottler *throttle.Throttler
    56  // }
    57  
    58  const errorThrottlerPeriod = 10 * time.Second
    59  
    60  type ProfileSession struct {
    61  	// configuration, doesn't change
    62  	upstream         upstream.Upstream
    63  	spyName          string
    64  	sampleRate       uint32
    65  	profileTypes     []spy.ProfileType
    66  	uploadRate       time.Duration
    67  	disableGCRuns    bool
    68  	withSubprocesses bool
    69  	clibIntegration  bool
    70  	spyFactory       SpyFactory
    71  	noForkDetection  bool
    72  	pid              int
    73  
    74  	logger    logger2.Logger
    75  	throttler *throttle.Throttler
    76  	stopOnce  sync.Once
    77  	stopCh    chan struct{}
    78  	trieMutex sync.Mutex
    79  
    80  	// these things do change:
    81  	appName            string
    82  	startTimeTruncated time.Time
    83  
    84  	// these slices / maps keep track of processes, spies, and tries
    85  	// see comment about multiple dimensions above
    86  	spies map[int][]spy.Spy // pid, profileType
    87  	// string is appName, int is index in pids
    88  	previousTries map[string][]*transporttrie.Trie
    89  	tries         map[string][]*transporttrie.Trie
    90  }
    91  
    92  type SpyFactory func(pid int) ([]spy.Spy, error)
    93  
    94  type SessionConfig struct {
    95  	upstream.Upstream
    96  	logger2.Logger
    97  	AppName          string
    98  	Tags             map[string]string
    99  	ProfilingTypes   []spy.ProfileType
   100  	DisableGCRuns    bool
   101  	SpyName          string
   102  	SampleRate       uint32
   103  	UploadRate       time.Duration
   104  	Pid              int
   105  	WithSubprocesses bool
   106  	ClibIntegration  bool
   107  	PHPSpyArgs       string
   108  }
   109  
   110  func NewSession(c SessionConfig) (*ProfileSession, error) {
   111  	return NewSessionWithSpyFactory(c, NewGenericSpyFactory(c))
   112  }
   113  
   114  func NewSessionWithSpyFactory(c SessionConfig, spyFactory SpyFactory) (*ProfileSession, error) {
   115  	appName, err := mergeTagsWithAppName(c.AppName, c.Tags)
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  
   120  	ps := &ProfileSession{
   121  		upstream:         c.Upstream,
   122  		appName:          appName,
   123  		spyName:          c.SpyName,
   124  		profileTypes:     c.ProfilingTypes,
   125  		disableGCRuns:    c.DisableGCRuns,
   126  		sampleRate:       c.SampleRate,
   127  		uploadRate:       c.UploadRate,
   128  		pid:              c.Pid,
   129  		spies:            make(map[int][]spy.Spy),
   130  		stopCh:           make(chan struct{}),
   131  		withSubprocesses: c.WithSubprocesses,
   132  		clibIntegration:  c.ClibIntegration,
   133  		logger:           c.Logger,
   134  		throttler:        throttle.New(errorThrottlerPeriod),
   135  		spyFactory:       spyFactory,
   136  
   137  		// string is appName, int is index in pids
   138  		previousTries: make(map[string][]*transporttrie.Trie),
   139  		tries:         make(map[string][]*transporttrie.Trie),
   140  	}
   141  
   142  	ps.initializeTries(ps.appName)
   143  
   144  	return ps, nil
   145  }
   146  
   147  func NewGenericSpyFactory(c SessionConfig) SpyFactory {
   148  	return func(pid int) ([]spy.Spy, error) {
   149  		var res []spy.Spy
   150  
   151  		sf, err := spy.StartFunc(c.SpyName)
   152  		if err != nil {
   153  			return res, err
   154  		}
   155  
   156  		for _, pt := range c.ProfilingTypes {
   157  			params := spy.InitParams{
   158  				Pid:           pid,
   159  				ProfileType:   pt,
   160  				SampleRate:    c.SampleRate,
   161  				DisableGCRuns: c.DisableGCRuns,
   162  				Logger:        c.Logger,
   163  				PHPSpyArgs:    c.PHPSpyArgs,
   164  			}
   165  			s, err := sf(params)
   166  
   167  			if err != nil {
   168  				return res, err
   169  			}
   170  			res = append(res, s)
   171  		}
   172  		return res, nil
   173  	}
   174  }
   175  
   176  func addSuffix(name string, ptype spy.ProfileType) (string, error) {
   177  	k, err := segment.ParseKey(name)
   178  	if err != nil {
   179  		return "", err
   180  	}
   181  	k.Add("__name__", k.AppName()+"."+string(ptype))
   182  	return k.Normalized(), nil
   183  }
   184  
   185  // mergeTagsWithAppName validates user input and merges explicitly specified
   186  // tags with tags from app name.
   187  //
   188  // App name may be in the full form including tags (app.name{foo=bar,baz=qux}).
   189  // Returned application name is always short, any tags that were included are
   190  // moved to tags map. When merged with explicitly provided tags (config/CLI),
   191  // last take precedence.
   192  //
   193  // App name may be an empty string. Tags must not contain reserved keys,
   194  // the map is modified in place.
   195  func mergeTagsWithAppName(appName string, tags map[string]string) (string, error) {
   196  	k, err := segment.ParseKey(appName)
   197  	if err != nil {
   198  		return "", err
   199  	}
   200  	for tagKey, tagValue := range tags {
   201  		if flameql.IsTagKeyReserved(tagKey) {
   202  			continue
   203  		}
   204  		if err = flameql.ValidateTagKey(tagKey); err != nil {
   205  			return "", err
   206  		}
   207  		k.Add(tagKey, tagValue)
   208  	}
   209  	return k.Normalized(), nil
   210  }
   211  
   212  func (ps *ProfileSession) takeSnapshots() {
   213  	var samplingCh <-chan time.Time
   214  	if ps.areSpiesResettable() {
   215  		samplingCh = make(chan time.Time) // will never fire
   216  	} else {
   217  		ticker := time.NewTicker(time.Second / time.Duration(ps.sampleRate))
   218  		defer ticker.Stop()
   219  		samplingCh = ticker.C
   220  	}
   221  	uploadTicker := alignedticker.NewAlignedTicker(ps.uploadRate)
   222  	defer uploadTicker.Stop()
   223  	for {
   224  		select {
   225  		case endTimeTruncated := <-uploadTicker.C:
   226  			ps.resetSpies()
   227  			ps.takeSnapshot()
   228  			ps.reset(endTimeTruncated)
   229  		case <-samplingCh:
   230  			ps.takeSnapshot()
   231  		case <-ps.stopCh:
   232  			ps.stopSpies()
   233  			return
   234  		}
   235  	}
   236  }
   237  
   238  func (ps *ProfileSession) stopSpies() {
   239  	for _, sarr := range ps.spies {
   240  		for _, s := range sarr {
   241  			s.Stop()
   242  		}
   243  	}
   244  }
   245  
   246  func (ps *ProfileSession) takeSnapshot() {
   247  	ps.trieMutex.Lock()
   248  	defer ps.trieMutex.Unlock()
   249  
   250  	pidsToRemove := []int{}
   251  	for pid, sarr := range ps.spies {
   252  		for i, s := range sarr {
   253  			labelsCache := map[string]string{}
   254  			err := s.Snapshot(func(labels *spy.Labels, stack []byte, v uint64) error {
   255  				appName := ps.appName
   256  				if labels != nil {
   257  					if newAppName, ok := labelsCache[labels.ID()]; ok {
   258  						appName = newAppName
   259  					} else {
   260  						newAppName, err := mergeTagsWithAppName(appName, labels.Tags())
   261  						if err != nil {
   262  							return fmt.Errorf("error setting tags: %w", err)
   263  						}
   264  						appName = newAppName
   265  						labelsCache[labels.ID()] = appName
   266  					}
   267  				}
   268  				if len(stack) > 0 {
   269  					if _, ok := ps.tries[appName]; !ok {
   270  						ps.initializeTries(appName)
   271  					}
   272  					ps.tries[appName][i].Insert(stack, v, true)
   273  				}
   274  				return nil
   275  			})
   276  			if err != nil {
   277  				if pid >= 0 && !process.Exists(pid) {
   278  					ps.logger.Debugf("error taking snapshot: PID %d: process doesn't exist?", pid)
   279  					pidsToRemove = append(pidsToRemove, pid)
   280  				} else {
   281  					ps.throttler.Run(func(skipped int) {
   282  						if skipped > 0 {
   283  							ps.logger.Errorf("error taking snapshot: %v, %d messages skipped due to throttling", err, skipped)
   284  						} else {
   285  							ps.logger.Errorf("error taking snapshot: %v", err)
   286  						}
   287  					})
   288  				}
   289  			}
   290  		}
   291  	}
   292  	for _, pid := range pidsToRemove {
   293  		for _, s := range ps.spies[pid] {
   294  			s.Stop()
   295  		}
   296  		delete(ps.spies, pid)
   297  	}
   298  }
   299  
   300  func (ps *ProfileSession) areSpiesResettable() bool {
   301  	for _, sarr := range ps.spies {
   302  		for _, s := range sarr {
   303  			if _, ok := s.(spy.Resettable); ok {
   304  				return true
   305  			}
   306  		}
   307  	}
   308  	return false
   309  }
   310  
   311  func (ps *ProfileSession) resetSpies() {
   312  	for _, sarr := range ps.spies {
   313  		for _, s := range sarr {
   314  			if sr, ok := s.(spy.Resettable); ok {
   315  				sr.Reset()
   316  			}
   317  		}
   318  	}
   319  }
   320  
   321  func (ps *ProfileSession) initializeSpies(pid int) ([]spy.Spy, error) {
   322  	return ps.spyFactory(pid)
   323  }
   324  
   325  func (ps *ProfileSession) ChangeName(newName string) error {
   326  	ps.trieMutex.Lock()
   327  	defer ps.trieMutex.Unlock()
   328  
   329  	var err error
   330  	newName, err = mergeTagsWithAppName(newName, map[string]string{})
   331  	if err != nil {
   332  		return err
   333  	}
   334  
   335  	ps.appName = newName
   336  	ps.initializeTries(ps.appName)
   337  
   338  	return nil
   339  }
   340  
   341  func (ps *ProfileSession) initializeTries(appName string) {
   342  	if _, ok := ps.previousTries[appName]; !ok {
   343  		// TODO Only set the trie if it's not already set
   344  		ps.previousTries[appName] = []*transporttrie.Trie{}
   345  		ps.tries[appName] = []*transporttrie.Trie{}
   346  		for i := 0; i < len(ps.profileTypes); i++ {
   347  			ps.previousTries[appName] = append(ps.previousTries[appName], nil)
   348  			ps.tries[appName] = append(ps.tries[appName], transporttrie.New())
   349  		}
   350  	}
   351  }
   352  
   353  // SetTags - add new tags to the session.
   354  func (ps *ProfileSession) SetTags(tags map[string]string) error {
   355  	newName, err := mergeTagsWithAppName(ps.appName, tags)
   356  	if err != nil {
   357  		return err
   358  	}
   359  	return ps.ChangeName(newName)
   360  }
   361  
   362  // SetTag - add a new tag to the session.
   363  func (ps *ProfileSession) SetTag(key, val string) error {
   364  	return ps.SetTags(map[string]string{key: val})
   365  }
   366  
   367  // RemoveTags - remove tags from the session.
   368  func (ps *ProfileSession) RemoveTags(keys ...string) error {
   369  	removals := make(map[string]string)
   370  	for _, key := range keys {
   371  		// 'Adding' a key with an empty string triggers a key removal.
   372  		removals[key] = ""
   373  	}
   374  	newName, err := mergeTagsWithAppName(ps.appName, removals)
   375  	if err != nil {
   376  		return err
   377  	}
   378  	return ps.ChangeName(newName)
   379  }
   380  
   381  func (ps *ProfileSession) Start() error {
   382  	ps.reset(time.Now().Truncate(ps.uploadRate))
   383  
   384  	pid := ps.pid
   385  	spies, err := ps.initializeSpies(pid)
   386  	if err != nil {
   387  		return err
   388  	}
   389  
   390  	ps.spies[pid] = spies
   391  
   392  	go ps.takeSnapshots()
   393  	return nil
   394  }
   395  
   396  // the difference between stop and reset is that reset stops current session
   397  // and then instantly starts a new one
   398  func (ps *ProfileSession) reset(endTimeTruncated time.Time) {
   399  	ps.trieMutex.Lock()
   400  	defer ps.trieMutex.Unlock()
   401  
   402  	// if the process was forked the spy will keep profiling the old process. That's usually not what you want
   403  	//   so in that case we stop the profiling session early
   404  	if ps.clibIntegration && !ps.noForkDetection && ps.isForked() {
   405  		ps.logger.Debugf("fork detected, stopping the session")
   406  		ps.stopOnce.Do(func() {
   407  			close(ps.stopCh)
   408  		})
   409  		return
   410  	}
   411  
   412  	// upload the read data to server
   413  	if !ps.startTimeTruncated.IsZero() {
   414  		ps.uploadTries(endTimeTruncated)
   415  	}
   416  
   417  	// reset the start time
   418  	ps.startTimeTruncated = endTimeTruncated
   419  
   420  	if ps.withSubprocesses {
   421  		ps.addSubprocesses()
   422  	}
   423  }
   424  
   425  func (ps *ProfileSession) Stop() {
   426  	ps.trieMutex.Lock()
   427  	defer ps.trieMutex.Unlock()
   428  
   429  	ps.stopOnce.Do(func() {
   430  		// TODO: wait for stopCh consumer to finish!
   431  		close(ps.stopCh)
   432  		// before stopping, upload the tries
   433  		if !ps.startTimeTruncated.IsZero() {
   434  			ps.uploadTries(ps.startTimeTruncated.Add(ps.uploadRate))
   435  		} // was never started
   436  	})
   437  }
   438  
   439  func (ps *ProfileSession) uploadTries(endTimeTruncated time.Time) {
   440  	for name, tarr := range ps.tries {
   441  		for i, trie := range tarr {
   442  			profileType := ps.profileTypes[i]
   443  			skipUpload := false
   444  
   445  			if trie != nil {
   446  				endTime := endTimeTruncated
   447  				startTime := endTime.Add(-ps.uploadRate)
   448  
   449  				uploadTrie := trie
   450  				if profileType.IsCumulative() {
   451  					previousTrie := ps.previousTries[name][i]
   452  					if previousTrie == nil {
   453  						skipUpload = true
   454  					} else {
   455  						// TODO: Diff doesn't remove empty branches. We need to add that at some point
   456  						uploadTrie = trie.Diff(previousTrie)
   457  					}
   458  				}
   459  
   460  				if !skipUpload && !uploadTrie.IsEmpty() {
   461  					nameWithSuffix, _ := addSuffix(name, profileType)
   462  					ps.upstream.Upload(&upstream.UploadJob{
   463  						Name:            nameWithSuffix,
   464  						StartTime:       startTime,
   465  						EndTime:         endTime,
   466  						SpyName:         ps.spyName,
   467  						SampleRate:      ps.sampleRate,
   468  						Units:           profileType.Units(),
   469  						AggregationType: profileType.AggregationType(),
   470  						Trie:            uploadTrie,
   471  					})
   472  				}
   473  				if profileType.IsCumulative() {
   474  					ps.previousTries[name][i] = trie
   475  				}
   476  			}
   477  			ps.tries[name][i] = transporttrie.New()
   478  		}
   479  	}
   480  }
   481  
   482  func (ps *ProfileSession) isForked() bool {
   483  	return os.Getpid() != ps.pid
   484  }
   485  
   486  func (ps *ProfileSession) addSubprocesses() {
   487  	newPids := findAllSubprocesses(ps.pid)
   488  	for _, newPid := range newPids {
   489  		if _, ok := ps.spies[newPid]; !ok {
   490  			newSpies, err := ps.initializeSpies(newPid)
   491  			if err != nil {
   492  				if ps.logger != nil {
   493  					ps.logger.Errorf("failed to initialize a spy %d [%s]", newPid, ps.spyName)
   494  				}
   495  			} else {
   496  				if ps.logger != nil {
   497  					ps.logger.Debugf("started spy for subprocess %d [%s]", newPid, ps.spyName)
   498  				}
   499  				ps.spies[newPid] = newSpies
   500  			}
   501  		}
   502  	}
   503  }