github.com/cloudwego/kitex@v0.9.0/pkg/profiler/profiler.go (about)

     1  /*
     2   * Copyright 2022 CloudWeGo Authors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package profiler
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"runtime/pprof"
    25  	"sort"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/google/pprof/profile"
    31  
    32  	"github.com/cloudwego/kitex/pkg/klog"
    33  )
    34  
    35  type profilerContextKey struct{}
    36  
    37  type Profiler interface {
    38  	Run(ctx context.Context) (err error)
    39  	Stop()
    40  	Pause()
    41  	Resume()
    42  	Prepare(ctx context.Context) context.Context
    43  	Tag(ctx context.Context, tags ...string) context.Context
    44  	Untag(ctx context.Context)
    45  	Lookup(ctx context.Context, key string) (string, bool)
    46  }
    47  
    48  type Processor func(profiles []*TagsProfile) error
    49  
    50  func LogProcessor(profiles []*TagsProfile) error {
    51  	if len(profiles) == 0 {
    52  		return nil
    53  	}
    54  	klog.Infof("KITEX: profiler collect %d records", len(profiles))
    55  	for _, p := range profiles {
    56  		if p.Key != "" {
    57  			klog.Infof("KITEX: profiler - %s %.2f%%", p.Key, p.Percent*100)
    58  		} else {
    59  			klog.Infof("KITEX: profiler - type=default %.2f%%", p.Percent*100)
    60  		}
    61  	}
    62  	klog.Info("---------------------------------")
    63  	return nil
    64  }
    65  
    66  func NewProfiler(processor Processor, interval, window time.Duration) *profiler {
    67  	if processor == nil {
    68  		processor = LogProcessor
    69  	}
    70  	return &profiler{
    71  		stateCond: sync.NewCond(&sync.Mutex{}),
    72  		processor: processor,
    73  		interval:  interval,
    74  		window:    window,
    75  	}
    76  }
    77  
    78  var _ Profiler = (*profiler)(nil)
    79  
    80  const (
    81  	// state changes:
    82  	//   running => pausing => paused => resuming => running
    83  	//           => stopped
    84  	stateRunning  = 0
    85  	statePausing  = 1
    86  	statePaused   = 2
    87  	stateResuming = 3
    88  	stateStopped  = 4
    89  )
    90  
    91  type profiler struct {
    92  	data      bytes.Buffer // protobuf
    93  	state     int
    94  	stateCond *sync.Cond
    95  	// settings
    96  	processor Processor
    97  	interval  time.Duration // sleep time between every profiling window
    98  	window    time.Duration // profiling in the window, go pprof collect stack profile every 10ms
    99  }
   100  
   101  // Tag current goroutine with tagged ctx
   102  // it's used for reuse goroutine scenario
   103  func Tag(ctx context.Context) {
   104  	if pc, ok := ctx.Value(profilerContextKey{}).(*profilerContext); ok {
   105  		pc.profiler.Tag(ctx)
   106  	}
   107  }
   108  
   109  // Untag current goroutine with tagged ctx
   110  // it's used for reuse goroutine scenario
   111  func Untag(ctx context.Context) {
   112  	if pc, ok := ctx.Value(profilerContextKey{}).(*profilerContext); ok {
   113  		pc.profiler.Untag(ctx)
   114  	}
   115  }
   116  
   117  type profilerContext struct {
   118  	profiler Profiler
   119  	untagCtx context.Context
   120  	tags     []string
   121  }
   122  
   123  func newProfilerContext(profiler Profiler) *profilerContext {
   124  	return &profilerContext{
   125  		profiler: profiler,
   126  		tags:     make([]string, 0, 12),
   127  	}
   128  }
   129  
   130  // Prepare the profiler context
   131  func (p *profiler) Prepare(ctx context.Context) context.Context {
   132  	if c := ctx.Value(profilerContextKey{}); c != nil {
   133  		return ctx
   134  	}
   135  	return context.WithValue(ctx, profilerContextKey{}, newProfilerContext(p))
   136  }
   137  
   138  // State return current profiler state
   139  func (p *profiler) State() (state int) {
   140  	p.stateCond.L.Lock()
   141  	state = p.state
   142  	p.stateCond.L.Unlock()
   143  	return state
   144  }
   145  
   146  // Stop the profiler
   147  func (p *profiler) Stop() {
   148  	if p.State() == stateStopped {
   149  		return
   150  	}
   151  	// stateRunning => stateStopped
   152  	p.stateChange(stateRunning, stateStopped)
   153  }
   154  
   155  // Pause the profiler.
   156  // The profiler has been paused when Pause() return
   157  func (p *profiler) Pause() {
   158  	if p.State() == statePaused {
   159  		return
   160  	}
   161  	// stateRunning => statePausing
   162  	p.stateChange(stateRunning, statePausing)
   163  	// => statePaused
   164  	p.stateWait(statePaused)
   165  }
   166  
   167  // Resume the profiler.
   168  // The profiler has been resumed when Resume() return
   169  func (p *profiler) Resume() {
   170  	if p.State() == stateRunning {
   171  		return
   172  	}
   173  	// statePaused => stateResuming
   174  	p.stateChange(statePaused, stateResuming)
   175  	// => stateRunning
   176  	p.stateWait(stateRunning)
   177  }
   178  
   179  // Run start analyse the pprof data with interval and window settings
   180  func (p *profiler) Run(ctx context.Context) (err error) {
   181  	var profiles []*TagsProfile
   182  	timer := time.NewTimer(0)
   183  	for {
   184  		// check state
   185  		state := p.State()
   186  		switch state {
   187  		case stateRunning: // do nothing
   188  		case statePausing: // pause the loop
   189  			p.stateChange(statePausing, statePaused) // wake up Pause()
   190  			klog.Info("KITEX: profiler paused")
   191  			p.stateChange(stateResuming, stateRunning) // wake up Resume()
   192  			klog.Info("KITEX: profiler resumed")
   193  			continue
   194  		case statePaused, stateResuming: // actually, no such case
   195  			continue
   196  		case stateStopped: // end the loop
   197  			klog.Info("KITEX: profiler stopped")
   198  			return nil
   199  		}
   200  
   201  		// wait for an interval time to reduce the cost of profiling
   202  		if p.interval > 0 {
   203  			timer.Reset(p.interval)
   204  			<-timer.C
   205  		}
   206  
   207  		// start profiler
   208  		if err = p.startProfile(); err != nil {
   209  			klog.Errorf("KITEX: profiler start profile failed: %v", err)
   210  			// the reason of startProfile failed maybe because of there is a running pprof program.
   211  			// so we need to wait a fixed time for it.
   212  			time.Sleep(time.Second * 30)
   213  			continue
   214  		}
   215  
   216  		// wait for a window time to collect pprof data
   217  		if p.window > 0 {
   218  			timer.Reset(p.window)
   219  			<-timer.C
   220  		}
   221  
   222  		// stop profiler
   223  		p.stopProfile()
   224  
   225  		// analyse and process pprof data
   226  		profiles, err = p.analyse()
   227  		if err != nil {
   228  			klog.Errorf("KITEX: profiler analyse failed: %v", err)
   229  			continue
   230  		}
   231  		err = p.processor(profiles)
   232  		if err != nil {
   233  			klog.Errorf("KITEX: profiler controller process failed: %v", err)
   234  			continue
   235  		}
   236  	}
   237  }
   238  
   239  // Tag current goroutine with tags
   240  // If ctx already tagged, append the existed tags
   241  func (p *profiler) Tag(ctx context.Context, tags ...string) context.Context {
   242  	pctx, ok := ctx.Value(profilerContextKey{}).(*profilerContext)
   243  	if !ok {
   244  		pctx = newProfilerContext(p)
   245  		ctx = context.WithValue(ctx, profilerContextKey{}, pctx)
   246  	}
   247  	if pctx.untagCtx == nil {
   248  		pctx.untagCtx = ctx
   249  	}
   250  	pctx.tags = append(pctx.tags, tags...)
   251  	// do not return pprof ctx
   252  	pprof.SetGoroutineLabels(pprof.WithLabels(context.Background(), pprof.Labels(pctx.tags...)))
   253  	return ctx
   254  }
   255  
   256  // Untag current goroutine
   257  // Only untag if ctx already tagged, will not clear the goroutine labels if not tagged by profiler
   258  func (p *profiler) Untag(ctx context.Context) {
   259  	if pc, ok := ctx.Value(profilerContextKey{}).(*profilerContext); ok && pc.untagCtx != nil {
   260  		// if ctx have untagCtx, that means the current goroutine created by a tagged goroutine
   261  		// we need to untag the goroutine when finished
   262  		// else, do nothing
   263  		pprof.SetGoroutineLabels(pc.untagCtx)
   264  	}
   265  }
   266  
   267  func (p *profiler) Lookup(ctx context.Context, key string) (string, bool) {
   268  	return pprof.Label(ctx, key)
   269  }
   270  
   271  func (p *profiler) stateChange(from, to int) {
   272  	p.stateCond.L.Lock()
   273  	for p.state != from { // wait state to from first
   274  		p.stateCond.Wait()
   275  	}
   276  	p.state = to
   277  	p.stateCond.L.Unlock()
   278  	p.stateCond.Broadcast()
   279  }
   280  
   281  func (p *profiler) stateWait(to int) {
   282  	p.stateCond.L.Lock()
   283  	for p.state != to {
   284  		p.stateCond.Wait()
   285  	}
   286  	p.stateCond.L.Unlock()
   287  }
   288  
   289  func (p *profiler) startProfile() error {
   290  	p.data.Reset()
   291  	return pprof.StartCPUProfile(&p.data)
   292  }
   293  
   294  func (p *profiler) stopProfile() {
   295  	pprof.StopCPUProfile()
   296  }
   297  
   298  func (p *profiler) analyse() ([]*TagsProfile, error) {
   299  	// parse protobuf data
   300  	pf, err := profile.ParseData(p.data.Bytes())
   301  	if err != nil {
   302  		return nil, err
   303  	}
   304  
   305  	// filter cpu value index
   306  	sampleIdx := -1
   307  	for idx, st := range pf.SampleType {
   308  		if st.Type == "cpu" {
   309  			sampleIdx = idx
   310  			break
   311  		}
   312  	}
   313  	if sampleIdx < 0 {
   314  		return nil, errors.New("profiler: sample type not found")
   315  	}
   316  
   317  	// calculate every sample expense
   318  	counter := map[string]*TagsProfile{} // map[tagsKey]funcProfile
   319  	var total int64
   320  	for _, sm := range pf.Sample {
   321  		value := sm.Value[sampleIdx]
   322  		tags := labelToTags(sm.Label)
   323  		tagsKey := tagsToKey(tags)
   324  		tp, ok := counter[tagsKey]
   325  		if !ok {
   326  			tp = &TagsProfile{}
   327  			counter[tagsKey] = tp
   328  			tp.Key = tagsKey
   329  			tp.Tags = tags
   330  		}
   331  		tp.Value += value
   332  		total += value
   333  	}
   334  
   335  	// compensate value with duration date
   336  	durationRate := float64((p.interval + p.window) / p.window)
   337  	profiles := make([]*TagsProfile, 0, len(counter)) // flat to array
   338  	for _, l := range counter {
   339  		l.Percent = float64(l.Value) / float64(total)
   340  		l.Value = int64(durationRate * float64(l.Value))
   341  		profiles = append(profiles, l)
   342  	}
   343  	return profiles, nil
   344  }
   345  
   346  // TagsProfile is the stats result group by tag key
   347  type TagsProfile struct {
   348  	Key     string   // eg: a=1,b=2
   349  	Tags    []string // eg: ["a", "1", "b", "2"]
   350  	Value   int64    // pprof cpu times
   351  	Percent float64  // <= 1.0
   352  }
   353  
   354  func labelToTags(label map[string][]string) []string {
   355  	tags := make([]string, 0, len(label)*2)
   356  	for k, v := range label {
   357  		tags = append(tags, k, strings.Join(v, ","))
   358  	}
   359  	return tags
   360  }
   361  
   362  func tagsToKey(tags []string) string {
   363  	if len(tags)%2 != 0 {
   364  		return ""
   365  	}
   366  	tagsPair := make([]string, 0, len(tags)/2)
   367  	for i := 0; i < len(tags); i += 2 {
   368  		tagsPair = append(tagsPair, fmt.Sprintf("%s=%s", tags[i], tags[i+1]))
   369  	}
   370  	// sort tags to make it a unique key
   371  	sort.Strings(tagsPair)
   372  	return strings.Join(tagsPair, "|")
   373  }