github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/profiler/profiler.go (about)

     1  package profiler
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"math/rand"
    10  	"os"
    11  	"path/filepath"
    12  	"runtime"
    13  	"runtime/pprof"
    14  	"time"
    15  
    16  	"github.com/google/pprof/profile"
    17  	"github.com/rs/zerolog"
    18  	"go.uber.org/atomic"
    19  	"go.uber.org/multierr"
    20  	pb "google.golang.org/genproto/googleapis/devtools/cloudprofiler/v2"
    21  
    22  	"github.com/onflow/flow-go/engine"
    23  )
    24  
    25  type timedProfileFunc func(io.Writer, time.Duration) error
    26  type profileDef struct {
    27  	profileName string
    28  	profileType pb.ProfileType
    29  	profileFunc timedProfileFunc
    30  }
    31  
    32  // ProfilerConfig profiler parameters.
    33  type ProfilerConfig struct {
    34  	Enabled         bool
    35  	UploaderEnabled bool
    36  
    37  	Dir      string
    38  	Interval time.Duration
    39  	Duration time.Duration
    40  }
    41  
    42  type AutoProfiler struct {
    43  	unit     *engine.Unit
    44  	dir      string
    45  	log      zerolog.Logger
    46  	interval time.Duration
    47  	duration time.Duration
    48  
    49  	uploader Uploader
    50  	enabled  *atomic.Bool
    51  
    52  	// used to trigger a profile run for a given duration
    53  	trigger chan time.Duration
    54  }
    55  
    56  // New creates a new AutoProfiler instance performing profiling every interval for duration.
    57  func New(log zerolog.Logger, uploader Uploader, cfg ProfilerConfig) (*AutoProfiler, error) {
    58  
    59  	err := os.MkdirAll(cfg.Dir, os.ModePerm)
    60  	if err != nil {
    61  		return nil, fmt.Errorf("could not create profile dir %v: %w", cfg.Dir, err)
    62  	}
    63  
    64  	// add 50% jitter to the interval
    65  	jitter := time.Duration(rand.Int63n(int64(cfg.Interval)))
    66  	interval := cfg.Interval/2 + jitter
    67  
    68  	p := &AutoProfiler{
    69  		unit:     engine.NewUnit(),
    70  		log:      log.With().Str("component", "profiler").Logger(),
    71  		dir:      cfg.Dir,
    72  		interval: interval,
    73  		duration: cfg.Duration,
    74  		uploader: uploader,
    75  		enabled:  atomic.NewBool(cfg.Enabled),
    76  		trigger:  make(chan time.Duration),
    77  	}
    78  
    79  	go p.runForever()
    80  
    81  	return p, nil
    82  }
    83  
    84  // SetEnabled sets whether the profiler is active.
    85  // No errors are expected during normal operation.
    86  func (p *AutoProfiler) SetEnabled(enabled bool) error {
    87  	p.enabled.Store(enabled)
    88  	return nil
    89  }
    90  
    91  // Enabled returns the current enabled state of the profiler.
    92  func (p *AutoProfiler) Enabled() bool {
    93  	return p.enabled.Load()
    94  }
    95  
    96  // TriggerRun manually triggers a profile run if one is not already running.
    97  func (p *AutoProfiler) TriggerRun(d time.Duration) error {
    98  	select {
    99  	case p.trigger <- d:
   100  		return nil
   101  	default:
   102  		return errors.New("profiling is already in progress")
   103  	}
   104  }
   105  
   106  func (p *AutoProfiler) runForever() {
   107  	t := time.NewTicker(p.interval)
   108  	defer t.Stop()
   109  
   110  	for {
   111  		select {
   112  		case <-t.C:
   113  			if p.Enabled() {
   114  				p.runOnce(p.duration)
   115  			}
   116  		case d := <-p.trigger:
   117  			p.runOnce(d)
   118  		case <-p.unit.Quit():
   119  			return
   120  		}
   121  	}
   122  }
   123  
   124  func (p *AutoProfiler) Ready() <-chan struct{} {
   125  	if p.Enabled() {
   126  		p.log.Info().Dur("duration", p.duration).Time("nextRunAt", time.Now().Add(p.interval)).Msg("AutoProfiler has started")
   127  	} else {
   128  		p.log.Info().Msg("AutoProfiler has started, profiler is disabled")
   129  	}
   130  	return p.unit.Ready()
   131  }
   132  
   133  func (p *AutoProfiler) Done() <-chan struct{} {
   134  	return p.unit.Done()
   135  }
   136  
   137  func (p *AutoProfiler) runOnce(d time.Duration) {
   138  	startTime := time.Now()
   139  	p.log.Info().Msg("starting profile trace")
   140  
   141  	for _, prof := range [...]profileDef{
   142  		{profileName: "goroutine", profileType: pb.ProfileType_THREADS, profileFunc: func(w io.Writer, _ time.Duration) error { return newProfileFunc("goroutine")(w) }},
   143  		{profileName: "heap", profileType: pb.ProfileType_HEAP, profileFunc: p.pprofHeap},
   144  		{profileName: "allocs", profileType: pb.ProfileType_HEAP_ALLOC, profileFunc: p.pprofAllocs},
   145  		{profileName: "block", profileType: pb.ProfileType_CONTENTION, profileFunc: p.pprofBlock},
   146  		{profileName: "cpu", profileType: pb.ProfileType_WALL, profileFunc: p.pprofCpu},
   147  	} {
   148  		path := filepath.Join(p.dir, fmt.Sprintf("%s-%s", prof.profileName, time.Now().Format(time.RFC3339)))
   149  
   150  		logger := p.log.With().Str("profileName", prof.profileName).Str("profilePath", path).Logger()
   151  		logger.Info().Str("file", path).Msg("capturing")
   152  
   153  		f, err := os.CreateTemp(p.dir, "profile")
   154  		if err != nil {
   155  			logger.Err(err).Msg("failed to create temp profile")
   156  			continue
   157  		}
   158  		logger = logger.With().Str("tempFile", f.Name()).Logger()
   159  
   160  		// Remove temp file if it still exists.
   161  		defer func(logger zerolog.Logger, tmpName string) {
   162  			if _, err := os.Stat(tmpName); errors.Is(err, os.ErrNotExist) {
   163  				return
   164  			}
   165  			if err := os.Remove(tmpName); err != nil {
   166  				logger.Warn().Err(err).Msg("failed to remove profile")
   167  			}
   168  		}(logger, f.Name())
   169  
   170  		err = p.pprof(f, prof.profileFunc, d)
   171  		if err != nil {
   172  			logger.Error().Err(err).Msg("failed to generate profile")
   173  			continue
   174  		}
   175  
   176  		// default CreateTemp permissions are 0600.
   177  		err = os.Chmod(f.Name(), 0644)
   178  		if err != nil {
   179  			logger.Error().Err(err).Msg("failed to set profile permissions")
   180  			continue
   181  		}
   182  
   183  		err = os.Rename(f.Name(), path)
   184  		if err != nil {
   185  			logger.Error().Err(err).Msg("failed to rename profile")
   186  			continue
   187  		}
   188  
   189  		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   190  		defer cancel()
   191  
   192  		err = p.uploader.Upload(ctx, path, prof.profileType)
   193  		if err != nil {
   194  			logger.Warn().Err(err).Msg("failed to upload profile")
   195  			continue
   196  		}
   197  	}
   198  	p.log.Info().Dur("duration", time.Since(startTime)).Msg("finished profile trace")
   199  }
   200  
   201  func (p *AutoProfiler) pprof(f *os.File, fn timedProfileFunc, d time.Duration) (err error) {
   202  	defer func() {
   203  		multierr.AppendInto(&err, f.Close())
   204  	}()
   205  
   206  	return fn(f, d)
   207  }
   208  
   209  type profileFunc func(io.Writer) error
   210  
   211  func newProfileFunc(name string) profileFunc {
   212  	return func(w io.Writer) error {
   213  		return pprof.Lookup(name).WriteTo(w, 0)
   214  	}
   215  }
   216  
   217  func (p *AutoProfiler) goHeapProfile(sampleTypes ...string) (*profile.Profile, error) {
   218  	if len(sampleTypes) == 0 {
   219  		return nil, fmt.Errorf("no sample types specified")
   220  	}
   221  
   222  	// Forces the GC before taking each of the heap profiles and improves the profile accuracy.
   223  	// Autoprofiler runs very infrequently so performance impact is minimal.
   224  	runtime.GC()
   225  
   226  	buf := &bytes.Buffer{}
   227  	err := newProfileFunc("heap")(buf)
   228  	if err != nil {
   229  		return nil, fmt.Errorf("failed to generate allocs profile: %w", err)
   230  	}
   231  
   232  	prof, err := profile.Parse(buf)
   233  	if err != nil {
   234  		return nil, fmt.Errorf("failed to parse allocs profile: %w", err)
   235  	}
   236  	prof.TimeNanos = time.Now().UnixNano()
   237  
   238  	selectedSampleTypes := make([]int, 0, len(sampleTypes))
   239  	for _, name := range sampleTypes {
   240  		for i, sampleType := range prof.SampleType {
   241  			if sampleType.Type == name {
   242  				selectedSampleTypes = append(selectedSampleTypes, i)
   243  			}
   244  		}
   245  	}
   246  	if len(selectedSampleTypes) != len(sampleTypes) {
   247  		return nil, fmt.Errorf("failed to find all sample types: want: %+v, got: %+v", sampleTypes, prof.SampleType)
   248  	}
   249  
   250  	newSampleType := make([]*profile.ValueType, 0, len(selectedSampleTypes))
   251  	for _, j := range selectedSampleTypes {
   252  		newSampleType = append(newSampleType, prof.SampleType[j])
   253  	}
   254  	prof.SampleType = newSampleType
   255  	prof.DefaultSampleType = prof.SampleType[0].Type
   256  
   257  	for _, s := range prof.Sample {
   258  		newValue := make([]int64, 0, len(selectedSampleTypes))
   259  		for _, j := range selectedSampleTypes {
   260  			newValue = append(newValue, s.Value[j])
   261  		}
   262  		s.Value = newValue
   263  	}
   264  
   265  	// Merge profile with itself to remove empty samples.
   266  	prof, err = profile.Merge([]*profile.Profile{prof})
   267  	if err != nil {
   268  		return nil, fmt.Errorf("failed to merge profile: %w", err)
   269  	}
   270  
   271  	return prof, nil
   272  }
   273  
   274  // pprofHeap produces cumulative heap profile since the program start.
   275  func (p *AutoProfiler) pprofHeap(w io.Writer, _ time.Duration) error {
   276  	prof, err := p.goHeapProfile("inuse_objects", "inuse_space")
   277  	if err != nil {
   278  		return fmt.Errorf("failed to get heap profile: %w", err)
   279  	}
   280  
   281  	return prof.Write(w)
   282  }
   283  
   284  // pprofAllocs produces differential allocs profile for the given duration.
   285  func (p *AutoProfiler) pprofAllocs(w io.Writer, d time.Duration) (err error) {
   286  	p1, err := p.goHeapProfile("alloc_objects", "alloc_space")
   287  	if err != nil {
   288  		return fmt.Errorf("failed to get allocs profile: %w", err)
   289  	}
   290  
   291  	select {
   292  	case <-time.After(d):
   293  	case <-p.unit.Quit():
   294  		return context.Canceled
   295  	}
   296  
   297  	p2, err := p.goHeapProfile("alloc_objects", "alloc_space")
   298  	if err != nil {
   299  		return fmt.Errorf("failed to get allocs profile: %w", err)
   300  	}
   301  
   302  	// multiply values by -1 and merge to get differential profile.
   303  	p1.Scale(-1)
   304  	diff, err := profile.Merge([]*profile.Profile{p1, p2})
   305  	if err != nil {
   306  		return fmt.Errorf("failed to merge allocs profiles: %w", err)
   307  	}
   308  	diff.TimeNanos = time.Now().UnixNano()
   309  	diff.DurationNanos = d.Nanoseconds()
   310  
   311  	return diff.Write(w)
   312  }
   313  
   314  func (p *AutoProfiler) pprofBlock(w io.Writer, d time.Duration) error {
   315  	runtime.SetBlockProfileRate(100)
   316  	defer runtime.SetBlockProfileRate(0)
   317  
   318  	select {
   319  	case <-time.After(d):
   320  		return newProfileFunc("block")(w)
   321  	case <-p.unit.Quit():
   322  		return context.Canceled
   323  	}
   324  }
   325  
   326  func (p *AutoProfiler) pprofCpu(w io.Writer, d time.Duration) error {
   327  	err := pprof.StartCPUProfile(w)
   328  	if err != nil {
   329  		return fmt.Errorf("failed to start CPU profile: %w", err)
   330  	}
   331  	defer pprof.StopCPUProfile()
   332  
   333  	select {
   334  	case <-time.After(d):
   335  		return nil
   336  	case <-p.unit.Quit():
   337  		return context.Canceled
   338  	}
   339  }