github.com/grailbio/base@v0.0.11/file/s3file/metrics.go (about)

     1  package s3file
     2  
     3  import (
     4  	"expvar"
     5  	"flag"
     6  	"fmt"
     7  	"io"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/grailbio/base/log"
    13  )
    14  
    15  var (
    16  	metricAutologOnce   sync.Once
    17  	metricAutologPeriod = flag.Duration("s3file.metric_log_period", 0,
    18  		"Interval for logging S3 operation metrics. Zero disables logging.")
    19  )
    20  
    21  func metricAutolog() {
    22  	metricAutologOnce.Do(func() {
    23  		if period := *metricAutologPeriod; period > 0 {
    24  			go logMetricsLoop(period)
    25  		}
    26  	})
    27  }
    28  
    29  type metricOpMap struct{ m sync.Map }
    30  
    31  func (m *metricOpMap) Op(key string) *metricOp {
    32  	var init metricOp
    33  	got, _ := m.m.LoadOrStore(key, &init)
    34  	return got.(*metricOp)
    35  }
    36  
    37  func (m *metricOpMap) VisitAndReset(f func(string, *metricOp)) {
    38  	m.m.Range(func(key, value interface{}) bool {
    39  		m.m.Delete(key)
    40  		f(key.(string), value.(*metricOp))
    41  		return true
    42  	})
    43  }
    44  
    45  var (
    46  	metrics           metricOpMap
    47  	metricRemoteAddrs expvar.Map
    48  )
    49  
    50  type metricOp struct {
    51  	Count expvar.Int
    52  
    53  	Retry1 expvar.Int
    54  	Retry2 expvar.Int
    55  	Retry4 expvar.Int
    56  	Retry8 expvar.Int
    57  
    58  	DurationFast  expvar.Int
    59  	Duration1Ms   expvar.Int
    60  	Duration10Ms  expvar.Int
    61  	Duration100Ms expvar.Int
    62  	Duration1S    expvar.Int
    63  	Duration10S   expvar.Int
    64  	Duration100S  expvar.Int
    65  
    66  	Bytes expvar.Int
    67  }
    68  
    69  type metricOpProgress struct {
    70  	parent  *metricOp
    71  	start   time.Time
    72  	retries int // == 0 if first try succeeds
    73  }
    74  
    75  func (m *metricOp) Start() *metricOpProgress {
    76  	m.Count.Add(1)
    77  	return &metricOpProgress{m, time.Now(), 0}
    78  }
    79  
    80  func (m *metricOpProgress) Retry() { m.retries++ }
    81  
    82  func (m *metricOpProgress) Bytes(b int) { m.parent.Bytes.Add(int64(b)) }
    83  
    84  func (m *metricOpProgress) Done() {
    85  	switch {
    86  	case m.retries >= 8:
    87  		m.parent.Retry8.Add(1)
    88  	case m.retries >= 4:
    89  		m.parent.Retry4.Add(1)
    90  	case m.retries >= 2:
    91  		m.parent.Retry2.Add(1)
    92  	case m.retries >= 1:
    93  		m.parent.Retry1.Add(1)
    94  	}
    95  
    96  	took := time.Since(m.start)
    97  	switch {
    98  	case took > 100*time.Second:
    99  		m.parent.Duration100S.Add(1)
   100  	case took > 10*time.Second:
   101  		m.parent.Duration10S.Add(1)
   102  	case took > time.Second:
   103  		m.parent.Duration1S.Add(1)
   104  	case took > 100*time.Millisecond:
   105  		m.parent.Duration100Ms.Add(1)
   106  	case took > 10*time.Millisecond:
   107  		m.parent.Duration10Ms.Add(1)
   108  	case took > 1*time.Millisecond:
   109  		m.parent.Duration1Ms.Add(1)
   110  	default:
   111  		m.parent.DurationFast.Add(1)
   112  	}
   113  }
   114  
   115  func (m *metricOp) Write(w io.Writer, period time.Duration) (int, error) {
   116  	perMinute := 60 / period.Seconds()
   117  	return fmt.Fprintf(w, "n:%d r:%d/%d/%d/%d t:%d/%d/%d/%d/%d/%d/%d mib:%d [/min]",
   118  		int(float64(m.Count.Value())*perMinute),
   119  		int(float64(m.Retry1.Value())*perMinute),
   120  		int(float64(m.Retry2.Value())*perMinute),
   121  		int(float64(m.Retry4.Value())*perMinute),
   122  		int(float64(m.Retry8.Value())*perMinute),
   123  		int(float64(m.DurationFast.Value())*perMinute),
   124  		int(float64(m.Duration1Ms.Value())*perMinute),
   125  		int(float64(m.Duration10Ms.Value())*perMinute),
   126  		int(float64(m.Duration100Ms.Value())*perMinute),
   127  		int(float64(m.Duration1S.Value())*perMinute),
   128  		int(float64(m.Duration10S.Value())*perMinute),
   129  		int(float64(m.Duration100S.Value())*perMinute),
   130  		int(float64(m.Bytes.Value())/(1<<20)*perMinute),
   131  	)
   132  }
   133  
   134  func logMetricsLoop(period time.Duration) {
   135  	ticker := time.NewTicker(period)
   136  	defer ticker.Stop()
   137  	var buf strings.Builder
   138  	for {
   139  		select {
   140  		case <-ticker.C:
   141  			metrics.VisitAndReset(func(op string, metrics *metricOp) {
   142  				buf.Reset()
   143  				fmt.Fprintf(&buf, "s3file metrics: op:%s ", op)
   144  				_, _ = metrics.Write(&buf, period)
   145  				log.Print(buf.String())
   146  			})
   147  		}
   148  	}
   149  }