bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/collect/collect.go (about)

     1  // Package collect provides functions for sending data to OpenTSDB.
     2  //
     3  // The "collect" namespace is used (i.e., <root>.collect) to collect
     4  // program and queue metrics.
     5  package collect // import "bosun.org/collect"
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  	"net/http"
    11  	"net/url"
    12  	"runtime"
    13  	"sort"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	"bosun.org/util"
    19  
    20  	"bosun.org/metadata"
    21  	"bosun.org/opentsdb"
    22  )
    23  
    24  var (
    25  	// Freq is how often metrics are sent to OpenTSDB.
    26  	Freq = time.Second * 15
    27  
    28  	// MaxQueueLen is the maximum size of the queue, above which incoming data will
    29  	// be discarded. Defaults to about 150MB.
    30  	MaxQueueLen = 200000
    31  
    32  	// BatchSize is the maximum length of data points sent at once to OpenTSDB.
    33  	BatchSize = 500
    34  
    35  	// Debug enables debug logging.
    36  	Debug = false
    37  
    38  	// Print prints all datapoints to stdout instead of sending them.
    39  	Print = false
    40  
    41  	// DisableDefaultCollectors prevents the scollector self metrics from being
    42  	// generated.
    43  	DisableDefaultCollectors = false
    44  
    45  	// Tags is an opentsdb.TagSet used when sending self metrics.
    46  	Tags opentsdb.TagSet
    47  
    48  	// Whether or not to use NTLM authentication
    49  	UseNtlm bool = false
    50  
    51  	// DefaultClient can be used to override the HTTP client that will be used to make requests.
    52  	DefaultClient *http.Client = http.DefaultClient
    53  
    54  	// Dropped is the number of dropped data points due to a full queue.
    55  	dropped int64
    56  
    57  	// Dropped is the number of discarded data points due to being invalid
    58  	discarded int64
    59  
    60  	// Sent is the number of sent data points.
    61  	sent int64
    62  
    63  	// Authtoken is the token to use to communicate with bosun
    64  	AuthToken string
    65  
    66  	tchan               chan *opentsdb.DataPoint
    67  	tsdbURL             string
    68  	metricRoot          string
    69  	queue               []*opentsdb.DataPoint
    70  	qlock, mlock, slock sync.Mutex // Locks for queues, maps, stats.
    71  	counters            = make(map[string]*addMetric)
    72  	sets                = make(map[string]*setMetric)
    73  	puts                = make(map[string]*putMetric)
    74  	aggs                = make(map[string]*agMetric)
    75  
    76  	//DirectHandler is an http handler to invoke instead of actually making a network request
    77  	DirectHandler http.Handler
    78  )
    79  
    80  const (
    81  	descCollectAlloc             = "Total number of bytes allocated and still in use by the runtime (via runtime.ReadMemStats)."
    82  	descCollectDiscarded         = "Counter of discarded data points due to being invalid."
    83  	descCollectDropped           = "Counter of dropped data points due to the queue being full."
    84  	descCollectGoRoutines        = "Total number of goroutines that currently exist (via runtime.NumGoroutine)."
    85  	descCollectGcCpuFraction     = "fraction of CPU time used by GC"
    86  	descCollectTotalGCPause      = "Total GC Pause time in milliseconds"
    87  	descCollectPostBad           = "Counter of HTTP POST requests where resp.StatusCode != http.StatusNoContent."
    88  	descCollectPostBatchSize     = "Number of datapoints included in each batch."
    89  	descCollectPostCount         = "Counter of batches sent to the server."
    90  	descCollectPostDuration      = "How many milliseconds it took to send HTTP POST requests to the server."
    91  	descCollectPostError         = "Counter of errors received when sending a batch to the server."
    92  	descCollectPostRestore       = "Counter of data points restored from batches that could not be sent to the server."
    93  	descCollectPostTotalBytes    = "Total number of gzipped bytes sent to the server."
    94  	descCollectPostTotalDuration = "Total number of milliseconds it took to send an HTTP POST request to the server."
    95  	descCollectQueued            = "Total number of items currently queued and waiting to be sent to the server."
    96  	descCollectSent              = "Counter of data points sent to the server."
    97  )
    98  
    99  // InitChan is similar to Init, but uses the given channel instead of creating a
   100  // new one.
   101  func InitChan(tsdbhost *url.URL, root string, ch chan *opentsdb.DataPoint) error {
   102  	if tchan != nil {
   103  		return fmt.Errorf("cannot init twice")
   104  	}
   105  	if err := checkClean(root, "metric root"); err != nil {
   106  		return err
   107  	}
   108  	u, err := tsdbhost.Parse("/api/put")
   109  	if err != nil {
   110  		return err
   111  	}
   112  	if strings.HasPrefix(u.Host, ":") {
   113  		u.Host = "localhost" + u.Host
   114  	}
   115  	tsdbURL = u.String()
   116  	metricRoot = root + "."
   117  	tchan = ch
   118  	go queuer()
   119  	go send()
   120  	go collect()
   121  	if DisableDefaultCollectors {
   122  		return nil
   123  	}
   124  	Set("collect.dropped", Tags, func() (i interface{}) {
   125  		slock.Lock()
   126  		i = dropped
   127  		slock.Unlock()
   128  		return
   129  	})
   130  	Set("collect.discarded", Tags, func() (i interface{}) {
   131  		slock.Lock()
   132  		i = discarded
   133  		slock.Unlock()
   134  		return
   135  	})
   136  
   137  	Set("collect.sent", Tags, func() (i interface{}) {
   138  		slock.Lock()
   139  		i = sent
   140  		slock.Unlock()
   141  		return
   142  	})
   143  	Set("collect.queued", Tags, func() (i interface{}) {
   144  		qlock.Lock()
   145  		i = len(queue)
   146  		qlock.Unlock()
   147  		return
   148  	})
   149  	Set("collect.alloc", Tags, func() interface{} {
   150  		var ms runtime.MemStats
   151  		runtime.ReadMemStats(&ms)
   152  		return ms.Alloc
   153  	})
   154  	Set("collect.gc.cpu_fraction", Tags, func() interface{} {
   155  		var ms runtime.MemStats
   156  		runtime.ReadMemStats(&ms)
   157  		return ms.GCCPUFraction
   158  	})
   159  	Set("collect.gc.total_pause", Tags, func() interface{} {
   160  		var ms runtime.MemStats
   161  		runtime.ReadMemStats(&ms)
   162  		return ms.PauseTotalNs / uint64(time.Millisecond)
   163  	})
   164  	Set("collect.goroutines", Tags, func() interface{} {
   165  		return runtime.NumGoroutine()
   166  	})
   167  	AggregateMeta(metricRoot+"collect.post.batchsize", metadata.Count, descCollectPostBatchSize)
   168  	AggregateMeta(metricRoot+"collect.post.duration", metadata.MilliSecond, descCollectPostDuration)
   169  	metadata.AddMetricMeta(metricRoot+"collect.alloc", metadata.Gauge, metadata.Bytes, descCollectAlloc)
   170  	metadata.AddMetricMeta(metricRoot+"collect.goroutines", metadata.Gauge, metadata.Count, descCollectGoRoutines)
   171  	metadata.AddMetricMeta(metricRoot+"collect.gc.cpu_fraction", metadata.Gauge, metadata.Pct, descCollectGcCpuFraction)
   172  	metadata.AddMetricMeta(metricRoot+"collect.gc.total_pause", metadata.Counter, metadata.MilliSecond, descCollectTotalGCPause)
   173  	metadata.AddMetricMeta(metricRoot+"collect.post.bad_status", metadata.Counter, metadata.PerSecond, descCollectPostBad)
   174  	metadata.AddMetricMeta(metricRoot+"collect.post.count", metadata.Counter, metadata.PerSecond, descCollectPostCount)
   175  	metadata.AddMetricMeta(metricRoot+"collect.post.error", metadata.Counter, metadata.PerSecond, descCollectPostError)
   176  	metadata.AddMetricMeta(metricRoot+"collect.post.restore", metadata.Counter, metadata.PerSecond, descCollectPostRestore)
   177  	metadata.AddMetricMeta(metricRoot+"collect.post.total_bytes", metadata.Counter, metadata.Bytes, descCollectPostTotalBytes)
   178  	metadata.AddMetricMeta(metricRoot+"collect.post.total_duration", metadata.Counter, metadata.MilliSecond, descCollectPostTotalDuration)
   179  	metadata.AddMetricMeta(metricRoot+"collect.queued", metadata.Gauge, metadata.Item, descCollectQueued)
   180  	metadata.AddMetricMeta(metricRoot+"collect.sent", metadata.Counter, metadata.PerSecond, descCollectSent)
   181  	metadata.AddMetricMeta(metricRoot+"collect.dropped", metadata.Counter, metadata.PerSecond, descCollectDropped)
   182  	metadata.AddMetricMeta(metricRoot+"collect.discarded", metadata.Counter, metadata.PerSecond, descCollectDiscarded)
   183  	// Make sure these get zeroed out instead of going unknown on restart
   184  	Add("collect.post.error", Tags, 0)
   185  	Add("collect.post.bad_status", Tags, 0)
   186  	Add("collect.post.restore", Tags, 0)
   187  	return nil
   188  }
   189  
   190  // Init sets up the channels and the queue for sending data to OpenTSDB. It also
   191  // sets up the basename for all metrics.
   192  func Init(tsdbhost *url.URL, root string) error {
   193  	return InitChan(tsdbhost, root, make(chan *opentsdb.DataPoint))
   194  }
   195  
   196  type agMetric struct {
   197  	metric string
   198  	ts     opentsdb.TagSet
   199  	values []float64
   200  }
   201  
   202  func AggregateMeta(metric string, unit metadata.Unit, desc string) {
   203  	agStrings := []string{"avg", "count", "min", "median", "max", "95", "99"}
   204  	for _, ag := range agStrings {
   205  		if ag == "count" {
   206  			metadata.AddMetricMeta(metric+"_"+ag, metadata.Gauge, metadata.Count, "The number of samples per aggregation.")
   207  			continue
   208  		}
   209  		metadata.AddMetricMeta(metric+"_"+ag, metadata.Gauge, unit, desc)
   210  	}
   211  }
   212  
   213  func (am *agMetric) Process(now int64) {
   214  	var avg float64
   215  	for _, v := range am.values {
   216  		avg += v
   217  	}
   218  	avg /= float64(len(am.values))
   219  	extRoot := metricRoot + am.metric
   220  	tchan <- &opentsdb.DataPoint{
   221  		Metric:    extRoot + "_avg",
   222  		Timestamp: now,
   223  		Value:     avg,
   224  		Tags:      am.ts,
   225  	}
   226  	tchan <- &opentsdb.DataPoint{
   227  		Metric:    extRoot + "_count",
   228  		Timestamp: now,
   229  		Value:     len(am.values),
   230  		Tags:      am.ts,
   231  	}
   232  	sort.Float64s(am.values)
   233  	percentile := func(p float64) float64 {
   234  		if p <= 0 {
   235  			return am.values[0]
   236  		}
   237  		if p >= 1 {
   238  			return am.values[len(am.values)-1]
   239  		}
   240  		i := p * float64(len(am.values)-1)
   241  		i = math.Ceil(i)
   242  		return am.values[int(i)]
   243  	}
   244  	tchan <- &opentsdb.DataPoint{
   245  		Metric:    extRoot + "_min",
   246  		Timestamp: now,
   247  		Value:     percentile(0),
   248  		Tags:      am.ts,
   249  	}
   250  	tchan <- &opentsdb.DataPoint{
   251  		Metric:    extRoot + "_median",
   252  		Timestamp: now,
   253  		Value:     percentile(.5),
   254  		Tags:      am.ts,
   255  	}
   256  	tchan <- &opentsdb.DataPoint{
   257  		Metric:    extRoot + "_max",
   258  		Timestamp: now,
   259  		Value:     percentile(1),
   260  		Tags:      am.ts,
   261  	}
   262  	tchan <- &opentsdb.DataPoint{
   263  		Metric:    extRoot + "_95",
   264  		Timestamp: now,
   265  		Value:     percentile(.95),
   266  		Tags:      am.ts,
   267  	}
   268  	tchan <- &opentsdb.DataPoint{
   269  		Metric:    extRoot + "_99",
   270  		Timestamp: now,
   271  		Value:     percentile(.99),
   272  		Tags:      am.ts,
   273  	}
   274  }
   275  
   276  func Sample(metric string, ts opentsdb.TagSet, v float64) error {
   277  	if err := check(metric, &ts); err != nil {
   278  		return err
   279  	}
   280  	tss := metric + ts.String()
   281  	mlock.Lock()
   282  	if aggs[tss] == nil {
   283  		aggs[tss] = &agMetric{
   284  			metric: metric,
   285  			ts:     ts.Copy(),
   286  		}
   287  	}
   288  	aggs[tss].values = append(aggs[tss].values, v)
   289  	mlock.Unlock()
   290  	return nil
   291  }
   292  
   293  // StartTimer records the current time, and returns a function you can call to
   294  // record the end of your action.
   295  //
   296  // Typical usage would be:
   297  //    done := collect.StartTimer("myMetric", opentsdb.TagSet{})
   298  //    doMyThing()
   299  //    done()
   300  func StartTimer(metric string, ts opentsdb.TagSet) func() {
   301  	start := time.Now()
   302  	return func() {
   303  		d := time.Now().Sub(start) / time.Millisecond
   304  		Sample(metric, ts, float64(d))
   305  	}
   306  }
   307  
   308  type setMetric struct {
   309  	metric string
   310  	ts     opentsdb.TagSet
   311  	f      func() interface{}
   312  }
   313  
   314  // Set registers a callback for the given metric and tags, calling f immediately
   315  // before queueing data for send.
   316  func Set(metric string, ts opentsdb.TagSet, f func() interface{}) error {
   317  	if err := check(metric, &ts); err != nil {
   318  		return err
   319  	}
   320  	tss := metric + ts.String()
   321  	mlock.Lock()
   322  	sets[tss] = &setMetric{metric, ts.Copy(), f}
   323  	mlock.Unlock()
   324  	return nil
   325  }
   326  
   327  type addMetric struct {
   328  	metric string
   329  	ts     opentsdb.TagSet
   330  	value  int64
   331  }
   332  
   333  // Add takes a metric and increments a counter for that metric. The metric name
   334  // is appended to the basename specified in the Init function.
   335  func Add(metric string, ts opentsdb.TagSet, inc int64) error {
   336  	if err := check(metric, &ts); err != nil {
   337  		return err
   338  	}
   339  	tss := metric + ts.String()
   340  	mlock.Lock()
   341  	if counters[tss] == nil {
   342  		counters[tss] = &addMetric{
   343  			metric: metric,
   344  			ts:     ts.Copy(),
   345  		}
   346  	}
   347  	counters[tss].value += inc
   348  	mlock.Unlock()
   349  	return nil
   350  }
   351  
   352  func Get(metric string, ts opentsdb.TagSet) int64 {
   353  	var counter_value int64
   354  	if err := check(metric, &ts); err != nil {
   355  		return 0
   356  	}
   357  	tss := metric + ts.String()
   358  	mlock.Lock()
   359  	if counters[tss] != nil {
   360  		counter_value = counters[tss].value
   361  	} else {
   362  		counter_value = 0
   363  	}
   364  	mlock.Unlock()
   365  	return counter_value
   366  }
   367  
   368  type putMetric struct {
   369  	metric string
   370  	ts     opentsdb.TagSet
   371  	value  interface{}
   372  }
   373  
   374  // Put is useful for capturing "events" that have a gauge value. Subsequent
   375  // calls between the sending interval will overwrite previous calls.
   376  func Put(metric string, ts opentsdb.TagSet, v interface{}) error {
   377  	if err := check(metric, &ts); err != nil {
   378  		return err
   379  	}
   380  	tss := metric + ts.String()
   381  	mlock.Lock()
   382  	puts[tss] = &putMetric{metric, ts.Copy(), v}
   383  	mlock.Unlock()
   384  	return nil
   385  }
   386  
   387  func check(metric string, ts *opentsdb.TagSet) error {
   388  	if err := checkClean(metric, "metric"); err != nil {
   389  		return err
   390  	}
   391  	for k, v := range *ts {
   392  		if err := checkClean(k, "tagk"); err != nil {
   393  			return err
   394  		}
   395  		if err := checkClean(v, "tagv"); err != nil {
   396  			return err
   397  		}
   398  	}
   399  
   400  	if *ts == nil {
   401  		*ts = make(opentsdb.TagSet)
   402  	}
   403  	if host, present := (*ts)["host"]; !present {
   404  		(*ts)["host"] = util.GetHostManager().GetHostName()
   405  	} else if host == "" {
   406  		delete(*ts, "host")
   407  	}
   408  	return nil
   409  }
   410  
   411  func checkClean(s, t string) error {
   412  	if sc, err := opentsdb.Clean(s); s != sc || err != nil {
   413  		if err != nil {
   414  			return err
   415  		}
   416  		return fmt.Errorf("%s %s may only contain a to z, A to Z, 0 to 9, -, _, ., / or Unicode letters and may not be empty", t, s)
   417  	}
   418  	return nil
   419  }
   420  
   421  func collect() {
   422  	for {
   423  		time.Sleep(Freq)
   424  		flushData()
   425  	}
   426  }
   427  
   428  func flushData() {
   429  	mlock.Lock()
   430  	now := time.Now().Unix()
   431  	for _, c := range counters {
   432  		dp := &opentsdb.DataPoint{
   433  			Metric:    metricRoot + c.metric,
   434  			Timestamp: now,
   435  			Value:     c.value,
   436  			Tags:      c.ts,
   437  		}
   438  		tchan <- dp
   439  	}
   440  	for _, s := range sets {
   441  		dp := &opentsdb.DataPoint{
   442  			Metric:    metricRoot + s.metric,
   443  			Timestamp: now,
   444  			Value:     s.f(),
   445  			Tags:      s.ts,
   446  		}
   447  		tchan <- dp
   448  	}
   449  	for _, s := range puts {
   450  		dp := &opentsdb.DataPoint{
   451  			Metric:    metricRoot + s.metric,
   452  			Timestamp: now,
   453  			Value:     s.value,
   454  			Tags:      s.ts,
   455  		}
   456  		tchan <- dp
   457  	}
   458  	for _, am := range aggs {
   459  		am.Process(now)
   460  	}
   461  	puts = make(map[string]*putMetric)
   462  	aggs = make(map[string]*agMetric)
   463  	mlock.Unlock()
   464  }