bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/collect/collect.go (about) 1 // Package collect provides functions for sending data to OpenTSDB. 2 // 3 // The "collect" namespace is used (i.e., <root>.collect) to collect 4 // program and queue metrics. 5 package collect // import "bosun.org/collect" 6 7 import ( 8 "fmt" 9 "math" 10 "net/http" 11 "net/url" 12 "runtime" 13 "sort" 14 "strings" 15 "sync" 16 "time" 17 18 "bosun.org/util" 19 20 "bosun.org/metadata" 21 "bosun.org/opentsdb" 22 ) 23 24 var ( 25 // Freq is how often metrics are sent to OpenTSDB. 26 Freq = time.Second * 15 27 28 // MaxQueueLen is the maximum size of the queue, above which incoming data will 29 // be discarded. Defaults to about 150MB. 30 MaxQueueLen = 200000 31 32 // BatchSize is the maximum length of data points sent at once to OpenTSDB. 33 BatchSize = 500 34 35 // Debug enables debug logging. 36 Debug = false 37 38 // Print prints all datapoints to stdout instead of sending them. 39 Print = false 40 41 // DisableDefaultCollectors prevents the scollector self metrics from being 42 // generated. 43 DisableDefaultCollectors = false 44 45 // Tags is an opentsdb.TagSet used when sending self metrics. 46 Tags opentsdb.TagSet 47 48 // Whether or not to use NTLM authentication 49 UseNtlm bool = false 50 51 // DefaultClient can be used to override the HTTP client that will be used to make requests. 52 DefaultClient *http.Client = http.DefaultClient 53 54 // Dropped is the number of dropped data points due to a full queue. 55 dropped int64 56 57 // Dropped is the number of discarded data points due to being invalid 58 discarded int64 59 60 // Sent is the number of sent data points. 61 sent int64 62 63 // Authtoken is the token to use to communicate with bosun 64 AuthToken string 65 66 tchan chan *opentsdb.DataPoint 67 tsdbURL string 68 metricRoot string 69 queue []*opentsdb.DataPoint 70 qlock, mlock, slock sync.Mutex // Locks for queues, maps, stats. 71 counters = make(map[string]*addMetric) 72 sets = make(map[string]*setMetric) 73 puts = make(map[string]*putMetric) 74 aggs = make(map[string]*agMetric) 75 76 //DirectHandler is an http handler to invoke instead of actually making a network request 77 DirectHandler http.Handler 78 ) 79 80 const ( 81 descCollectAlloc = "Total number of bytes allocated and still in use by the runtime (via runtime.ReadMemStats)." 82 descCollectDiscarded = "Counter of discarded data points due to being invalid." 83 descCollectDropped = "Counter of dropped data points due to the queue being full." 84 descCollectGoRoutines = "Total number of goroutines that currently exist (via runtime.NumGoroutine)." 85 descCollectGcCpuFraction = "fraction of CPU time used by GC" 86 descCollectTotalGCPause = "Total GC Pause time in milliseconds" 87 descCollectPostBad = "Counter of HTTP POST requests where resp.StatusCode != http.StatusNoContent." 88 descCollectPostBatchSize = "Number of datapoints included in each batch." 89 descCollectPostCount = "Counter of batches sent to the server." 90 descCollectPostDuration = "How many milliseconds it took to send HTTP POST requests to the server." 91 descCollectPostError = "Counter of errors received when sending a batch to the server." 92 descCollectPostRestore = "Counter of data points restored from batches that could not be sent to the server." 93 descCollectPostTotalBytes = "Total number of gzipped bytes sent to the server." 94 descCollectPostTotalDuration = "Total number of milliseconds it took to send an HTTP POST request to the server." 95 descCollectQueued = "Total number of items currently queued and waiting to be sent to the server." 96 descCollectSent = "Counter of data points sent to the server." 97 ) 98 99 // InitChan is similar to Init, but uses the given channel instead of creating a 100 // new one. 101 func InitChan(tsdbhost *url.URL, root string, ch chan *opentsdb.DataPoint) error { 102 if tchan != nil { 103 return fmt.Errorf("cannot init twice") 104 } 105 if err := checkClean(root, "metric root"); err != nil { 106 return err 107 } 108 u, err := tsdbhost.Parse("/api/put") 109 if err != nil { 110 return err 111 } 112 if strings.HasPrefix(u.Host, ":") { 113 u.Host = "localhost" + u.Host 114 } 115 tsdbURL = u.String() 116 metricRoot = root + "." 117 tchan = ch 118 go queuer() 119 go send() 120 go collect() 121 if DisableDefaultCollectors { 122 return nil 123 } 124 Set("collect.dropped", Tags, func() (i interface{}) { 125 slock.Lock() 126 i = dropped 127 slock.Unlock() 128 return 129 }) 130 Set("collect.discarded", Tags, func() (i interface{}) { 131 slock.Lock() 132 i = discarded 133 slock.Unlock() 134 return 135 }) 136 137 Set("collect.sent", Tags, func() (i interface{}) { 138 slock.Lock() 139 i = sent 140 slock.Unlock() 141 return 142 }) 143 Set("collect.queued", Tags, func() (i interface{}) { 144 qlock.Lock() 145 i = len(queue) 146 qlock.Unlock() 147 return 148 }) 149 Set("collect.alloc", Tags, func() interface{} { 150 var ms runtime.MemStats 151 runtime.ReadMemStats(&ms) 152 return ms.Alloc 153 }) 154 Set("collect.gc.cpu_fraction", Tags, func() interface{} { 155 var ms runtime.MemStats 156 runtime.ReadMemStats(&ms) 157 return ms.GCCPUFraction 158 }) 159 Set("collect.gc.total_pause", Tags, func() interface{} { 160 var ms runtime.MemStats 161 runtime.ReadMemStats(&ms) 162 return ms.PauseTotalNs / uint64(time.Millisecond) 163 }) 164 Set("collect.goroutines", Tags, func() interface{} { 165 return runtime.NumGoroutine() 166 }) 167 AggregateMeta(metricRoot+"collect.post.batchsize", metadata.Count, descCollectPostBatchSize) 168 AggregateMeta(metricRoot+"collect.post.duration", metadata.MilliSecond, descCollectPostDuration) 169 metadata.AddMetricMeta(metricRoot+"collect.alloc", metadata.Gauge, metadata.Bytes, descCollectAlloc) 170 metadata.AddMetricMeta(metricRoot+"collect.goroutines", metadata.Gauge, metadata.Count, descCollectGoRoutines) 171 metadata.AddMetricMeta(metricRoot+"collect.gc.cpu_fraction", metadata.Gauge, metadata.Pct, descCollectGcCpuFraction) 172 metadata.AddMetricMeta(metricRoot+"collect.gc.total_pause", metadata.Counter, metadata.MilliSecond, descCollectTotalGCPause) 173 metadata.AddMetricMeta(metricRoot+"collect.post.bad_status", metadata.Counter, metadata.PerSecond, descCollectPostBad) 174 metadata.AddMetricMeta(metricRoot+"collect.post.count", metadata.Counter, metadata.PerSecond, descCollectPostCount) 175 metadata.AddMetricMeta(metricRoot+"collect.post.error", metadata.Counter, metadata.PerSecond, descCollectPostError) 176 metadata.AddMetricMeta(metricRoot+"collect.post.restore", metadata.Counter, metadata.PerSecond, descCollectPostRestore) 177 metadata.AddMetricMeta(metricRoot+"collect.post.total_bytes", metadata.Counter, metadata.Bytes, descCollectPostTotalBytes) 178 metadata.AddMetricMeta(metricRoot+"collect.post.total_duration", metadata.Counter, metadata.MilliSecond, descCollectPostTotalDuration) 179 metadata.AddMetricMeta(metricRoot+"collect.queued", metadata.Gauge, metadata.Item, descCollectQueued) 180 metadata.AddMetricMeta(metricRoot+"collect.sent", metadata.Counter, metadata.PerSecond, descCollectSent) 181 metadata.AddMetricMeta(metricRoot+"collect.dropped", metadata.Counter, metadata.PerSecond, descCollectDropped) 182 metadata.AddMetricMeta(metricRoot+"collect.discarded", metadata.Counter, metadata.PerSecond, descCollectDiscarded) 183 // Make sure these get zeroed out instead of going unknown on restart 184 Add("collect.post.error", Tags, 0) 185 Add("collect.post.bad_status", Tags, 0) 186 Add("collect.post.restore", Tags, 0) 187 return nil 188 } 189 190 // Init sets up the channels and the queue for sending data to OpenTSDB. It also 191 // sets up the basename for all metrics. 192 func Init(tsdbhost *url.URL, root string) error { 193 return InitChan(tsdbhost, root, make(chan *opentsdb.DataPoint)) 194 } 195 196 type agMetric struct { 197 metric string 198 ts opentsdb.TagSet 199 values []float64 200 } 201 202 func AggregateMeta(metric string, unit metadata.Unit, desc string) { 203 agStrings := []string{"avg", "count", "min", "median", "max", "95", "99"} 204 for _, ag := range agStrings { 205 if ag == "count" { 206 metadata.AddMetricMeta(metric+"_"+ag, metadata.Gauge, metadata.Count, "The number of samples per aggregation.") 207 continue 208 } 209 metadata.AddMetricMeta(metric+"_"+ag, metadata.Gauge, unit, desc) 210 } 211 } 212 213 func (am *agMetric) Process(now int64) { 214 var avg float64 215 for _, v := range am.values { 216 avg += v 217 } 218 avg /= float64(len(am.values)) 219 extRoot := metricRoot + am.metric 220 tchan <- &opentsdb.DataPoint{ 221 Metric: extRoot + "_avg", 222 Timestamp: now, 223 Value: avg, 224 Tags: am.ts, 225 } 226 tchan <- &opentsdb.DataPoint{ 227 Metric: extRoot + "_count", 228 Timestamp: now, 229 Value: len(am.values), 230 Tags: am.ts, 231 } 232 sort.Float64s(am.values) 233 percentile := func(p float64) float64 { 234 if p <= 0 { 235 return am.values[0] 236 } 237 if p >= 1 { 238 return am.values[len(am.values)-1] 239 } 240 i := p * float64(len(am.values)-1) 241 i = math.Ceil(i) 242 return am.values[int(i)] 243 } 244 tchan <- &opentsdb.DataPoint{ 245 Metric: extRoot + "_min", 246 Timestamp: now, 247 Value: percentile(0), 248 Tags: am.ts, 249 } 250 tchan <- &opentsdb.DataPoint{ 251 Metric: extRoot + "_median", 252 Timestamp: now, 253 Value: percentile(.5), 254 Tags: am.ts, 255 } 256 tchan <- &opentsdb.DataPoint{ 257 Metric: extRoot + "_max", 258 Timestamp: now, 259 Value: percentile(1), 260 Tags: am.ts, 261 } 262 tchan <- &opentsdb.DataPoint{ 263 Metric: extRoot + "_95", 264 Timestamp: now, 265 Value: percentile(.95), 266 Tags: am.ts, 267 } 268 tchan <- &opentsdb.DataPoint{ 269 Metric: extRoot + "_99", 270 Timestamp: now, 271 Value: percentile(.99), 272 Tags: am.ts, 273 } 274 } 275 276 func Sample(metric string, ts opentsdb.TagSet, v float64) error { 277 if err := check(metric, &ts); err != nil { 278 return err 279 } 280 tss := metric + ts.String() 281 mlock.Lock() 282 if aggs[tss] == nil { 283 aggs[tss] = &agMetric{ 284 metric: metric, 285 ts: ts.Copy(), 286 } 287 } 288 aggs[tss].values = append(aggs[tss].values, v) 289 mlock.Unlock() 290 return nil 291 } 292 293 // StartTimer records the current time, and returns a function you can call to 294 // record the end of your action. 295 // 296 // Typical usage would be: 297 // done := collect.StartTimer("myMetric", opentsdb.TagSet{}) 298 // doMyThing() 299 // done() 300 func StartTimer(metric string, ts opentsdb.TagSet) func() { 301 start := time.Now() 302 return func() { 303 d := time.Now().Sub(start) / time.Millisecond 304 Sample(metric, ts, float64(d)) 305 } 306 } 307 308 type setMetric struct { 309 metric string 310 ts opentsdb.TagSet 311 f func() interface{} 312 } 313 314 // Set registers a callback for the given metric and tags, calling f immediately 315 // before queueing data for send. 316 func Set(metric string, ts opentsdb.TagSet, f func() interface{}) error { 317 if err := check(metric, &ts); err != nil { 318 return err 319 } 320 tss := metric + ts.String() 321 mlock.Lock() 322 sets[tss] = &setMetric{metric, ts.Copy(), f} 323 mlock.Unlock() 324 return nil 325 } 326 327 type addMetric struct { 328 metric string 329 ts opentsdb.TagSet 330 value int64 331 } 332 333 // Add takes a metric and increments a counter for that metric. The metric name 334 // is appended to the basename specified in the Init function. 335 func Add(metric string, ts opentsdb.TagSet, inc int64) error { 336 if err := check(metric, &ts); err != nil { 337 return err 338 } 339 tss := metric + ts.String() 340 mlock.Lock() 341 if counters[tss] == nil { 342 counters[tss] = &addMetric{ 343 metric: metric, 344 ts: ts.Copy(), 345 } 346 } 347 counters[tss].value += inc 348 mlock.Unlock() 349 return nil 350 } 351 352 func Get(metric string, ts opentsdb.TagSet) int64 { 353 var counter_value int64 354 if err := check(metric, &ts); err != nil { 355 return 0 356 } 357 tss := metric + ts.String() 358 mlock.Lock() 359 if counters[tss] != nil { 360 counter_value = counters[tss].value 361 } else { 362 counter_value = 0 363 } 364 mlock.Unlock() 365 return counter_value 366 } 367 368 type putMetric struct { 369 metric string 370 ts opentsdb.TagSet 371 value interface{} 372 } 373 374 // Put is useful for capturing "events" that have a gauge value. Subsequent 375 // calls between the sending interval will overwrite previous calls. 376 func Put(metric string, ts opentsdb.TagSet, v interface{}) error { 377 if err := check(metric, &ts); err != nil { 378 return err 379 } 380 tss := metric + ts.String() 381 mlock.Lock() 382 puts[tss] = &putMetric{metric, ts.Copy(), v} 383 mlock.Unlock() 384 return nil 385 } 386 387 func check(metric string, ts *opentsdb.TagSet) error { 388 if err := checkClean(metric, "metric"); err != nil { 389 return err 390 } 391 for k, v := range *ts { 392 if err := checkClean(k, "tagk"); err != nil { 393 return err 394 } 395 if err := checkClean(v, "tagv"); err != nil { 396 return err 397 } 398 } 399 400 if *ts == nil { 401 *ts = make(opentsdb.TagSet) 402 } 403 if host, present := (*ts)["host"]; !present { 404 (*ts)["host"] = util.GetHostManager().GetHostName() 405 } else if host == "" { 406 delete(*ts, "host") 407 } 408 return nil 409 } 410 411 func checkClean(s, t string) error { 412 if sc, err := opentsdb.Clean(s); s != sc || err != nil { 413 if err != nil { 414 return err 415 } 416 return fmt.Errorf("%s %s may only contain a to z, A to Z, 0 to 9, -, _, ., / or Unicode letters and may not be empty", t, s) 417 } 418 return nil 419 } 420 421 func collect() { 422 for { 423 time.Sleep(Freq) 424 flushData() 425 } 426 } 427 428 func flushData() { 429 mlock.Lock() 430 now := time.Now().Unix() 431 for _, c := range counters { 432 dp := &opentsdb.DataPoint{ 433 Metric: metricRoot + c.metric, 434 Timestamp: now, 435 Value: c.value, 436 Tags: c.ts, 437 } 438 tchan <- dp 439 } 440 for _, s := range sets { 441 dp := &opentsdb.DataPoint{ 442 Metric: metricRoot + s.metric, 443 Timestamp: now, 444 Value: s.f(), 445 Tags: s.ts, 446 } 447 tchan <- dp 448 } 449 for _, s := range puts { 450 dp := &opentsdb.DataPoint{ 451 Metric: metricRoot + s.metric, 452 Timestamp: now, 453 Value: s.value, 454 Tags: s.ts, 455 } 456 tchan <- dp 457 } 458 for _, am := range aggs { 459 am.Process(now) 460 } 461 puts = make(map[string]*putMetric) 462 aggs = make(map[string]*agMetric) 463 mlock.Unlock() 464 }