github.com/google/cloudprober@v0.11.3/surfacers/prometheus/prometheus.go (about) 1 // Copyright 2017-2020 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package prometheus provides a prometheus surfacer for Cloudprober. Prometheus 17 surfacer exports incoming metrics over a web interface in a format that 18 prometheus understands (http://prometheus.io). 19 20 This surfacer processes each incoming EventMetrics and holds the latest value 21 and timestamp for each metric in memory. These metrics are made available 22 through a web URL (default: /metrics), which Prometheus scrapes at a regular 23 interval. 24 25 Example /metrics page: 26 #TYPE sent counter 27 sent{ptype="dns",probe="vm-to-public-dns",dst="8.8.8.8"} 181299 1497330037000 28 sent{ptype="ping",probe="vm-to-public-dns",dst="8.8.4.4"} 362600 1497330037000 29 #TYPE rcvd counter 30 rcvd{ptype="dns",probe="vm-to-public-dns",dst="8.8.8.8"} 181234 1497330037000 31 rcvd{ptype="ping",probe="vm-to-public-dns",dst="8.8.4.4"} 362600 1497330037000 32 */ 33 package prometheus 34 35 import ( 36 "context" 37 "fmt" 38 "io" 39 "net/http" 40 "regexp" 41 "strconv" 42 "strings" 43 "time" 44 45 "github.com/google/cloudprober/logger" 46 "github.com/google/cloudprober/metrics" 47 "github.com/google/cloudprober/surfacers/common/options" 48 configpb "github.com/google/cloudprober/surfacers/prometheus/proto" 49 ) 50 51 // Prometheus metric and label names should match the following regular 52 // expressions. Since, "-" is commonly used in metric and label names, we 53 // replace it by "_". If a name still doesn't match the regular expression, we 54 // ignore it with a warning log message. 55 const ( 56 ValidMetricNameRegex = "^[a-zA-Z_:]([a-zA-Z0-9_:])*$" 57 ValidLabelNameRegex = "^[a-zA-Z_]([a-zA-Z0-9_])*$" 58 ) 59 60 const histogram = "histogram" 61 62 // queriesQueueSize defines how many queries can we queue before we start 63 // blocking on previous queries to finish. 64 const queriesQueueSize = 10 65 66 var ( 67 // Cache of EventMetric label to prometheus label mapping. We use it to 68 // quickly lookup if we have already seen a label and we have a prometheus 69 // label corresponding to it. 70 promLabelNames = make(map[string]string) 71 72 // Cache of EventMetric metric to prometheus metric mapping. We use it to 73 // quickly lookup if we have already seen a metric and we have a prometheus 74 // metric name corresponding to it. 75 promMetricNames = make(map[string]string) 76 ) 77 78 type promMetric struct { 79 typ string 80 data map[string]*dataPoint 81 dataKeys []string // To keep data keys ordered 82 } 83 84 type dataPoint struct { 85 value string 86 timestamp int64 87 } 88 89 // httpWriter is a wrapper for http.ResponseWriter that includes a channel 90 // to signal the completion of the writing of the response. 91 type httpWriter struct { 92 w http.ResponseWriter 93 doneChan chan struct{} 94 } 95 96 // PromSurfacer implements a prometheus surfacer for Cloudprober. PromSurfacer 97 // organizes metrics into a two-level data structure: 98 // 1. Metric name -> PromMetric data structure dict. 99 // 2. A PromMetric organizes data associated with a metric in a 100 // Data key -> Data point map, where data point consists of a value 101 // and timestamp. 102 // Data key represents a unique combination of metric name and labels. 103 type PromSurfacer struct { 104 c *configpb.SurfacerConf // Configuration 105 opts *options.Options 106 prefix string // Metrics prefix, e.g. "cloudprober_" 107 emChan chan *metrics.EventMetrics // Buffered channel to store incoming EventMetrics 108 metrics map[string]*promMetric // Metric name to promMetric mapping 109 metricNames []string // Metric names, to keep names ordered. 110 queryChan chan *httpWriter // Query channel 111 l *logger.Logger 112 113 // A handler that takes a promMetric and a dataKey and writes the 114 // corresponding metric string to the provided io.Writer. 115 dataWriter func(w io.Writer, pm *promMetric, dataKey string) 116 117 // Regexes for metric and label names. 118 metricNameRe *regexp.Regexp 119 labelNameRe *regexp.Regexp 120 } 121 122 // New returns a prometheus surfacer based on the config provided. It sets up a 123 // goroutine to process both the incoming EventMetrics and the web requests for 124 // the URL handler /metrics. 125 func New(ctx context.Context, config *configpb.SurfacerConf, opts *options.Options, l *logger.Logger) (*PromSurfacer, error) { 126 if config == nil { 127 config = &configpb.SurfacerConf{} 128 } 129 ps := &PromSurfacer{ 130 c: config, 131 opts: opts, 132 emChan: make(chan *metrics.EventMetrics, config.GetMetricsBufferSize()), 133 queryChan: make(chan *httpWriter, queriesQueueSize), 134 metrics: make(map[string]*promMetric), 135 prefix: config.GetMetricsPrefix(), 136 metricNameRe: regexp.MustCompile(ValidMetricNameRegex), 137 labelNameRe: regexp.MustCompile(ValidLabelNameRegex), 138 l: l, 139 } 140 141 if ps.c.GetIncludeTimestamp() { 142 ps.dataWriter = func(w io.Writer, pm *promMetric, k string) { 143 fmt.Fprintf(w, "%s %s %d\n", k, pm.data[k].value, pm.data[k].timestamp) 144 } 145 } else { 146 ps.dataWriter = func(w io.Writer, pm *promMetric, k string) { 147 fmt.Fprintf(w, "%s %s\n", k, pm.data[k].value) 148 } 149 } 150 151 // Start a goroutine to process the incoming EventMetrics as well as 152 // the incoming web queries. To avoid data access race conditions, we do 153 // one thing at a time. 154 go func() { 155 for { 156 select { 157 case <-ctx.Done(): 158 ps.l.Infof("Context canceled, stopping the input/output processing loop.") 159 return 160 case em := <-ps.emChan: 161 ps.record(em) 162 case hw := <-ps.queryChan: 163 ps.writeData(hw.w) 164 close(hw.doneChan) 165 } 166 } 167 }() 168 169 http.HandleFunc(ps.c.GetMetricsUrl(), func(w http.ResponseWriter, r *http.Request) { 170 // doneChan is used to track the completion of the response writing. This is 171 // required as response is written in a different goroutine. 172 doneChan := make(chan struct{}, 1) 173 ps.queryChan <- &httpWriter{w, doneChan} 174 <-doneChan 175 }) 176 177 l.Infof("Initialized prometheus exporter at the URL: %s", ps.c.GetMetricsUrl()) 178 return ps, nil 179 } 180 181 // Write queues the incoming data into a channel. This channel is watched by a 182 // goroutine that actually processes the data and updates the in-memory 183 // database. 184 func (ps *PromSurfacer) Write(_ context.Context, em *metrics.EventMetrics) { 185 select { 186 case ps.emChan <- em: 187 default: 188 ps.l.Errorf("PromSurfacer's write channel is full, dropping new data.") 189 } 190 } 191 192 func promType(em *metrics.EventMetrics) string { 193 switch em.Kind { 194 case metrics.CUMULATIVE: 195 return "counter" 196 case metrics.GAUGE: 197 return "gauge" 198 default: 199 return "unknown" 200 } 201 } 202 203 // promTime converts time.Time to Unix milliseconds. 204 func promTime(t time.Time) int64 { 205 return t.UnixNano() / (1000 * 1000) 206 } 207 208 func (ps *PromSurfacer) recordMetric(metricName, key, value string, em *metrics.EventMetrics, typ string) { 209 // Recognized metric 210 if pm := ps.metrics[metricName]; pm != nil { 211 // Recognized metric name and labels combination. 212 if pm.data[key] != nil { 213 pm.data[key].value = value 214 pm.data[key].timestamp = promTime(em.Timestamp) 215 return 216 } 217 pm.data[key] = &dataPoint{ 218 value: value, 219 timestamp: promTime(em.Timestamp), 220 } 221 pm.dataKeys = append(pm.dataKeys, key) 222 } else { 223 // Newly discovered metric name. 224 if typ == "" { 225 typ = promType(em) 226 } 227 ps.metrics[metricName] = &promMetric{ 228 typ: typ, 229 data: map[string]*dataPoint{ 230 key: &dataPoint{ 231 value: value, 232 timestamp: promTime(em.Timestamp), 233 }, 234 }, 235 dataKeys: []string{key}, 236 } 237 ps.metricNames = append(ps.metricNames, metricName) 238 } 239 return 240 } 241 242 // checkLabelName finds a prometheus label name for an incoming label. If label 243 // is found to be invalid even after some basic conversions, a zero string is 244 // returned. 245 func (ps *PromSurfacer) checkLabelName(k string) string { 246 // Before checking with regex, see if this label name is 247 // already known. This block will be entered only once per 248 // label name. 249 if promLabel, ok := promLabelNames[k]; ok { 250 return promLabel 251 } 252 253 ps.l.Infof("Checking validity of new label: %s", k) 254 // We'll come here only once per label name. 255 256 // Prometheus doesn't support "-" in metric names. 257 labelName := strings.Replace(k, "-", "_", -1) 258 if !ps.labelNameRe.MatchString(labelName) { 259 // Explicitly store a zero string so that we don't check it again. 260 promLabelNames[k] = "" 261 ps.l.Warningf("Ignoring invalid prometheus label name: %s", k) 262 return "" 263 } 264 promLabelNames[k] = labelName 265 return labelName 266 } 267 268 // promMetricName finds a prometheus metric name for an incoming metric. If metric 269 // is found to be invalid even after some basic conversions, a zero string is 270 // returned. 271 func (ps *PromSurfacer) promMetricName(k string) string { 272 k = ps.prefix + k 273 274 // Before checking with regex, see if this metric name is 275 // already known. This block will be entered only once per 276 // metric name. 277 if metricName, ok := promMetricNames[k]; ok { 278 return metricName 279 } 280 281 ps.l.Infof("Checking validity of new metric: %s", k) 282 // We'll come here only once per metric name. 283 284 // Prometheus doesn't support "-" in metric names. 285 metricName := strings.Replace(k, "-", "_", -1) 286 if !ps.metricNameRe.MatchString(metricName) { 287 // Explicitly store a zero string so that we don't check it again. 288 promMetricNames[k] = "" 289 ps.l.Warningf("Ignoring invalid prometheus metric name: %s", k) 290 return "" 291 } 292 promMetricNames[k] = metricName 293 return metricName 294 } 295 296 func dataKey(metricName string, labels []string) string { 297 return metricName + "{" + strings.Join(labels, ",") + "}" 298 } 299 300 // record processes the incoming EventMetrics and updates the in-memory 301 // database. 302 // 303 // Since prometheus doesn't support certain metrics.Value types, we handle them 304 // differently. 305 // 306 // metrics.Map value type: We break Map values into multiple data keys, with 307 // each map key corresponding to a label in the data key. 308 // For example, "resp-code map:code 200:45 500:2" gets converted into: 309 // resp-code{code=200} 45 310 // resp-code{code=500} 2 311 // 312 // metrics.String value type: We convert string value type into a data key with 313 // val="value" label. 314 // For example, "version cloudprober-20170608-RC00" gets converted into: 315 // version{val=cloudprober-20170608-RC00} 1 316 func (ps *PromSurfacer) record(em *metrics.EventMetrics) { 317 var labels []string 318 for _, k := range em.LabelsKeys() { 319 if labelName := ps.checkLabelName(k); labelName != "" { 320 labels = append(labels, labelName+"=\""+em.Label(k)+"\"") 321 } 322 } 323 324 for _, metricName := range em.MetricsKeys() { 325 if !ps.opts.AllowMetric(metricName) { 326 continue 327 } 328 pMetricName := ps.promMetricName(metricName) 329 if pMetricName == "" { 330 // No prometheus metric name found for this metric. 331 continue 332 } 333 val := em.Metric(metricName) 334 335 // Map values get expanded into metrics with extra label. 336 if mapVal, ok := val.(*metrics.Map); ok { 337 labelName := ps.checkLabelName(mapVal.MapName) 338 if labelName == "" { 339 continue 340 } 341 for _, k := range mapVal.Keys() { 342 labelsWithMap := append(labels, labelName+"=\""+k+"\"") 343 ps.recordMetric(pMetricName, dataKey(pMetricName, labelsWithMap), mapVal.GetKey(k).String(), em, "") 344 } 345 continue 346 } 347 348 // Distribution values get expanded into metrics with extra label "le". 349 if distVal, ok := val.(*metrics.Distribution); ok { 350 d := distVal.Data() 351 var val int64 352 ps.recordMetric(pMetricName, dataKey(pMetricName+"_sum", labels), strconv.FormatFloat(d.Sum, 'f', -1, 64), em, histogram) 353 ps.recordMetric(pMetricName, dataKey(pMetricName+"_count", labels), strconv.FormatInt(d.Count, 10), em, histogram) 354 for i := range d.LowerBounds { 355 val += d.BucketCounts[i] 356 var lb string 357 if i == len(d.LowerBounds)-1 { 358 lb = "+Inf" 359 } else { 360 lb = strconv.FormatFloat(d.LowerBounds[i+1], 'f', -1, 64) 361 } 362 labelsWithBucket := append(labels, "le=\""+lb+"\"") 363 ps.recordMetric(pMetricName, dataKey(pMetricName+"_bucket", labelsWithBucket), strconv.FormatInt(val, 10), em, histogram) 364 } 365 continue 366 } 367 368 // String values get converted into a label. 369 if _, ok := val.(metrics.String); ok { 370 newLabels := append(labels, "val="+val.String()) 371 ps.recordMetric(pMetricName, dataKey(pMetricName, newLabels), "1", em, "") 372 continue 373 } 374 375 // All other value types, mostly numerical types. 376 ps.recordMetric(pMetricName, dataKey(pMetricName, labels), val.String(), em, "") 377 } 378 } 379 380 // writeData writes metrics data on w io.Writer 381 func (ps *PromSurfacer) writeData(w io.Writer) { 382 for _, name := range ps.metricNames { 383 pm := ps.metrics[name] 384 fmt.Fprintf(w, "#TYPE %s %s\n", name, pm.typ) 385 for _, k := range pm.dataKeys { 386 ps.dataWriter(w, pm, k) 387 } 388 } 389 }