bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/scollector/collectors/riak.go (about)

     1  package collectors
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"net/http"
     7  	"net/url"
     8  	"strings"
     9  
    10  	"bosun.org/metadata"
    11  	"bosun.org/opentsdb"
    12  )
    13  
    14  var riakMeta = map[string]MetricMeta{
    15  	"pbc_connects_total": {
    16  		Metric:   "pbc_connections",
    17  		RateType: metadata.Counter,
    18  		Unit:     metadata.Connection,
    19  		Desc:     "Total number of Protocol Buffers connections made.",
    20  	},
    21  	"read_repairs_total": {
    22  		Metric:   "read_repairs",
    23  		RateType: metadata.Counter,
    24  		Unit:     metadata.Operation,
    25  		Desc:     "Total number of Read Repairs this node has coordinated.",
    26  	},
    27  	"read_repairs_primary_outofdate_count": {
    28  		Metric:   "read_repairs_primary_outofdate",
    29  		RateType: metadata.Counter,
    30  		Unit:     metadata.Operation,
    31  		Desc:     "Total number of read repair operations performed on primary vnodes due to stale replicas.",
    32  	},
    33  	"read_repairs_primary_notfound_count": {
    34  		Metric:   "read_repairs_primary_notfound",
    35  		RateType: metadata.Counter,
    36  		Unit:     metadata.Operation,
    37  		Desc:     "Total number of read repair operations performed on primary vnodes due to missing replicas.",
    38  	},
    39  	"read_repairs_fallback_outofdate_count": {
    40  		Metric:   "read_repairs_fallback_outofdate",
    41  		RateType: metadata.Counter,
    42  		Unit:     metadata.Operation,
    43  		Desc:     "Total number of read repair operations performed on fallback vnodes due to stale replicas.",
    44  	},
    45  	"read_repairs_fallback_notfound_count": {
    46  		Metric:   "read_repairs_fallback_notfound",
    47  		RateType: metadata.Counter,
    48  		Unit:     metadata.Operation,
    49  		Desc:     "Total number of read repair operations performed on fallback vnodes due to missing replicas.",
    50  	},
    51  	"coord_redirs_total": {
    52  		Metric:   "coord_redirs",
    53  		RateType: metadata.Counter,
    54  		Unit:     metadata.Operation,
    55  		Desc:     "Total number of requests this node has redirected to other nodes for coordination.",
    56  	},
    57  	"precommit_fail": {
    58  		Metric:   "precommit_fail",
    59  		RateType: metadata.Counter,
    60  		Unit:     metadata.Event,
    61  		Desc:     "Total number of pre-commit hook failures.",
    62  	},
    63  	"postcommit_fail": {
    64  		Metric:   "postcommit_fail",
    65  		RateType: metadata.Counter,
    66  		Unit:     metadata.Event,
    67  		Desc:     "Total number of post-commit hook failures.",
    68  	},
    69  	"executing_mappers": {
    70  		Metric:   "executing_mappers",
    71  		RateType: metadata.Gauge,
    72  		Unit:     metadata.Process,
    73  	},
    74  	"pipeline_create_count": {
    75  		Metric:   "pipeline.create.count",
    76  		RateType: metadata.Counter,
    77  		Unit:     metadata.Process,
    78  		Desc:     "The total number of pipelines created since the node was started.",
    79  	},
    80  	"pipeline_create_error_count": {
    81  		Metric:   "pipeline.create.errors",
    82  		RateType: metadata.Counter,
    83  		Unit:     metadata.Event,
    84  		Desc:     "The total number of pipeline creation errors since the node was started.",
    85  	},
    86  	"pipeline_active": {
    87  		Metric:   "active",
    88  		TagSet:   opentsdb.TagSet{"type": "pbc"},
    89  		RateType: metadata.Gauge,
    90  		Unit:     metadata.Process,
    91  		Desc:     "The number of pipelines active in the last 60 seconds.",
    92  	},
    93  	"index_fsm_active": {
    94  		Metric:   "active",
    95  		TagSet:   opentsdb.TagSet{"type": "index"},
    96  		RateType: metadata.Gauge,
    97  		Unit:     metadata.Process,
    98  		Desc:     "Number of active Secondary Index FSMs.",
    99  	},
   100  	"list_fsm_active": {
   101  		Metric:   "active",
   102  		TagSet:   opentsdb.TagSet{"type": "list"},
   103  		RateType: metadata.Gauge,
   104  		Unit:     metadata.Process,
   105  		Desc:     "Number of active Keylisting FSMs.",
   106  	},
   107  
   108  	"memory_total": {
   109  		Metric:   "memory",
   110  		TagSet:   opentsdb.TagSet{"type": "total"},
   111  		RateType: metadata.Gauge,
   112  		Unit:     metadata.Bytes,
   113  		Desc:     "Total allocated memory (sum of processes and system).",
   114  	},
   115  	"memory_processes": {
   116  		Metric:   "memory",
   117  		TagSet:   opentsdb.TagSet{"type": "processes"},
   118  		RateType: metadata.Gauge,
   119  		Unit:     metadata.Bytes,
   120  		Desc:     "Total amount of memory allocated for Erlang processes.",
   121  	},
   122  	"memory_processes_used": {
   123  		Metric:   "memory",
   124  		TagSet:   opentsdb.TagSet{"type": "processes_used"},
   125  		RateType: metadata.Gauge,
   126  		Unit:     metadata.Bytes,
   127  		Desc:     "Total amount of memory used by Erlang processes.",
   128  	},
   129  	"memory_system": {
   130  		Metric:   "memory",
   131  		TagSet:   opentsdb.TagSet{"type": "system"},
   132  		RateType: metadata.Gauge,
   133  		Unit:     metadata.Bytes,
   134  		Desc:     "Total allocated memory that is not directly related to an Erlang process.",
   135  	},
   136  	"memory_system_used": {
   137  		Metric:   "memory",
   138  		TagSet:   opentsdb.TagSet{"type": "system_used"},
   139  		RateType: metadata.Gauge,
   140  		Unit:     metadata.Bytes,
   141  	},
   142  	"memory_atom": {
   143  		Metric:   "memory",
   144  		TagSet:   opentsdb.TagSet{"type": "atom"},
   145  		RateType: metadata.Gauge,
   146  		Unit:     metadata.Bytes,
   147  		Desc:     "Total amount of memory currently allocated for atom storage.",
   148  	},
   149  	"memory_atom_used": {
   150  		Metric:   "memory",
   151  		TagSet:   opentsdb.TagSet{"type": "atom_used"},
   152  		RateType: metadata.Gauge,
   153  		Unit:     metadata.Bytes,
   154  		Desc:     "Total amount of memory currently used for atom storage.",
   155  	},
   156  	"memory_binary": {
   157  		Metric:   "memory",
   158  		TagSet:   opentsdb.TagSet{"type": "binary"},
   159  		RateType: metadata.Gauge,
   160  		Unit:     metadata.Bytes,
   161  		Desc:     "Total amount of memory used for binaries.",
   162  	},
   163  	"memory_code": {
   164  		Metric:   "memory",
   165  		TagSet:   opentsdb.TagSet{"type": "code"},
   166  		RateType: metadata.Gauge,
   167  		Unit:     metadata.Bytes,
   168  		Desc:     "Total amount of memory allocated for Erlang code.",
   169  	},
   170  	"memory_ets": {
   171  		Metric:   "memory",
   172  		TagSet:   opentsdb.TagSet{"type": "ets"},
   173  		RateType: metadata.Gauge,
   174  		Unit:     metadata.Bytes,
   175  		Desc:     "Total memory allocated for Erlang Term Storage.",
   176  	},
   177  	"mem_total": {
   178  		Metric:   "memory",
   179  		TagSet:   opentsdb.TagSet{"type": "available"},
   180  		RateType: metadata.Gauge,
   181  		Unit:     metadata.Bytes,
   182  		Desc:     "Total available system memory.",
   183  	},
   184  	"mem_allocated": {
   185  		Metric:   "memory",
   186  		TagSet:   opentsdb.TagSet{"type": "allocated"},
   187  		RateType: metadata.Gauge,
   188  		Unit:     metadata.Bytes,
   189  		Desc:     "Total memory allocated for this node.",
   190  	},
   191  
   192  	"vnode_index_reads_total": {
   193  		Metric:   "vnode.index.requests",
   194  		RateType: metadata.Counter,
   195  		Unit:     metadata.Operation,
   196  		Desc:     "Total number of local replicas participating in secondary index reads.",
   197  	},
   198  	"vnode_index_writes_total": {
   199  		Metric:   "vnode.index.requests",
   200  		TagSet:   opentsdb.TagSet{"type": "write"},
   201  		RateType: metadata.Counter,
   202  		Unit:     metadata.Operation,
   203  		Desc:     "Total number of local replicas participating in secondary index writes.",
   204  	},
   205  	"vnode_index_deletes_total": {
   206  		Metric:   "vnode.index.requests",
   207  		TagSet:   opentsdb.TagSet{"type": "delete"},
   208  		RateType: metadata.Counter,
   209  		Unit:     metadata.Operation,
   210  		Desc:     "Total number of local replicas participating in secondary index deletes.",
   211  	},
   212  	"vnode_index_writes_postings_total": {
   213  		Metric:   "vnode.index.requests",
   214  		TagSet:   opentsdb.TagSet{"type": "write_post"},
   215  		RateType: metadata.Counter,
   216  		Unit:     metadata.Operation,
   217  		Desc:     "Total number of individual secondary index values written.",
   218  	},
   219  	"vnode_index_deletes_postings_total": {
   220  		Metric:   "vnode.index.requests",
   221  		TagSet:   opentsdb.TagSet{"type": "delete_post"},
   222  		RateType: metadata.Counter,
   223  		Unit:     metadata.Operation,
   224  		Desc:     "Total number of individual secondary index values deleted.",
   225  	},
   226  
   227  	"vnode_gets_total": {
   228  		Metric:   "vnode.requests",
   229  		TagSet:   opentsdb.TagSet{"type": "get"},
   230  		RateType: metadata.Counter,
   231  		Unit:     metadata.Operation,
   232  		Desc:     "Total number of GETs coordinated by local vnodes.",
   233  	},
   234  	"vnode_puts_total": {
   235  		Metric:   "vnode.requests",
   236  		TagSet:   opentsdb.TagSet{"type": "put"},
   237  		RateType: metadata.Counter,
   238  		Unit:     metadata.Operation,
   239  		Desc:     "Total number of PUTS coordinated by local vnodes.",
   240  	},
   241  	"node_gets_total": {
   242  		Metric:   "node.requests",
   243  		TagSet:   opentsdb.TagSet{"type": "get"},
   244  		RateType: metadata.Counter,
   245  		Unit:     metadata.Operation,
   246  		Desc:     "Total number of GETs coordinated by this node, including GETs to non-local vnodes.",
   247  	},
   248  	"node_puts_total": {
   249  		Metric:   "node.requests",
   250  		TagSet:   opentsdb.TagSet{"type": "put"},
   251  		RateType: metadata.Counter,
   252  		Unit:     metadata.Operation,
   253  		Desc:     "Total number of PUTs coordinated by this node, including PUTs to non-local vnodes.",
   254  	},
   255  	"node_get_fsm_time_mean": {
   256  		Metric:   "node.latency.mean",
   257  		TagSet:   opentsdb.TagSet{"type": "get"},
   258  		RateType: metadata.Gauge,
   259  		Unit:     metadata.Second,
   260  		Desc:     "Mean time between reception of client GET request and subsequent response to client.",
   261  	},
   262  	"node_put_fsm_time_mean": {
   263  		Metric:   "node.latency.mean",
   264  		TagSet:   opentsdb.TagSet{"type": "put"},
   265  		RateType: metadata.Gauge,
   266  		Unit:     metadata.Second,
   267  		Desc:     "Mean time between reception of client PUT request and subsequent response to client.",
   268  	},
   269  	"node_get_fsm_time_median": {
   270  		Metric:   "node.latency.median",
   271  		TagSet:   opentsdb.TagSet{"type": "get"},
   272  		RateType: metadata.Gauge,
   273  		Unit:     metadata.Second,
   274  		Desc:     "Median time between reception of client GET request and subsequent response to client.",
   275  	},
   276  	"node_put_fsm_time_median": {
   277  		Metric:   "node.latency.median",
   278  		TagSet:   opentsdb.TagSet{"type": "put"},
   279  		RateType: metadata.Gauge,
   280  		Unit:     metadata.Second,
   281  		Desc:     "Median time between reception of client PUT request and subsequent response to client.",
   282  	},
   283  	"node_get_fsm_time_95": {
   284  		Metric:   "node.latency.95th",
   285  		TagSet:   opentsdb.TagSet{"type": "get"},
   286  		RateType: metadata.Gauge,
   287  		Unit:     metadata.Second,
   288  		Desc:     "95th percentile time between reception of client GET request and subsequent response to client.",
   289  	},
   290  	"node_put_fsm_time_95": {
   291  		Metric:   "node.latency.95th",
   292  		TagSet:   opentsdb.TagSet{"type": "put"},
   293  		RateType: metadata.Gauge,
   294  		Unit:     metadata.Second,
   295  		Desc:     "95th percentile time between reception of client PUT request and subsequent response to client.",
   296  	},
   297  	"node_get_fsm_time_99": {
   298  		Metric:   "node.latency.99th",
   299  		TagSet:   opentsdb.TagSet{"type": "get"},
   300  		RateType: metadata.Gauge,
   301  		Unit:     metadata.Second,
   302  		Desc:     "99th percentile time between reception of client GET request and subsequent response to client.",
   303  	},
   304  	"node_put_fsm_time_99": {
   305  		Metric:   "node.latency.99th",
   306  		TagSet:   opentsdb.TagSet{"type": "put"},
   307  		RateType: metadata.Gauge,
   308  		Unit:     metadata.Second,
   309  		Desc:     "99th percentile time between reception of client PUT request and subsequent response to client.",
   310  	},
   311  	"node_get_fsm_time_100": {
   312  		Metric:   "node.latency.100th",
   313  		TagSet:   opentsdb.TagSet{"type": "get"},
   314  		RateType: metadata.Gauge,
   315  		Unit:     metadata.Second,
   316  		Desc:     "100th percentile time between reception of client GET request and subsequent response to client.",
   317  	},
   318  	"node_put_fsm_time_100": {
   319  		Metric:   "node.latency.100th",
   320  		TagSet:   opentsdb.TagSet{"type": "put"},
   321  		RateType: metadata.Gauge,
   322  		Unit:     metadata.Second,
   323  		Desc:     "100th percentile time between reception of client PUT request and subsequent response to client.",
   324  	},
   325  	"node_get_fsm_objsize_mean": {
   326  		Metric:   "node.objsize.mean",
   327  		TagSet:   opentsdb.TagSet{"type": "get"},
   328  		RateType: metadata.Gauge,
   329  		Unit:     metadata.Bytes,
   330  		Desc:     "Mean object size encountered by this node within the last minute.",
   331  	},
   332  	"node_get_fsm_objsize_median": {
   333  		Metric:   "node.objsize.median",
   334  		TagSet:   opentsdb.TagSet{"type": "get"},
   335  		RateType: metadata.Gauge,
   336  		Unit:     metadata.Bytes,
   337  		Desc:     "Median object size encountered by this node within the last minute.",
   338  	},
   339  	"node_get_fsm_objsize_95": {
   340  		Metric:   "node.objsize.95th",
   341  		TagSet:   opentsdb.TagSet{"type": "get"},
   342  		RateType: metadata.Gauge,
   343  		Unit:     metadata.Bytes,
   344  		Desc:     "95th percentile object size encountered by this node within the last minute.",
   345  	},
   346  	"node_get_fsm_objsize_99": {
   347  		Metric:   "node.objsize.99th",
   348  		TagSet:   opentsdb.TagSet{"type": "get"},
   349  		RateType: metadata.Gauge,
   350  		Unit:     metadata.Bytes,
   351  		Desc:     "99th percentile object size encountered by this node within the last minute.",
   352  	},
   353  	"node_get_fsm_objsize_100": {
   354  		Metric:   "node.objsize.100th",
   355  		TagSet:   opentsdb.TagSet{"type": "get"},
   356  		RateType: metadata.Gauge,
   357  		Unit:     metadata.Bytes,
   358  		Desc:     "100th percentile object size encountered by this node within the last minute.",
   359  	},
   360  	"node_get_fsm_siblings_mean": {
   361  		Metric:   "node.siblings.mean",
   362  		TagSet:   opentsdb.TagSet{"type": "get"},
   363  		RateType: metadata.Gauge,
   364  		Unit:     metadata.Count,
   365  		Desc:     "Mean number of siblings encountered during all GET operations by this node within the last minute.",
   366  	},
   367  	"node_get_fsm_siblings_median": {
   368  		Metric:   "node.siblings.median",
   369  		TagSet:   opentsdb.TagSet{"type": "get"},
   370  		RateType: metadata.Gauge,
   371  		Unit:     metadata.Count,
   372  		Desc:     "Median number of siblings encountered during all GET operations by this node within the last minute.",
   373  	},
   374  	"node_get_fsm_siblings_95": {
   375  		Metric:   "node.siblings.95th",
   376  		TagSet:   opentsdb.TagSet{"type": "get"},
   377  		RateType: metadata.Gauge,
   378  		Unit:     metadata.Count,
   379  		Desc:     "95th percentile of siblings encountered during all GET operations by this node within the last minute.",
   380  	},
   381  	"node_get_fsm_siblings_99": {
   382  		Metric:   "node.siblings.99th",
   383  		TagSet:   opentsdb.TagSet{"type": "get"},
   384  		RateType: metadata.Gauge,
   385  		Unit:     metadata.Count,
   386  		Desc:     "99th percentile of siblings encountered during all GET operations by this node within the last minute.",
   387  	},
   388  	"node_get_fsm_siblings_100": {
   389  		Metric:   "node.siblings.100th",
   390  		TagSet:   opentsdb.TagSet{"type": "get"},
   391  		RateType: metadata.Gauge,
   392  		Unit:     metadata.Count,
   393  		Desc:     "100th percentile of siblings encountered during all GET operations by this node within the last minute.",
   394  	},
   395  	"node_get_fsm_rejected_total": {
   396  		Metric:   "node.requests.rejected",
   397  		TagSet:   opentsdb.TagSet{"type": "get"},
   398  		RateType: metadata.Counter,
   399  		Unit:     metadata.Event,
   400  		Desc:     "Total number of GET FSMs rejected by Sidejob's overload protection.",
   401  	},
   402  	"node_put_fsm_rejected_total": {
   403  		Metric:   "node.requests.rejected",
   404  		TagSet:   opentsdb.TagSet{"type": "put"},
   405  		RateType: metadata.Counter,
   406  		Unit:     metadata.Event,
   407  		Desc:     "Total number of PUT FSMs rejected by Sidejob's overload protection.",
   408  	},
   409  	"ring_num_partitions": {
   410  		Metric:   "ring_num_partitions",
   411  		RateType: metadata.Gauge,
   412  		Unit:     metadata.Count,
   413  		Desc:     "The number of partitions in the ring.",
   414  	},
   415  	"ring_creation_size": {
   416  		Metric:   "ring.creation_size",
   417  		RateType: metadata.Gauge,
   418  		Unit:     metadata.Count,
   419  		Desc:     "Ring size this cluster was created with.",
   420  	},
   421  	"cpu_nprocs": {
   422  		Metric:   "cpu.nprocs",
   423  		RateType: metadata.Gauge,
   424  		Unit:     metadata.Count,
   425  		Desc:     "Number of operating system processes.",
   426  	},
   427  	"cpu_avg1": {
   428  		Metric:   "cpu.avg1",
   429  		RateType: metadata.Gauge,
   430  		Unit:     metadata.Load,
   431  		Desc:     "The average number of active processes for the last 1 minute (equivalent to top(1) command’s load average when divided by 256()).",
   432  	},
   433  	"cpu_avg5": {
   434  		Metric:   "cpu.avg5",
   435  		RateType: metadata.Gauge,
   436  		Unit:     metadata.Load,
   437  		Desc:     "The average number of active processes for the last 5 minutes (equivalent to top(1) command’s load average when divided by 256()).",
   438  	},
   439  	"cpu_avg15": {
   440  		Metric:   "cpu.avg15",
   441  		RateType: metadata.Gauge,
   442  		Unit:     metadata.Load,
   443  		Desc:     "The average number of active processes for the last 15 minutes (equivalent to top(1) command’s load average when divided by 256()).",
   444  	},
   445  	"riak_search_vnodeq_total": {
   446  		Metric:   "search.vnodeq",
   447  		RateType: metadata.Counter,
   448  		Unit:     metadata.Event,
   449  		Desc:     "Total number of unprocessed messages all vnode message queues in the Riak Search subsystem have received on this node since it was started.",
   450  	},
   451  	"riak_search_vnodes_running": {
   452  		Metric:   "search.vnodes_running",
   453  		RateType: metadata.Gauge,
   454  		Unit:     metadata.Process,
   455  		Desc:     "Total number of vnodes currently running in the Riak Search subsystem.",
   456  	},
   457  }
   458  
   459  func init() {
   460  	collectors = append(collectors, &IntervalCollector{F: c_riak, Enable: enableRiak})
   461  }
   462  
   463  const (
   464  	localRiakURL string = "http://localhost:8098/stats"
   465  )
   466  
   467  func Riak(s string) error {
   468  	u, err := url.Parse(s)
   469  	if err != nil {
   470  		return err
   471  	}
   472  	collectors = append(collectors,
   473  		&IntervalCollector{
   474  			F: func() (opentsdb.MultiDataPoint, error) {
   475  				return riak(s)
   476  			},
   477  			name: fmt.Sprintf("riak-%s", u.Host),
   478  		})
   479  	return nil
   480  }
   481  
   482  func enableRiak() bool {
   483  	return enableURL(localRiakURL)()
   484  }
   485  
   486  func c_riak() (opentsdb.MultiDataPoint, error) {
   487  	return riak(localRiakURL)
   488  }
   489  
   490  func riak(s string) (opentsdb.MultiDataPoint, error) {
   491  	var md opentsdb.MultiDataPoint
   492  	res, err := http.Get(s)
   493  	if err != nil {
   494  		return nil, err
   495  	}
   496  	defer res.Body.Close()
   497  	var r map[string]interface{}
   498  	if err := json.NewDecoder(res.Body).Decode(&r); err != nil {
   499  		return nil, err
   500  	}
   501  	for k, v := range r {
   502  		if m, ok := riakMeta[k]; ok {
   503  			if v == "undefined" {
   504  				continue
   505  			}
   506  			if strings.HasPrefix(m.Metric, "node.latency") {
   507  				if nl, ok := v.(float64); ok {
   508  					v = nl / 1000000
   509  				} else {
   510  					err := fmt.Errorf("riak: bad integer %s in metric '%s'", v, m.Metric)
   511  					return nil, err
   512  				}
   513  			}
   514  			Add(&md, "riak."+m.Metric, v, m.TagSet, m.RateType, m.Unit, m.Desc)
   515  		} else if k == "connected_nodes" {
   516  			nodes, ok := v.([]interface{})
   517  			// 'connected_nodes' array can be empty
   518  			if !ok {
   519  				err := fmt.Errorf("riak: unexpected content or type for 'connected_nodes' metric array")
   520  				return nil, err
   521  			}
   522  			Add(&md, "riak.connected_nodes", len(nodes), nil, metadata.Gauge, metadata.Count, descConNodes)
   523  		} else if k == "ring_members" {
   524  			ringMembers, ok := v.([]interface{})
   525  			// at least one ring member must always exist
   526  			if !ok || len(ringMembers) < 1 {
   527  				err := fmt.Errorf("riak: unexpected content or type for 'ring_members' metric array")
   528  				return nil, err
   529  			}
   530  			Add(&md, "riak.ring_members", len(ringMembers), nil, metadata.Gauge, metadata.Count, descRingMembers)
   531  		}
   532  	}
   533  	return md, nil
   534  }
   535  
   536  const (
   537  	descConNodes    = "Count of nodes that this node is aware of at this time."
   538  	descRingMembers = "Count of nodes that are members of the ring."
   539  )