github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/state/metrics.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package state
     5  
     6  import (
     7  	"encoding/json"
     8  	"fmt"
     9  	"sort"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/juju/charm/v12"
    14  	"github.com/juju/errors"
    15  	"github.com/juju/loggo"
    16  	"github.com/juju/mgo/v3"
    17  	"github.com/juju/mgo/v3/bson"
    18  	"github.com/juju/mgo/v3/txn"
    19  	"github.com/juju/names/v5"
    20  )
    21  
    22  var metricsLogger = loggo.GetLogger("juju.state.metrics")
    23  
    24  const (
    25  	CleanupAge = time.Hour * 24
    26  )
    27  
    28  // MetricBatch represents a batch of metrics reported from a unit.
    29  // These will be received from the unit in batches.
    30  // The main contents of the metric (key, value) is defined
    31  // by the charm author and sent from the unit via a call to
    32  // add-metric
    33  type MetricBatch struct {
    34  	st  *State
    35  	doc metricBatchDoc
    36  }
    37  
    38  type metricBatchDoc struct {
    39  	UUID           string    `bson:"_id"`
    40  	ModelUUID      string    `bson:"model-uuid"`
    41  	Unit           string    `bson:"unit"`
    42  	CharmURL       string    `bson:"charmurl"`
    43  	Sent           bool      `bson:"sent"`
    44  	DeleteTime     time.Time `bson:"delete-time"`
    45  	Created        time.Time `bson:"created"`
    46  	Metrics        []Metric  `bson:"metrics"`
    47  	Credentials    []byte    `bson:"credentials"`
    48  	SLACredentials []byte    `bson:"sla-credentials,omitempty"`
    49  }
    50  
    51  // Metric represents a single Metric.
    52  type Metric struct {
    53  	Key    string            `bson:"key"`
    54  	Value  string            `bson:"value"`
    55  	Time   time.Time         `bson:"time"`
    56  	Labels map[string]string `bson:"labels,omitempty"`
    57  }
    58  
    59  type byTime []Metric
    60  
    61  // Len implements sort.Interface.
    62  func (t byTime) Len() int { return len(t) }
    63  
    64  // Swap implements sort.Interface.
    65  func (t byTime) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
    66  
    67  // Less implements sort.Interface.
    68  func (t byTime) Less(i, j int) bool {
    69  	return t[i].Time.Before(t[j].Time)
    70  }
    71  
    72  type byKey []Metric
    73  
    74  // Len implements sort.Interface.
    75  func (t byKey) Len() int { return len(t) }
    76  
    77  // Swap implements sort.Interface.
    78  func (t byKey) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
    79  
    80  // Less implements sort.Interface.
    81  func (t byKey) Less(i, j int) bool {
    82  	if t[i].Key == t[j].Key {
    83  		return labelsKey(t[i].Labels) < labelsKey(t[j].Labels)
    84  	}
    85  	return t[i].Key < t[j].Key
    86  }
    87  
    88  func labelsKey(m map[string]string) string {
    89  	var result []string
    90  	for k, v := range m {
    91  		result = append(result, fmt.Sprintf("%s=%s", k, v))
    92  	}
    93  	sort.Strings(result)
    94  	return strings.Join(result, ",")
    95  }
    96  
    97  // validate checks that the MetricBatch contains valid metrics.
    98  func (m *MetricBatch) validate() error {
    99  	ch, err := m.st.Charm(m.doc.CharmURL)
   100  	if err != nil {
   101  		return errors.Trace(err)
   102  	}
   103  	chMetrics := ch.Metrics()
   104  	if chMetrics == nil {
   105  		return errors.Errorf("charm doesn't implement metrics")
   106  	}
   107  	for _, m := range m.doc.Metrics {
   108  		if err := chMetrics.ValidateMetric(m.Key, m.Value); err != nil {
   109  			return errors.Trace(err)
   110  		}
   111  	}
   112  	return nil
   113  }
   114  
   115  // BatchParam contains the properties of the metrics batch used when creating a metrics
   116  // batch.
   117  type BatchParam struct {
   118  	UUID     string
   119  	CharmURL string
   120  	Created  time.Time
   121  	Metrics  []Metric
   122  	Unit     names.UnitTag
   123  }
   124  
   125  // ModelBatchParam contains the properties of a metric batch for a model
   126  // The model uuid will be attenuated in the call to AddModelMetrics.
   127  type ModelBatchParam struct {
   128  	UUID    string
   129  	Created time.Time
   130  	Metrics []Metric
   131  }
   132  
   133  // AddMetrics adds a new batch of metrics to the database.
   134  func (st *State) AddMetrics(batch BatchParam) (*MetricBatch, error) {
   135  	if len(batch.Metrics) == 0 {
   136  		return nil, errors.New("cannot add a batch of 0 metrics")
   137  	}
   138  	charmURL, err := charm.ParseURL(batch.CharmURL)
   139  	if err != nil {
   140  		return nil, errors.NewNotValid(err, "could not parse charm URL")
   141  	}
   142  
   143  	unit, err := st.Unit(batch.Unit.Id())
   144  	if err != nil {
   145  		return nil, errors.Trace(err)
   146  	}
   147  	application, err := unit.Application()
   148  	if err != nil {
   149  		return nil, errors.Trace(err)
   150  	}
   151  
   152  	slaCreds, err := st.SLACredential()
   153  	if err != nil {
   154  		return nil, errors.Trace(err)
   155  	}
   156  
   157  	metric := &MetricBatch{
   158  		st: st,
   159  		doc: metricBatchDoc{
   160  			UUID:           batch.UUID,
   161  			ModelUUID:      st.ModelUUID(),
   162  			Unit:           batch.Unit.Id(),
   163  			CharmURL:       charmURL.String(),
   164  			Sent:           false,
   165  			Created:        batch.Created,
   166  			Metrics:        batch.Metrics,
   167  			Credentials:    application.MetricCredentials(),
   168  			SLACredentials: slaCreds,
   169  		},
   170  	}
   171  	if err := metric.validate(); err != nil {
   172  		return nil, err
   173  	}
   174  	buildTxn := func(attempt int) ([]txn.Op, error) {
   175  		if attempt > 0 {
   176  			notDead, err := isNotDead(st, unitsC, batch.Unit.Id())
   177  			if err != nil || !notDead {
   178  				return nil, errors.NotFoundf(batch.Unit.Id())
   179  			}
   180  			exists, err := st.MetricBatch(batch.UUID)
   181  			if exists != nil && err == nil {
   182  				return nil, errors.AlreadyExistsf("metrics batch UUID %q", batch.UUID)
   183  			}
   184  			if !errors.IsNotFound(err) {
   185  				return nil, errors.Trace(err)
   186  			}
   187  		}
   188  		ops := []txn.Op{{
   189  			C:      unitsC,
   190  			Id:     st.docID(batch.Unit.Id()),
   191  			Assert: notDeadDoc,
   192  		}, {
   193  			C:      metricsC,
   194  			Id:     metric.UUID(),
   195  			Assert: txn.DocMissing,
   196  			Insert: &metric.doc,
   197  		}}
   198  		return ops, nil
   199  	}
   200  	err = st.db().Run(buildTxn)
   201  	if err != nil {
   202  		return nil, errors.Trace(err)
   203  	}
   204  
   205  	return metric, nil
   206  }
   207  
   208  // AddModelMetrics adds a new model-centric batch of metrics to the database.
   209  func (st *State) AddModelMetrics(batch ModelBatchParam) (*MetricBatch, error) {
   210  	if len(batch.Metrics) == 0 {
   211  		return nil, errors.New("cannot add a batch of 0 metrics")
   212  	}
   213  	slaCreds, err := st.SLACredential()
   214  	if err != nil {
   215  		return nil, errors.Trace(err)
   216  	}
   217  	metric := &MetricBatch{
   218  		st: st,
   219  		doc: metricBatchDoc{
   220  			UUID:           batch.UUID,
   221  			ModelUUID:      st.ModelUUID(),
   222  			Sent:           false,
   223  			Created:        batch.Created,
   224  			Metrics:        batch.Metrics,
   225  			SLACredentials: slaCreds,
   226  		},
   227  	}
   228  	buildTxn := func(attempt int) ([]txn.Op, error) {
   229  		if attempt > 0 {
   230  			exists, err := st.MetricBatch(batch.UUID)
   231  			if exists != nil && err == nil {
   232  				return nil, errors.AlreadyExistsf("metrics batch UUID %q", batch.UUID)
   233  			}
   234  			if !errors.IsNotFound(err) {
   235  				return nil, errors.Trace(err)
   236  			}
   237  		}
   238  		ops := []txn.Op{{
   239  			C:      metricsC,
   240  			Id:     metric.UUID(),
   241  			Assert: txn.DocMissing,
   242  			Insert: &metric.doc,
   243  		}}
   244  		return ops, nil
   245  	}
   246  	err = st.db().Run(buildTxn)
   247  	if err != nil {
   248  		return nil, errors.Trace(err)
   249  	}
   250  
   251  	return metric, nil
   252  }
   253  
   254  // AllMetricBatches returns all metric batches currently stored in state.
   255  // TODO (tasdomas): this method is currently only used in the uniter worker test -
   256  //
   257  //	it needs to be modified to restrict the scope of the values it
   258  //	returns if it is to be used outside of tests.
   259  func (st *State) AllMetricBatches() ([]MetricBatch, error) {
   260  	c, closer := st.db().GetCollection(metricsC)
   261  	defer closer()
   262  	docs := []metricBatchDoc{}
   263  	err := c.Find(nil).All(&docs)
   264  	if err != nil {
   265  		return nil, errors.Trace(err)
   266  	}
   267  	results := make([]MetricBatch, len(docs))
   268  	for i, doc := range docs {
   269  		results[i] = MetricBatch{st: st, doc: doc}
   270  	}
   271  	return results, nil
   272  }
   273  
   274  func (st *State) queryMetricBatches(query bson.M) ([]MetricBatch, error) {
   275  	c, closer := st.db().GetCollection(metricsC)
   276  	defer closer()
   277  	docs := []metricBatchDoc{}
   278  	err := c.Find(query).Sort("created").All(&docs)
   279  	if err != nil {
   280  		return nil, errors.Trace(err)
   281  	}
   282  	results := make([]MetricBatch, len(docs))
   283  	for i, doc := range docs {
   284  		results[i] = MetricBatch{st: st, doc: doc}
   285  	}
   286  	return results, nil
   287  }
   288  
   289  // MetricBatchesForUnit returns metric batches for the given unit.
   290  func (st *State) MetricBatchesForUnit(unit string) ([]MetricBatch, error) {
   291  	_, err := st.Unit(unit)
   292  	if err != nil {
   293  		return nil, errors.Trace(err)
   294  	}
   295  	return st.queryMetricBatches(bson.M{"unit": unit})
   296  }
   297  
   298  // MetricBatchesForModel returns metric batches for all the units in the model.
   299  func (st *State) MetricBatchesForModel() ([]MetricBatch, error) {
   300  	return st.queryMetricBatches(bson.M{"model-uuid": st.ModelUUID()})
   301  }
   302  
   303  // MetricBatchesForApplication returns metric batches for the given application.
   304  func (st *State) MetricBatchesForApplication(application string) ([]MetricBatch, error) {
   305  	app, err := st.Application(application)
   306  	if err != nil {
   307  		return nil, errors.Trace(err)
   308  	}
   309  	units, err := app.AllUnits()
   310  	if err != nil {
   311  		return nil, errors.Trace(err)
   312  	}
   313  	unitNames := make([]bson.M, len(units))
   314  	for i, u := range units {
   315  		unitNames[i] = bson.M{"unit": u.Name()}
   316  	}
   317  	return st.queryMetricBatches(bson.M{"$or": unitNames})
   318  }
   319  
   320  // MetricBatch returns the metric batch with the given id.
   321  func (st *State) MetricBatch(id string) (*MetricBatch, error) {
   322  	c, closer := st.db().GetCollection(metricsC)
   323  	defer closer()
   324  	doc := metricBatchDoc{}
   325  	err := c.Find(bson.M{"_id": id}).One(&doc)
   326  	if err == mgo.ErrNotFound {
   327  		return nil, errors.NotFoundf("metric %v", id)
   328  	}
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  	return &MetricBatch{st: st, doc: doc}, nil
   333  }
   334  
   335  // CleanupOldMetrics looks for metrics that are 24 hours old (or older)
   336  // and have been sent. Any metrics it finds are deleted.
   337  func (st *State) CleanupOldMetrics() error {
   338  	now := st.clock().Now()
   339  	metrics, closer := st.db().GetCollection(metricsC)
   340  	defer closer()
   341  	// Nothing else in the system will interact with sent metrics, and nothing needs
   342  	// to watch them either; so in this instance it's safe to do an end run around the
   343  	// mgo/txn package. See State.cleanupRelationSettings for a similar situation.
   344  	metricsW := metrics.Writeable()
   345  	// TODO (mattyw) iter over this.
   346  	info, err := metricsW.RemoveAll(bson.M{
   347  		"model-uuid":  st.ModelUUID(),
   348  		"sent":        true,
   349  		"delete-time": bson.M{"$lte": now},
   350  	})
   351  	if err == nil {
   352  		metricsLogger.Tracef("cleanup removed %d metrics", info.Removed)
   353  	}
   354  	return errors.Trace(err)
   355  }
   356  
   357  // MetricsToSend returns batchSize metrics that need to be sent
   358  // to the collector
   359  func (st *State) MetricsToSend(batchSize int) ([]*MetricBatch, error) {
   360  	var docs []metricBatchDoc
   361  	c, closer := st.db().GetCollection(metricsC)
   362  	defer closer()
   363  
   364  	q := bson.M{
   365  		"model-uuid": st.ModelUUID(),
   366  		"sent":       false,
   367  	}
   368  	err := c.Find(q).Limit(batchSize).All(&docs)
   369  	if err != nil {
   370  		return nil, errors.Trace(err)
   371  	}
   372  
   373  	batch := make([]*MetricBatch, len(docs))
   374  	for i, doc := range docs {
   375  		batch[i] = &MetricBatch{st: st, doc: doc}
   376  
   377  	}
   378  
   379  	return batch, nil
   380  }
   381  
   382  // CountOfUnsentMetrics returns the number of metrics that
   383  // haven't been sent to the collection service.
   384  func (st *State) CountOfUnsentMetrics() (int, error) {
   385  	c, closer := st.db().GetCollection(metricsC)
   386  	defer closer()
   387  	return c.Find(bson.M{
   388  		"model-uuid": st.ModelUUID(),
   389  		"sent":       false,
   390  	}).Count()
   391  }
   392  
   393  // CountOfSentMetrics returns the number of metrics that
   394  // have been sent to the collection service and have not
   395  // been removed by the cleanup worker.
   396  func (st *State) CountOfSentMetrics() (int, error) {
   397  	c, closer := st.db().GetCollection(metricsC)
   398  	defer closer()
   399  	return c.Find(bson.M{
   400  		"model-uuid": st.ModelUUID(),
   401  		"sent":       true,
   402  	}).Count()
   403  }
   404  
   405  // MarshalJSON defines how the MetricBatch type should be
   406  // converted to json.
   407  func (m *MetricBatch) MarshalJSON() ([]byte, error) {
   408  	return json.Marshal(m.doc)
   409  }
   410  
   411  // UUID returns to uuid of the metric.
   412  func (m *MetricBatch) UUID() string {
   413  	return m.doc.UUID
   414  }
   415  
   416  // ModelUUID returns the model UUID this metric applies to.
   417  func (m *MetricBatch) ModelUUID() string {
   418  	return m.doc.ModelUUID
   419  }
   420  
   421  // Unit returns the name of the unit this metric was generated in.
   422  func (m *MetricBatch) Unit() string {
   423  	return m.doc.Unit
   424  }
   425  
   426  // CharmURL returns the charm url for the charm this metric was generated in.
   427  func (m *MetricBatch) CharmURL() string {
   428  	return m.doc.CharmURL
   429  }
   430  
   431  // Created returns the time this metric batch was created.
   432  func (m *MetricBatch) Created() time.Time {
   433  	return m.doc.Created
   434  }
   435  
   436  // Sent returns a flag to tell us if this metric has been sent to the metric
   437  // collection service
   438  func (m *MetricBatch) Sent() bool {
   439  	return m.doc.Sent
   440  }
   441  
   442  // Metrics returns the metrics in this batch.
   443  func (m *MetricBatch) Metrics() []Metric {
   444  	result := make([]Metric, len(m.doc.Metrics))
   445  	copy(result, m.doc.Metrics)
   446  	return result
   447  }
   448  
   449  // UniqueMetrics returns only the last value for each
   450  // metric key in this batch.
   451  func (m *MetricBatch) UniqueMetrics() []Metric {
   452  	metrics := m.Metrics()
   453  	sort.Sort(byTime(metrics))
   454  	uniq := map[string]Metric{}
   455  	for _, m := range metrics {
   456  		uniq[fmt.Sprintf("%s-%s", m.Key, labelsKey(m.Labels))] = m
   457  	}
   458  	results := make([]Metric, len(uniq))
   459  	i := 0
   460  	for _, m := range uniq {
   461  		results[i] = m
   462  		i++
   463  	}
   464  	sort.Sort(byKey(results))
   465  	return results
   466  }
   467  
   468  // SetSent marks the metric has having been sent at
   469  // the specified time.
   470  func (m *MetricBatch) SetSent(t time.Time) error {
   471  	deleteTime := t.UTC().Add(CleanupAge)
   472  	ops := setSentOps([]string{m.UUID()}, deleteTime)
   473  	if err := m.st.db().RunTransaction(ops); err != nil {
   474  		return errors.Annotatef(err, "cannot set metric sent for metric %q", m.UUID())
   475  	}
   476  
   477  	m.doc.Sent = true
   478  	m.doc.DeleteTime = deleteTime
   479  	return nil
   480  }
   481  
   482  // Credentials returns any credentials associated with the metric batch.
   483  func (m *MetricBatch) Credentials() []byte {
   484  	return m.doc.Credentials
   485  }
   486  
   487  // SLACredentials returns any sla credentials associated with the metric batch.
   488  func (m *MetricBatch) SLACredentials() []byte {
   489  	return m.doc.SLACredentials
   490  }
   491  
   492  func setSentOps(batchUUIDs []string, deleteTime time.Time) []txn.Op {
   493  	ops := make([]txn.Op, len(batchUUIDs))
   494  	for i, u := range batchUUIDs {
   495  		ops[i] = txn.Op{
   496  			C:      metricsC,
   497  			Id:     u,
   498  			Assert: txn.DocExists,
   499  			Update: bson.M{"$set": bson.M{"sent": true, "delete-time": deleteTime}},
   500  		}
   501  	}
   502  	return ops
   503  }
   504  
   505  // SetMetricBatchesSent sets sent on each MetricBatch corresponding to the uuids provided.
   506  func (st *State) SetMetricBatchesSent(batchUUIDs []string) error {
   507  	deleteTime := st.clock().Now().UTC().Add(CleanupAge)
   508  	ops := setSentOps(batchUUIDs, deleteTime)
   509  	if err := st.db().RunTransaction(ops); err != nil {
   510  		return errors.Annotatef(err, "cannot set metric sent in bulk call")
   511  	}
   512  	return nil
   513  }