bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/database/state_data.go (about)

     1  package database
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"strconv"
     7  	"time"
     8  
     9  	"strings"
    10  
    11  	"bosun.org/models"
    12  	"bosun.org/slog"
    13  	"github.com/garyburd/redigo/redis"
    14  )
    15  
    16  /*
    17  incidentById:{id} - json encoded state. Authoritative source.
    18  
    19  renderedTemplatesById:{id} - json encoded RenderedTemplates by Incident Id
    20  
    21  lastTouched:{alert} - ZSET of alert key to last touched time stamp
    22  unknown:{alert} - Set of unknown alert keys for alert
    23  unevel:{alert} - Set of unevaluated alert keys for alert
    24  
    25  openIncidents - Hash of open incident Ids. Alert Key -> incident id
    26  incidents:{ak} - List of incidents for alert key
    27  
    28  allIncidents - List of all incidents ever. Value is "incidentId:timestamp:ak"
    29  */
    30  
    31  const (
    32  	statesOpenIncidentsKey = "openIncidents"
    33  )
    34  
    35  func statesLastTouchedKey(alert string) string {
    36  	return fmt.Sprintf("lastTouched:%s", alert)
    37  }
    38  func statesUnknownKey(alert string) string {
    39  	return fmt.Sprintf("unknown:%s", alert)
    40  }
    41  func statesUnevalKey(alert string) string {
    42  	return fmt.Sprintf("uneval:%s", alert)
    43  }
    44  func incidentStateKey(id int64) string {
    45  	return fmt.Sprintf("incidentById:%d", id)
    46  }
    47  func renderedTemplatesKey(id int64) string {
    48  	return fmt.Sprintf("renderedTemplatesById:%d", id)
    49  }
    50  func incidentsForAlertKeyKey(ak models.AlertKey) string {
    51  	return fmt.Sprintf("incidents:%s", ak)
    52  }
    53  
    54  type StateDataAccess interface {
    55  	TouchAlertKey(ak models.AlertKey, t time.Time) error
    56  	GetUntouchedSince(alert string, time int64) ([]models.AlertKey, error)
    57  
    58  	GetOpenIncident(ak models.AlertKey) (*models.IncidentState, error)
    59  	GetLatestIncident(ak models.AlertKey) (*models.IncidentState, error)
    60  	GetAllOpenIncidents() ([]*models.IncidentState, error)
    61  	GetIncidentState(incidentId int64) (*models.IncidentState, error)
    62  
    63  	GetAllIncidentsByAlertKey(ak models.AlertKey) ([]*models.IncidentState, error)
    64  	GetAllIncidentIdsByAlertKey(ak models.AlertKey) ([]int64, error)
    65  
    66  	UpdateIncidentState(s *models.IncidentState) (int64, error)
    67  	ImportIncidentState(s *models.IncidentState) error
    68  
    69  	// SetIncidentNext gets the incident for previousIncidentId, and sets its NextId field to be nextIncidentId and then saves the incident
    70  	SetIncidentNext(incidentId, nextIncidentId int64) error
    71  
    72  	SetRenderedTemplates(incidentId int64, rt *models.RenderedTemplates) error
    73  	GetRenderedTemplates(incidentId int64) (*models.RenderedTemplates, error)
    74  	GetRenderedTemplateKeys() ([]string, error)
    75  	CleanupOldRenderedTemplates(olderThan time.Duration)
    76  	DeleteRenderedTemplates(incidentIds []int64) error
    77  
    78  	Forget(ak models.AlertKey) error
    79  	SetUnevaluated(ak models.AlertKey, uneval bool) error
    80  	GetUnknownAndUnevalAlertKeys(alert string) ([]models.AlertKey, []models.AlertKey, error)
    81  }
    82  
    83  func (d *dataAccess) SetRenderedTemplates(incidentId int64, rt *models.RenderedTemplates) error {
    84  	conn := d.Get()
    85  	defer conn.Close()
    86  
    87  	data, err := json.Marshal(rt)
    88  	if err != nil {
    89  		return slog.Wrap(err)
    90  	}
    91  	_, err = conn.Do("SET", renderedTemplatesKey(incidentId), data)
    92  	if err != nil {
    93  		return slog.Wrap(err)
    94  	}
    95  	return nil
    96  }
    97  
    98  func (d *dataAccess) GetRenderedTemplates(incidentId int64) (*models.RenderedTemplates, error) {
    99  	conn := d.Get()
   100  	defer conn.Close()
   101  
   102  	b, err := redis.Bytes(conn.Do("GET", renderedTemplatesKey(incidentId)))
   103  	renderedT := &models.RenderedTemplates{}
   104  	if err != nil {
   105  		if err == redis.ErrNil {
   106  			return renderedT, nil
   107  		}
   108  		return nil, slog.Wrap(err)
   109  	}
   110  	if err = json.Unmarshal(b, renderedT); err != nil {
   111  		return nil, slog.Wrap(err)
   112  	}
   113  	return renderedT, nil
   114  }
   115  
   116  func (d *dataAccess) scanMatchCmd(pattern string) (string, []interface{}, int) {
   117  	//ledis uses XSCAN cursor "KV" MATCH foo
   118  	//redis uses SCAN cursor MATCH foo
   119  	if d.isRedis {
   120  		return "SCAN", []interface{}{"0", "MATCH", pattern}, 0
   121  	}
   122  	return "XSCAN", []interface{}{"KV", "0", "MATCH", pattern}, 1
   123  }
   124  
   125  func (d *dataAccess) GetRenderedTemplateKeys() ([]string, error) {
   126  	conn := d.Get()
   127  	defer conn.Close()
   128  
   129  	cmd, args, cursorIdx := d.scanMatchCmd("renderedTemplatesById:*")
   130  	found := []string{}
   131  	for {
   132  		vals, err := redis.Values(conn.Do(cmd, args...))
   133  		if err != nil {
   134  			return nil, slog.Wrap(err)
   135  		}
   136  		cursor, err := redis.String(vals[0], nil)
   137  		if err != nil {
   138  			return nil, slog.Wrap(err)
   139  		}
   140  		args[cursorIdx] = cursor
   141  		keys, err := redis.Strings(vals[1], nil)
   142  		if err != nil {
   143  			return nil, slog.Wrap(err)
   144  		}
   145  		found = append(found, keys...)
   146  		if cursor == "" || cursor == "0" {
   147  			break
   148  		}
   149  	}
   150  	return found, nil
   151  }
   152  
   153  func (d *dataAccess) DeleteRenderedTemplates(incidentIds []int64) error {
   154  	conn := d.Get()
   155  	defer conn.Close()
   156  	const batchSize = 1000
   157  	args := make([]interface{}, 0, batchSize)
   158  	for len(incidentIds) > 0 {
   159  		size := len(incidentIds)
   160  		if size > batchSize {
   161  			size = batchSize
   162  		}
   163  		thisBatch := incidentIds[:size]
   164  		incidentIds = incidentIds[size:]
   165  		args = args[:0]
   166  		for _, id := range thisBatch {
   167  			args = append(args, renderedTemplatesKey(id))
   168  		}
   169  		_, err := conn.Do("DEL", args...)
   170  		if err != nil {
   171  			return slog.Wrap(err)
   172  		}
   173  	}
   174  	return nil
   175  }
   176  
   177  func (d *dataAccess) State() StateDataAccess {
   178  	return d
   179  }
   180  
   181  func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error {
   182  	conn := d.Get()
   183  	defer conn.Close()
   184  
   185  	_, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak))
   186  	return slog.Wrap(err)
   187  }
   188  
   189  func (d *dataAccess) GetUntouchedSince(alert string, time int64) ([]models.AlertKey, error) {
   190  	conn := d.Get()
   191  	defer conn.Close()
   192  
   193  	results, err := redis.Strings(conn.Do("ZRANGEBYSCORE", statesLastTouchedKey(alert), "-inf", time))
   194  	if err != nil {
   195  		return nil, slog.Wrap(err)
   196  	}
   197  	aks := make([]models.AlertKey, len(results))
   198  	for i := range results {
   199  		aks[i] = models.AlertKey(results[i])
   200  	}
   201  	return aks, nil
   202  }
   203  
   204  func (d *dataAccess) GetOpenIncident(ak models.AlertKey) (*models.IncidentState, error) {
   205  	conn := d.Get()
   206  	defer conn.Close()
   207  
   208  	inc, err := d.getLatestIncident(ak, conn)
   209  	if err != nil {
   210  		return nil, slog.Wrap(err)
   211  	}
   212  	if inc == nil {
   213  		return nil, nil
   214  	}
   215  	if inc.Open {
   216  		return inc, nil
   217  	}
   218  	return nil, nil
   219  }
   220  
   221  func (d *dataAccess) getLatestIncident(ak models.AlertKey, conn redis.Conn) (*models.IncidentState, error) {
   222  	id, err := redis.Int64(conn.Do("LINDEX", incidentsForAlertKeyKey(ak), 0))
   223  	if err != nil {
   224  		if err == redis.ErrNil {
   225  			return nil, nil
   226  		}
   227  		return nil, slog.Wrap(err)
   228  	}
   229  	inc, err := d.getIncident(id, conn)
   230  	if err != nil {
   231  		return nil, slog.Wrap(err)
   232  	}
   233  	return inc, nil
   234  }
   235  
   236  func (d *dataAccess) GetLatestIncident(ak models.AlertKey) (*models.IncidentState, error) {
   237  	conn := d.Get()
   238  	defer conn.Close()
   239  
   240  	return d.getLatestIncident(ak, conn)
   241  }
   242  
   243  func (d *dataAccess) GetAllOpenIncidents() ([]*models.IncidentState, error) {
   244  	conn := d.Get()
   245  	defer conn.Close()
   246  
   247  	// get open ids
   248  	ids, err := int64s(conn.Do("HVALS", statesOpenIncidentsKey))
   249  	if err != nil {
   250  		return nil, slog.Wrap(err)
   251  	}
   252  	return d.incidentMultiGet(conn, ids)
   253  }
   254  
   255  func (d *dataAccess) GetAllIncidentsByAlertKey(ak models.AlertKey) ([]*models.IncidentState, error) {
   256  	conn := d.Get()
   257  	defer conn.Close()
   258  
   259  	ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1))
   260  	if err != nil {
   261  		return nil, slog.Wrap(err)
   262  	}
   263  	return d.incidentMultiGet(conn, ids)
   264  }
   265  
   266  func (d *dataAccess) GetAllIncidentIdsByAlertKey(ak models.AlertKey) ([]int64, error) {
   267  	conn := d.Get()
   268  	defer conn.Close()
   269  
   270  	ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1))
   271  	if err != nil {
   272  		return nil, slog.Wrap(err)
   273  	}
   274  	return ids, nil
   275  }
   276  
   277  // In general one should not use the redis KEYS command. So this is only used
   278  // in migration. If we want to use a proper index of all incidents
   279  // then issues with allIncidents must be fixed. Currently it is planned
   280  // to remove allIncidents in a future commit
   281  func (d *dataAccess) getAllIncidentIdsByKeys() ([]int64, error) {
   282  	conn := d.Get()
   283  	defer conn.Close()
   284  
   285  	summaries, err := redis.Strings(conn.Do("KEYS", "incidentById:*"))
   286  	if err != nil {
   287  		return nil, slog.Wrap(err)
   288  	}
   289  	ids := make([]int64, len(summaries))
   290  	for i, sum := range summaries {
   291  		var err error
   292  		ids[i], err = strconv.ParseInt(strings.Split(sum, ":")[1], 0, 64)
   293  		if err != nil {
   294  			return nil, slog.Wrap(err)
   295  		}
   296  	}
   297  	return ids, nil
   298  }
   299  
   300  func (d *dataAccess) incidentMultiGet(conn redis.Conn, ids []int64) ([]*models.IncidentState, error) {
   301  	if len(ids) == 0 {
   302  		return nil, nil
   303  	}
   304  	// get all incident json keys
   305  	args := make([]interface{}, 0, len(ids))
   306  	for _, id := range ids {
   307  		args = append(args, incidentStateKey(id))
   308  	}
   309  	jsons, err := redis.Strings(conn.Do("MGET", args...))
   310  	if err != nil {
   311  		return nil, slog.Wrap(err)
   312  	}
   313  	results := make([]*models.IncidentState, 0, len(jsons))
   314  	for _, j := range jsons {
   315  		state := &models.IncidentState{}
   316  		if err = json.Unmarshal([]byte(j), state); err != nil {
   317  			return nil, slog.Wrap(err)
   318  		}
   319  		results = append(results, state)
   320  	}
   321  	return results, nil
   322  }
   323  
   324  func (d *dataAccess) getIncident(incidentId int64, conn redis.Conn) (*models.IncidentState, error) {
   325  	b, err := redis.Bytes(conn.Do("GET", incidentStateKey(incidentId)))
   326  	if err != nil {
   327  		return nil, slog.Wrap(err)
   328  	}
   329  	state := &models.IncidentState{}
   330  	if err = json.Unmarshal(b, state); err != nil {
   331  		return nil, slog.Wrap(err)
   332  	}
   333  	return state, nil
   334  }
   335  
   336  // setIncident directly sets the incident as is to the datastore
   337  func (d *dataAccess) setIncident(incident *models.IncidentState, conn redis.Conn) error {
   338  	data, err := json.Marshal(incident)
   339  	if err != nil {
   340  		return slog.Wrap(err)
   341  	}
   342  	if _, err = conn.Do("SET", incidentStateKey(incident.Id), data); err != nil {
   343  		return err
   344  	}
   345  	return nil
   346  }
   347  
   348  func (d *dataAccess) GetIncidentState(incidentId int64) (*models.IncidentState, error) {
   349  	conn := d.Get()
   350  	defer conn.Close()
   351  	return d.getIncident(incidentId, conn)
   352  }
   353  
   354  // SetIncidentNext gets the incident for previousIncidentId, and sets its NextId field
   355  // to be nextIncidentId and then saves the incident
   356  func (d *dataAccess) SetIncidentNext(previousIncidentId, nextIncidentId int64) error {
   357  	conn := d.Get()
   358  	defer conn.Close()
   359  	previousIncident, err := d.getIncident(previousIncidentId, conn)
   360  	if err != nil {
   361  		return err
   362  	}
   363  	previousIncident.NextId = nextIncidentId
   364  	err = d.setIncident(previousIncident, conn)
   365  	if err != nil {
   366  		return err
   367  	}
   368  	return nil
   369  }
   370  
   371  func (d *dataAccess) UpdateIncidentState(s *models.IncidentState) (int64, error) {
   372  	return d.save(s, false)
   373  }
   374  
   375  func (d *dataAccess) ImportIncidentState(s *models.IncidentState) error {
   376  	_, err := d.save(s, true)
   377  	return err
   378  }
   379  
   380  func (d *dataAccess) save(s *models.IncidentState, isImport bool) (int64, error) {
   381  	conn := d.Get()
   382  	defer conn.Close()
   383  
   384  	isNew := false
   385  	//if id is still zero, assign new id.
   386  	if s.Id == 0 {
   387  		id, err := redis.Int64(conn.Do("INCR", "maxIncidentId"))
   388  		if err != nil {
   389  			return s.Id, slog.Wrap(err)
   390  		}
   391  		s.Id = id
   392  		isNew = true
   393  	} else if isImport {
   394  		max, err := redis.Int64(conn.Do("GET", "maxIncidentId"))
   395  		if err != nil {
   396  			max = 0
   397  		}
   398  		if max < s.Id {
   399  			if _, err = conn.Do("SET", "maxIncidentId", s.Id); err != nil {
   400  				return s.Id, slog.Wrap(err)
   401  			}
   402  		}
   403  		isNew = true
   404  	}
   405  	return s.Id, d.transact(conn, func() error {
   406  		if isNew {
   407  			// add to list for alert key
   408  			if _, err := conn.Do("LPUSH", incidentsForAlertKeyKey(s.AlertKey), s.Id); err != nil {
   409  				return slog.Wrap(err)
   410  			}
   411  			dat := fmt.Sprintf("%d:%d:%s", s.Id, s.Start.UTC().Unix(), s.AlertKey)
   412  			if _, err := conn.Do("LPUSH", "allIncidents", dat); err != nil {
   413  				return slog.Wrap(err)
   414  			}
   415  		}
   416  
   417  		// store the incident json
   418  		data, err := json.Marshal(s)
   419  		if err != nil {
   420  			return slog.Wrap(err)
   421  		}
   422  		_, err = conn.Do("SET", incidentStateKey(s.Id), data)
   423  
   424  		addRem := func(b bool) string {
   425  			if b {
   426  				return "SADD"
   427  			}
   428  			return "SREM"
   429  		}
   430  		// appropriately add or remove it from the "open" set
   431  		if s.Open {
   432  			if _, err = conn.Do("HSET", statesOpenIncidentsKey, s.AlertKey, s.Id); err != nil {
   433  				return slog.Wrap(err)
   434  			}
   435  		} else {
   436  			if _, err = conn.Do("HDEL", statesOpenIncidentsKey, s.AlertKey); err != nil {
   437  				return slog.Wrap(err)
   438  			}
   439  		}
   440  
   441  		//appropriately add or remove from unknown and uneval sets
   442  		if _, err = conn.Do(addRem(s.CurrentStatus == models.StUnknown), statesUnknownKey(s.Alert), s.AlertKey); err != nil {
   443  			return slog.Wrap(err)
   444  		}
   445  		if _, err = conn.Do(addRem(s.Unevaluated), statesUnevalKey(s.Alert), s.AlertKey); err != nil {
   446  			return slog.Wrap(err)
   447  		}
   448  		return nil
   449  	})
   450  }
   451  
   452  func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error {
   453  	conn := d.Get()
   454  	defer conn.Close()
   455  
   456  	op := "SREM"
   457  	if uneval {
   458  		op = "SADD"
   459  	}
   460  	_, err := conn.Do(op, statesUnevalKey(ak.Name()), ak)
   461  	return slog.Wrap(err)
   462  }
   463  
   464  // The nucular option. Delete all we know about this alert key
   465  func (d *dataAccess) Forget(ak models.AlertKey) error {
   466  	conn := d.Get()
   467  	defer conn.Close()
   468  
   469  	ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1))
   470  	if err != nil {
   471  		return slog.Wrap(err)
   472  	}
   473  	alert := ak.Name()
   474  	return d.transact(conn, func() error {
   475  		// last touched.
   476  		if _, err := conn.Do("ZREM", statesLastTouchedKey(alert), ak); err != nil {
   477  			return slog.Wrap(err)
   478  		}
   479  		// unknown/uneval sets
   480  		if _, err := conn.Do("SREM", statesUnknownKey(alert), ak); err != nil {
   481  			return slog.Wrap(err)
   482  		}
   483  		if _, err := conn.Do("SREM", statesUnevalKey(alert), ak); err != nil {
   484  			return slog.Wrap(err)
   485  		}
   486  		//open set
   487  		if _, err := conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil {
   488  			return slog.Wrap(err)
   489  		}
   490  		if _, err = conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil {
   491  			return slog.Wrap(err)
   492  		}
   493  		for _, id := range ids {
   494  			if _, err = conn.Do("DEL", incidentStateKey(id)); err != nil {
   495  				return slog.Wrap(err)
   496  			}
   497  			if _, err = conn.Do("DEL", renderedTemplatesKey(id)); err != nil {
   498  				return slog.Wrap(err)
   499  			}
   500  		}
   501  		if _, err := conn.Do(d.LCLEAR(), incidentsForAlertKeyKey(ak)); err != nil {
   502  			return slog.Wrap(err)
   503  		}
   504  		return nil
   505  	})
   506  }
   507  
   508  func (d *dataAccess) GetUnknownAndUnevalAlertKeys(alert string) ([]models.AlertKey, []models.AlertKey, error) {
   509  	conn := d.Get()
   510  	defer conn.Close()
   511  
   512  	unknownS, err := redis.Strings(conn.Do("SMEMBERS", statesUnknownKey(alert)))
   513  	if err != nil {
   514  		return nil, nil, slog.Wrap(err)
   515  	}
   516  	unknown := make([]models.AlertKey, len(unknownS))
   517  	for i, u := range unknownS {
   518  		unknown[i] = models.AlertKey(u)
   519  	}
   520  
   521  	unEvals, err := redis.Strings(conn.Do("SMEMBERS", statesUnevalKey(alert)))
   522  	if err != nil {
   523  		return nil, nil, slog.Wrap(err)
   524  	}
   525  	unevals := make([]models.AlertKey, len(unEvals))
   526  	for i, u := range unEvals {
   527  		unevals[i] = models.AlertKey(u)
   528  	}
   529  
   530  	return unknown, unevals, nil
   531  }
   532  
   533  func int64s(reply interface{}, err error) ([]int64, error) {
   534  	if err != nil {
   535  		return nil, slog.Wrap(err)
   536  	}
   537  	ints := []int64{}
   538  	values, err := redis.Values(reply, err)
   539  	if err != nil {
   540  		return ints, slog.Wrap(err)
   541  	}
   542  	if err := redis.ScanSlice(values, &ints); err != nil {
   543  		return ints, slog.Wrap(err)
   544  	}
   545  	return ints, nil
   546  }
   547  
   548  func (d *dataAccess) transact(conn redis.Conn, f func() error) error {
   549  	if !d.isRedis {
   550  		return f()
   551  	}
   552  	if _, err := conn.Do("MULTI"); err != nil {
   553  		return slog.Wrap(err)
   554  	}
   555  	if err := f(); err != nil {
   556  		return slog.Wrap(err)
   557  	}
   558  	if _, err := conn.Do("EXEC"); err != nil {
   559  		return slog.Wrap(err)
   560  	}
   561  	return nil
   562  }
   563  
   564  // CleanupCleanupOldRenderedTemplates will in a loop purge any old rendered templates
   565  func (d *dataAccess) CleanupOldRenderedTemplates(olderThan time.Duration) {
   566  	// run after 5 minutes (to let bosun stabilize)
   567  	// and then every hour
   568  	time.Sleep(time.Minute * 5)
   569  	for {
   570  		conn := d.Get()
   571  		slog.Infof("Cleaning out old rendered templates")
   572  		earliestOk := time.Now().UTC().Add(-1 * olderThan)
   573  		func() {
   574  			toPurge := []int64{}
   575  			keys, err := d.GetRenderedTemplateKeys()
   576  			if err != nil {
   577  				slog.Error(err)
   578  				return
   579  			}
   580  			for _, key := range keys {
   581  				parts := strings.Split(key, ":")
   582  				if len(parts) != 2 {
   583  					slog.Errorf("Invalid rendered template redis key found: %s", key)
   584  					continue
   585  				}
   586  				id, err := strconv.ParseInt(parts[1], 10, 64)
   587  				if err != nil {
   588  					slog.Error(err)
   589  					continue
   590  				}
   591  				state, err := d.getIncident(id, conn)
   592  				if err != nil {
   593  					if IsRedisNil(err) {
   594  						toPurge = append(toPurge, id)
   595  						continue
   596  					}
   597  					slog.Error(err)
   598  					continue
   599  				}
   600  				if state.End != nil && (*state.End).Before(earliestOk) {
   601  					toPurge = append(toPurge, id)
   602  				}
   603  			}
   604  			if len(toPurge) == 0 {
   605  				return
   606  			}
   607  			slog.Infof("Deleting %d old rendered templates", len(toPurge))
   608  			if err = d.DeleteRenderedTemplates(toPurge); err != nil {
   609  				slog.Error(err)
   610  				return
   611  			}
   612  		}()
   613  		conn.Close()
   614  		slog.Info("Done cleaning rendered templates")
   615  		time.Sleep(time.Hour)
   616  	}
   617  }
   618  
   619  func IsRedisNil(err error) bool {
   620  	if err != nil && strings.Contains(err.Error(), "nil returned") {
   621  		return true
   622  	}
   623  	return false
   624  }