bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/database/state_data.go (about) 1 package database 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "strconv" 7 "time" 8 9 "strings" 10 11 "bosun.org/models" 12 "bosun.org/slog" 13 "github.com/garyburd/redigo/redis" 14 ) 15 16 /* 17 incidentById:{id} - json encoded state. Authoritative source. 18 19 renderedTemplatesById:{id} - json encoded RenderedTemplates by Incident Id 20 21 lastTouched:{alert} - ZSET of alert key to last touched time stamp 22 unknown:{alert} - Set of unknown alert keys for alert 23 unevel:{alert} - Set of unevaluated alert keys for alert 24 25 openIncidents - Hash of open incident Ids. Alert Key -> incident id 26 incidents:{ak} - List of incidents for alert key 27 28 allIncidents - List of all incidents ever. Value is "incidentId:timestamp:ak" 29 */ 30 31 const ( 32 statesOpenIncidentsKey = "openIncidents" 33 ) 34 35 func statesLastTouchedKey(alert string) string { 36 return fmt.Sprintf("lastTouched:%s", alert) 37 } 38 func statesUnknownKey(alert string) string { 39 return fmt.Sprintf("unknown:%s", alert) 40 } 41 func statesUnevalKey(alert string) string { 42 return fmt.Sprintf("uneval:%s", alert) 43 } 44 func incidentStateKey(id int64) string { 45 return fmt.Sprintf("incidentById:%d", id) 46 } 47 func renderedTemplatesKey(id int64) string { 48 return fmt.Sprintf("renderedTemplatesById:%d", id) 49 } 50 func incidentsForAlertKeyKey(ak models.AlertKey) string { 51 return fmt.Sprintf("incidents:%s", ak) 52 } 53 54 type StateDataAccess interface { 55 TouchAlertKey(ak models.AlertKey, t time.Time) error 56 GetUntouchedSince(alert string, time int64) ([]models.AlertKey, error) 57 58 GetOpenIncident(ak models.AlertKey) (*models.IncidentState, error) 59 GetLatestIncident(ak models.AlertKey) (*models.IncidentState, error) 60 GetAllOpenIncidents() ([]*models.IncidentState, error) 61 GetIncidentState(incidentId int64) (*models.IncidentState, error) 62 63 GetAllIncidentsByAlertKey(ak models.AlertKey) ([]*models.IncidentState, error) 64 GetAllIncidentIdsByAlertKey(ak models.AlertKey) ([]int64, error) 65 66 UpdateIncidentState(s *models.IncidentState) (int64, error) 67 ImportIncidentState(s *models.IncidentState) error 68 69 // SetIncidentNext gets the incident for previousIncidentId, and sets its NextId field to be nextIncidentId and then saves the incident 70 SetIncidentNext(incidentId, nextIncidentId int64) error 71 72 SetRenderedTemplates(incidentId int64, rt *models.RenderedTemplates) error 73 GetRenderedTemplates(incidentId int64) (*models.RenderedTemplates, error) 74 GetRenderedTemplateKeys() ([]string, error) 75 CleanupOldRenderedTemplates(olderThan time.Duration) 76 DeleteRenderedTemplates(incidentIds []int64) error 77 78 Forget(ak models.AlertKey) error 79 SetUnevaluated(ak models.AlertKey, uneval bool) error 80 GetUnknownAndUnevalAlertKeys(alert string) ([]models.AlertKey, []models.AlertKey, error) 81 } 82 83 func (d *dataAccess) SetRenderedTemplates(incidentId int64, rt *models.RenderedTemplates) error { 84 conn := d.Get() 85 defer conn.Close() 86 87 data, err := json.Marshal(rt) 88 if err != nil { 89 return slog.Wrap(err) 90 } 91 _, err = conn.Do("SET", renderedTemplatesKey(incidentId), data) 92 if err != nil { 93 return slog.Wrap(err) 94 } 95 return nil 96 } 97 98 func (d *dataAccess) GetRenderedTemplates(incidentId int64) (*models.RenderedTemplates, error) { 99 conn := d.Get() 100 defer conn.Close() 101 102 b, err := redis.Bytes(conn.Do("GET", renderedTemplatesKey(incidentId))) 103 renderedT := &models.RenderedTemplates{} 104 if err != nil { 105 if err == redis.ErrNil { 106 return renderedT, nil 107 } 108 return nil, slog.Wrap(err) 109 } 110 if err = json.Unmarshal(b, renderedT); err != nil { 111 return nil, slog.Wrap(err) 112 } 113 return renderedT, nil 114 } 115 116 func (d *dataAccess) scanMatchCmd(pattern string) (string, []interface{}, int) { 117 //ledis uses XSCAN cursor "KV" MATCH foo 118 //redis uses SCAN cursor MATCH foo 119 if d.isRedis { 120 return "SCAN", []interface{}{"0", "MATCH", pattern}, 0 121 } 122 return "XSCAN", []interface{}{"KV", "0", "MATCH", pattern}, 1 123 } 124 125 func (d *dataAccess) GetRenderedTemplateKeys() ([]string, error) { 126 conn := d.Get() 127 defer conn.Close() 128 129 cmd, args, cursorIdx := d.scanMatchCmd("renderedTemplatesById:*") 130 found := []string{} 131 for { 132 vals, err := redis.Values(conn.Do(cmd, args...)) 133 if err != nil { 134 return nil, slog.Wrap(err) 135 } 136 cursor, err := redis.String(vals[0], nil) 137 if err != nil { 138 return nil, slog.Wrap(err) 139 } 140 args[cursorIdx] = cursor 141 keys, err := redis.Strings(vals[1], nil) 142 if err != nil { 143 return nil, slog.Wrap(err) 144 } 145 found = append(found, keys...) 146 if cursor == "" || cursor == "0" { 147 break 148 } 149 } 150 return found, nil 151 } 152 153 func (d *dataAccess) DeleteRenderedTemplates(incidentIds []int64) error { 154 conn := d.Get() 155 defer conn.Close() 156 const batchSize = 1000 157 args := make([]interface{}, 0, batchSize) 158 for len(incidentIds) > 0 { 159 size := len(incidentIds) 160 if size > batchSize { 161 size = batchSize 162 } 163 thisBatch := incidentIds[:size] 164 incidentIds = incidentIds[size:] 165 args = args[:0] 166 for _, id := range thisBatch { 167 args = append(args, renderedTemplatesKey(id)) 168 } 169 _, err := conn.Do("DEL", args...) 170 if err != nil { 171 return slog.Wrap(err) 172 } 173 } 174 return nil 175 } 176 177 func (d *dataAccess) State() StateDataAccess { 178 return d 179 } 180 181 func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error { 182 conn := d.Get() 183 defer conn.Close() 184 185 _, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak)) 186 return slog.Wrap(err) 187 } 188 189 func (d *dataAccess) GetUntouchedSince(alert string, time int64) ([]models.AlertKey, error) { 190 conn := d.Get() 191 defer conn.Close() 192 193 results, err := redis.Strings(conn.Do("ZRANGEBYSCORE", statesLastTouchedKey(alert), "-inf", time)) 194 if err != nil { 195 return nil, slog.Wrap(err) 196 } 197 aks := make([]models.AlertKey, len(results)) 198 for i := range results { 199 aks[i] = models.AlertKey(results[i]) 200 } 201 return aks, nil 202 } 203 204 func (d *dataAccess) GetOpenIncident(ak models.AlertKey) (*models.IncidentState, error) { 205 conn := d.Get() 206 defer conn.Close() 207 208 inc, err := d.getLatestIncident(ak, conn) 209 if err != nil { 210 return nil, slog.Wrap(err) 211 } 212 if inc == nil { 213 return nil, nil 214 } 215 if inc.Open { 216 return inc, nil 217 } 218 return nil, nil 219 } 220 221 func (d *dataAccess) getLatestIncident(ak models.AlertKey, conn redis.Conn) (*models.IncidentState, error) { 222 id, err := redis.Int64(conn.Do("LINDEX", incidentsForAlertKeyKey(ak), 0)) 223 if err != nil { 224 if err == redis.ErrNil { 225 return nil, nil 226 } 227 return nil, slog.Wrap(err) 228 } 229 inc, err := d.getIncident(id, conn) 230 if err != nil { 231 return nil, slog.Wrap(err) 232 } 233 return inc, nil 234 } 235 236 func (d *dataAccess) GetLatestIncident(ak models.AlertKey) (*models.IncidentState, error) { 237 conn := d.Get() 238 defer conn.Close() 239 240 return d.getLatestIncident(ak, conn) 241 } 242 243 func (d *dataAccess) GetAllOpenIncidents() ([]*models.IncidentState, error) { 244 conn := d.Get() 245 defer conn.Close() 246 247 // get open ids 248 ids, err := int64s(conn.Do("HVALS", statesOpenIncidentsKey)) 249 if err != nil { 250 return nil, slog.Wrap(err) 251 } 252 return d.incidentMultiGet(conn, ids) 253 } 254 255 func (d *dataAccess) GetAllIncidentsByAlertKey(ak models.AlertKey) ([]*models.IncidentState, error) { 256 conn := d.Get() 257 defer conn.Close() 258 259 ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1)) 260 if err != nil { 261 return nil, slog.Wrap(err) 262 } 263 return d.incidentMultiGet(conn, ids) 264 } 265 266 func (d *dataAccess) GetAllIncidentIdsByAlertKey(ak models.AlertKey) ([]int64, error) { 267 conn := d.Get() 268 defer conn.Close() 269 270 ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1)) 271 if err != nil { 272 return nil, slog.Wrap(err) 273 } 274 return ids, nil 275 } 276 277 // In general one should not use the redis KEYS command. So this is only used 278 // in migration. If we want to use a proper index of all incidents 279 // then issues with allIncidents must be fixed. Currently it is planned 280 // to remove allIncidents in a future commit 281 func (d *dataAccess) getAllIncidentIdsByKeys() ([]int64, error) { 282 conn := d.Get() 283 defer conn.Close() 284 285 summaries, err := redis.Strings(conn.Do("KEYS", "incidentById:*")) 286 if err != nil { 287 return nil, slog.Wrap(err) 288 } 289 ids := make([]int64, len(summaries)) 290 for i, sum := range summaries { 291 var err error 292 ids[i], err = strconv.ParseInt(strings.Split(sum, ":")[1], 0, 64) 293 if err != nil { 294 return nil, slog.Wrap(err) 295 } 296 } 297 return ids, nil 298 } 299 300 func (d *dataAccess) incidentMultiGet(conn redis.Conn, ids []int64) ([]*models.IncidentState, error) { 301 if len(ids) == 0 { 302 return nil, nil 303 } 304 // get all incident json keys 305 args := make([]interface{}, 0, len(ids)) 306 for _, id := range ids { 307 args = append(args, incidentStateKey(id)) 308 } 309 jsons, err := redis.Strings(conn.Do("MGET", args...)) 310 if err != nil { 311 return nil, slog.Wrap(err) 312 } 313 results := make([]*models.IncidentState, 0, len(jsons)) 314 for _, j := range jsons { 315 state := &models.IncidentState{} 316 if err = json.Unmarshal([]byte(j), state); err != nil { 317 return nil, slog.Wrap(err) 318 } 319 results = append(results, state) 320 } 321 return results, nil 322 } 323 324 func (d *dataAccess) getIncident(incidentId int64, conn redis.Conn) (*models.IncidentState, error) { 325 b, err := redis.Bytes(conn.Do("GET", incidentStateKey(incidentId))) 326 if err != nil { 327 return nil, slog.Wrap(err) 328 } 329 state := &models.IncidentState{} 330 if err = json.Unmarshal(b, state); err != nil { 331 return nil, slog.Wrap(err) 332 } 333 return state, nil 334 } 335 336 // setIncident directly sets the incident as is to the datastore 337 func (d *dataAccess) setIncident(incident *models.IncidentState, conn redis.Conn) error { 338 data, err := json.Marshal(incident) 339 if err != nil { 340 return slog.Wrap(err) 341 } 342 if _, err = conn.Do("SET", incidentStateKey(incident.Id), data); err != nil { 343 return err 344 } 345 return nil 346 } 347 348 func (d *dataAccess) GetIncidentState(incidentId int64) (*models.IncidentState, error) { 349 conn := d.Get() 350 defer conn.Close() 351 return d.getIncident(incidentId, conn) 352 } 353 354 // SetIncidentNext gets the incident for previousIncidentId, and sets its NextId field 355 // to be nextIncidentId and then saves the incident 356 func (d *dataAccess) SetIncidentNext(previousIncidentId, nextIncidentId int64) error { 357 conn := d.Get() 358 defer conn.Close() 359 previousIncident, err := d.getIncident(previousIncidentId, conn) 360 if err != nil { 361 return err 362 } 363 previousIncident.NextId = nextIncidentId 364 err = d.setIncident(previousIncident, conn) 365 if err != nil { 366 return err 367 } 368 return nil 369 } 370 371 func (d *dataAccess) UpdateIncidentState(s *models.IncidentState) (int64, error) { 372 return d.save(s, false) 373 } 374 375 func (d *dataAccess) ImportIncidentState(s *models.IncidentState) error { 376 _, err := d.save(s, true) 377 return err 378 } 379 380 func (d *dataAccess) save(s *models.IncidentState, isImport bool) (int64, error) { 381 conn := d.Get() 382 defer conn.Close() 383 384 isNew := false 385 //if id is still zero, assign new id. 386 if s.Id == 0 { 387 id, err := redis.Int64(conn.Do("INCR", "maxIncidentId")) 388 if err != nil { 389 return s.Id, slog.Wrap(err) 390 } 391 s.Id = id 392 isNew = true 393 } else if isImport { 394 max, err := redis.Int64(conn.Do("GET", "maxIncidentId")) 395 if err != nil { 396 max = 0 397 } 398 if max < s.Id { 399 if _, err = conn.Do("SET", "maxIncidentId", s.Id); err != nil { 400 return s.Id, slog.Wrap(err) 401 } 402 } 403 isNew = true 404 } 405 return s.Id, d.transact(conn, func() error { 406 if isNew { 407 // add to list for alert key 408 if _, err := conn.Do("LPUSH", incidentsForAlertKeyKey(s.AlertKey), s.Id); err != nil { 409 return slog.Wrap(err) 410 } 411 dat := fmt.Sprintf("%d:%d:%s", s.Id, s.Start.UTC().Unix(), s.AlertKey) 412 if _, err := conn.Do("LPUSH", "allIncidents", dat); err != nil { 413 return slog.Wrap(err) 414 } 415 } 416 417 // store the incident json 418 data, err := json.Marshal(s) 419 if err != nil { 420 return slog.Wrap(err) 421 } 422 _, err = conn.Do("SET", incidentStateKey(s.Id), data) 423 424 addRem := func(b bool) string { 425 if b { 426 return "SADD" 427 } 428 return "SREM" 429 } 430 // appropriately add or remove it from the "open" set 431 if s.Open { 432 if _, err = conn.Do("HSET", statesOpenIncidentsKey, s.AlertKey, s.Id); err != nil { 433 return slog.Wrap(err) 434 } 435 } else { 436 if _, err = conn.Do("HDEL", statesOpenIncidentsKey, s.AlertKey); err != nil { 437 return slog.Wrap(err) 438 } 439 } 440 441 //appropriately add or remove from unknown and uneval sets 442 if _, err = conn.Do(addRem(s.CurrentStatus == models.StUnknown), statesUnknownKey(s.Alert), s.AlertKey); err != nil { 443 return slog.Wrap(err) 444 } 445 if _, err = conn.Do(addRem(s.Unevaluated), statesUnevalKey(s.Alert), s.AlertKey); err != nil { 446 return slog.Wrap(err) 447 } 448 return nil 449 }) 450 } 451 452 func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error { 453 conn := d.Get() 454 defer conn.Close() 455 456 op := "SREM" 457 if uneval { 458 op = "SADD" 459 } 460 _, err := conn.Do(op, statesUnevalKey(ak.Name()), ak) 461 return slog.Wrap(err) 462 } 463 464 // The nucular option. Delete all we know about this alert key 465 func (d *dataAccess) Forget(ak models.AlertKey) error { 466 conn := d.Get() 467 defer conn.Close() 468 469 ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1)) 470 if err != nil { 471 return slog.Wrap(err) 472 } 473 alert := ak.Name() 474 return d.transact(conn, func() error { 475 // last touched. 476 if _, err := conn.Do("ZREM", statesLastTouchedKey(alert), ak); err != nil { 477 return slog.Wrap(err) 478 } 479 // unknown/uneval sets 480 if _, err := conn.Do("SREM", statesUnknownKey(alert), ak); err != nil { 481 return slog.Wrap(err) 482 } 483 if _, err := conn.Do("SREM", statesUnevalKey(alert), ak); err != nil { 484 return slog.Wrap(err) 485 } 486 //open set 487 if _, err := conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil { 488 return slog.Wrap(err) 489 } 490 if _, err = conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil { 491 return slog.Wrap(err) 492 } 493 for _, id := range ids { 494 if _, err = conn.Do("DEL", incidentStateKey(id)); err != nil { 495 return slog.Wrap(err) 496 } 497 if _, err = conn.Do("DEL", renderedTemplatesKey(id)); err != nil { 498 return slog.Wrap(err) 499 } 500 } 501 if _, err := conn.Do(d.LCLEAR(), incidentsForAlertKeyKey(ak)); err != nil { 502 return slog.Wrap(err) 503 } 504 return nil 505 }) 506 } 507 508 func (d *dataAccess) GetUnknownAndUnevalAlertKeys(alert string) ([]models.AlertKey, []models.AlertKey, error) { 509 conn := d.Get() 510 defer conn.Close() 511 512 unknownS, err := redis.Strings(conn.Do("SMEMBERS", statesUnknownKey(alert))) 513 if err != nil { 514 return nil, nil, slog.Wrap(err) 515 } 516 unknown := make([]models.AlertKey, len(unknownS)) 517 for i, u := range unknownS { 518 unknown[i] = models.AlertKey(u) 519 } 520 521 unEvals, err := redis.Strings(conn.Do("SMEMBERS", statesUnevalKey(alert))) 522 if err != nil { 523 return nil, nil, slog.Wrap(err) 524 } 525 unevals := make([]models.AlertKey, len(unEvals)) 526 for i, u := range unEvals { 527 unevals[i] = models.AlertKey(u) 528 } 529 530 return unknown, unevals, nil 531 } 532 533 func int64s(reply interface{}, err error) ([]int64, error) { 534 if err != nil { 535 return nil, slog.Wrap(err) 536 } 537 ints := []int64{} 538 values, err := redis.Values(reply, err) 539 if err != nil { 540 return ints, slog.Wrap(err) 541 } 542 if err := redis.ScanSlice(values, &ints); err != nil { 543 return ints, slog.Wrap(err) 544 } 545 return ints, nil 546 } 547 548 func (d *dataAccess) transact(conn redis.Conn, f func() error) error { 549 if !d.isRedis { 550 return f() 551 } 552 if _, err := conn.Do("MULTI"); err != nil { 553 return slog.Wrap(err) 554 } 555 if err := f(); err != nil { 556 return slog.Wrap(err) 557 } 558 if _, err := conn.Do("EXEC"); err != nil { 559 return slog.Wrap(err) 560 } 561 return nil 562 } 563 564 // CleanupCleanupOldRenderedTemplates will in a loop purge any old rendered templates 565 func (d *dataAccess) CleanupOldRenderedTemplates(olderThan time.Duration) { 566 // run after 5 minutes (to let bosun stabilize) 567 // and then every hour 568 time.Sleep(time.Minute * 5) 569 for { 570 conn := d.Get() 571 slog.Infof("Cleaning out old rendered templates") 572 earliestOk := time.Now().UTC().Add(-1 * olderThan) 573 func() { 574 toPurge := []int64{} 575 keys, err := d.GetRenderedTemplateKeys() 576 if err != nil { 577 slog.Error(err) 578 return 579 } 580 for _, key := range keys { 581 parts := strings.Split(key, ":") 582 if len(parts) != 2 { 583 slog.Errorf("Invalid rendered template redis key found: %s", key) 584 continue 585 } 586 id, err := strconv.ParseInt(parts[1], 10, 64) 587 if err != nil { 588 slog.Error(err) 589 continue 590 } 591 state, err := d.getIncident(id, conn) 592 if err != nil { 593 if IsRedisNil(err) { 594 toPurge = append(toPurge, id) 595 continue 596 } 597 slog.Error(err) 598 continue 599 } 600 if state.End != nil && (*state.End).Before(earliestOk) { 601 toPurge = append(toPurge, id) 602 } 603 } 604 if len(toPurge) == 0 { 605 return 606 } 607 slog.Infof("Deleting %d old rendered templates", len(toPurge)) 608 if err = d.DeleteRenderedTemplates(toPurge); err != nil { 609 slog.Error(err) 610 return 611 } 612 }() 613 conn.Close() 614 slog.Info("Done cleaning rendered templates") 615 time.Sleep(time.Hour) 616 } 617 } 618 619 func IsRedisNil(err error) bool { 620 if err != nil && strings.Contains(err.Error(), "nil returned") { 621 return true 622 } 623 return false 624 }