github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/state/logs.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 // Low-level functionality for interacting with the logs collection 5 // and tailing logs from the replication oplog. 6 7 package state 8 9 import ( 10 "regexp" 11 "strings" 12 "time" 13 14 "github.com/dustin/go-humanize" 15 "github.com/juju/errors" 16 "github.com/juju/loggo" 17 "github.com/juju/names" 18 "github.com/juju/utils/deque" 19 "github.com/juju/utils/set" 20 "gopkg.in/mgo.v2" 21 "gopkg.in/mgo.v2/bson" 22 "launchpad.net/tomb" 23 24 "github.com/juju/juju/mongo" 25 ) 26 27 const logsDB = "logs" 28 const logsC = "logs" 29 30 // LoggingState describes the methods on State required for logging to 31 // the database. 32 type LoggingState interface { 33 EnvironUUID() string 34 MongoSession() *mgo.Session 35 } 36 37 // InitDbLogs sets up the indexes for the logs collection. It should 38 // be called as state is opened. It is idempotent. 39 func InitDbLogs(session *mgo.Session) error { 40 logsColl := session.DB(logsDB).C(logsC) 41 for _, key := range [][]string{{"e", "t"}, {"e", "n"}} { 42 err := logsColl.EnsureIndex(mgo.Index{Key: key}) 43 if err != nil { 44 return errors.Annotate(err, "cannot create index for logs collection") 45 } 46 } 47 return nil 48 } 49 50 // logDoc describes log messages stored in MongoDB. 51 // 52 // Single character field names are used for serialisation to save 53 // space. These documents will be inserted 1000's of times and each 54 // document includes the field names. 55 type logDoc struct { 56 Id bson.ObjectId `bson:"_id"` 57 Time time.Time `bson:"t"` 58 EnvUUID string `bson:"e"` 59 Entity string `bson:"n"` // e.g. "machine-0" 60 Module string `bson:"m"` // e.g. "juju.worker.firewaller" 61 Location string `bson:"l"` // "filename:lineno" 62 Level loggo.Level `bson:"v"` 63 Message string `bson:"x"` 64 } 65 66 type DbLogger struct { 67 logsColl *mgo.Collection 68 envUUID string 69 entity string 70 } 71 72 // NewDbLogger returns a DbLogger instance which is used to write logs 73 // to the database. 74 func NewDbLogger(st LoggingState, entity names.Tag) *DbLogger { 75 _, logsColl := initLogsSession(st) 76 return &DbLogger{ 77 logsColl: logsColl, 78 envUUID: st.EnvironUUID(), 79 entity: entity.String(), 80 } 81 } 82 83 // Log writes a log message to the database. 84 func (logger *DbLogger) Log(t time.Time, module string, location string, level loggo.Level, msg string) error { 85 return logger.logsColl.Insert(&logDoc{ 86 Id: bson.NewObjectId(), 87 Time: t, 88 EnvUUID: logger.envUUID, 89 Entity: logger.entity, 90 Module: module, 91 Location: location, 92 Level: level, 93 Message: msg, 94 }) 95 } 96 97 // Close cleans up resources used by the DbLogger instance. 98 func (logger *DbLogger) Close() { 99 if logger.logsColl != nil { 100 logger.logsColl.Database.Session.Close() 101 } 102 } 103 104 // LogTailer allows for retrieval of Juju's logs from MongoDB. It 105 // first returns any matching already recorded logs and then waits for 106 // additional matching logs as they appear. 107 type LogTailer interface { 108 // Logs returns the channel through which the LogTailer returns 109 // Juju logs. It will be closed when the tailer stops. 110 Logs() <-chan *LogRecord 111 112 // Dying returns a channel which will be closed as the LogTailer 113 // stops. 114 Dying() <-chan struct{} 115 116 // Stop is used to request that the LogTailer stops. It blocks 117 // unil the LogTailer has stopped. 118 Stop() error 119 120 // Err returns the error that caused the LogTailer to stopped. If 121 // it hasn't stopped or stopped without error nil will be 122 // returned. 123 Err() error 124 } 125 126 // LogRecord defines a single Juju log message as returned by 127 // LogTailer. 128 type LogRecord struct { 129 Time time.Time 130 Entity string 131 Module string 132 Location string 133 Level loggo.Level 134 Message string 135 } 136 137 // LogTailerParams specifies the filtering a LogTailer should apply to 138 // logs in order to decide which to return. 139 type LogTailerParams struct { 140 StartTime time.Time 141 MinLevel loggo.Level 142 InitialLines int 143 IncludeEntity []string 144 ExcludeEntity []string 145 IncludeModule []string 146 ExcludeModule []string 147 Oplog *mgo.Collection // For testing only 148 } 149 150 // oplogOverlap is used to decide on the initial oplog timestamp to 151 // use when the LogTailer transitions from querying the logs 152 // collection to tailing the oplog. Oplog records with a timestamp >= 153 // tolastTsFromLogsCollection - oplogOverlap will be considered. This 154 // is to allow for delayed log writes, clock skew between the Juju 155 // cluster hosts and log writes that occur during the transition 156 // period. 157 const oplogOverlap = time.Minute 158 159 // This is the maximum number of log document ids that will be tracked 160 // to avoid re-reporting logs when transitioning between querying the 161 // logs collection and tailing the oplog. 162 // 163 // The value was calculated by looking at the per-minute peak log 164 // output of large broken environments with logging at DEBUG. 165 var maxRecentLogIds = int(oplogOverlap.Minutes() * 150000) 166 167 // NewLogTailer returns a LogTailer which filters according to the 168 // parameters given. 169 func NewLogTailer(st LoggingState, params *LogTailerParams) LogTailer { 170 session := st.MongoSession().Copy() 171 t := &logTailer{ 172 envUUID: st.EnvironUUID(), 173 session: session, 174 logsColl: session.DB(logsDB).C(logsC).With(session), 175 params: params, 176 logCh: make(chan *LogRecord), 177 recentIds: newRecentIdTracker(maxRecentLogIds), 178 } 179 go func() { 180 err := t.loop() 181 t.tomb.Kill(errors.Cause(err)) 182 close(t.logCh) 183 session.Close() 184 t.tomb.Done() 185 }() 186 return t 187 } 188 189 type logTailer struct { 190 tomb tomb.Tomb 191 envUUID string 192 session *mgo.Session 193 logsColl *mgo.Collection 194 params *LogTailerParams 195 logCh chan *LogRecord 196 lastTime time.Time 197 recentIds *recentIdTracker 198 } 199 200 // Logs implements the LogTailer interface. 201 func (t *logTailer) Logs() <-chan *LogRecord { 202 return t.logCh 203 } 204 205 // Dying implements the LogTailer interface. 206 func (t *logTailer) Dying() <-chan struct{} { 207 return t.tomb.Dying() 208 } 209 210 // Stop implements the LogTailer interface. 211 func (t *logTailer) Stop() error { 212 t.tomb.Kill(nil) 213 return t.tomb.Wait() 214 } 215 216 // Err implements the LogTailer interface. 217 func (t *logTailer) Err() error { 218 return t.tomb.Err() 219 } 220 221 func (t *logTailer) loop() error { 222 err := t.processCollection() 223 if err != nil { 224 return errors.Trace(err) 225 } 226 227 err = t.tailOplog() 228 return errors.Trace(err) 229 } 230 231 func (t *logTailer) processCollection() error { 232 // Create a selector from the params. 233 sel := t.paramsToSelector(t.params, "") 234 query := t.logsColl.Find(sel) 235 236 if t.params.InitialLines > 0 { 237 // This is a little racy but it's good enough. 238 count, err := query.Count() 239 if err != nil { 240 return errors.Annotate(err, "query count failed") 241 } 242 if skipOver := count - t.params.InitialLines; skipOver > 0 { 243 query = query.Skip(skipOver) 244 } 245 } 246 247 iter := query.Sort("t", "_id").Iter() 248 doc := new(logDoc) 249 for iter.Next(doc) { 250 select { 251 case <-t.tomb.Dying(): 252 return errors.Trace(tomb.ErrDying) 253 case t.logCh <- logDocToRecord(doc): 254 t.lastTime = doc.Time 255 t.recentIds.Add(doc.Id) 256 } 257 } 258 return errors.Trace(iter.Close()) 259 } 260 261 func (t *logTailer) tailOplog() error { 262 recentIds := t.recentIds.AsSet() 263 264 newParams := t.params 265 newParams.StartTime = t.lastTime 266 oplogSel := append(t.paramsToSelector(newParams, "o."), 267 bson.DocElem{"ns", logsDB + "." + logsC}, 268 ) 269 270 oplog := t.params.Oplog 271 if oplog == nil { 272 oplog = mongo.GetOplog(t.session) 273 } 274 275 minOplogTs := t.lastTime.Add(-oplogOverlap) 276 oplogTailer := mongo.NewOplogTailer(oplog, oplogSel, minOplogTs) 277 defer oplogTailer.Stop() 278 279 logger.Tracef("LogTailer starting oplog tailing: recent id count=%d, lastTime=%s, minOplogTs=%s", 280 recentIds.Length(), t.lastTime, minOplogTs) 281 282 skipCount := 0 283 for { 284 select { 285 case <-t.tomb.Dying(): 286 return errors.Trace(tomb.ErrDying) 287 case oplogDoc, ok := <-oplogTailer.Out(): 288 if !ok { 289 return errors.Annotate(oplogTailer.Err(), "oplog tailer died") 290 } 291 292 doc := new(logDoc) 293 err := oplogDoc.UnmarshalObject(doc) 294 if err != nil { 295 return errors.Annotate(err, "oplog unmarshalling failed") 296 } 297 298 if recentIds.Contains(doc.Id) { 299 // This document has already been reported. 300 skipCount++ 301 if skipCount%1000 == 0 { 302 logger.Tracef("LogTailer duplicates skipped: %d", skipCount) 303 } 304 continue 305 } 306 307 select { 308 case <-t.tomb.Dying(): 309 return errors.Trace(tomb.ErrDying) 310 case t.logCh <- logDocToRecord(doc): 311 } 312 } 313 } 314 } 315 316 func (t *logTailer) paramsToSelector(params *LogTailerParams, prefix string) bson.D { 317 sel := bson.D{ 318 {"e", t.envUUID}, 319 {"t", bson.M{"$gte": params.StartTime}}, 320 } 321 if params.MinLevel > loggo.UNSPECIFIED { 322 sel = append(sel, bson.DocElem{"v", bson.M{"$gte": params.MinLevel}}) 323 } 324 if len(params.IncludeEntity) > 0 { 325 sel = append(sel, 326 bson.DocElem{"n", bson.RegEx{Pattern: makeEntityPattern(params.IncludeEntity)}}) 327 } 328 if len(params.ExcludeEntity) > 0 { 329 sel = append(sel, 330 bson.DocElem{"n", bson.M{"$not": bson.RegEx{Pattern: makeEntityPattern(params.ExcludeEntity)}}}) 331 } 332 if len(params.IncludeModule) > 0 { 333 sel = append(sel, 334 bson.DocElem{"m", bson.RegEx{Pattern: makeModulePattern(params.IncludeModule)}}) 335 } 336 if len(params.ExcludeModule) > 0 { 337 sel = append(sel, 338 bson.DocElem{"m", bson.M{"$not": bson.RegEx{Pattern: makeModulePattern(params.ExcludeModule)}}}) 339 } 340 341 if prefix != "" { 342 for i, elem := range sel { 343 sel[i].Name = prefix + elem.Name 344 } 345 } 346 return sel 347 } 348 349 func makeEntityPattern(entities []string) string { 350 var patterns []string 351 for _, entity := range entities { 352 // Convert * wildcard to the regex equivalent. This is safe 353 // because * never appears in entity names. 354 patterns = append(patterns, strings.Replace(entity, "*", ".*", -1)) 355 } 356 return `^(` + strings.Join(patterns, "|") + `)$` 357 } 358 359 func makeModulePattern(modules []string) string { 360 var patterns []string 361 for _, module := range modules { 362 patterns = append(patterns, regexp.QuoteMeta(module)) 363 } 364 return `^(` + strings.Join(patterns, "|") + `)(\..+)?$` 365 } 366 367 func newRecentIdTracker(maxLen int) *recentIdTracker { 368 return &recentIdTracker{ 369 ids: deque.NewWithMaxLen(maxLen), 370 } 371 } 372 373 type recentIdTracker struct { 374 ids *deque.Deque 375 } 376 377 func (t *recentIdTracker) Add(id bson.ObjectId) { 378 t.ids.PushBack(id) 379 } 380 381 func (t *recentIdTracker) AsSet() *objectIdSet { 382 out := newObjectIdSet() 383 for { 384 id, ok := t.ids.PopFront() 385 if !ok { 386 break 387 } 388 out.Add(id.(bson.ObjectId)) 389 } 390 return out 391 } 392 393 func newObjectIdSet() *objectIdSet { 394 return &objectIdSet{ 395 ids: set.NewStrings(), 396 } 397 } 398 399 type objectIdSet struct { 400 ids set.Strings 401 } 402 403 func (s *objectIdSet) Add(id bson.ObjectId) { 404 s.ids.Add(string(id)) 405 } 406 407 func (s *objectIdSet) Contains(id bson.ObjectId) bool { 408 return s.ids.Contains(string(id)) 409 } 410 411 func (s *objectIdSet) Length() int { 412 return len(s.ids) 413 } 414 415 func logDocToRecord(doc *logDoc) *LogRecord { 416 return &LogRecord{ 417 Time: doc.Time, 418 Entity: doc.Entity, 419 Module: doc.Module, 420 Location: doc.Location, 421 Level: doc.Level, 422 Message: doc.Message, 423 } 424 } 425 426 // PruneLogs removes old log documents in order to control the size of 427 // logs collection. All logs older than minLogTime are 428 // removed. Further removal is also performed if the logs collection 429 // size is greater than maxLogsMB. 430 func PruneLogs(st LoggingState, minLogTime time.Time, maxLogsMB int) error { 431 session, logsColl := initLogsSession(st) 432 defer session.Close() 433 434 envUUIDs, err := getEnvsInLogs(logsColl) 435 if err != nil { 436 return errors.Annotate(err, "failed to get log counts") 437 } 438 439 pruneCounts := make(map[string]int) 440 441 // Remove old log entries (per environment UUID to take advantage 442 // of indexes on the logs collection). 443 for _, envUUID := range envUUIDs { 444 removeInfo, err := logsColl.RemoveAll(bson.M{ 445 "e": envUUID, 446 "t": bson.M{"$lt": minLogTime}, 447 }) 448 if err != nil { 449 return errors.Annotate(err, "failed to prune logs by time") 450 } 451 pruneCounts[envUUID] = removeInfo.Removed 452 } 453 454 // Do further pruning if the logs collection is over the maximum size. 455 for { 456 collMB, err := getCollectionMB(logsColl) 457 if err != nil { 458 return errors.Annotate(err, "failed to retrieve log counts") 459 } 460 if collMB <= maxLogsMB { 461 break 462 } 463 464 envUUID, count, err := findEnvWithMostLogs(logsColl, envUUIDs) 465 if err != nil { 466 return errors.Annotate(err, "log count query failed") 467 } 468 if count < 5000 { 469 break // Pruning is not worthwhile 470 } 471 472 // Remove the oldest 1% of log records for the environment. 473 toRemove := int(float64(count) * 0.01) 474 475 // Find the threshold timestammp to start removing from. 476 // NOTE: this assumes that there are no more logs being added 477 // for the time range being pruned (which should be true for 478 // any realistic minimum log collection size). 479 tsQuery := logsColl.Find(bson.M{"e": envUUID}).Sort("t") 480 tsQuery = tsQuery.Skip(toRemove) 481 tsQuery = tsQuery.Select(bson.M{"t": 1}) 482 var doc bson.M 483 err = tsQuery.One(&doc) 484 if err != nil { 485 return errors.Annotate(err, "log pruning timestamp query failed") 486 } 487 thresholdTs := doc["t"].(time.Time) 488 489 // Remove old records. 490 removeInfo, err := logsColl.RemoveAll(bson.M{ 491 "e": envUUID, 492 "t": bson.M{"$lt": thresholdTs}, 493 }) 494 if err != nil { 495 return errors.Annotate(err, "log pruning failed") 496 } 497 pruneCounts[envUUID] += removeInfo.Removed 498 } 499 500 for envUUID, count := range pruneCounts { 501 if count > 0 { 502 logger.Debugf("pruned %d logs for environment %s", count, envUUID) 503 } 504 } 505 return nil 506 } 507 508 // initLogsSession creates a new session suitable for logging updates, 509 // returning the session and a logs mgo.Collection connected to that 510 // session. 511 func initLogsSession(st LoggingState) (*mgo.Session, *mgo.Collection) { 512 // To improve throughput, only wait for the logs to be written to 513 // the primary. For some reason, this makes a huge difference even 514 // when the replicaset only has one member (i.e. a single primary). 515 session := st.MongoSession().Copy() 516 session.SetSafe(&mgo.Safe{ 517 W: 1, 518 }) 519 db := session.DB(logsDB) 520 return session, db.C(logsC).With(session) 521 } 522 523 // getCollectionMB returns the size of a MongoDB collection (in 524 // bytes), excluding space used by indexes. 525 func getCollectionMB(coll *mgo.Collection) (int, error) { 526 var result bson.M 527 err := coll.Database.Run(bson.D{ 528 {"collStats", coll.Name}, 529 {"scale", humanize.MiByte}, 530 }, &result) 531 if err != nil { 532 return 0, errors.Trace(err) 533 } 534 return result["size"].(int), nil 535 } 536 537 // getEnvsInLogs returns the unique envrionment UUIDs that exist in 538 // the logs collection. This uses the one of the indexes on the 539 // collection and should be fast. 540 func getEnvsInLogs(coll *mgo.Collection) ([]string, error) { 541 var envUUIDs []string 542 err := coll.Find(nil).Distinct("e", &envUUIDs) 543 if err != nil { 544 return nil, errors.Trace(err) 545 } 546 return envUUIDs, nil 547 } 548 549 // findEnvWithMostLogs returns the envUUID and log count for the 550 // environment with the most logs in the logs collection. 551 func findEnvWithMostLogs(logsColl *mgo.Collection, envUUIDs []string) (string, int, error) { 552 var maxEnvUUID string 553 var maxCount int 554 for _, envUUID := range envUUIDs { 555 count, err := getLogCountForEnv(logsColl, envUUID) 556 if err != nil { 557 return "", -1, errors.Trace(err) 558 } 559 if count > maxCount { 560 maxEnvUUID = envUUID 561 maxCount = count 562 } 563 } 564 return maxEnvUUID, maxCount, nil 565 } 566 567 // getLogCountForEnv returns the number of log records stored for a 568 // given environment. 569 func getLogCountForEnv(coll *mgo.Collection, envUUID string) (int, error) { 570 count, err := coll.Find(bson.M{"e": envUUID}).Count() 571 if err != nil { 572 return -1, errors.Annotate(err, "failed to get log count") 573 } 574 return count, nil 575 }