github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/state/logs.go

github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/state/logs.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  // Low-level functionality for interacting with the logs collection
     5  // and tailing logs from the replication oplog.
     6  
     7  package state
     8  
     9  import (
    10  	"regexp"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/dustin/go-humanize"
    15  	"github.com/juju/errors"
    16  	"github.com/juju/loggo"
    17  	"github.com/juju/names"
    18  	"github.com/juju/utils/deque"
    19  	"github.com/juju/utils/set"
    20  	"gopkg.in/mgo.v2"
    21  	"gopkg.in/mgo.v2/bson"
    22  	"launchpad.net/tomb"
    23  
    24  	"github.com/juju/juju/mongo"
    25  )
    26  
    27  const logsDB = "logs"
    28  const logsC = "logs"
    29  
    30  // LoggingState describes the methods on State required for logging to
    31  // the database.
    32  type LoggingState interface {
    33  	EnvironUUID() string
    34  	MongoSession() *mgo.Session
    35  }
    36  
    37  // InitDbLogs sets up the indexes for the logs collection. It should
    38  // be called as state is opened. It is idempotent.
    39  func InitDbLogs(session *mgo.Session) error {
    40  	logsColl := session.DB(logsDB).C(logsC)
    41  	for _, key := range [][]string{{"e", "t"}, {"e", "n"}} {
    42  		err := logsColl.EnsureIndex(mgo.Index{Key: key})
    43  		if err != nil {
    44  			return errors.Annotate(err, "cannot create index for logs collection")
    45  		}
    46  	}
    47  	return nil
    48  }
    49  
    50  // logDoc describes log messages stored in MongoDB.
    51  //
    52  // Single character field names are used for serialisation to save
    53  // space. These documents will be inserted 1000's of times and each
    54  // document includes the field names.
    55  type logDoc struct {
    56  	Id       bson.ObjectId `bson:"_id"`
    57  	Time     time.Time     `bson:"t"`
    58  	EnvUUID  string        `bson:"e"`
    59  	Entity   string        `bson:"n"` // e.g. "machine-0"
    60  	Module   string        `bson:"m"` // e.g. "juju.worker.firewaller"
    61  	Location string        `bson:"l"` // "filename:lineno"
    62  	Level    loggo.Level   `bson:"v"`
    63  	Message  string        `bson:"x"`
    64  }
    65  
    66  type DbLogger struct {
    67  	logsColl *mgo.Collection
    68  	envUUID  string
    69  	entity   string
    70  }
    71  
    72  // NewDbLogger returns a DbLogger instance which is used to write logs
    73  // to the database.
    74  func NewDbLogger(st LoggingState, entity names.Tag) *DbLogger {
    75  	_, logsColl := initLogsSession(st)
    76  	return &DbLogger{
    77  		logsColl: logsColl,
    78  		envUUID:  st.EnvironUUID(),
    79  		entity:   entity.String(),
    80  	}
    81  }
    82  
    83  // Log writes a log message to the database.
    84  func (logger *DbLogger) Log(t time.Time, module string, location string, level loggo.Level, msg string) error {
    85  	return logger.logsColl.Insert(&logDoc{
    86  		Id:       bson.NewObjectId(),
    87  		Time:     t,
    88  		EnvUUID:  logger.envUUID,
    89  		Entity:   logger.entity,
    90  		Module:   module,
    91  		Location: location,
    92  		Level:    level,
    93  		Message:  msg,
    94  	})
    95  }
    96  
    97  // Close cleans up resources used by the DbLogger instance.
    98  func (logger *DbLogger) Close() {
    99  	if logger.logsColl != nil {
   100  		logger.logsColl.Database.Session.Close()
   101  	}
   102  }
   103  
   104  // LogTailer allows for retrieval of Juju's logs from MongoDB. It
   105  // first returns any matching already recorded logs and then waits for
   106  // additional matching logs as they appear.
   107  type LogTailer interface {
   108  	// Logs returns the channel through which the LogTailer returns
   109  	// Juju logs. It will be closed when the tailer stops.
   110  	Logs() <-chan *LogRecord
   111  
   112  	// Dying returns a channel which will be closed as the LogTailer
   113  	// stops.
   114  	Dying() <-chan struct{}
   115  
   116  	// Stop is used to request that the LogTailer stops. It blocks
   117  	// unil the LogTailer has stopped.
   118  	Stop() error
   119  
   120  	// Err returns the error that caused the LogTailer to stopped. If
   121  	// it hasn't stopped or stopped without error nil will be
   122  	// returned.
   123  	Err() error
   124  }
   125  
   126  // LogRecord defines a single Juju log message as returned by
   127  // LogTailer.
   128  type LogRecord struct {
   129  	Time     time.Time
   130  	Entity   string
   131  	Module   string
   132  	Location string
   133  	Level    loggo.Level
   134  	Message  string
   135  }
   136  
   137  // LogTailerParams specifies the filtering a LogTailer should apply to
   138  // logs in order to decide which to return.
   139  type LogTailerParams struct {
   140  	StartTime     time.Time
   141  	MinLevel      loggo.Level
   142  	InitialLines  int
   143  	IncludeEntity []string
   144  	ExcludeEntity []string
   145  	IncludeModule []string
   146  	ExcludeModule []string
   147  	Oplog         *mgo.Collection // For testing only
   148  }
   149  
   150  // oplogOverlap is used to decide on the initial oplog timestamp to
   151  // use when the LogTailer transitions from querying the logs
   152  // collection to tailing the oplog. Oplog records with a timestamp >=
   153  // tolastTsFromLogsCollection - oplogOverlap will be considered. This
   154  // is to allow for delayed log writes, clock skew between the Juju
   155  // cluster hosts and log writes that occur during the transition
   156  // period.
   157  const oplogOverlap = time.Minute
   158  
   159  // This is the maximum number of log document ids that will be tracked
   160  // to avoid re-reporting logs when transitioning between querying the
   161  // logs collection and tailing the oplog.
   162  //
   163  // The value was calculated by looking at the per-minute peak log
   164  // output of large broken environments with logging at DEBUG.
   165  var maxRecentLogIds = int(oplogOverlap.Minutes() * 150000)
   166  
   167  // NewLogTailer returns a LogTailer which filters according to the
   168  // parameters given.
   169  func NewLogTailer(st LoggingState, params *LogTailerParams) LogTailer {
   170  	session := st.MongoSession().Copy()
   171  	t := &logTailer{
   172  		envUUID:   st.EnvironUUID(),
   173  		session:   session,
   174  		logsColl:  session.DB(logsDB).C(logsC).With(session),
   175  		params:    params,
   176  		logCh:     make(chan *LogRecord),
   177  		recentIds: newRecentIdTracker(maxRecentLogIds),
   178  	}
   179  	go func() {
   180  		err := t.loop()
   181  		t.tomb.Kill(errors.Cause(err))
   182  		close(t.logCh)
   183  		session.Close()
   184  		t.tomb.Done()
   185  	}()
   186  	return t
   187  }
   188  
   189  type logTailer struct {
   190  	tomb      tomb.Tomb
   191  	envUUID   string
   192  	session   *mgo.Session
   193  	logsColl  *mgo.Collection
   194  	params    *LogTailerParams
   195  	logCh     chan *LogRecord
   196  	lastTime  time.Time
   197  	recentIds *recentIdTracker
   198  }
   199  
   200  // Logs implements the LogTailer interface.
   201  func (t *logTailer) Logs() <-chan *LogRecord {
   202  	return t.logCh
   203  }
   204  
   205  // Dying implements the LogTailer interface.
   206  func (t *logTailer) Dying() <-chan struct{} {
   207  	return t.tomb.Dying()
   208  }
   209  
   210  // Stop implements the LogTailer interface.
   211  func (t *logTailer) Stop() error {
   212  	t.tomb.Kill(nil)
   213  	return t.tomb.Wait()
   214  }
   215  
   216  // Err implements the LogTailer interface.
   217  func (t *logTailer) Err() error {
   218  	return t.tomb.Err()
   219  }
   220  
   221  func (t *logTailer) loop() error {
   222  	err := t.processCollection()
   223  	if err != nil {
   224  		return errors.Trace(err)
   225  	}
   226  
   227  	err = t.tailOplog()
   228  	return errors.Trace(err)
   229  }
   230  
   231  func (t *logTailer) processCollection() error {
   232  	// Create a selector from the params.
   233  	sel := t.paramsToSelector(t.params, "")
   234  	query := t.logsColl.Find(sel)
   235  
   236  	if t.params.InitialLines > 0 {
   237  		// This is a little racy but it's good enough.
   238  		count, err := query.Count()
   239  		if err != nil {
   240  			return errors.Annotate(err, "query count failed")
   241  		}
   242  		if skipOver := count - t.params.InitialLines; skipOver > 0 {
   243  			query = query.Skip(skipOver)
   244  		}
   245  	}
   246  
   247  	iter := query.Sort("t", "_id").Iter()
   248  	doc := new(logDoc)
   249  	for iter.Next(doc) {
   250  		select {
   251  		case <-t.tomb.Dying():
   252  			return errors.Trace(tomb.ErrDying)
   253  		case t.logCh <- logDocToRecord(doc):
   254  			t.lastTime = doc.Time
   255  			t.recentIds.Add(doc.Id)
   256  		}
   257  	}
   258  	return errors.Trace(iter.Close())
   259  }
   260  
   261  func (t *logTailer) tailOplog() error {
   262  	recentIds := t.recentIds.AsSet()
   263  
   264  	newParams := t.params
   265  	newParams.StartTime = t.lastTime
   266  	oplogSel := append(t.paramsToSelector(newParams, "o."),
   267  		bson.DocElem{"ns", logsDB + "." + logsC},
   268  	)
   269  
   270  	oplog := t.params.Oplog
   271  	if oplog == nil {
   272  		oplog = mongo.GetOplog(t.session)
   273  	}
   274  
   275  	minOplogTs := t.lastTime.Add(-oplogOverlap)
   276  	oplogTailer := mongo.NewOplogTailer(oplog, oplogSel, minOplogTs)
   277  	defer oplogTailer.Stop()
   278  
   279  	logger.Tracef("LogTailer starting oplog tailing: recent id count=%d, lastTime=%s, minOplogTs=%s",
   280  		recentIds.Length(), t.lastTime, minOplogTs)
   281  
   282  	skipCount := 0
   283  	for {
   284  		select {
   285  		case <-t.tomb.Dying():
   286  			return errors.Trace(tomb.ErrDying)
   287  		case oplogDoc, ok := <-oplogTailer.Out():
   288  			if !ok {
   289  				return errors.Annotate(oplogTailer.Err(), "oplog tailer died")
   290  			}
   291  
   292  			doc := new(logDoc)
   293  			err := oplogDoc.UnmarshalObject(doc)
   294  			if err != nil {
   295  				return errors.Annotate(err, "oplog unmarshalling failed")
   296  			}
   297  
   298  			if recentIds.Contains(doc.Id) {
   299  				// This document has already been reported.
   300  				skipCount++
   301  				if skipCount%1000 == 0 {
   302  					logger.Tracef("LogTailer duplicates skipped: %d", skipCount)
   303  				}
   304  				continue
   305  			}
   306  
   307  			select {
   308  			case <-t.tomb.Dying():
   309  				return errors.Trace(tomb.ErrDying)
   310  			case t.logCh <- logDocToRecord(doc):
   311  			}
   312  		}
   313  	}
   314  }
   315  
   316  func (t *logTailer) paramsToSelector(params *LogTailerParams, prefix string) bson.D {
   317  	sel := bson.D{
   318  		{"e", t.envUUID},
   319  		{"t", bson.M{"$gte": params.StartTime}},
   320  	}
   321  	if params.MinLevel > loggo.UNSPECIFIED {
   322  		sel = append(sel, bson.DocElem{"v", bson.M{"$gte": params.MinLevel}})
   323  	}
   324  	if len(params.IncludeEntity) > 0 {
   325  		sel = append(sel,
   326  			bson.DocElem{"n", bson.RegEx{Pattern: makeEntityPattern(params.IncludeEntity)}})
   327  	}
   328  	if len(params.ExcludeEntity) > 0 {
   329  		sel = append(sel,
   330  			bson.DocElem{"n", bson.M{"$not": bson.RegEx{Pattern: makeEntityPattern(params.ExcludeEntity)}}})
   331  	}
   332  	if len(params.IncludeModule) > 0 {
   333  		sel = append(sel,
   334  			bson.DocElem{"m", bson.RegEx{Pattern: makeModulePattern(params.IncludeModule)}})
   335  	}
   336  	if len(params.ExcludeModule) > 0 {
   337  		sel = append(sel,
   338  			bson.DocElem{"m", bson.M{"$not": bson.RegEx{Pattern: makeModulePattern(params.ExcludeModule)}}})
   339  	}
   340  
   341  	if prefix != "" {
   342  		for i, elem := range sel {
   343  			sel[i].Name = prefix + elem.Name
   344  		}
   345  	}
   346  	return sel
   347  }
   348  
   349  func makeEntityPattern(entities []string) string {
   350  	var patterns []string
   351  	for _, entity := range entities {
   352  		// Convert * wildcard to the regex equivalent. This is safe
   353  		// because * never appears in entity names.
   354  		patterns = append(patterns, strings.Replace(entity, "*", ".*", -1))
   355  	}
   356  	return `^(` + strings.Join(patterns, "|") + `)$`
   357  }
   358  
   359  func makeModulePattern(modules []string) string {
   360  	var patterns []string
   361  	for _, module := range modules {
   362  		patterns = append(patterns, regexp.QuoteMeta(module))
   363  	}
   364  	return `^(` + strings.Join(patterns, "|") + `)(\..+)?$`
   365  }
   366  
   367  func newRecentIdTracker(maxLen int) *recentIdTracker {
   368  	return &recentIdTracker{
   369  		ids: deque.NewWithMaxLen(maxLen),
   370  	}
   371  }
   372  
   373  type recentIdTracker struct {
   374  	ids *deque.Deque
   375  }
   376  
   377  func (t *recentIdTracker) Add(id bson.ObjectId) {
   378  	t.ids.PushBack(id)
   379  }
   380  
   381  func (t *recentIdTracker) AsSet() *objectIdSet {
   382  	out := newObjectIdSet()
   383  	for {
   384  		id, ok := t.ids.PopFront()
   385  		if !ok {
   386  			break
   387  		}
   388  		out.Add(id.(bson.ObjectId))
   389  	}
   390  	return out
   391  }
   392  
   393  func newObjectIdSet() *objectIdSet {
   394  	return &objectIdSet{
   395  		ids: set.NewStrings(),
   396  	}
   397  }
   398  
   399  type objectIdSet struct {
   400  	ids set.Strings
   401  }
   402  
   403  func (s *objectIdSet) Add(id bson.ObjectId) {
   404  	s.ids.Add(string(id))
   405  }
   406  
   407  func (s *objectIdSet) Contains(id bson.ObjectId) bool {
   408  	return s.ids.Contains(string(id))
   409  }
   410  
   411  func (s *objectIdSet) Length() int {
   412  	return len(s.ids)
   413  }
   414  
   415  func logDocToRecord(doc *logDoc) *LogRecord {
   416  	return &LogRecord{
   417  		Time:     doc.Time,
   418  		Entity:   doc.Entity,
   419  		Module:   doc.Module,
   420  		Location: doc.Location,
   421  		Level:    doc.Level,
   422  		Message:  doc.Message,
   423  	}
   424  }
   425  
   426  // PruneLogs removes old log documents in order to control the size of
   427  // logs collection. All logs older than minLogTime are
   428  // removed. Further removal is also performed if the logs collection
   429  // size is greater than maxLogsMB.
   430  func PruneLogs(st LoggingState, minLogTime time.Time, maxLogsMB int) error {
   431  	session, logsColl := initLogsSession(st)
   432  	defer session.Close()
   433  
   434  	envUUIDs, err := getEnvsInLogs(logsColl)
   435  	if err != nil {
   436  		return errors.Annotate(err, "failed to get log counts")
   437  	}
   438  
   439  	pruneCounts := make(map[string]int)
   440  
   441  	// Remove old log entries (per environment UUID to take advantage
   442  	// of indexes on the logs collection).
   443  	for _, envUUID := range envUUIDs {
   444  		removeInfo, err := logsColl.RemoveAll(bson.M{
   445  			"e": envUUID,
   446  			"t": bson.M{"$lt": minLogTime},
   447  		})
   448  		if err != nil {
   449  			return errors.Annotate(err, "failed to prune logs by time")
   450  		}
   451  		pruneCounts[envUUID] = removeInfo.Removed
   452  	}
   453  
   454  	// Do further pruning if the logs collection is over the maximum size.
   455  	for {
   456  		collMB, err := getCollectionMB(logsColl)
   457  		if err != nil {
   458  			return errors.Annotate(err, "failed to retrieve log counts")
   459  		}
   460  		if collMB <= maxLogsMB {
   461  			break
   462  		}
   463  
   464  		envUUID, count, err := findEnvWithMostLogs(logsColl, envUUIDs)
   465  		if err != nil {
   466  			return errors.Annotate(err, "log count query failed")
   467  		}
   468  		if count < 5000 {
   469  			break // Pruning is not worthwhile
   470  		}
   471  
   472  		// Remove the oldest 1% of log records for the environment.
   473  		toRemove := int(float64(count) * 0.01)
   474  
   475  		// Find the threshold timestammp to start removing from.
   476  		// NOTE: this assumes that there are no more logs being added
   477  		// for the time range being pruned (which should be true for
   478  		// any realistic minimum log collection size).
   479  		tsQuery := logsColl.Find(bson.M{"e": envUUID}).Sort("t")
   480  		tsQuery = tsQuery.Skip(toRemove)
   481  		tsQuery = tsQuery.Select(bson.M{"t": 1})
   482  		var doc bson.M
   483  		err = tsQuery.One(&doc)
   484  		if err != nil {
   485  			return errors.Annotate(err, "log pruning timestamp query failed")
   486  		}
   487  		thresholdTs := doc["t"].(time.Time)
   488  
   489  		// Remove old records.
   490  		removeInfo, err := logsColl.RemoveAll(bson.M{
   491  			"e": envUUID,
   492  			"t": bson.M{"$lt": thresholdTs},
   493  		})
   494  		if err != nil {
   495  			return errors.Annotate(err, "log pruning failed")
   496  		}
   497  		pruneCounts[envUUID] += removeInfo.Removed
   498  	}
   499  
   500  	for envUUID, count := range pruneCounts {
   501  		if count > 0 {
   502  			logger.Debugf("pruned %d logs for environment %s", count, envUUID)
   503  		}
   504  	}
   505  	return nil
   506  }
   507  
   508  // initLogsSession creates a new session suitable for logging updates,
   509  // returning the session and a logs mgo.Collection connected to that
   510  // session.
   511  func initLogsSession(st LoggingState) (*mgo.Session, *mgo.Collection) {
   512  	// To improve throughput, only wait for the logs to be written to
   513  	// the primary. For some reason, this makes a huge difference even
   514  	// when the replicaset only has one member (i.e. a single primary).
   515  	session := st.MongoSession().Copy()
   516  	session.SetSafe(&mgo.Safe{
   517  		W: 1,
   518  	})
   519  	db := session.DB(logsDB)
   520  	return session, db.C(logsC).With(session)
   521  }
   522  
   523  // getCollectionMB returns the size of a MongoDB collection (in
   524  // bytes), excluding space used by indexes.
   525  func getCollectionMB(coll *mgo.Collection) (int, error) {
   526  	var result bson.M
   527  	err := coll.Database.Run(bson.D{
   528  		{"collStats", coll.Name},
   529  		{"scale", humanize.MiByte},
   530  	}, &result)
   531  	if err != nil {
   532  		return 0, errors.Trace(err)
   533  	}
   534  	return result["size"].(int), nil
   535  }
   536  
   537  // getEnvsInLogs returns the unique envrionment UUIDs that exist in
   538  // the logs collection. This uses the one of the indexes on the
   539  // collection and should be fast.
   540  func getEnvsInLogs(coll *mgo.Collection) ([]string, error) {
   541  	var envUUIDs []string
   542  	err := coll.Find(nil).Distinct("e", &envUUIDs)
   543  	if err != nil {
   544  		return nil, errors.Trace(err)
   545  	}
   546  	return envUUIDs, nil
   547  }
   548  
   549  // findEnvWithMostLogs returns the envUUID and log count for the
   550  // environment with the most logs in the logs collection.
   551  func findEnvWithMostLogs(logsColl *mgo.Collection, envUUIDs []string) (string, int, error) {
   552  	var maxEnvUUID string
   553  	var maxCount int
   554  	for _, envUUID := range envUUIDs {
   555  		count, err := getLogCountForEnv(logsColl, envUUID)
   556  		if err != nil {
   557  			return "", -1, errors.Trace(err)
   558  		}
   559  		if count > maxCount {
   560  			maxEnvUUID = envUUID
   561  			maxCount = count
   562  		}
   563  	}
   564  	return maxEnvUUID, maxCount, nil
   565  }
   566  
   567  // getLogCountForEnv returns the number of log records stored for a
   568  // given environment.
   569  func getLogCountForEnv(coll *mgo.Collection, envUUID string) (int, error) {
   570  	count, err := coll.Find(bson.M{"e": envUUID}).Count()
   571  	if err != nil {
   572  		return -1, errors.Annotate(err, "failed to get log count")
   573  	}
   574  	return count, nil
   575  }