github.com/moby/docker@v26.1.3+incompatible/daemon/logger/journald/read.go (about)

     1  //go:build linux && cgo && !static_build && journald
     2  
     3  package journald // import "github.com/docker/docker/daemon/logger/journald"
     4  
     5  import (
     6  	"context"
     7  	"runtime"
     8  	"strconv"
     9  	"sync/atomic"
    10  	"time"
    11  
    12  	"github.com/containerd/log"
    13  	"github.com/coreos/go-systemd/v22/journal"
    14  	"github.com/docker/docker/api/types/backend"
    15  	"github.com/docker/docker/daemon/logger"
    16  	"github.com/docker/docker/daemon/logger/journald/internal/sdjournal"
    17  )
    18  
    19  const (
    20  	closedDrainTimeout = 5 * time.Second
    21  	waitInterval       = 250 * time.Millisecond
    22  )
    23  
    24  // Fields which we know are not user-provided attribute fields.
    25  var wellKnownFields = map[string]bool{
    26  	"MESSAGE":             true,
    27  	"MESSAGE_ID":          true,
    28  	"PRIORITY":            true,
    29  	"CODE_FILE":           true,
    30  	"CODE_LINE":           true,
    31  	"CODE_FUNC":           true,
    32  	"ERRNO":               true,
    33  	"SYSLOG_FACILITY":     true,
    34  	fieldSyslogIdentifier: true,
    35  	"SYSLOG_PID":          true,
    36  	fieldSyslogTimestamp:  true,
    37  	fieldContainerName:    true,
    38  	fieldContainerID:      true,
    39  	fieldContainerIDFull:  true,
    40  	fieldContainerTag:     true,
    41  	fieldImageName:        true,
    42  	fieldPLogID:           true,
    43  	fieldPLogOrdinal:      true,
    44  	fieldPLogLast:         true,
    45  	fieldPartialMessage:   true,
    46  	fieldLogEpoch:         true,
    47  	fieldLogOrdinal:       true,
    48  }
    49  
    50  type reader struct {
    51  	s             *journald
    52  	j             *sdjournal.Journal
    53  	logWatcher    *logger.LogWatcher
    54  	config        logger.ReadConfig
    55  	maxOrdinal    uint64
    56  	ready         chan struct{}
    57  	drainDeadline time.Time
    58  }
    59  
    60  func getMessage(d map[string]string) (line []byte, ok bool) {
    61  	m, ok := d["MESSAGE"]
    62  	if ok {
    63  		line = []byte(m)
    64  		if d[fieldPartialMessage] != "true" {
    65  			line = append(line, "\n"...)
    66  		}
    67  	}
    68  	return line, ok
    69  }
    70  
    71  func getPriority(d map[string]string) (journal.Priority, bool) {
    72  	if pri, ok := d["PRIORITY"]; ok {
    73  		i, err := strconv.Atoi(pri)
    74  		return journal.Priority(i), err == nil
    75  	}
    76  	return -1, false
    77  }
    78  
    79  // getSource recovers the stream name from the entry data by mapping from the
    80  // journal priority field back to the stream that we would have assigned that
    81  // value.
    82  func getSource(d map[string]string) string {
    83  	source := ""
    84  	if priority, ok := getPriority(d); ok {
    85  		if priority == journal.PriErr {
    86  			source = "stderr"
    87  		} else if priority == journal.PriInfo {
    88  			source = "stdout"
    89  		}
    90  	}
    91  	return source
    92  }
    93  
    94  func getAttrs(d map[string]string) []backend.LogAttr {
    95  	var attrs []backend.LogAttr
    96  	for k, v := range d {
    97  		if k[0] != '_' && !wellKnownFields[k] {
    98  			attrs = append(attrs, backend.LogAttr{Key: k, Value: v})
    99  		}
   100  	}
   101  	return attrs
   102  }
   103  
   104  // The SeekXYZ() methods all move the journal read pointer to a "conceptual"
   105  // position which does not correspond to any journal entry. A subsequent call to
   106  // Next(), Previous() or similar is necessary to resolve the read pointer to a
   107  // discrete entry.
   108  // https://github.com/systemd/systemd/pull/5930#issuecomment-300878104
   109  // But that's not all! If there is no discrete entry to resolve the position to,
   110  // the call to Next() or Previous() will just leave the read pointer in a
   111  // conceptual position, or do something even more bizarre.
   112  // https://github.com/systemd/systemd/issues/9934
   113  
   114  // initialSeekHead positions the journal read pointer at the earliest journal
   115  // entry with a timestamp of at least r.config.Since. It returns true if there
   116  // is an entry to read at the read pointer.
   117  func (r *reader) initialSeekHead() (bool, error) {
   118  	var err error
   119  	if r.config.Since.IsZero() {
   120  		err = r.j.SeekHead()
   121  	} else {
   122  		err = r.j.SeekRealtime(r.config.Since)
   123  	}
   124  	if err != nil {
   125  		return false, err
   126  	}
   127  	return r.j.Next()
   128  }
   129  
   130  // initialSeekTail positions the journal read pointer at a journal entry
   131  // relative to the tail of the journal at the time of the call based on the
   132  // specification in r.config. It returns true if there is an entry to read at
   133  // the read pointer. Otherwise the read pointer is set to a conceptual position
   134  // which will be resolved to the desired entry (once written) by advancing
   135  // forward with r.j.Next() or similar.
   136  func (r *reader) initialSeekTail() (bool, error) {
   137  	var err error
   138  	if r.config.Until.IsZero() {
   139  		err = r.j.SeekTail()
   140  	} else {
   141  		err = r.j.SeekRealtime(r.config.Until)
   142  	}
   143  	if err != nil {
   144  		return false, err
   145  	}
   146  
   147  	var ok bool
   148  	if r.config.Tail == 0 {
   149  		ok, err = r.j.Previous()
   150  	} else {
   151  		var n int
   152  		n, err = r.j.PreviousSkip(uint(r.config.Tail))
   153  		ok = n > 0
   154  	}
   155  	if err != nil {
   156  		return ok, err
   157  	}
   158  	if !ok {
   159  		// The (filtered) journal has no entries. The tail is the head: all new
   160  		// entries which get written into the journal from this point forward
   161  		// should be read from the journal. However the read pointer is
   162  		// positioned at a conceptual position which is not condusive to reading
   163  		// those entries. The tail of the journal is resolved to the last entry
   164  		// in the journal _at the time of the first successful Previous() call_,
   165  		// which means that an arbitrary number of journal entries added in the
   166  		// interim may be skipped: race condition. While the realtime conceptual
   167  		// position is not so racy, it is also unhelpful: it is the timestamp
   168  		// past where reading should stop, so all logs that should be followed
   169  		// would be skipped over.
   170  		// Reset the read pointer position to avoid these problems.
   171  		return r.initialSeekHead()
   172  	} else if r.config.Tail == 0 {
   173  		// The journal read pointer is positioned at the discrete position of
   174  		// the journal entry _before_ the entry to send.
   175  		return r.j.Next()
   176  	}
   177  
   178  	// Check if the PreviousSkip went too far back.
   179  	timestamp, err := r.j.Realtime()
   180  	if err != nil {
   181  		return false, err
   182  	}
   183  	if timestamp.Before(r.config.Since) {
   184  		if err := r.j.SeekRealtime(r.config.Since); err != nil {
   185  			return false, err
   186  		}
   187  		return r.j.Next()
   188  	}
   189  	return true, nil
   190  }
   191  
   192  // wait blocks until the journal has new data to read, the reader's drain
   193  // deadline is exceeded, or the log reading consumer is gone.
   194  func (r *reader) wait() (bool, error) {
   195  	for {
   196  		dur := waitInterval
   197  		if !r.drainDeadline.IsZero() {
   198  			dur = time.Until(r.drainDeadline)
   199  			if dur < 0 {
   200  				// Container is gone but we haven't found the end of the
   201  				// logs before the deadline. Maybe it was dropped by
   202  				// journald, e.g. due to rate-limiting.
   203  				return false, nil
   204  			} else if dur > waitInterval {
   205  				dur = waitInterval
   206  			}
   207  		}
   208  		status, err := r.j.Wait(dur)
   209  		if err != nil {
   210  			return false, err
   211  		} else if status != sdjournal.StatusNOP {
   212  			return true, nil
   213  		}
   214  		select {
   215  		case <-r.logWatcher.WatchConsumerGone():
   216  			return false, nil
   217  		case <-r.s.closed:
   218  			// Container is gone; don't wait indefinitely for journal entries that will never arrive.
   219  			if r.maxOrdinal >= atomic.LoadUint64(&r.s.ordinal) {
   220  				return false, nil
   221  			}
   222  			if r.drainDeadline.IsZero() {
   223  				r.drainDeadline = time.Now().Add(closedDrainTimeout)
   224  			}
   225  		default:
   226  		}
   227  	}
   228  }
   229  
   230  // nextWait blocks until there is a new journal entry to read, and advances the
   231  // journal read pointer to it.
   232  func (r *reader) nextWait() (bool, error) {
   233  	for {
   234  		if ok, err := r.j.Next(); err != nil || ok {
   235  			return ok, err
   236  		}
   237  		if ok, err := r.wait(); err != nil || !ok {
   238  			return false, err
   239  		}
   240  	}
   241  }
   242  
   243  // drainJournal reads and sends log messages from the journal, starting from the
   244  // current read pointer, until the end of the journal or a terminal stopping
   245  // condition is reached.
   246  //
   247  // It returns false when a terminal stopping condition has been reached:
   248  //   - the watch consumer is gone, or
   249  //   - (if until is nonzero) a log entry is read which has a timestamp after
   250  //     until
   251  func (r *reader) drainJournal() (bool, error) {
   252  	for i := 0; ; i++ {
   253  		// Read the entry's timestamp.
   254  		timestamp, err := r.j.Realtime()
   255  		if err != nil {
   256  			return true, err
   257  		}
   258  		if !r.config.Until.IsZero() && r.config.Until.Before(timestamp) {
   259  			return false, nil
   260  		}
   261  
   262  		// Read and send the logged message, if there is one to read.
   263  		data, err := r.j.Data()
   264  		if err != nil {
   265  			return true, err
   266  		}
   267  
   268  		if data[fieldLogEpoch] == r.s.epoch {
   269  			seq, err := strconv.ParseUint(data[fieldLogOrdinal], 10, 64)
   270  			if err == nil && seq > r.maxOrdinal {
   271  				r.maxOrdinal = seq
   272  			}
   273  		}
   274  
   275  		if line, ok := getMessage(data); ok {
   276  			// Send the log message, unless the consumer is gone
   277  			msg := &logger.Message{
   278  				Line:      line,
   279  				Source:    getSource(data),
   280  				Timestamp: timestamp.In(time.UTC),
   281  				Attrs:     getAttrs(data),
   282  			}
   283  			// The daemon timestamp will differ from the "trusted"
   284  			// timestamp of when the event was received by journald.
   285  			// We can efficiently seek around the journal by the
   286  			// event timestamp, and the event timestamp is what
   287  			// journalctl displays. The daemon timestamp is just an
   288  			// application-supplied field with no special
   289  			// significance; libsystemd won't help us seek to the
   290  			// entry with the closest timestamp.
   291  			/*
   292  				if sts := data["SYSLOG_TIMESTAMP"]; sts != "" {
   293  					if tv, err := time.Parse(time.RFC3339Nano, sts); err == nil {
   294  						msg.Timestamp = tv
   295  					}
   296  				}
   297  			*/
   298  			select {
   299  			case <-r.logWatcher.WatchConsumerGone():
   300  				return false, nil
   301  			case r.logWatcher.Msg <- msg:
   302  			}
   303  		}
   304  
   305  		// Call sd_journal_process() periodically during the processing loop
   306  		// to close any opened file descriptors for rotated (deleted) journal files.
   307  		if i != 0 && i%1024 == 0 {
   308  			if _, err := r.j.Process(); err != nil {
   309  				// log a warning but ignore it for now
   310  				log.G(context.TODO()).WithField("container", r.s.vars[fieldContainerIDFull]).
   311  					WithField("error", err).
   312  					Warn("journald: error processing journal")
   313  			}
   314  		}
   315  
   316  		if ok, err := r.j.Next(); err != nil || !ok {
   317  			return true, err
   318  		}
   319  	}
   320  }
   321  
   322  func (r *reader) readJournal() error {
   323  	caughtUp := atomic.LoadUint64(&r.s.ordinal)
   324  	if more, err := r.drainJournal(); err != nil || !more {
   325  		return err
   326  	}
   327  
   328  	if !r.config.Follow {
   329  		if r.s.readSyncTimeout == 0 {
   330  			return nil
   331  		}
   332  		r.drainDeadline = time.Now().Add(r.s.readSyncTimeout)
   333  	}
   334  
   335  	for {
   336  		select {
   337  		case <-r.s.closed:
   338  			// container is gone, drain journal
   339  			lastSeq := atomic.LoadUint64(&r.s.ordinal)
   340  			if r.maxOrdinal >= lastSeq {
   341  				// All caught up with the logger!
   342  				return nil
   343  			}
   344  		default:
   345  		}
   346  
   347  		if more, err := r.nextWait(); err != nil || !more {
   348  			return err
   349  		}
   350  		if more, err := r.drainJournal(); err != nil || !more {
   351  			return err
   352  		}
   353  		if !r.config.Follow && r.s.readSyncTimeout > 0 && r.maxOrdinal >= caughtUp {
   354  			return nil
   355  		}
   356  	}
   357  }
   358  
   359  func (r *reader) readLogs() {
   360  	defer close(r.logWatcher.Msg)
   361  
   362  	// Make sure the ready channel is closed in the event of an early
   363  	// return.
   364  	defer r.signalReady()
   365  
   366  	// Quoting https://www.freedesktop.org/software/systemd/man/sd-journal.html:
   367  	//     Functions that operate on sd_journal objects are thread
   368  	//     agnostic — given sd_journal pointer may only be used from one
   369  	//     specific thread at all times (and it has to be the very same one
   370  	//     during the entire lifetime of the object), but multiple,
   371  	//     independent threads may use multiple, independent objects safely.
   372  	//
   373  	// sdjournal.Open returns a value which wraps an sd_journal pointer so
   374  	// we need to abide by those rules.
   375  	runtime.LockOSThread()
   376  	defer runtime.UnlockOSThread()
   377  
   378  	// Get a handle to the journal.
   379  	var err error
   380  	if r.s.journalReadDir != "" {
   381  		r.j, err = sdjournal.OpenDir(r.s.journalReadDir, 0)
   382  	} else {
   383  		r.j, err = sdjournal.Open(0)
   384  	}
   385  	if err != nil {
   386  		r.logWatcher.Err <- err
   387  		return
   388  	}
   389  	defer r.j.Close()
   390  
   391  	if r.config.Follow {
   392  		// Initialize library inotify watches early
   393  		if err := r.j.InitializeInotify(); err != nil {
   394  			r.logWatcher.Err <- err
   395  			return
   396  		}
   397  	}
   398  
   399  	// Remove limits on the size of data items that we'll retrieve.
   400  	if err := r.j.SetDataThreshold(0); err != nil {
   401  		r.logWatcher.Err <- err
   402  		return
   403  	}
   404  	// Add a match to have the library do the searching for us.
   405  	if err := r.j.AddMatch(fieldContainerIDFull, r.s.vars[fieldContainerIDFull]); err != nil {
   406  		r.logWatcher.Err <- err
   407  		return
   408  	}
   409  
   410  	var ok bool
   411  	if r.config.Tail >= 0 {
   412  		ok, err = r.initialSeekTail()
   413  	} else {
   414  		ok, err = r.initialSeekHead()
   415  	}
   416  	if err != nil {
   417  		r.logWatcher.Err <- err
   418  		return
   419  	}
   420  	r.signalReady()
   421  	if !ok {
   422  		if !r.config.Follow {
   423  			return
   424  		}
   425  		// Either the read pointer is positioned at a discrete journal entry, in
   426  		// which case the position will be unaffected by subsequent logging, or
   427  		// the read pointer is in the conceptual position corresponding to the
   428  		// first journal entry to send once it is logged in the future.
   429  		if more, err := r.nextWait(); err != nil || !more {
   430  			if err != nil {
   431  				r.logWatcher.Err <- err
   432  			}
   433  			return
   434  		}
   435  	}
   436  
   437  	if err := r.readJournal(); err != nil {
   438  		r.logWatcher.Err <- err
   439  		return
   440  	}
   441  }
   442  
   443  func (r *reader) signalReady() {
   444  	select {
   445  	case <-r.ready:
   446  	default:
   447  		close(r.ready)
   448  	}
   449  }
   450  
   451  func (s *journald) ReadLogs(config logger.ReadConfig) *logger.LogWatcher {
   452  	r := &reader{
   453  		s:          s,
   454  		logWatcher: logger.NewLogWatcher(),
   455  		config:     config,
   456  		ready:      make(chan struct{}),
   457  	}
   458  	go r.readLogs()
   459  	// Block until the reader is in position to read from the current config
   460  	// location to prevent race conditions in tests.
   461  	<-r.ready
   462  	return r.logWatcher
   463  }
   464  
   465  func waitUntilFlushedImpl(s *journald) error {
   466  	if s.readSyncTimeout == 0 {
   467  		return nil
   468  	}
   469  
   470  	ordinal := atomic.LoadUint64(&s.ordinal)
   471  	if ordinal == 0 {
   472  		return nil // No logs were sent; nothing to wait for.
   473  	}
   474  
   475  	flushed := make(chan error)
   476  	go func() {
   477  		defer close(flushed)
   478  		runtime.LockOSThread()
   479  		defer runtime.UnlockOSThread()
   480  
   481  		var (
   482  			j   *sdjournal.Journal
   483  			err error
   484  		)
   485  		if s.journalReadDir != "" {
   486  			j, err = sdjournal.OpenDir(s.journalReadDir, 0)
   487  		} else {
   488  			j, err = sdjournal.Open(0)
   489  		}
   490  		if err != nil {
   491  			flushed <- err
   492  			return
   493  		}
   494  		defer j.Close()
   495  
   496  		if err := j.AddMatch(fieldContainerIDFull, s.vars[fieldContainerIDFull]); err != nil {
   497  			flushed <- err
   498  			return
   499  		}
   500  		if err := j.AddMatch(fieldLogEpoch, s.epoch); err != nil {
   501  			flushed <- err
   502  			return
   503  		}
   504  		if err := j.AddMatch(fieldLogOrdinal, strconv.FormatUint(ordinal, 10)); err != nil {
   505  			flushed <- err
   506  			return
   507  		}
   508  
   509  		deadline := time.Now().Add(s.readSyncTimeout)
   510  		for time.Now().Before(deadline) {
   511  			if ok, err := j.Next(); ok {
   512  				// Found it!
   513  				return
   514  			} else if err != nil {
   515  				flushed <- err
   516  				return
   517  			}
   518  			if _, err := j.Wait(100 * time.Millisecond); err != nil {
   519  				flushed <- err
   520  				return
   521  			}
   522  		}
   523  		log.G(context.TODO()).WithField("container", s.vars[fieldContainerIDFull]).
   524  			Warn("journald: deadline exceeded waiting for logs to be committed to journal")
   525  	}()
   526  	return <-flushed
   527  }
   528  
   529  func init() {
   530  	waitUntilFlushed = waitUntilFlushedImpl
   531  }