github.com/fafucoder/cilium@v1.6.11/pkg/monitor/agent/monitor.go (about)

     1  // Copyright 2017-2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package agent
    16  
    17  import (
    18  	"context"
    19  	"io/ioutil"
    20  	"net"
    21  	"os"
    22  	"path"
    23  	"syscall"
    24  	"time"
    25  
    26  	"github.com/cilium/cilium/api/v1/models"
    27  	"github.com/cilium/cilium/pkg/bpf"
    28  	"github.com/cilium/cilium/pkg/defaults"
    29  	"github.com/cilium/cilium/pkg/lock"
    30  	"github.com/cilium/cilium/pkg/logging/logfields"
    31  	"github.com/cilium/cilium/pkg/monitor/agent/listener"
    32  	"github.com/cilium/cilium/pkg/monitor/payload"
    33  	"github.com/cilium/cilium/pkg/option"
    34  	"github.com/sirupsen/logrus"
    35  )
    36  
    37  const (
    38  	pollTimeout = 5000
    39  )
    40  
    41  // isCtxDone is a utility function that returns true when the context's Done()
    42  // channel is closed. It is intended to simplify goroutines that need to check
    43  // this multiple times in their loop.
    44  func isCtxDone(ctx context.Context) bool {
    45  	select {
    46  	case <-ctx.Done():
    47  		return true
    48  	default:
    49  		return false
    50  	}
    51  }
    52  
    53  func getPerfConfig(nPages int) *bpf.PerfEventConfig {
    54  	// configure BPF perf buffer reader
    55  	c := bpf.DefaultPerfEventConfig()
    56  	c.NumPages = nPages
    57  	return c
    58  }
    59  
    60  // Monitor structure for centralizing the responsibilities of the main events
    61  // reader.
    62  // There is some racey-ness around perfReaderCancel since it replaces on every
    63  // perf reader start. In the event that a MonitorListener from a previous
    64  // generation calls its cleanup after the start of the new perf reader, we
    65  // might call the new, and incorrect, cancel function. We guard for this by
    66  // checking the number of listeners during the cleanup call. The perf reader
    67  // must have at least one MonitorListener (since it started) so no cancel is called.
    68  // If it doesn't, the cancel is the correct behavior (the older generation
    69  // cancel must have been called for us to get this far anyway).
    70  type Monitor struct {
    71  	lock.Mutex
    72  
    73  	ctx              context.Context
    74  	perfReaderCancel context.CancelFunc
    75  	listeners        map[listener.MonitorListener]struct{}
    76  	nPages           int
    77  	monitorEvents    *bpf.PerCpuEvents
    78  }
    79  
    80  // NewMonitor creates a Monitor, and starts client connection handling and agent event
    81  // handling.
    82  // Note that the perf buffer reader is started only when listeners are
    83  // connected.
    84  func NewMonitor(ctx context.Context, nPages int, server1_2 net.Listener) (m *Monitor, err error) {
    85  	m = &Monitor{
    86  		ctx:              ctx,
    87  		listeners:        make(map[listener.MonitorListener]struct{}),
    88  		nPages:           nPages,
    89  		perfReaderCancel: func() {}, // no-op to avoid doing null checks everywhere
    90  	}
    91  
    92  	// assert that we can actually connect the monitor
    93  	c := getPerfConfig(nPages)
    94  	mapPath := c.MapName
    95  	if !path.IsAbs(mapPath) {
    96  		mapPath = bpf.MapPath(mapPath)
    97  	}
    98  	if _, err := os.Stat(mapPath); os.IsNotExist(err) {
    99  		return nil, err
   100  	}
   101  
   102  	// start new MonitorListener handler
   103  	go m.connectionHandler1_2(ctx, server1_2)
   104  
   105  	return m, nil
   106  }
   107  
   108  // registerNewListener adds the new MonitorListener to the global list. It also spawns
   109  // a singleton goroutine to read and distribute the events. It passes a
   110  // cancelable context to this goroutine and the cancelFunc is assigned to
   111  // perfReaderCancel. Note that cancelling parentCtx (e.g. on program shutdown)
   112  // will also cancel the derived context.
   113  func (m *Monitor) registerNewListener(parentCtx context.Context, conn net.Conn, version listener.Version) {
   114  	m.Lock()
   115  	defer m.Unlock()
   116  
   117  	// If this is the first listener, start the perf reader
   118  	if len(m.listeners) == 0 {
   119  		m.perfReaderCancel() // don't leak any old readers, just in case.
   120  		perfEventReaderCtx, cancel := context.WithCancel(parentCtx)
   121  		m.perfReaderCancel = cancel
   122  		go m.perfEventReader(perfEventReaderCtx, m.nPages)
   123  	}
   124  
   125  	switch version {
   126  	case listener.Version1_2:
   127  		newListener := newListenerv1_2(conn, option.Config.MonitorQueueSize, m.removeListener)
   128  		m.listeners[newListener] = struct{}{}
   129  
   130  	default:
   131  		conn.Close()
   132  		log.WithField("version", version).Error("Closing new connection from unsupported monitor client version")
   133  	}
   134  
   135  	log.WithFields(logrus.Fields{
   136  		"count.listener": len(m.listeners),
   137  		"version":        version,
   138  	}).Debug("New listener connected")
   139  }
   140  
   141  // removeListener deletes the MonitorListener from the list, closes its queue, and
   142  // stops perfReader if this is the last MonitorListener
   143  func (m *Monitor) removeListener(ml listener.MonitorListener) {
   144  	m.Lock()
   145  	defer m.Unlock()
   146  
   147  	delete(m.listeners, ml)
   148  	log.WithFields(logrus.Fields{
   149  		"count.listener": len(m.listeners),
   150  		"version":        ml.Version(),
   151  	}).Debug("Removed listener")
   152  
   153  	// If this was the final listener, shutdown the perf reader and unmap our
   154  	// ring buffer readers. This tells the kernel to not emit this data.
   155  	// Note: it is critical to hold the lock and check the number of listeners.
   156  	// This guards against an older generation listener calling the
   157  	// current generation perfReaderCancel
   158  	if len(m.listeners) == 0 {
   159  		m.perfReaderCancel()
   160  	}
   161  }
   162  
   163  // perfEventReader is a goroutine that reads events from the perf buffer. It
   164  // will exit when stopCtx is done. Note, however, that it will block in the
   165  // Poll call but assumes enough events are generated that these blocks are
   166  // short.
   167  func (m *Monitor) perfEventReader(stopCtx context.Context, nPages int) {
   168  	scopedLog := log.WithField(logfields.StartTime, time.Now())
   169  	scopedLog.Info("Beginning to read perf buffer")
   170  	defer scopedLog.Info("Stopped reading perf buffer")
   171  
   172  	c := getPerfConfig(nPages)
   173  	monitorEvents, err := bpf.NewPerCpuEvents(c)
   174  	if err != nil {
   175  		scopedLog.WithError(err).Fatal("Cannot initialise BPF perf ring buffer sockets")
   176  	}
   177  	defer monitorEvents.CloseAll()
   178  
   179  	// update the class's monitorEvents This is only accessed by .DumpStats()
   180  	// also grab the callbacks we need to avoid locking again. These methods never change.
   181  	m.Lock()
   182  	m.monitorEvents = monitorEvents
   183  	receiveEvent := m.receiveEvent
   184  	lostEvent := m.lostEvent
   185  	errorEvent := m.errorEvent
   186  	m.Unlock()
   187  
   188  	for !isCtxDone(stopCtx) {
   189  		todo, err := monitorEvents.Poll(pollTimeout)
   190  		switch {
   191  		case isCtxDone(stopCtx):
   192  			return
   193  
   194  		case err == syscall.EBADF:
   195  			return
   196  
   197  		case err != nil:
   198  			scopedLog.WithError(err).Error("Error in Poll")
   199  			continue
   200  		}
   201  
   202  		if todo > 0 {
   203  			if err := monitorEvents.ReadAll(receiveEvent, lostEvent, errorEvent); err != nil {
   204  				scopedLog.WithError(err).Warn("Error received while reading from perf buffer")
   205  			}
   206  		}
   207  	}
   208  }
   209  
   210  // Status returns the current status of the monitor
   211  func (m *Monitor) Status() *models.MonitorStatus {
   212  	m.Lock()
   213  	defer m.Unlock()
   214  
   215  	if m.monitorEvents == nil {
   216  		return nil
   217  	}
   218  
   219  	lost, _, unknown := m.monitorEvents.Stats()
   220  	status := models.MonitorStatus{
   221  		Cpus:     int64(m.monitorEvents.Cpus),
   222  		Lost:     int64(lost),
   223  		Npages:   int64(m.monitorEvents.Npages),
   224  		Pagesize: int64(m.monitorEvents.Pagesize),
   225  		Unknown:  int64(unknown),
   226  	}
   227  
   228  	return &status
   229  }
   230  
   231  // connectionHandler1_2 handles all the incoming connections and sets up the
   232  // listener objects. It will block on Accept, but expects the caller to close
   233  // server, inducing a return.
   234  func (m *Monitor) connectionHandler1_2(parentCtx context.Context, server net.Listener) {
   235  	for !isCtxDone(parentCtx) {
   236  		conn, err := server.Accept()
   237  		switch {
   238  		case isCtxDone(parentCtx) && conn != nil:
   239  			conn.Close()
   240  			fallthrough
   241  
   242  		case isCtxDone(parentCtx) && conn == nil:
   243  			return
   244  
   245  		case err != nil:
   246  			log.WithError(err).Warn("Error accepting connection")
   247  			continue
   248  		}
   249  
   250  		m.registerNewListener(parentCtx, conn, listener.Version1_2)
   251  	}
   252  }
   253  
   254  // send enqueues the payload to all listeners.
   255  func (m *Monitor) send(pl *payload.Payload) {
   256  	m.Lock()
   257  	defer m.Unlock()
   258  	for ml := range m.listeners {
   259  		ml.Enqueue(pl)
   260  	}
   261  }
   262  
   263  func (m *Monitor) receiveEvent(es *bpf.PerfEventSample, c int) {
   264  	pl := payload.Payload{Data: es.DataCopy(), CPU: c, Lost: 0, Type: payload.EventSample}
   265  	m.send(&pl)
   266  }
   267  
   268  func (m *Monitor) lostEvent(el *bpf.PerfEventLost, c int) {
   269  	pl := payload.Payload{Data: []byte{}, CPU: c, Lost: el.Lost, Type: payload.RecordLost}
   270  	m.send(&pl)
   271  }
   272  
   273  func (m *Monitor) errorEvent(el *bpf.PerfEvent) {
   274  	log.Errorf("BUG: Timeout while reading perf ring buffer: %s", el.Debug())
   275  	dumpFile := path.Join(defaults.RuntimePath, defaults.StateDir, "ring-buffer-crash.dump")
   276  	if err := ioutil.WriteFile(dumpFile, []byte(el.DebugDump()), 0644); err != nil {
   277  		log.WithError(err).Errorf("Unable to dump ring buffer state to %s", dumpFile)
   278  	}
   279  }