github.com/fafucoder/cilium@v1.6.11/pkg/monitor/agent/monitor.go (about) 1 // Copyright 2017-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package agent 16 17 import ( 18 "context" 19 "io/ioutil" 20 "net" 21 "os" 22 "path" 23 "syscall" 24 "time" 25 26 "github.com/cilium/cilium/api/v1/models" 27 "github.com/cilium/cilium/pkg/bpf" 28 "github.com/cilium/cilium/pkg/defaults" 29 "github.com/cilium/cilium/pkg/lock" 30 "github.com/cilium/cilium/pkg/logging/logfields" 31 "github.com/cilium/cilium/pkg/monitor/agent/listener" 32 "github.com/cilium/cilium/pkg/monitor/payload" 33 "github.com/cilium/cilium/pkg/option" 34 "github.com/sirupsen/logrus" 35 ) 36 37 const ( 38 pollTimeout = 5000 39 ) 40 41 // isCtxDone is a utility function that returns true when the context's Done() 42 // channel is closed. It is intended to simplify goroutines that need to check 43 // this multiple times in their loop. 44 func isCtxDone(ctx context.Context) bool { 45 select { 46 case <-ctx.Done(): 47 return true 48 default: 49 return false 50 } 51 } 52 53 func getPerfConfig(nPages int) *bpf.PerfEventConfig { 54 // configure BPF perf buffer reader 55 c := bpf.DefaultPerfEventConfig() 56 c.NumPages = nPages 57 return c 58 } 59 60 // Monitor structure for centralizing the responsibilities of the main events 61 // reader. 62 // There is some racey-ness around perfReaderCancel since it replaces on every 63 // perf reader start. In the event that a MonitorListener from a previous 64 // generation calls its cleanup after the start of the new perf reader, we 65 // might call the new, and incorrect, cancel function. We guard for this by 66 // checking the number of listeners during the cleanup call. The perf reader 67 // must have at least one MonitorListener (since it started) so no cancel is called. 68 // If it doesn't, the cancel is the correct behavior (the older generation 69 // cancel must have been called for us to get this far anyway). 70 type Monitor struct { 71 lock.Mutex 72 73 ctx context.Context 74 perfReaderCancel context.CancelFunc 75 listeners map[listener.MonitorListener]struct{} 76 nPages int 77 monitorEvents *bpf.PerCpuEvents 78 } 79 80 // NewMonitor creates a Monitor, and starts client connection handling and agent event 81 // handling. 82 // Note that the perf buffer reader is started only when listeners are 83 // connected. 84 func NewMonitor(ctx context.Context, nPages int, server1_2 net.Listener) (m *Monitor, err error) { 85 m = &Monitor{ 86 ctx: ctx, 87 listeners: make(map[listener.MonitorListener]struct{}), 88 nPages: nPages, 89 perfReaderCancel: func() {}, // no-op to avoid doing null checks everywhere 90 } 91 92 // assert that we can actually connect the monitor 93 c := getPerfConfig(nPages) 94 mapPath := c.MapName 95 if !path.IsAbs(mapPath) { 96 mapPath = bpf.MapPath(mapPath) 97 } 98 if _, err := os.Stat(mapPath); os.IsNotExist(err) { 99 return nil, err 100 } 101 102 // start new MonitorListener handler 103 go m.connectionHandler1_2(ctx, server1_2) 104 105 return m, nil 106 } 107 108 // registerNewListener adds the new MonitorListener to the global list. It also spawns 109 // a singleton goroutine to read and distribute the events. It passes a 110 // cancelable context to this goroutine and the cancelFunc is assigned to 111 // perfReaderCancel. Note that cancelling parentCtx (e.g. on program shutdown) 112 // will also cancel the derived context. 113 func (m *Monitor) registerNewListener(parentCtx context.Context, conn net.Conn, version listener.Version) { 114 m.Lock() 115 defer m.Unlock() 116 117 // If this is the first listener, start the perf reader 118 if len(m.listeners) == 0 { 119 m.perfReaderCancel() // don't leak any old readers, just in case. 120 perfEventReaderCtx, cancel := context.WithCancel(parentCtx) 121 m.perfReaderCancel = cancel 122 go m.perfEventReader(perfEventReaderCtx, m.nPages) 123 } 124 125 switch version { 126 case listener.Version1_2: 127 newListener := newListenerv1_2(conn, option.Config.MonitorQueueSize, m.removeListener) 128 m.listeners[newListener] = struct{}{} 129 130 default: 131 conn.Close() 132 log.WithField("version", version).Error("Closing new connection from unsupported monitor client version") 133 } 134 135 log.WithFields(logrus.Fields{ 136 "count.listener": len(m.listeners), 137 "version": version, 138 }).Debug("New listener connected") 139 } 140 141 // removeListener deletes the MonitorListener from the list, closes its queue, and 142 // stops perfReader if this is the last MonitorListener 143 func (m *Monitor) removeListener(ml listener.MonitorListener) { 144 m.Lock() 145 defer m.Unlock() 146 147 delete(m.listeners, ml) 148 log.WithFields(logrus.Fields{ 149 "count.listener": len(m.listeners), 150 "version": ml.Version(), 151 }).Debug("Removed listener") 152 153 // If this was the final listener, shutdown the perf reader and unmap our 154 // ring buffer readers. This tells the kernel to not emit this data. 155 // Note: it is critical to hold the lock and check the number of listeners. 156 // This guards against an older generation listener calling the 157 // current generation perfReaderCancel 158 if len(m.listeners) == 0 { 159 m.perfReaderCancel() 160 } 161 } 162 163 // perfEventReader is a goroutine that reads events from the perf buffer. It 164 // will exit when stopCtx is done. Note, however, that it will block in the 165 // Poll call but assumes enough events are generated that these blocks are 166 // short. 167 func (m *Monitor) perfEventReader(stopCtx context.Context, nPages int) { 168 scopedLog := log.WithField(logfields.StartTime, time.Now()) 169 scopedLog.Info("Beginning to read perf buffer") 170 defer scopedLog.Info("Stopped reading perf buffer") 171 172 c := getPerfConfig(nPages) 173 monitorEvents, err := bpf.NewPerCpuEvents(c) 174 if err != nil { 175 scopedLog.WithError(err).Fatal("Cannot initialise BPF perf ring buffer sockets") 176 } 177 defer monitorEvents.CloseAll() 178 179 // update the class's monitorEvents This is only accessed by .DumpStats() 180 // also grab the callbacks we need to avoid locking again. These methods never change. 181 m.Lock() 182 m.monitorEvents = monitorEvents 183 receiveEvent := m.receiveEvent 184 lostEvent := m.lostEvent 185 errorEvent := m.errorEvent 186 m.Unlock() 187 188 for !isCtxDone(stopCtx) { 189 todo, err := monitorEvents.Poll(pollTimeout) 190 switch { 191 case isCtxDone(stopCtx): 192 return 193 194 case err == syscall.EBADF: 195 return 196 197 case err != nil: 198 scopedLog.WithError(err).Error("Error in Poll") 199 continue 200 } 201 202 if todo > 0 { 203 if err := monitorEvents.ReadAll(receiveEvent, lostEvent, errorEvent); err != nil { 204 scopedLog.WithError(err).Warn("Error received while reading from perf buffer") 205 } 206 } 207 } 208 } 209 210 // Status returns the current status of the monitor 211 func (m *Monitor) Status() *models.MonitorStatus { 212 m.Lock() 213 defer m.Unlock() 214 215 if m.monitorEvents == nil { 216 return nil 217 } 218 219 lost, _, unknown := m.monitorEvents.Stats() 220 status := models.MonitorStatus{ 221 Cpus: int64(m.monitorEvents.Cpus), 222 Lost: int64(lost), 223 Npages: int64(m.monitorEvents.Npages), 224 Pagesize: int64(m.monitorEvents.Pagesize), 225 Unknown: int64(unknown), 226 } 227 228 return &status 229 } 230 231 // connectionHandler1_2 handles all the incoming connections and sets up the 232 // listener objects. It will block on Accept, but expects the caller to close 233 // server, inducing a return. 234 func (m *Monitor) connectionHandler1_2(parentCtx context.Context, server net.Listener) { 235 for !isCtxDone(parentCtx) { 236 conn, err := server.Accept() 237 switch { 238 case isCtxDone(parentCtx) && conn != nil: 239 conn.Close() 240 fallthrough 241 242 case isCtxDone(parentCtx) && conn == nil: 243 return 244 245 case err != nil: 246 log.WithError(err).Warn("Error accepting connection") 247 continue 248 } 249 250 m.registerNewListener(parentCtx, conn, listener.Version1_2) 251 } 252 } 253 254 // send enqueues the payload to all listeners. 255 func (m *Monitor) send(pl *payload.Payload) { 256 m.Lock() 257 defer m.Unlock() 258 for ml := range m.listeners { 259 ml.Enqueue(pl) 260 } 261 } 262 263 func (m *Monitor) receiveEvent(es *bpf.PerfEventSample, c int) { 264 pl := payload.Payload{Data: es.DataCopy(), CPU: c, Lost: 0, Type: payload.EventSample} 265 m.send(&pl) 266 } 267 268 func (m *Monitor) lostEvent(el *bpf.PerfEventLost, c int) { 269 pl := payload.Payload{Data: []byte{}, CPU: c, Lost: el.Lost, Type: payload.RecordLost} 270 m.send(&pl) 271 } 272 273 func (m *Monitor) errorEvent(el *bpf.PerfEvent) { 274 log.Errorf("BUG: Timeout while reading perf ring buffer: %s", el.Debug()) 275 dumpFile := path.Join(defaults.RuntimePath, defaults.StateDir, "ring-buffer-crash.dump") 276 if err := ioutil.WriteFile(dumpFile, []byte(el.DebugDump()), 0644); err != nil { 277 log.WithError(err).Errorf("Unable to dump ring buffer state to %s", dumpFile) 278 } 279 }