github.com/containerd/Containerd@v1.4.13/metrics/cgroups/v1/oom.go (about) 1 // +build linux 2 3 /* 4 Copyright The containerd Authors. 5 6 Licensed under the Apache License, Version 2.0 (the "License"); 7 you may not use this file except in compliance with the License. 8 You may obtain a copy of the License at 9 10 http://www.apache.org/licenses/LICENSE-2.0 11 12 Unless required by applicable law or agreed to in writing, software 13 distributed under the License is distributed on an "AS IS" BASIS, 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 See the License for the specific language governing permissions and 16 limitations under the License. 17 */ 18 19 package v1 20 21 import ( 22 "sync" 23 "sync/atomic" 24 25 "golang.org/x/sys/unix" 26 27 "github.com/containerd/cgroups" 28 metrics "github.com/docker/go-metrics" 29 "github.com/prometheus/client_golang/prometheus" 30 "github.com/sirupsen/logrus" 31 ) 32 33 func newOOMCollector(ns *metrics.Namespace) (*oomCollector, error) { 34 fd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) 35 if err != nil { 36 return nil, err 37 } 38 var desc *prometheus.Desc 39 if ns != nil { 40 desc = ns.NewDesc("memory_oom", "The number of times a container has received an oom event", metrics.Total, "container_id", "namespace") 41 } 42 c := &oomCollector{ 43 fd: fd, 44 desc: desc, 45 set: make(map[uintptr]*oom), 46 } 47 if ns != nil { 48 ns.Add(c) 49 } 50 go c.start() 51 return c, nil 52 } 53 54 type oomCollector struct { 55 mu sync.Mutex 56 57 desc *prometheus.Desc 58 fd int 59 set map[uintptr]*oom 60 } 61 62 type oom struct { 63 // count needs to stay the first member of this struct to ensure 64bits 64 // alignment on a 32bits machine (e.g. arm32). This is necessary as we use 65 // the sync/atomic operations on this field. 66 count int64 67 id string 68 namespace string 69 c cgroups.Cgroup 70 triggers []Trigger 71 } 72 73 func (o *oomCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...Trigger) error { 74 o.mu.Lock() 75 defer o.mu.Unlock() 76 fd, err := cg.OOMEventFD() 77 if err != nil { 78 return err 79 } 80 o.set[fd] = &oom{ 81 id: id, 82 c: cg, 83 triggers: triggers, 84 namespace: namespace, 85 } 86 event := unix.EpollEvent{ 87 Fd: int32(fd), 88 Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR, 89 } 90 return unix.EpollCtl(o.fd, unix.EPOLL_CTL_ADD, int(fd), &event) 91 } 92 93 func (o *oomCollector) Describe(ch chan<- *prometheus.Desc) { 94 ch <- o.desc 95 } 96 97 func (o *oomCollector) Collect(ch chan<- prometheus.Metric) { 98 o.mu.Lock() 99 defer o.mu.Unlock() 100 for _, t := range o.set { 101 c := atomic.LoadInt64(&t.count) 102 ch <- prometheus.MustNewConstMetric(o.desc, prometheus.CounterValue, float64(c), t.id, t.namespace) 103 } 104 } 105 106 // Close closes the epoll fd 107 func (o *oomCollector) Close() error { 108 return unix.Close(o.fd) 109 } 110 111 func (o *oomCollector) start() { 112 var events [128]unix.EpollEvent 113 for { 114 n, err := unix.EpollWait(o.fd, events[:], -1) 115 if err != nil { 116 if err == unix.EINTR { 117 continue 118 } 119 logrus.WithError(err).Error("cgroups: epoll wait failed, OOM notifications disabled") 120 return 121 } 122 for i := 0; i < n; i++ { 123 o.process(uintptr(events[i].Fd), events[i].Events) 124 } 125 } 126 } 127 128 func (o *oomCollector) process(fd uintptr, event uint32) { 129 // make sure to always flush the fd 130 flush(fd) 131 132 o.mu.Lock() 133 info, ok := o.set[fd] 134 if !ok { 135 o.mu.Unlock() 136 return 137 } 138 o.mu.Unlock() 139 // if we received an event but it was caused by the cgroup being deleted and the fd 140 // being closed make sure we close our copy and remove the container from the set 141 if info.c.State() == cgroups.Deleted { 142 o.mu.Lock() 143 delete(o.set, fd) 144 o.mu.Unlock() 145 unix.Close(int(fd)) 146 return 147 } 148 atomic.AddInt64(&info.count, 1) 149 for _, t := range info.triggers { 150 t(info.id, info.namespace, info.c) 151 } 152 } 153 154 func flush(fd uintptr) error { 155 var buf [8]byte 156 _, err := unix.Read(int(fd), buf[:]) 157 return err 158 }