github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/lib/cgutil/group_killer.go (about) 1 //go:build linux 2 3 package cgutil 4 5 import ( 6 "errors" 7 "fmt" 8 "os" 9 "strconv" 10 "time" 11 12 "github.com/hashicorp/go-hclog" 13 "github.com/opencontainers/runc/libcontainer/cgroups" 14 "github.com/opencontainers/runc/libcontainer/cgroups/fs" 15 "github.com/opencontainers/runc/libcontainer/configs" 16 ) 17 18 // freezer is the name of the cgroup subsystem used for stopping / starting 19 // a group of processes 20 const freezer = "freezer" 21 22 // thawed and frozen are the two states we put a cgroup in when trying to remove it 23 var ( 24 thawed = &configs.Resources{Freezer: configs.Thawed} 25 frozen = &configs.Resources{Freezer: configs.Frozen} 26 ) 27 28 // GroupKiller is used for SIGKILL-ing the process tree[s] of a cgroup by leveraging 29 // the freezer cgroup subsystem. 30 type GroupKiller interface { 31 KillGroup(cgroup *configs.Cgroup) error 32 } 33 34 // NewGroupKiller creates a GroupKiller with executor PID pid. 35 func NewGroupKiller(logger hclog.Logger, pid int) GroupKiller { 36 return &killer{ 37 logger: logger.Named("group_killer"), 38 pid: pid, 39 } 40 } 41 42 type killer struct { 43 logger hclog.Logger 44 pid int 45 } 46 47 // KillGroup will SIGKILL the process tree present in cgroup, using the freezer 48 // subsystem to prevent further forking, etc. 49 func (d *killer) KillGroup(cgroup *configs.Cgroup) error { 50 if UseV2 { 51 return d.v2(cgroup) 52 } 53 return d.v1(cgroup) 54 } 55 56 func (d *killer) v1(cgroup *configs.Cgroup) error { 57 if cgroup == nil { 58 return errors.New("missing cgroup") 59 } 60 61 // the actual path to our tasks freezer cgroup 62 path := cgroup.Path 63 64 d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v1", "executor_pid", d.pid) 65 66 // move executor PID into the init freezer cgroup so we can kill the task 67 // pids without killing the executor (which is the process running this code, 68 // doing the killing) 69 initPath, err := cgroups.GetInitCgroupPath(freezer) 70 if err != nil { 71 return fmt.Errorf("failed to find init cgroup: %w", err) 72 } 73 m := map[string]string{freezer: initPath} 74 if err = cgroups.EnterPid(m, d.pid); err != nil { 75 return fmt.Errorf("failed to add executor pid to init cgroup: %w", err) 76 } 77 78 // ability to freeze the cgroup 79 freeze := func() { 80 _ = new(fs.FreezerGroup).Set(path, frozen) 81 } 82 83 // ability to thaw the cgroup 84 thaw := func() { 85 _ = new(fs.FreezerGroup).Set(path, thawed) 86 } 87 88 // do the common kill logic 89 if err = d.kill(path, freeze, thaw); err != nil { 90 return err 91 } 92 93 // remove the cgroup from disk 94 return cgroups.RemovePath(path) 95 } 96 97 func (d *killer) v2(cgroup *configs.Cgroup) error { 98 if cgroup == nil || cgroup.Path == "" { 99 return errors.New("missing cgroup") 100 } 101 102 // move executor (d.PID) into init.scope 103 editSelf := &editor{"init.scope"} 104 if err := editSelf.write("cgroup.procs", strconv.Itoa(d.pid)); err != nil { 105 return err 106 } 107 108 // write "1" to cgroup.kill 109 editTask := &editor{cgroup.Path} 110 if err := editTask.write("cgroup.kill", "1"); err != nil { 111 return err 112 } 113 114 // note: do NOT remove the cgroup from disk; leave that to the Client, at 115 // least until #14375 is implemented. 116 return nil 117 } 118 119 // kill is used to SIGKILL all processes in cgroup 120 // 121 // The order of operations is 122 // 0. before calling this method, the executor pid has been moved outside of cgroup 123 // 1. freeze cgroup (so processes cannot fork further) 124 // 2. scan the cgroup to collect all pids 125 // 3. issue SIGKILL to each pid found 126 // 4. thaw the cgroup so processes can go die 127 // 5. wait on each processes until it is confirmed dead 128 func (d *killer) kill(cgroup string, freeze func(), thaw func()) error { 129 // freeze the cgroup stopping further forking 130 freeze() 131 132 d.logger.Trace("search for pids in", "cgroup", cgroup) 133 134 // find all the pids we intend to kill 135 pids, err := cgroups.GetPids(cgroup) 136 if err != nil { 137 // if we fail to get pids, re-thaw before bailing so there is at least 138 // a chance the processes can go die out of band 139 thaw() 140 return fmt.Errorf("failed to find pids: %w", err) 141 } 142 143 d.logger.Trace("send sigkill to frozen processes", "cgroup", cgroup, "pids", pids) 144 145 var processes []*os.Process 146 147 // kill the processes in cgroup 148 for _, pid := range pids { 149 p, findErr := os.FindProcess(pid) 150 if findErr != nil { 151 d.logger.Trace("failed to find process of pid to kill", "pid", pid, "error", findErr) 152 continue 153 } 154 processes = append(processes, p) 155 if killErr := p.Kill(); killErr != nil { 156 d.logger.Trace("failed to kill process", "pid", pid, "error", killErr) 157 continue 158 } 159 } 160 161 // thawed the cgroup so we can wait on each process 162 thaw() 163 164 // wait on each process 165 for _, p := range processes { 166 // do not capture error; errors are normal here 167 pState, _ := p.Wait() 168 d.logger.Trace("return from wait on process", "pid", p.Pid, "state", pState) 169 } 170 171 // cgroups are not atomic, the OS takes a moment to un-mark the cgroup as in-use; 172 // a tiny sleep here goes a long way for not creating noisy (but functionally benign) 173 // errors about removing busy cgroup 174 // 175 // alternatively we could do the removal in a loop and silence the interim errors, but meh 176 time.Sleep(50 * time.Millisecond) 177 178 return nil 179 }