github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/fs/freezer.go (about) 1 package fs 2 3 import ( 4 "errors" 5 "fmt" 6 "os" 7 "strings" 8 "time" 9 10 "github.com/opencontainers/runc/libcontainer/cgroups" 11 "github.com/opencontainers/runc/libcontainer/configs" 12 "github.com/sirupsen/logrus" 13 "golang.org/x/sys/unix" 14 ) 15 16 type FreezerGroup struct{} 17 18 func (s *FreezerGroup) Name() string { 19 return "freezer" 20 } 21 22 func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error { 23 return apply(path, pid) 24 } 25 26 func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { 27 switch r.Freezer { 28 case configs.Frozen: 29 defer func() { 30 if Err != nil { 31 // Freezing failed, and it is bad and dangerous 32 // to leave the cgroup in FROZEN or FREEZING 33 // state, so (try to) thaw it back. 34 _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) 35 } 36 }() 37 38 // As per older kernel docs (freezer-subsystem.txt before 39 // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, 40 // userspace should either retry or thaw. While current 41 // kernel cgroup v1 docs no longer mention a need to retry, 42 // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably 43 // freeze a cgroup v1 while new processes keep appearing in it 44 // (either via fork/clone or by writing new PIDs to 45 // cgroup.procs). 46 // 47 // The numbers below are empirically chosen to have a decent 48 // chance to succeed in various scenarios ("runc pause/unpause 49 // with parallel runc exec" and "bare freeze/unfreeze on a very 50 // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. 51 // 52 // Adding any amount of sleep in between retries did not 53 // increase the chances of successful freeze in "pause/unpause 54 // with parallel exec" reproducer. OTOH, adding an occasional 55 // sleep helped for the case where the system is extremely slow 56 // (CentOS 7 VM on GHA CI). 57 // 58 // Alas, this is still a game of chances, since the real fix 59 // belong to the kernel (cgroup v2 do not have this bug). 60 61 for i := 0; i < 1000; i++ { 62 if i%50 == 49 { 63 // Occasional thaw and sleep improves 64 // the chances to succeed in freezing 65 // in case new processes keep appearing 66 // in the cgroup. 67 _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) 68 time.Sleep(10 * time.Millisecond) 69 } 70 71 if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { 72 return err 73 } 74 75 if i%25 == 24 { 76 // Occasional short sleep before reading 77 // the state back also improves the chances to 78 // succeed in freezing in case of a very slow 79 // system. 80 time.Sleep(10 * time.Microsecond) 81 } 82 state, err := cgroups.ReadFile(path, "freezer.state") 83 if err != nil { 84 return err 85 } 86 state = strings.TrimSpace(state) 87 switch state { 88 case "FREEZING": 89 continue 90 case string(configs.Frozen): 91 if i > 1 { 92 logrus.Debugf("frozen after %d retries", i) 93 } 94 return nil 95 default: 96 // should never happen 97 return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) 98 } 99 } 100 // Despite our best efforts, it got stuck in FREEZING. 101 return errors.New("unable to freeze") 102 case configs.Thawed: 103 return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) 104 case configs.Undefined: 105 return nil 106 default: 107 return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) 108 } 109 } 110 111 func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { 112 return nil 113 } 114 115 func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { 116 for { 117 state, err := cgroups.ReadFile(path, "freezer.state") 118 if err != nil { 119 // If the kernel is too old, then we just treat the freezer as 120 // being in an "undefined" state. 121 if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { 122 err = nil 123 } 124 return configs.Undefined, err 125 } 126 switch strings.TrimSpace(state) { 127 case "THAWED": 128 return configs.Thawed, nil 129 case "FROZEN": 130 // Find out whether the cgroup is frozen directly, 131 // or indirectly via an ancestor. 132 self, err := cgroups.ReadFile(path, "freezer.self_freezing") 133 if err != nil { 134 // If the kernel is too old, then we just treat 135 // it as being frozen. 136 if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { 137 err = nil 138 } 139 return configs.Frozen, err 140 } 141 switch self { 142 case "0\n": 143 return configs.Thawed, nil 144 case "1\n": 145 return configs.Frozen, nil 146 default: 147 return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) 148 } 149 case "FREEZING": 150 // Make sure we get a stable freezer state, so retry if the cgroup 151 // is still undergoing freezing. This should be a temporary delay. 152 time.Sleep(1 * time.Millisecond) 153 continue 154 default: 155 return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) 156 } 157 } 158 }