github.com/mirantis/virtlet@v1.5.2-0.20191204181327-1659b8a48e9b/pkg/tools/validate.go (about) 1 /* 2 Copyright 2019 Mirantis 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tools 18 19 import ( 20 "bufio" 21 "bytes" 22 "errors" 23 "fmt" 24 "io" 25 "strconv" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/spf13/cobra" 31 v1 "k8s.io/api/core/v1" 32 meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 ) 34 35 const ( 36 expectedCRIProxySocketPath = "/run/criproxy.sock" 37 sysCheckNamespace = "kube-system" 38 ) 39 40 type validateCommand struct { 41 client KubeClient 42 out io.Writer 43 } 44 45 // NewValidateCommand returns a cobra.Command that validates a cluster readines 46 // for Virtlet deploy 47 func NewValidateCommand(client KubeClient, out io.Writer) *cobra.Command { 48 v := &validateCommand{client: client, out: out} 49 cmd := &cobra.Command{ 50 Use: "validate", 51 Short: "Make sure the cluster is ready for Virtlet deployment", 52 Long: "Check configuration of the cluster nodes to make sure they're ready for Virtlet deployment", 53 RunE: func(cmd *cobra.Command, args []string) error { 54 if len(args) != 0 { 55 return errors.New("This command does not accept arguments") 56 } 57 return v.Run() 58 }, 59 } 60 return cmd 61 } 62 63 func (v *validateCommand) Run() error { 64 nodeNames, err := v.client.GetNamesOfNodesMarkedForVirtlet() 65 if err != nil { 66 return err 67 } 68 69 if len(nodeNames) == 0 { 70 return errors.New("there are no nodes with Virtlet") 71 } 72 73 v.info("Nodes with Virtlet: %s", strings.Join(nodeNames, ", ")) 74 75 pods, errs := v.prepareSysCheckPods(nodeNames) 76 defer v.deleteSysCheckPods(pods) 77 for _, errstr := range errs { 78 v.info(errstr) 79 } 80 81 if len(pods) == 0 { 82 return errors.New("couldn't create system check pods on any Virtlet nodes") 83 } 84 85 errCount := v.checkCNI(pods) 86 errCount += v.checkCRIProxy(pods) 87 errCount += v.checkKubeletArgs(pods) 88 89 if errCount != 0 { 90 return fmt.Errorf("found %d problems", errCount) 91 } 92 v.info("Validation successful.") 93 94 return nil 95 } 96 97 func (v *validateCommand) prepareSysCheckPods(nodes []string) (pods []*v1.Pod, errs []string) { 98 // TODO: add timeouts 99 // TODO: create the pods in parallel 100 hostPathType := v1.HostPathDirectory 101 var definedPods []*v1.Pod 102 for _, name := range nodes { 103 v.info("Creating syscheck pod on the node %q", name) 104 pod, err := v.client.CreatePod(&v1.Pod{ 105 ObjectMeta: meta_v1.ObjectMeta{ 106 Name: "virtletsyscheck-" + name, 107 Namespace: sysCheckNamespace, 108 }, 109 Spec: v1.PodSpec{ 110 Volumes: []v1.Volume{ 111 { 112 Name: "hostfs", 113 VolumeSource: v1.VolumeSource{ 114 HostPath: &v1.HostPathVolumeSource{ 115 Path: "/", 116 Type: &hostPathType, 117 }, 118 }, 119 }, 120 }, 121 Containers: []v1.Container{ 122 { 123 Name: "syscheck", 124 Image: "busybox", 125 Command: []string{"/bin/sh", "-c", "--"}, 126 Args: []string{"trap : TERM INT; (while true; do sleep 1000; done) & wait"}, 127 VolumeMounts: []v1.VolumeMount{ 128 { 129 Name: "hostfs", 130 MountPath: "/mnt", 131 ReadOnly: true, 132 }, 133 }, 134 }, 135 }, 136 NodeSelector: map[string]string{"kubernetes.io/hostname": name}, 137 HostPID: true, 138 }, 139 }) 140 if err != nil { 141 errs = append(errs, fmt.Sprintf("SysCheck pod creation failed on the node %q: %v", name, err)) 142 } else { 143 definedPods = append(definedPods, pod) 144 } 145 } 146 147 var wg sync.WaitGroup 148 wg.Add(len(definedPods)) 149 for _, def := range definedPods { 150 go func(podDef *v1.Pod) { 151 for { 152 // TODO: add a check for container start failure, e.g. when 153 // downloading a container image fails 154 if pod, err := v.client.GetPod(podDef.Name, sysCheckNamespace); err != nil { 155 errs = append(errs, fmt.Sprintf("Status check for SysCheck pod %q failed: %v", podDef.Name, err)) 156 break 157 } else if pod.Status.Phase == v1.PodRunning { 158 pods = append(pods, pod) 159 break 160 } 161 time.Sleep(250 * time.Millisecond) 162 } 163 wg.Done() 164 }(def) 165 } 166 wg.Wait() 167 v.info("SysCheck pods on all the Virtlet nodes are running") 168 169 return 170 } 171 172 func (v *validateCommand) info(fmtstring string, a ...interface{}) { 173 fmt.Fprintf(v.out, fmtstring+"\n", a...) 174 } 175 176 func (v *validateCommand) deleteSysCheckPods(pods []*v1.Pod) { 177 for _, pod := range pods { 178 if err := v.client.DeletePod(pod.Name, sysCheckNamespace); err != nil { 179 v.info("Error during removal of SysCheck pod %q/%q: %v", sysCheckNamespace, pod.Name, err) 180 } 181 } 182 } 183 184 func doInAllPods(pods []*v1.Pod, check func(*v1.Pod) int) int { 185 // TODO: add timeouts 186 var wg sync.WaitGroup 187 wg.Add(len(pods)) 188 189 errCount := 0 190 for _, pod := range pods { 191 go func(pod_ *v1.Pod) { 192 errCount += check(pod_) 193 wg.Done() 194 }(pod) 195 } 196 197 wg.Wait() 198 return errCount 199 } 200 201 func (v *validateCommand) runCheckOnAllNodes(pods []*v1.Pod, description, command string, check func(nodeName, out string) int) int { 202 return doInAllPods(pods, func(pod *v1.Pod) int { 203 errCount := 0 204 var out bytes.Buffer 205 _, err := v.client.ExecInContainer( 206 pod.Name, "syscheck", pod.Namespace, nil, bufio.NewWriter(&out), nil, 207 []string{ 208 "/bin/sh", "-c", 209 command, 210 }, 211 ) 212 if err != nil { 213 v.info("ERROR: %s verification failed on the node %q: %v", description, pod.Spec.NodeName, err) 214 errCount++ 215 } 216 217 return errCount + check(pod.Spec.NodeName, strings.TrimRight(out.String(), "\r\n")) 218 }) 219 } 220 221 func (v *validateCommand) checkCNI(pods []*v1.Pod) int { 222 // TODO: try to do a CNI setup in a network namespace 223 return v.runCheckOnAllNodes( 224 pods, "CNI configuration", 225 "find /mnt/etc/cni/net.d -name \"*.conf\" -o -name \"*.conflist\" -o -name \"*.json\" | wc -l", 226 func(nodeName, out string) int { 227 errCount := 0 228 if i, err := strconv.Atoi(out); err != nil { 229 v.info("ERROR: internal error during conunting CNI configuration files on %q: %v", nodeName, err) 230 errCount++ 231 } else if i == 0 { 232 v.info("ERROR: node %q does not have any CNI configuration in /etc/cni/net.d", nodeName) 233 errCount++ 234 } 235 return errCount 236 }, 237 ) 238 } 239 240 func (v *validateCommand) checkCRIProxy(pods []*v1.Pod) int { 241 // TODO: handle custom CRI proxy socket paths 242 return v.runCheckOnAllNodes( 243 pods, "CRI Proxy", 244 "pgrep criproxy | while read pid ; do cat /proc/$pid/cmdline ; done", 245 func(nodeName, out string) int { 246 errCount := 0 247 if len(out) == 0 { 248 v.info("ERROR: node %q doesn't have CRI Proxy running", nodeName) 249 errCount++ 250 } else if !strings.Contains(out, expectedCRIProxySocketPath) { 251 v.info("ERROR: CRI Proxy doesn't have %q as its socket path on the node %q", expectedCRIProxySocketPath, nodeName) 252 errCount++ 253 } 254 return errCount 255 }, 256 ) 257 } 258 259 func (v *validateCommand) checkKubeletArgs(pods []*v1.Pod) int { 260 // TODO: handle custom CRI proxy socket paths 261 return v.runCheckOnAllNodes( 262 pods, "kubelet configuration", 263 "( pgrep kubelet ; pgrep hyperkube ) | while read pid ; do cat /proc/$pid/cmdline ; done", 264 func(nodeName, out string) int { 265 errCount := 0 266 if len(out) == 0 { 267 // FIXME: this may happen if kubelet process has different name 268 v.info("ERROR: kubelet process not found on node %q", nodeName) 269 errCount++ 270 } else { 271 for _, arg := range []string{ 272 "--container-runtime=remote", 273 "--container-runtime-endpoint=unix:///run/criproxy.sock", 274 "--image-service-endpoint=unix:///run/criproxy.sock", 275 "--enable-controller-attach-detach=false", 276 } { 277 if !strings.Contains(out, arg) { 278 v.info("kubelet on node %q is missing %q option", nodeName, arg) 279 errCount++ 280 } 281 } 282 } 283 return errCount 284 }, 285 ) 286 }