github.com/Mirantis/virtlet@v1.5.2-0.20191204181327-1659b8a48e9b/pkg/tools/validate.go (about)

     1  /*
     2  Copyright 2019 Mirantis
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package tools
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"strconv"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/spf13/cobra"
    31  	v1 "k8s.io/api/core/v1"
    32  	meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  )
    34  
    35  const (
    36  	expectedCRIProxySocketPath = "/run/criproxy.sock"
    37  	sysCheckNamespace          = "kube-system"
    38  )
    39  
    40  type validateCommand struct {
    41  	client KubeClient
    42  	out    io.Writer
    43  }
    44  
    45  // NewValidateCommand returns a cobra.Command that validates a cluster readines
    46  // for Virtlet deploy
    47  func NewValidateCommand(client KubeClient, out io.Writer) *cobra.Command {
    48  	v := &validateCommand{client: client, out: out}
    49  	cmd := &cobra.Command{
    50  		Use:   "validate",
    51  		Short: "Make sure the cluster is ready for Virtlet deployment",
    52  		Long:  "Check configuration of the cluster nodes to make sure they're ready for Virtlet deployment",
    53  		RunE: func(cmd *cobra.Command, args []string) error {
    54  			if len(args) != 0 {
    55  				return errors.New("This command does not accept arguments")
    56  			}
    57  			return v.Run()
    58  		},
    59  	}
    60  	return cmd
    61  }
    62  
    63  func (v *validateCommand) Run() error {
    64  	nodeNames, err := v.client.GetNamesOfNodesMarkedForVirtlet()
    65  	if err != nil {
    66  		return err
    67  	}
    68  
    69  	if len(nodeNames) == 0 {
    70  		return errors.New("there are no nodes with Virtlet")
    71  	}
    72  
    73  	v.info("Nodes with Virtlet: %s", strings.Join(nodeNames, ", "))
    74  
    75  	pods, errs := v.prepareSysCheckPods(nodeNames)
    76  	defer v.deleteSysCheckPods(pods)
    77  	for _, errstr := range errs {
    78  		v.info(errstr)
    79  	}
    80  
    81  	if len(pods) == 0 {
    82  		return errors.New("couldn't create system check pods on any Virtlet nodes")
    83  	}
    84  
    85  	errCount := v.checkCNI(pods)
    86  	errCount += v.checkCRIProxy(pods)
    87  	errCount += v.checkKubeletArgs(pods)
    88  
    89  	if errCount != 0 {
    90  		return fmt.Errorf("found %d problems", errCount)
    91  	}
    92  	v.info("Validation successful.")
    93  
    94  	return nil
    95  }
    96  
    97  func (v *validateCommand) prepareSysCheckPods(nodes []string) (pods []*v1.Pod, errs []string) {
    98  	// TODO: add timeouts
    99  	// TODO: create the pods in parallel
   100  	hostPathType := v1.HostPathDirectory
   101  	var definedPods []*v1.Pod
   102  	for _, name := range nodes {
   103  		v.info("Creating syscheck pod on the node %q", name)
   104  		pod, err := v.client.CreatePod(&v1.Pod{
   105  			ObjectMeta: meta_v1.ObjectMeta{
   106  				Name:      "virtletsyscheck-" + name,
   107  				Namespace: sysCheckNamespace,
   108  			},
   109  			Spec: v1.PodSpec{
   110  				Volumes: []v1.Volume{
   111  					{
   112  						Name: "hostfs",
   113  						VolumeSource: v1.VolumeSource{
   114  							HostPath: &v1.HostPathVolumeSource{
   115  								Path: "/",
   116  								Type: &hostPathType,
   117  							},
   118  						},
   119  					},
   120  				},
   121  				Containers: []v1.Container{
   122  					{
   123  						Name:    "syscheck",
   124  						Image:   "busybox",
   125  						Command: []string{"/bin/sh", "-c", "--"},
   126  						Args:    []string{"trap : TERM INT; (while true; do sleep 1000; done) & wait"},
   127  						VolumeMounts: []v1.VolumeMount{
   128  							{
   129  								Name:      "hostfs",
   130  								MountPath: "/mnt",
   131  								ReadOnly:  true,
   132  							},
   133  						},
   134  					},
   135  				},
   136  				NodeSelector: map[string]string{"kubernetes.io/hostname": name},
   137  				HostPID:      true,
   138  			},
   139  		})
   140  		if err != nil {
   141  			errs = append(errs, fmt.Sprintf("SysCheck pod creation failed on the node %q: %v", name, err))
   142  		} else {
   143  			definedPods = append(definedPods, pod)
   144  		}
   145  	}
   146  
   147  	var wg sync.WaitGroup
   148  	wg.Add(len(definedPods))
   149  	for _, def := range definedPods {
   150  		go func(podDef *v1.Pod) {
   151  			for {
   152  				// TODO: add a check for container start failure, e.g. when
   153  				// downloading a container image fails
   154  				if pod, err := v.client.GetPod(podDef.Name, sysCheckNamespace); err != nil {
   155  					errs = append(errs, fmt.Sprintf("Status check for SysCheck pod %q failed: %v", podDef.Name, err))
   156  					break
   157  				} else if pod.Status.Phase == v1.PodRunning {
   158  					pods = append(pods, pod)
   159  					break
   160  				}
   161  				time.Sleep(250 * time.Millisecond)
   162  			}
   163  			wg.Done()
   164  		}(def)
   165  	}
   166  	wg.Wait()
   167  	v.info("SysCheck pods on all the Virtlet nodes are running")
   168  
   169  	return
   170  }
   171  
   172  func (v *validateCommand) info(fmtstring string, a ...interface{}) {
   173  	fmt.Fprintf(v.out, fmtstring+"\n", a...)
   174  }
   175  
   176  func (v *validateCommand) deleteSysCheckPods(pods []*v1.Pod) {
   177  	for _, pod := range pods {
   178  		if err := v.client.DeletePod(pod.Name, sysCheckNamespace); err != nil {
   179  			v.info("Error during removal of SysCheck pod %q/%q: %v", sysCheckNamespace, pod.Name, err)
   180  		}
   181  	}
   182  }
   183  
   184  func doInAllPods(pods []*v1.Pod, check func(*v1.Pod) int) int {
   185  	// TODO: add timeouts
   186  	var wg sync.WaitGroup
   187  	wg.Add(len(pods))
   188  
   189  	errCount := 0
   190  	for _, pod := range pods {
   191  		go func(pod_ *v1.Pod) {
   192  			errCount += check(pod_)
   193  			wg.Done()
   194  		}(pod)
   195  	}
   196  
   197  	wg.Wait()
   198  	return errCount
   199  }
   200  
   201  func (v *validateCommand) runCheckOnAllNodes(pods []*v1.Pod, description, command string, check func(nodeName, out string) int) int {
   202  	return doInAllPods(pods, func(pod *v1.Pod) int {
   203  		errCount := 0
   204  		var out bytes.Buffer
   205  		_, err := v.client.ExecInContainer(
   206  			pod.Name, "syscheck", pod.Namespace, nil, bufio.NewWriter(&out), nil,
   207  			[]string{
   208  				"/bin/sh", "-c",
   209  				command,
   210  			},
   211  		)
   212  		if err != nil {
   213  			v.info("ERROR: %s verification failed on the node %q: %v", description, pod.Spec.NodeName, err)
   214  			errCount++
   215  		}
   216  
   217  		return errCount + check(pod.Spec.NodeName, strings.TrimRight(out.String(), "\r\n"))
   218  	})
   219  }
   220  
   221  func (v *validateCommand) checkCNI(pods []*v1.Pod) int {
   222  	// TODO: try to do a CNI setup in a network namespace
   223  	return v.runCheckOnAllNodes(
   224  		pods, "CNI configuration",
   225  		"find /mnt/etc/cni/net.d -name \"*.conf\" -o -name \"*.conflist\" -o -name \"*.json\" | wc -l",
   226  		func(nodeName, out string) int {
   227  			errCount := 0
   228  			if i, err := strconv.Atoi(out); err != nil {
   229  				v.info("ERROR: internal error during conunting CNI configuration files on %q: %v", nodeName, err)
   230  				errCount++
   231  			} else if i == 0 {
   232  				v.info("ERROR: node %q does not have any CNI configuration in /etc/cni/net.d", nodeName)
   233  				errCount++
   234  			}
   235  			return errCount
   236  		},
   237  	)
   238  }
   239  
   240  func (v *validateCommand) checkCRIProxy(pods []*v1.Pod) int {
   241  	// TODO: handle custom CRI proxy socket paths
   242  	return v.runCheckOnAllNodes(
   243  		pods, "CRI Proxy",
   244  		"pgrep criproxy | while read pid ; do cat /proc/$pid/cmdline ; done",
   245  		func(nodeName, out string) int {
   246  			errCount := 0
   247  			if len(out) == 0 {
   248  				v.info("ERROR: node %q doesn't have CRI Proxy running", nodeName)
   249  				errCount++
   250  			} else if !strings.Contains(out, expectedCRIProxySocketPath) {
   251  				v.info("ERROR: CRI Proxy doesn't have %q as its socket path on the node %q", expectedCRIProxySocketPath, nodeName)
   252  				errCount++
   253  			}
   254  			return errCount
   255  		},
   256  	)
   257  }
   258  
   259  func (v *validateCommand) checkKubeletArgs(pods []*v1.Pod) int {
   260  	// TODO: handle custom CRI proxy socket paths
   261  	return v.runCheckOnAllNodes(
   262  		pods, "kubelet configuration",
   263  		"( pgrep kubelet ; pgrep hyperkube ) | while read pid ; do cat /proc/$pid/cmdline ; done",
   264  		func(nodeName, out string) int {
   265  			errCount := 0
   266  			if len(out) == 0 {
   267  				// FIXME: this may happen if kubelet process has different name
   268  				v.info("ERROR: kubelet process not found on node %q", nodeName)
   269  				errCount++
   270  			} else {
   271  				for _, arg := range []string{
   272  					"--container-runtime=remote",
   273  					"--container-runtime-endpoint=unix:///run/criproxy.sock",
   274  					"--image-service-endpoint=unix:///run/criproxy.sock",
   275  					"--enable-controller-attach-detach=false",
   276  				} {
   277  					if !strings.Contains(out, arg) {
   278  						v.info("kubelet on node %q is missing %q option", nodeName, arg)
   279  						errCount++
   280  					}
   281  				}
   282  			}
   283  			return errCount
   284  		},
   285  	)
   286  }