sigs.k8s.io/cluster-api-provider-azure@v1.17.0/test/e2e/azure_logcollector.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2020 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2e
    21  
    22  import (
    23  	"context"
    24  	"io"
    25  	"net/http"
    26  	"os"
    27  	"path/filepath"
    28  	"strings"
    29  	"time"
    30  
    31  	"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
    32  	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
    33  	"github.com/pkg/errors"
    34  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    35  	infrav1alpha "sigs.k8s.io/cluster-api-provider-azure/api/v1alpha1"
    36  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    37  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    38  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    39  	azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
    40  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    41  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    42  	"sigs.k8s.io/cluster-api/test/framework"
    43  	"sigs.k8s.io/cluster-api/util"
    44  	"sigs.k8s.io/controller-runtime/pkg/client"
    45  	kinderrors "sigs.k8s.io/kind/pkg/errors"
    46  )
    47  
    48  // AzureLogCollector collects logs from a CAPZ workload cluster.
    49  type AzureLogCollector struct{}
    50  
    51  const (
    52  	collectLogInterval = 3 * time.Second
    53  	collectLogTimeout  = 1 * time.Minute
    54  )
    55  
    56  var _ framework.ClusterLogCollector = &AzureLogCollector{}
    57  
    58  // CollectMachineLog collects logs from a machine.
    59  func (k AzureLogCollector) CollectMachineLog(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine, outputPath string) error {
    60  	var errs []error
    61  
    62  	am, err := getAzureMachine(ctx, managementClusterClient, m)
    63  	if err != nil {
    64  		return err
    65  	}
    66  
    67  	cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, m.ObjectMeta)
    68  	if err != nil {
    69  		return err
    70  	}
    71  
    72  	hostname := getHostname(m, isAzureMachineWindows(am))
    73  
    74  	if err := collectLogsFromNode(cluster, hostname, isAzureMachineWindows(am), outputPath); err != nil {
    75  		errs = append(errs, err)
    76  	}
    77  
    78  	if err := collectVMBootLog(ctx, am, outputPath); err != nil {
    79  		errs = append(errs, errors.Wrap(err, "Unable to collect VM Boot Diagnostic logs"))
    80  	}
    81  
    82  	return kinderrors.NewAggregate(errs)
    83  }
    84  
    85  // CollectMachinePoolLog collects logs from a machine pool.
    86  func (k AzureLogCollector) CollectMachinePoolLog(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool, outputPath string) error {
    87  	var errs []error
    88  	var isWindows bool
    89  
    90  	am, err := getAzureMachinePool(ctx, managementClusterClient, mp)
    91  	if err != nil {
    92  		if !apierrors.IsNotFound(err) {
    93  			return err
    94  		}
    95  		// Machine pool can be an AzureManagedMachinePool for AKS clusters.
    96  		_, err = getAzureManagedMachinePool(ctx, managementClusterClient, mp)
    97  		if err != nil {
    98  			if !apierrors.IsNotFound(err) {
    99  				return err
   100  			}
   101  			_, err = getAzureASOManagedMachinePool(ctx, managementClusterClient, mp)
   102  			return err
   103  		}
   104  	} else {
   105  		isWindows = isAzureMachinePoolWindows(am)
   106  	}
   107  
   108  	cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, mp.ObjectMeta)
   109  	if err != nil {
   110  		return err
   111  	}
   112  
   113  	for i, instance := range mp.Spec.ProviderIDList {
   114  		if mp.Status.NodeRefs != nil && len(mp.Status.NodeRefs) >= (i+1) {
   115  			hostname := mp.Status.NodeRefs[i].Name
   116  
   117  			if err := collectLogsFromNode(cluster, hostname, isWindows, filepath.Join(outputPath, hostname)); err != nil {
   118  				errs = append(errs, err)
   119  			}
   120  
   121  			if err := collectVMSSBootLog(ctx, instance, filepath.Join(outputPath, hostname)); err != nil {
   122  				errs = append(errs, errors.Wrap(err, "Unable to collect VMSS Boot Diagnostic logs"))
   123  			}
   124  		} else {
   125  			Logf("MachinePool instance %s does not have a corresponding NodeRef", instance)
   126  			Logf("Skipping log collection for MachinePool instance %s", instance)
   127  		}
   128  	}
   129  
   130  	return kinderrors.NewAggregate(errs)
   131  }
   132  
   133  // CollectInfrastructureLogs collects log from the infrastructure.
   134  // This is currently a no-op implementation to satisfy the LogCollector interface.
   135  func (k AzureLogCollector) CollectInfrastructureLogs(ctx context.Context, managementClusterClient client.Client, c *clusterv1.Cluster, outputPath string) error {
   136  	return nil
   137  }
   138  
   139  // collectLogsFromNode collects logs from various sources by ssh'ing into the node
   140  func collectLogsFromNode(cluster *clusterv1.Cluster, hostname string, isWindows bool, outputPath string) error {
   141  	nodeOSType := azure.LinuxOS
   142  	if isWindows {
   143  		nodeOSType = azure.WindowsOS
   144  	}
   145  	Logf("Collecting logs for %s node %s in cluster %s in namespace %s\n", nodeOSType, hostname, cluster.Name, cluster.Namespace)
   146  
   147  	controlPlaneEndpoint := cluster.Spec.ControlPlaneEndpoint.Host
   148  
   149  	execToPathFn := func(outputFileName, command string, args ...string) func() error {
   150  		return func() error {
   151  			return retryWithTimeout(collectLogInterval, collectLogTimeout, func() error {
   152  				f, err := fileOnHost(filepath.Join(outputPath, outputFileName))
   153  				if err != nil {
   154  					return err
   155  				}
   156  				defer f.Close()
   157  				return execOnHost(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, f, command, args...)
   158  			})
   159  		}
   160  	}
   161  
   162  	if isWindows {
   163  		// if we initiate to many ssh connections they get dropped (default is 10) so split it up
   164  		var errors []error
   165  		errors = append(errors, kinderrors.AggregateConcurrent(windowsInfo(execToPathFn)))
   166  		errors = append(errors, kinderrors.AggregateConcurrent(windowsK8sLogs(execToPathFn)))
   167  		errors = append(errors, kinderrors.AggregateConcurrent(windowsNetworkLogs(execToPathFn)))
   168  		errors = append(errors, kinderrors.AggregateConcurrent(windowsCrashDumpLogs(execToPathFn)))
   169  		errors = append(errors, sftpCopyFile(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, "/c:/crashdumps.tar", filepath.Join(outputPath, "crashdumps.tar")))
   170  
   171  		return kinderrors.NewAggregate(errors)
   172  	}
   173  
   174  	return kinderrors.AggregateConcurrent(linuxLogs(execToPathFn))
   175  }
   176  
   177  func getHostname(m *clusterv1.Machine, isWindows bool) string {
   178  	hostname := m.Spec.InfrastructureRef.Name
   179  	if isWindows {
   180  		// Windows host name ends up being different than the infra machine name
   181  		// due to Windows name limitations in Azure so use ip address instead.
   182  		if len(m.Status.Addresses) > 0 {
   183  			hostname = m.Status.Addresses[0].Address
   184  		} else {
   185  			Logf("Unable to collect logs as node doesn't have addresses")
   186  		}
   187  	}
   188  	return hostname
   189  }
   190  
   191  func getAzureCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureCluster, error) {
   192  	key := client.ObjectKey{
   193  		Namespace: namespace,
   194  		Name:      name,
   195  	}
   196  
   197  	azCluster := &infrav1.AzureCluster{}
   198  	err := managementClusterClient.Get(ctx, key, azCluster)
   199  	return azCluster, err
   200  }
   201  
   202  func getAzureManagedControlPlane(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureManagedControlPlane, error) {
   203  	key := client.ObjectKey{
   204  		Namespace: namespace,
   205  		Name:      name,
   206  	}
   207  
   208  	azManagedControlPlane := &infrav1.AzureManagedControlPlane{}
   209  	err := managementClusterClient.Get(ctx, key, azManagedControlPlane)
   210  	return azManagedControlPlane, err
   211  }
   212  
   213  func getAzureASOManagedCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1alpha.AzureASOManagedCluster, error) {
   214  	key := client.ObjectKey{
   215  		Namespace: namespace,
   216  		Name:      name,
   217  	}
   218  
   219  	azManagedCluster := &infrav1alpha.AzureASOManagedCluster{}
   220  	err := managementClusterClient.Get(ctx, key, azManagedCluster)
   221  	return azManagedCluster, err
   222  }
   223  
   224  func getAzureMachine(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine) (*infrav1.AzureMachine, error) {
   225  	key := client.ObjectKey{
   226  		Namespace: m.Spec.InfrastructureRef.Namespace,
   227  		Name:      m.Spec.InfrastructureRef.Name,
   228  	}
   229  
   230  	azMachine := &infrav1.AzureMachine{}
   231  	err := managementClusterClient.Get(ctx, key, azMachine)
   232  	return azMachine, err
   233  }
   234  
   235  func getAzureMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1exp.AzureMachinePool, error) {
   236  	key := client.ObjectKey{
   237  		Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace,
   238  		Name:      mp.Spec.Template.Spec.InfrastructureRef.Name,
   239  	}
   240  
   241  	azMachinePool := &infrav1exp.AzureMachinePool{}
   242  	err := managementClusterClient.Get(ctx, key, azMachinePool)
   243  	return azMachinePool, err
   244  }
   245  
   246  func getAzureManagedMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1.AzureManagedMachinePool, error) {
   247  	key := client.ObjectKey{
   248  		Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace,
   249  		Name:      mp.Spec.Template.Spec.InfrastructureRef.Name,
   250  	}
   251  
   252  	azManagedMachinePool := &infrav1.AzureManagedMachinePool{}
   253  	err := managementClusterClient.Get(ctx, key, azManagedMachinePool)
   254  	return azManagedMachinePool, err
   255  }
   256  
   257  func getAzureASOManagedMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1alpha.AzureASOManagedMachinePool, error) {
   258  	key := client.ObjectKey{
   259  		Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace,
   260  		Name:      mp.Spec.Template.Spec.InfrastructureRef.Name,
   261  	}
   262  
   263  	azManagedMachinePool := &infrav1alpha.AzureASOManagedMachinePool{}
   264  	err := managementClusterClient.Get(ctx, key, azManagedMachinePool)
   265  	return azManagedMachinePool, err
   266  }
   267  
   268  func linuxLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   269  	return []func() error{
   270  		execToPathFn(
   271  			"journal.log",
   272  			"sudo", "journalctl", "--no-pager", "--output=short-precise",
   273  		),
   274  		execToPathFn(
   275  			"kern.log",
   276  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-k",
   277  		),
   278  		execToPathFn(
   279  			"kubelet-version.txt",
   280  			"PATH=/opt/bin:${PATH}", "kubelet", "--version",
   281  		),
   282  		execToPathFn(
   283  			"kubelet.log",
   284  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "kubelet.service",
   285  		),
   286  		execToPathFn(
   287  			"containerd.log",
   288  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "containerd.service",
   289  		),
   290  		execToPathFn(
   291  			"ignition.log",
   292  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-at", "ignition",
   293  		),
   294  		execToPathFn(
   295  			"cloud-init.log",
   296  			"cat", "/var/log/cloud-init.log",
   297  		),
   298  		execToPathFn(
   299  			"cloud-init-output.log",
   300  			"cat", "/var/log/cloud-init-output.log",
   301  		),
   302  		execToPathFn(
   303  			"sentinel-file-dir.txt",
   304  			"ls", "/run/cluster-api/",
   305  		),
   306  		execToPathFn(
   307  			"cni.log",
   308  			"cat", "/var/log/calico/cni/cni.log",
   309  		),
   310  	}
   311  }
   312  
   313  func windowsK8sLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   314  	return []func() error{
   315  		execToPathFn(
   316  			"hyperv-operation.log",
   317  			"Get-WinEvent", "-LogName Microsoft-Windows-Hyper-V-Compute-Operational | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Sort-Object TimeCreated | Format-Table -Wrap -Autosize",
   318  		),
   319  		execToPathFn(
   320  			"containerd-containers.log",
   321  			"ctr.exe", "-n k8s.io containers list",
   322  		),
   323  		execToPathFn(
   324  			"containerd-tasks.log",
   325  			"ctr.exe", "-n k8s.io tasks list",
   326  		),
   327  		execToPathFn(
   328  			"containers-hcs.log",
   329  			"hcsdiag", "list",
   330  		),
   331  		execToPathFn(
   332  			"kubelet.log",
   333  			`Get-ChildItem "C:\\var\\log\\kubelet\\"  | ForEach-Object { if ($_ -match 'log.INFO|err.*.log') { write-output "$_";cat "c:\\var\\log\\kubelet\\$_" } }`,
   334  		),
   335  		execToPathFn(
   336  			"cni.log",
   337  			`Get-Content "C:\\cni.log"`,
   338  		),
   339  	}
   340  }
   341  
   342  func windowsInfo(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   343  	return []func() error{
   344  		execToPathFn(
   345  			"reboots.log",
   346  			"Get-WinEvent", `-ErrorAction Ignore -FilterHashtable @{logname = 'System'; id = 1074, 1076, 2004, 6005, 6006, 6008 } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`,
   347  		),
   348  		execToPathFn(
   349  			"scm.log",
   350  			"Get-WinEvent", `-FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`,
   351  		),
   352  		execToPathFn(
   353  			"pagefile.log",
   354  			"Get-CimInstance", "win32_pagefileusage | Format-List *",
   355  		),
   356  		execToPathFn(
   357  			"cloudbase-init-unattend.log",
   358  			"get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init-unattend.log'",
   359  		),
   360  		execToPathFn(
   361  			"cloudbase-init.log",
   362  			"get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init.log'",
   363  		),
   364  		execToPathFn(
   365  			"services.log",
   366  			"get-service",
   367  		),
   368  	}
   369  }
   370  
   371  func windowsNetworkLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   372  	return []func() error{
   373  		execToPathFn(
   374  			"network.log",
   375  			"Get-HnsNetwork | Select Name, Type, Id, AddressPrefix | Format-Table -Wrap -Autosize",
   376  		),
   377  		execToPathFn(
   378  			"network-detailed.log",
   379  			"Get-hnsnetwork | Convertto-json -Depth 20",
   380  		),
   381  		execToPathFn(
   382  			"network-individual-detailed.log",
   383  			"Get-hnsnetwork | % { Get-HnsNetwork -Id $_.ID -Detailed } | Convertto-json -Depth 20",
   384  		),
   385  		execToPathFn(
   386  			"hnsendpoints.log",
   387  			"Get-HnsEndpoint | Select IpAddress, MacAddress, IsRemoteEndpoint, State",
   388  		),
   389  		execToPathFn(
   390  			"hnsendpolicy-detailed.log",
   391  			"Get-hnspolicylist | Convertto-json -Depth 20",
   392  		),
   393  		execToPathFn(
   394  			"ipconfig.log",
   395  			"ipconfig /allcompartments /all",
   396  		),
   397  		execToPathFn(
   398  			"ips.log",
   399  			"Get-NetIPAddress -IncludeAllCompartments",
   400  		),
   401  		execToPathFn(
   402  			"interfaces.log",
   403  			"Get-NetIPInterface -IncludeAllCompartments",
   404  		),
   405  		execToPathFn(
   406  			"hnsdiag.txt",
   407  			"hnsdiag list all -d",
   408  		),
   409  	}
   410  }
   411  
   412  func windowsCrashDumpLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   413  	return []func() error{
   414  		execToPathFn(
   415  			"dir-localdumps.log",
   416  			// note: the powershell 'ls' alias will not have any output if the target directory is empty.
   417  			// we're logging the contents of the c:\localdumps directory because the command that invokes tar.exe below is
   418  			// not providing output when run in powershell over ssh for some reason.
   419  			"ls 'c:\\localdumps' -Recurse",
   420  		),
   421  		execToPathFn(
   422  			// capture any crashdump files created by windows into a .tar to be collected via sftp
   423  			"tar-crashdumps.log",
   424  			"$p = 'c:\\localdumps' ; if (Test-Path $p) { tar.exe -cvzf c:\\crashdumps.tar $p *>&1 | %{ Write-Output \"$_\"} } else { Write-Host \"No crash dumps found at $p\" }",
   425  		),
   426  	}
   427  }
   428  
   429  // collectVMBootLog collects boot logs of the vm by using azure boot diagnostics.
   430  func collectVMBootLog(ctx context.Context, am *infrav1.AzureMachine, outputPath string) error {
   431  	if am == nil {
   432  		return errors.New("AzureMachine is nil")
   433  	}
   434  	Logf("Collecting boot logs for AzureMachine %s\n", am.GetName())
   435  
   436  	if am.Spec.ProviderID == nil {
   437  		return errors.New("AzureMachine provider ID is nil")
   438  	}
   439  
   440  	resource, err := azureutil.ParseResourceID(*am.Spec.ProviderID)
   441  	if err != nil {
   442  		return errors.Wrap(err, "failed to parse resource id")
   443  	}
   444  
   445  	subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID")
   446  	if subscriptionID == "" {
   447  		return errors.New("AZURE_SUBSCRIPTION_ID is not set")
   448  	}
   449  
   450  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   451  	if err != nil {
   452  		return errors.Wrap(err, "failed to get default azure credential")
   453  	}
   454  
   455  	vmClient, err := armcompute.NewVirtualMachinesClient(subscriptionID, cred, nil)
   456  	if err != nil {
   457  		return errors.Wrap(err, "failed to create virtual machines client")
   458  	}
   459  
   460  	bootDiagnostics, err := vmClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, nil)
   461  	if err != nil {
   462  		return errors.Wrap(err, "failed to get boot diagnostics data")
   463  	}
   464  
   465  	return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath)
   466  }
   467  
   468  // collectVMSSBootLog collects boot logs of the scale set by using azure boot diagnostics.
   469  func collectVMSSBootLog(ctx context.Context, providerID string, outputPath string) error {
   470  	resourceID := strings.TrimPrefix(providerID, azureutil.ProviderIDPrefix)
   471  	v := strings.Split(resourceID, "/")
   472  	instanceID := v[len(v)-1]
   473  	resourceID = strings.TrimSuffix(resourceID, "/virtualMachines/"+instanceID)
   474  	resource, err := azureutil.ParseResourceID(resourceID)
   475  	if err != nil {
   476  		return errors.Wrap(err, "failed to parse resource id")
   477  	}
   478  
   479  	Logf("Collecting boot logs for VMSS instance %s of scale set %s\n", instanceID, resource.Name)
   480  
   481  	subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID")
   482  	if subscriptionID == "" {
   483  		return errors.New("AZURE_SUBSCRIPTION_ID is not set")
   484  	}
   485  
   486  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   487  	if err != nil {
   488  		return errors.Wrap(err, "failed to get default azure credential")
   489  	}
   490  	vmssClient, err := armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, cred, nil)
   491  	if err != nil {
   492  		return errors.Wrap(err, "failed to create virtual machine scale set VMs client")
   493  	}
   494  
   495  	bootDiagnostics, err := vmssClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, instanceID, nil)
   496  	if err != nil {
   497  		return errors.Wrap(err, "failed to get boot diagnostics data")
   498  	}
   499  
   500  	return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath)
   501  }
   502  
   503  func writeBootLog(bootDiagnostics armcompute.RetrieveBootDiagnosticsDataResult, outputPath string) error {
   504  	var err error
   505  	req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, *bootDiagnostics.SerialConsoleLogBlobURI, http.NoBody)
   506  	if err != nil {
   507  		return errors.Wrap(err, "failed to create HTTP request")
   508  	}
   509  	resp, err := http.DefaultClient.Do(req)
   510  	if err != nil || resp.StatusCode != 200 {
   511  		return errors.Wrap(err, "failed to get logs from serial console uri")
   512  	}
   513  	defer resp.Body.Close()
   514  
   515  	content, err := io.ReadAll(resp.Body)
   516  	if err != nil {
   517  		return errors.Wrap(err, "failed to read response body")
   518  	}
   519  
   520  	if err := os.WriteFile(filepath.Join(outputPath, "boot.log"), content, 0o600); err != nil {
   521  		return errors.Wrap(err, "failed to write response to file")
   522  	}
   523  
   524  	return nil
   525  }