sigs.k8s.io/cluster-api-provider-azure@v1.14.3/test/e2e/azure_logcollector.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2020 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2e
    21  
    22  import (
    23  	"context"
    24  	"io"
    25  	"net/http"
    26  	"os"
    27  	"path/filepath"
    28  	"strings"
    29  	"time"
    30  
    31  	"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
    32  	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
    33  	"github.com/pkg/errors"
    34  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    35  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    36  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    37  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    38  	azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
    39  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    40  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    41  	"sigs.k8s.io/cluster-api/test/framework"
    42  	"sigs.k8s.io/cluster-api/util"
    43  	"sigs.k8s.io/controller-runtime/pkg/client"
    44  	kinderrors "sigs.k8s.io/kind/pkg/errors"
    45  )
    46  
    47  // AzureLogCollector collects logs from a CAPZ workload cluster.
    48  type AzureLogCollector struct{}
    49  
    50  const (
    51  	collectLogInterval = 3 * time.Second
    52  	collectLogTimeout  = 1 * time.Minute
    53  )
    54  
    55  var _ framework.ClusterLogCollector = &AzureLogCollector{}
    56  
    57  // CollectMachineLog collects logs from a machine.
    58  func (k AzureLogCollector) CollectMachineLog(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine, outputPath string) error {
    59  	var errs []error
    60  
    61  	am, err := getAzureMachine(ctx, managementClusterClient, m)
    62  	if err != nil {
    63  		return err
    64  	}
    65  
    66  	cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, m.ObjectMeta)
    67  	if err != nil {
    68  		return err
    69  	}
    70  
    71  	hostname := getHostname(m, isAzureMachineWindows(am))
    72  
    73  	if err := collectLogsFromNode(cluster, hostname, isAzureMachineWindows(am), outputPath); err != nil {
    74  		errs = append(errs, err)
    75  	}
    76  
    77  	if err := collectVMBootLog(ctx, am, outputPath); err != nil {
    78  		errs = append(errs, errors.Wrap(err, "Unable to collect VM Boot Diagnostic logs"))
    79  	}
    80  
    81  	return kinderrors.NewAggregate(errs)
    82  }
    83  
    84  // CollectMachinePoolLog collects logs from a machine pool.
    85  func (k AzureLogCollector) CollectMachinePoolLog(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool, outputPath string) error {
    86  	var errs []error
    87  	var isWindows bool
    88  
    89  	am, err := getAzureMachinePool(ctx, managementClusterClient, mp)
    90  	if err != nil {
    91  		if !apierrors.IsNotFound(err) {
    92  			return err
    93  		}
    94  		// Machine pool can be an AzureManagedMachinePool for AKS clusters.
    95  		_, err = getAzureManagedMachinePool(ctx, managementClusterClient, mp)
    96  		if err != nil {
    97  			return err
    98  		}
    99  	} else {
   100  		isWindows = isAzureMachinePoolWindows(am)
   101  	}
   102  
   103  	cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, mp.ObjectMeta)
   104  	if err != nil {
   105  		return err
   106  	}
   107  
   108  	for i, instance := range mp.Spec.ProviderIDList {
   109  		if mp.Status.NodeRefs != nil && len(mp.Status.NodeRefs) >= (i+1) {
   110  			hostname := mp.Status.NodeRefs[i].Name
   111  
   112  			if err := collectLogsFromNode(cluster, hostname, isWindows, filepath.Join(outputPath, hostname)); err != nil {
   113  				errs = append(errs, err)
   114  			}
   115  
   116  			if err := collectVMSSBootLog(ctx, instance, filepath.Join(outputPath, hostname)); err != nil {
   117  				errs = append(errs, errors.Wrap(err, "Unable to collect VMSS Boot Diagnostic logs"))
   118  			}
   119  		} else {
   120  			Logf("MachinePool instance %s does not have a corresponding NodeRef", instance)
   121  			Logf("Skipping log collection for MachinePool instance %s", instance)
   122  		}
   123  	}
   124  
   125  	return kinderrors.NewAggregate(errs)
   126  }
   127  
   128  // CollectInfrastructureLogs collects log from the infrastructure.
   129  // This is currently a no-op implementation to satisfy the LogCollector interface.
   130  func (k AzureLogCollector) CollectInfrastructureLogs(ctx context.Context, managementClusterClient client.Client, c *clusterv1.Cluster, outputPath string) error {
   131  	return nil
   132  }
   133  
   134  // collectLogsFromNode collects logs from various sources by ssh'ing into the node
   135  func collectLogsFromNode(cluster *clusterv1.Cluster, hostname string, isWindows bool, outputPath string) error {
   136  	nodeOSType := azure.LinuxOS
   137  	if isWindows {
   138  		nodeOSType = azure.WindowsOS
   139  	}
   140  	Logf("Collecting logs for %s node %s in cluster %s in namespace %s\n", nodeOSType, hostname, cluster.Name, cluster.Namespace)
   141  
   142  	controlPlaneEndpoint := cluster.Spec.ControlPlaneEndpoint.Host
   143  
   144  	execToPathFn := func(outputFileName, command string, args ...string) func() error {
   145  		return func() error {
   146  			return retryWithTimeout(collectLogInterval, collectLogTimeout, func() error {
   147  				f, err := fileOnHost(filepath.Join(outputPath, outputFileName))
   148  				if err != nil {
   149  					return err
   150  				}
   151  				defer f.Close()
   152  				return execOnHost(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, f, command, args...)
   153  			})
   154  		}
   155  	}
   156  
   157  	if isWindows {
   158  		// if we initiate to many ssh connections they get dropped (default is 10) so split it up
   159  		var errors []error
   160  		errors = append(errors, kinderrors.AggregateConcurrent(windowsInfo(execToPathFn)))
   161  		errors = append(errors, kinderrors.AggregateConcurrent(windowsK8sLogs(execToPathFn)))
   162  		errors = append(errors, kinderrors.AggregateConcurrent(windowsNetworkLogs(execToPathFn)))
   163  		errors = append(errors, kinderrors.AggregateConcurrent(windowsCrashDumpLogs(execToPathFn)))
   164  		errors = append(errors, sftpCopyFile(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, "/c:/crashdumps.tar", filepath.Join(outputPath, "crashdumps.tar")))
   165  
   166  		return kinderrors.NewAggregate(errors)
   167  	}
   168  
   169  	return kinderrors.AggregateConcurrent(linuxLogs(execToPathFn))
   170  }
   171  
   172  func getHostname(m *clusterv1.Machine, isWindows bool) string {
   173  	hostname := m.Spec.InfrastructureRef.Name
   174  	if isWindows {
   175  		// Windows host name ends up being different than the infra machine name
   176  		// due to Windows name limitations in Azure so use ip address instead.
   177  		if len(m.Status.Addresses) > 0 {
   178  			hostname = m.Status.Addresses[0].Address
   179  		} else {
   180  			Logf("Unable to collect logs as node doesn't have addresses")
   181  		}
   182  	}
   183  	return hostname
   184  }
   185  
   186  func getAzureCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureCluster, error) {
   187  	key := client.ObjectKey{
   188  		Namespace: namespace,
   189  		Name:      name,
   190  	}
   191  
   192  	azCluster := &infrav1.AzureCluster{}
   193  	err := managementClusterClient.Get(ctx, key, azCluster)
   194  	return azCluster, err
   195  }
   196  
   197  func getAzureManagedControlPlane(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureManagedControlPlane, error) {
   198  	key := client.ObjectKey{
   199  		Namespace: namespace,
   200  		Name:      name,
   201  	}
   202  
   203  	azManagedControlPlane := &infrav1.AzureManagedControlPlane{}
   204  	err := managementClusterClient.Get(ctx, key, azManagedControlPlane)
   205  	return azManagedControlPlane, err
   206  }
   207  
   208  func getAzureMachine(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine) (*infrav1.AzureMachine, error) {
   209  	key := client.ObjectKey{
   210  		Namespace: m.Spec.InfrastructureRef.Namespace,
   211  		Name:      m.Spec.InfrastructureRef.Name,
   212  	}
   213  
   214  	azMachine := &infrav1.AzureMachine{}
   215  	err := managementClusterClient.Get(ctx, key, azMachine)
   216  	return azMachine, err
   217  }
   218  
   219  func getAzureMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1exp.AzureMachinePool, error) {
   220  	key := client.ObjectKey{
   221  		Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace,
   222  		Name:      mp.Spec.Template.Spec.InfrastructureRef.Name,
   223  	}
   224  
   225  	azMachinePool := &infrav1exp.AzureMachinePool{}
   226  	err := managementClusterClient.Get(ctx, key, azMachinePool)
   227  	return azMachinePool, err
   228  }
   229  
   230  func getAzureManagedMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1.AzureManagedMachinePool, error) {
   231  	key := client.ObjectKey{
   232  		Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace,
   233  		Name:      mp.Spec.Template.Spec.InfrastructureRef.Name,
   234  	}
   235  
   236  	azManagedMachinePool := &infrav1.AzureManagedMachinePool{}
   237  	err := managementClusterClient.Get(ctx, key, azManagedMachinePool)
   238  	return azManagedMachinePool, err
   239  }
   240  
   241  func linuxLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   242  	return []func() error{
   243  		execToPathFn(
   244  			"journal.log",
   245  			"sudo", "journalctl", "--no-pager", "--output=short-precise",
   246  		),
   247  		execToPathFn(
   248  			"kern.log",
   249  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-k",
   250  		),
   251  		execToPathFn(
   252  			"kubelet-version.txt",
   253  			"PATH=/opt/bin:${PATH}", "kubelet", "--version",
   254  		),
   255  		execToPathFn(
   256  			"kubelet.log",
   257  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "kubelet.service",
   258  		),
   259  		execToPathFn(
   260  			"containerd.log",
   261  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "containerd.service",
   262  		),
   263  		execToPathFn(
   264  			"ignition.log",
   265  			"sudo", "journalctl", "--no-pager", "--output=short-precise", "-at", "ignition",
   266  		),
   267  		execToPathFn(
   268  			"cloud-init.log",
   269  			"cat", "/var/log/cloud-init.log",
   270  		),
   271  		execToPathFn(
   272  			"cloud-init-output.log",
   273  			"cat", "/var/log/cloud-init-output.log",
   274  		),
   275  		execToPathFn(
   276  			"sentinel-file-dir.txt",
   277  			"ls", "/run/cluster-api/",
   278  		),
   279  		execToPathFn(
   280  			"cni.log",
   281  			"cat", "/var/log/calico/cni/cni.log",
   282  		),
   283  	}
   284  }
   285  
   286  func windowsK8sLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   287  	return []func() error{
   288  		execToPathFn(
   289  			"hyperv-operation.log",
   290  			"Get-WinEvent", "-LogName Microsoft-Windows-Hyper-V-Compute-Operational | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Sort-Object TimeCreated | Format-Table -Wrap -Autosize",
   291  		),
   292  		execToPathFn(
   293  			"containerd-containers.log",
   294  			"ctr.exe", "-n k8s.io containers list",
   295  		),
   296  		execToPathFn(
   297  			"containerd-tasks.log",
   298  			"ctr.exe", "-n k8s.io tasks list",
   299  		),
   300  		execToPathFn(
   301  			"containers-hcs.log",
   302  			"hcsdiag", "list",
   303  		),
   304  		execToPathFn(
   305  			"kubelet.log",
   306  			`Get-ChildItem "C:\\var\\log\\kubelet\\"  | ForEach-Object { if ($_ -match 'log.INFO|err.*.log') { write-output "$_";cat "c:\\var\\log\\kubelet\\$_" } }`,
   307  		),
   308  		execToPathFn(
   309  			"cni.log",
   310  			`Get-Content "C:\\cni.log"`,
   311  		),
   312  	}
   313  }
   314  
   315  func windowsInfo(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   316  	return []func() error{
   317  		execToPathFn(
   318  			"reboots.log",
   319  			"Get-WinEvent", `-ErrorAction Ignore -FilterHashtable @{logname = 'System'; id = 1074, 1076, 2004, 6005, 6006, 6008 } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`,
   320  		),
   321  		execToPathFn(
   322  			"scm.log",
   323  			"Get-WinEvent", `-FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`,
   324  		),
   325  		execToPathFn(
   326  			"pagefile.log",
   327  			"Get-CimInstance", "win32_pagefileusage | Format-List *",
   328  		),
   329  		execToPathFn(
   330  			"cloudbase-init-unattend.log",
   331  			"get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init-unattend.log'",
   332  		),
   333  		execToPathFn(
   334  			"cloudbase-init.log",
   335  			"get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init.log'",
   336  		),
   337  		execToPathFn(
   338  			"services.log",
   339  			"get-service",
   340  		),
   341  	}
   342  }
   343  
   344  func windowsNetworkLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   345  	return []func() error{
   346  		execToPathFn(
   347  			"network.log",
   348  			"Get-HnsNetwork | Select Name, Type, Id, AddressPrefix | Format-Table -Wrap -Autosize",
   349  		),
   350  		execToPathFn(
   351  			"network-detailed.log",
   352  			"Get-hnsnetwork | Convertto-json -Depth 20",
   353  		),
   354  		execToPathFn(
   355  			"network-individual-detailed.log",
   356  			"Get-hnsnetwork | % { Get-HnsNetwork -Id $_.ID -Detailed } | Convertto-json -Depth 20",
   357  		),
   358  		execToPathFn(
   359  			"hnsendpoints.log",
   360  			"Get-HnsEndpoint | Select IpAddress, MacAddress, IsRemoteEndpoint, State",
   361  		),
   362  		execToPathFn(
   363  			"hnsendpolicy-detailed.log",
   364  			"Get-hnspolicylist | Convertto-json -Depth 20",
   365  		),
   366  		execToPathFn(
   367  			"ipconfig.log",
   368  			"ipconfig /allcompartments /all",
   369  		),
   370  		execToPathFn(
   371  			"ips.log",
   372  			"Get-NetIPAddress -IncludeAllCompartments",
   373  		),
   374  		execToPathFn(
   375  			"interfaces.log",
   376  			"Get-NetIPInterface -IncludeAllCompartments",
   377  		),
   378  		execToPathFn(
   379  			"hnsdiag.txt",
   380  			"hnsdiag list all -d",
   381  		),
   382  	}
   383  }
   384  
   385  func windowsCrashDumpLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error {
   386  	return []func() error{
   387  		execToPathFn(
   388  			"dir-localdumps.log",
   389  			// note: the powershell 'ls' alias will not have any output if the target directory is empty.
   390  			// we're logging the contents of the c:\localdumps directory because the command that invokes tar.exe below is
   391  			// not providing output when run in powershell over ssh for some reason.
   392  			"ls 'c:\\localdumps' -Recurse",
   393  		),
   394  		execToPathFn(
   395  			// capture any crashdump files created by windows into a .tar to be collected via sftp
   396  			"tar-crashdumps.log",
   397  			"$p = 'c:\\localdumps' ; if (Test-Path $p) { tar.exe -cvzf c:\\crashdumps.tar $p *>&1 | %{ Write-Output \"$_\"} } else { Write-Host \"No crash dumps found at $p\" }",
   398  		),
   399  	}
   400  }
   401  
   402  // collectVMBootLog collects boot logs of the vm by using azure boot diagnostics.
   403  func collectVMBootLog(ctx context.Context, am *infrav1.AzureMachine, outputPath string) error {
   404  	if am == nil {
   405  		return errors.New("AzureMachine is nil")
   406  	}
   407  	Logf("Collecting boot logs for AzureMachine %s\n", am.GetName())
   408  
   409  	if am.Spec.ProviderID == nil {
   410  		return errors.New("AzureMachine provider ID is nil")
   411  	}
   412  
   413  	resource, err := azureutil.ParseResourceID(*am.Spec.ProviderID)
   414  	if err != nil {
   415  		return errors.Wrap(err, "failed to parse resource id")
   416  	}
   417  
   418  	subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID")
   419  	if subscriptionID == "" {
   420  		return errors.New("AZURE_SUBSCRIPTION_ID is not set")
   421  	}
   422  
   423  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   424  	if err != nil {
   425  		return errors.Wrap(err, "failed to get default azure credential")
   426  	}
   427  
   428  	vmClient, err := armcompute.NewVirtualMachinesClient(subscriptionID, cred, nil)
   429  	if err != nil {
   430  		return errors.Wrap(err, "failed to create virtual machines client")
   431  	}
   432  
   433  	bootDiagnostics, err := vmClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, nil)
   434  	if err != nil {
   435  		return errors.Wrap(err, "failed to get boot diagnostics data")
   436  	}
   437  
   438  	return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath)
   439  }
   440  
   441  // collectVMSSBootLog collects boot logs of the scale set by using azure boot diagnostics.
   442  func collectVMSSBootLog(ctx context.Context, providerID string, outputPath string) error {
   443  	resourceID := strings.TrimPrefix(providerID, azureutil.ProviderIDPrefix)
   444  	v := strings.Split(resourceID, "/")
   445  	instanceID := v[len(v)-1]
   446  	resourceID = strings.TrimSuffix(resourceID, "/virtualMachines/"+instanceID)
   447  	resource, err := azureutil.ParseResourceID(resourceID)
   448  	if err != nil {
   449  		return errors.Wrap(err, "failed to parse resource id")
   450  	}
   451  
   452  	Logf("Collecting boot logs for VMSS instance %s of scale set %s\n", instanceID, resource.Name)
   453  
   454  	subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID")
   455  	if subscriptionID == "" {
   456  		return errors.New("AZURE_SUBSCRIPTION_ID is not set")
   457  	}
   458  
   459  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   460  	if err != nil {
   461  		return errors.Wrap(err, "failed to get default azure credential")
   462  	}
   463  	vmssClient, err := armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, cred, nil)
   464  	if err != nil {
   465  		return errors.Wrap(err, "failed to create virtual machine scale set VMs client")
   466  	}
   467  
   468  	bootDiagnostics, err := vmssClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, instanceID, nil)
   469  	if err != nil {
   470  		return errors.Wrap(err, "failed to get boot diagnostics data")
   471  	}
   472  
   473  	return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath)
   474  }
   475  
   476  func writeBootLog(bootDiagnostics armcompute.RetrieveBootDiagnosticsDataResult, outputPath string) error {
   477  	var err error
   478  	req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, *bootDiagnostics.SerialConsoleLogBlobURI, http.NoBody)
   479  	if err != nil {
   480  		return errors.Wrap(err, "failed to create HTTP request")
   481  	}
   482  	resp, err := http.DefaultClient.Do(req)
   483  	if err != nil || resp.StatusCode != 200 {
   484  		return errors.Wrap(err, "failed to get logs from serial console uri")
   485  	}
   486  	defer resp.Body.Close()
   487  
   488  	content, err := io.ReadAll(resp.Body)
   489  	if err != nil {
   490  		return errors.Wrap(err, "failed to read response body")
   491  	}
   492  
   493  	if err := os.WriteFile(filepath.Join(outputPath, "boot.log"), content, 0o600); err != nil {
   494  		return errors.Wrap(err, "failed to write response to file")
   495  	}
   496  
   497  	return nil
   498  }