sigs.k8s.io/cluster-api-provider-azure@v1.14.3/test/e2e/azure_logcollector.go (about) 1 //go:build e2e 2 // +build e2e 3 4 /* 5 Copyright 2020 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package e2e 21 22 import ( 23 "context" 24 "io" 25 "net/http" 26 "os" 27 "path/filepath" 28 "strings" 29 "time" 30 31 "github.com/Azure/azure-sdk-for-go/sdk/azidentity" 32 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" 33 "github.com/pkg/errors" 34 apierrors "k8s.io/apimachinery/pkg/api/errors" 35 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 36 "sigs.k8s.io/cluster-api-provider-azure/azure" 37 infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" 38 azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure" 39 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 40 expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" 41 "sigs.k8s.io/cluster-api/test/framework" 42 "sigs.k8s.io/cluster-api/util" 43 "sigs.k8s.io/controller-runtime/pkg/client" 44 kinderrors "sigs.k8s.io/kind/pkg/errors" 45 ) 46 47 // AzureLogCollector collects logs from a CAPZ workload cluster. 48 type AzureLogCollector struct{} 49 50 const ( 51 collectLogInterval = 3 * time.Second 52 collectLogTimeout = 1 * time.Minute 53 ) 54 55 var _ framework.ClusterLogCollector = &AzureLogCollector{} 56 57 // CollectMachineLog collects logs from a machine. 58 func (k AzureLogCollector) CollectMachineLog(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine, outputPath string) error { 59 var errs []error 60 61 am, err := getAzureMachine(ctx, managementClusterClient, m) 62 if err != nil { 63 return err 64 } 65 66 cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, m.ObjectMeta) 67 if err != nil { 68 return err 69 } 70 71 hostname := getHostname(m, isAzureMachineWindows(am)) 72 73 if err := collectLogsFromNode(cluster, hostname, isAzureMachineWindows(am), outputPath); err != nil { 74 errs = append(errs, err) 75 } 76 77 if err := collectVMBootLog(ctx, am, outputPath); err != nil { 78 errs = append(errs, errors.Wrap(err, "Unable to collect VM Boot Diagnostic logs")) 79 } 80 81 return kinderrors.NewAggregate(errs) 82 } 83 84 // CollectMachinePoolLog collects logs from a machine pool. 85 func (k AzureLogCollector) CollectMachinePoolLog(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool, outputPath string) error { 86 var errs []error 87 var isWindows bool 88 89 am, err := getAzureMachinePool(ctx, managementClusterClient, mp) 90 if err != nil { 91 if !apierrors.IsNotFound(err) { 92 return err 93 } 94 // Machine pool can be an AzureManagedMachinePool for AKS clusters. 95 _, err = getAzureManagedMachinePool(ctx, managementClusterClient, mp) 96 if err != nil { 97 return err 98 } 99 } else { 100 isWindows = isAzureMachinePoolWindows(am) 101 } 102 103 cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, mp.ObjectMeta) 104 if err != nil { 105 return err 106 } 107 108 for i, instance := range mp.Spec.ProviderIDList { 109 if mp.Status.NodeRefs != nil && len(mp.Status.NodeRefs) >= (i+1) { 110 hostname := mp.Status.NodeRefs[i].Name 111 112 if err := collectLogsFromNode(cluster, hostname, isWindows, filepath.Join(outputPath, hostname)); err != nil { 113 errs = append(errs, err) 114 } 115 116 if err := collectVMSSBootLog(ctx, instance, filepath.Join(outputPath, hostname)); err != nil { 117 errs = append(errs, errors.Wrap(err, "Unable to collect VMSS Boot Diagnostic logs")) 118 } 119 } else { 120 Logf("MachinePool instance %s does not have a corresponding NodeRef", instance) 121 Logf("Skipping log collection for MachinePool instance %s", instance) 122 } 123 } 124 125 return kinderrors.NewAggregate(errs) 126 } 127 128 // CollectInfrastructureLogs collects log from the infrastructure. 129 // This is currently a no-op implementation to satisfy the LogCollector interface. 130 func (k AzureLogCollector) CollectInfrastructureLogs(ctx context.Context, managementClusterClient client.Client, c *clusterv1.Cluster, outputPath string) error { 131 return nil 132 } 133 134 // collectLogsFromNode collects logs from various sources by ssh'ing into the node 135 func collectLogsFromNode(cluster *clusterv1.Cluster, hostname string, isWindows bool, outputPath string) error { 136 nodeOSType := azure.LinuxOS 137 if isWindows { 138 nodeOSType = azure.WindowsOS 139 } 140 Logf("Collecting logs for %s node %s in cluster %s in namespace %s\n", nodeOSType, hostname, cluster.Name, cluster.Namespace) 141 142 controlPlaneEndpoint := cluster.Spec.ControlPlaneEndpoint.Host 143 144 execToPathFn := func(outputFileName, command string, args ...string) func() error { 145 return func() error { 146 return retryWithTimeout(collectLogInterval, collectLogTimeout, func() error { 147 f, err := fileOnHost(filepath.Join(outputPath, outputFileName)) 148 if err != nil { 149 return err 150 } 151 defer f.Close() 152 return execOnHost(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, f, command, args...) 153 }) 154 } 155 } 156 157 if isWindows { 158 // if we initiate to many ssh connections they get dropped (default is 10) so split it up 159 var errors []error 160 errors = append(errors, kinderrors.AggregateConcurrent(windowsInfo(execToPathFn))) 161 errors = append(errors, kinderrors.AggregateConcurrent(windowsK8sLogs(execToPathFn))) 162 errors = append(errors, kinderrors.AggregateConcurrent(windowsNetworkLogs(execToPathFn))) 163 errors = append(errors, kinderrors.AggregateConcurrent(windowsCrashDumpLogs(execToPathFn))) 164 errors = append(errors, sftpCopyFile(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, "/c:/crashdumps.tar", filepath.Join(outputPath, "crashdumps.tar"))) 165 166 return kinderrors.NewAggregate(errors) 167 } 168 169 return kinderrors.AggregateConcurrent(linuxLogs(execToPathFn)) 170 } 171 172 func getHostname(m *clusterv1.Machine, isWindows bool) string { 173 hostname := m.Spec.InfrastructureRef.Name 174 if isWindows { 175 // Windows host name ends up being different than the infra machine name 176 // due to Windows name limitations in Azure so use ip address instead. 177 if len(m.Status.Addresses) > 0 { 178 hostname = m.Status.Addresses[0].Address 179 } else { 180 Logf("Unable to collect logs as node doesn't have addresses") 181 } 182 } 183 return hostname 184 } 185 186 func getAzureCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureCluster, error) { 187 key := client.ObjectKey{ 188 Namespace: namespace, 189 Name: name, 190 } 191 192 azCluster := &infrav1.AzureCluster{} 193 err := managementClusterClient.Get(ctx, key, azCluster) 194 return azCluster, err 195 } 196 197 func getAzureManagedControlPlane(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureManagedControlPlane, error) { 198 key := client.ObjectKey{ 199 Namespace: namespace, 200 Name: name, 201 } 202 203 azManagedControlPlane := &infrav1.AzureManagedControlPlane{} 204 err := managementClusterClient.Get(ctx, key, azManagedControlPlane) 205 return azManagedControlPlane, err 206 } 207 208 func getAzureMachine(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine) (*infrav1.AzureMachine, error) { 209 key := client.ObjectKey{ 210 Namespace: m.Spec.InfrastructureRef.Namespace, 211 Name: m.Spec.InfrastructureRef.Name, 212 } 213 214 azMachine := &infrav1.AzureMachine{} 215 err := managementClusterClient.Get(ctx, key, azMachine) 216 return azMachine, err 217 } 218 219 func getAzureMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1exp.AzureMachinePool, error) { 220 key := client.ObjectKey{ 221 Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, 222 Name: mp.Spec.Template.Spec.InfrastructureRef.Name, 223 } 224 225 azMachinePool := &infrav1exp.AzureMachinePool{} 226 err := managementClusterClient.Get(ctx, key, azMachinePool) 227 return azMachinePool, err 228 } 229 230 func getAzureManagedMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1.AzureManagedMachinePool, error) { 231 key := client.ObjectKey{ 232 Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, 233 Name: mp.Spec.Template.Spec.InfrastructureRef.Name, 234 } 235 236 azManagedMachinePool := &infrav1.AzureManagedMachinePool{} 237 err := managementClusterClient.Get(ctx, key, azManagedMachinePool) 238 return azManagedMachinePool, err 239 } 240 241 func linuxLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 242 return []func() error{ 243 execToPathFn( 244 "journal.log", 245 "sudo", "journalctl", "--no-pager", "--output=short-precise", 246 ), 247 execToPathFn( 248 "kern.log", 249 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-k", 250 ), 251 execToPathFn( 252 "kubelet-version.txt", 253 "PATH=/opt/bin:${PATH}", "kubelet", "--version", 254 ), 255 execToPathFn( 256 "kubelet.log", 257 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "kubelet.service", 258 ), 259 execToPathFn( 260 "containerd.log", 261 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "containerd.service", 262 ), 263 execToPathFn( 264 "ignition.log", 265 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-at", "ignition", 266 ), 267 execToPathFn( 268 "cloud-init.log", 269 "cat", "/var/log/cloud-init.log", 270 ), 271 execToPathFn( 272 "cloud-init-output.log", 273 "cat", "/var/log/cloud-init-output.log", 274 ), 275 execToPathFn( 276 "sentinel-file-dir.txt", 277 "ls", "/run/cluster-api/", 278 ), 279 execToPathFn( 280 "cni.log", 281 "cat", "/var/log/calico/cni/cni.log", 282 ), 283 } 284 } 285 286 func windowsK8sLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 287 return []func() error{ 288 execToPathFn( 289 "hyperv-operation.log", 290 "Get-WinEvent", "-LogName Microsoft-Windows-Hyper-V-Compute-Operational | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Sort-Object TimeCreated | Format-Table -Wrap -Autosize", 291 ), 292 execToPathFn( 293 "containerd-containers.log", 294 "ctr.exe", "-n k8s.io containers list", 295 ), 296 execToPathFn( 297 "containerd-tasks.log", 298 "ctr.exe", "-n k8s.io tasks list", 299 ), 300 execToPathFn( 301 "containers-hcs.log", 302 "hcsdiag", "list", 303 ), 304 execToPathFn( 305 "kubelet.log", 306 `Get-ChildItem "C:\\var\\log\\kubelet\\" | ForEach-Object { if ($_ -match 'log.INFO|err.*.log') { write-output "$_";cat "c:\\var\\log\\kubelet\\$_" } }`, 307 ), 308 execToPathFn( 309 "cni.log", 310 `Get-Content "C:\\cni.log"`, 311 ), 312 } 313 } 314 315 func windowsInfo(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 316 return []func() error{ 317 execToPathFn( 318 "reboots.log", 319 "Get-WinEvent", `-ErrorAction Ignore -FilterHashtable @{logname = 'System'; id = 1074, 1076, 2004, 6005, 6006, 6008 } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`, 320 ), 321 execToPathFn( 322 "scm.log", 323 "Get-WinEvent", `-FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`, 324 ), 325 execToPathFn( 326 "pagefile.log", 327 "Get-CimInstance", "win32_pagefileusage | Format-List *", 328 ), 329 execToPathFn( 330 "cloudbase-init-unattend.log", 331 "get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init-unattend.log'", 332 ), 333 execToPathFn( 334 "cloudbase-init.log", 335 "get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init.log'", 336 ), 337 execToPathFn( 338 "services.log", 339 "get-service", 340 ), 341 } 342 } 343 344 func windowsNetworkLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 345 return []func() error{ 346 execToPathFn( 347 "network.log", 348 "Get-HnsNetwork | Select Name, Type, Id, AddressPrefix | Format-Table -Wrap -Autosize", 349 ), 350 execToPathFn( 351 "network-detailed.log", 352 "Get-hnsnetwork | Convertto-json -Depth 20", 353 ), 354 execToPathFn( 355 "network-individual-detailed.log", 356 "Get-hnsnetwork | % { Get-HnsNetwork -Id $_.ID -Detailed } | Convertto-json -Depth 20", 357 ), 358 execToPathFn( 359 "hnsendpoints.log", 360 "Get-HnsEndpoint | Select IpAddress, MacAddress, IsRemoteEndpoint, State", 361 ), 362 execToPathFn( 363 "hnsendpolicy-detailed.log", 364 "Get-hnspolicylist | Convertto-json -Depth 20", 365 ), 366 execToPathFn( 367 "ipconfig.log", 368 "ipconfig /allcompartments /all", 369 ), 370 execToPathFn( 371 "ips.log", 372 "Get-NetIPAddress -IncludeAllCompartments", 373 ), 374 execToPathFn( 375 "interfaces.log", 376 "Get-NetIPInterface -IncludeAllCompartments", 377 ), 378 execToPathFn( 379 "hnsdiag.txt", 380 "hnsdiag list all -d", 381 ), 382 } 383 } 384 385 func windowsCrashDumpLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 386 return []func() error{ 387 execToPathFn( 388 "dir-localdumps.log", 389 // note: the powershell 'ls' alias will not have any output if the target directory is empty. 390 // we're logging the contents of the c:\localdumps directory because the command that invokes tar.exe below is 391 // not providing output when run in powershell over ssh for some reason. 392 "ls 'c:\\localdumps' -Recurse", 393 ), 394 execToPathFn( 395 // capture any crashdump files created by windows into a .tar to be collected via sftp 396 "tar-crashdumps.log", 397 "$p = 'c:\\localdumps' ; if (Test-Path $p) { tar.exe -cvzf c:\\crashdumps.tar $p *>&1 | %{ Write-Output \"$_\"} } else { Write-Host \"No crash dumps found at $p\" }", 398 ), 399 } 400 } 401 402 // collectVMBootLog collects boot logs of the vm by using azure boot diagnostics. 403 func collectVMBootLog(ctx context.Context, am *infrav1.AzureMachine, outputPath string) error { 404 if am == nil { 405 return errors.New("AzureMachine is nil") 406 } 407 Logf("Collecting boot logs for AzureMachine %s\n", am.GetName()) 408 409 if am.Spec.ProviderID == nil { 410 return errors.New("AzureMachine provider ID is nil") 411 } 412 413 resource, err := azureutil.ParseResourceID(*am.Spec.ProviderID) 414 if err != nil { 415 return errors.Wrap(err, "failed to parse resource id") 416 } 417 418 subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID") 419 if subscriptionID == "" { 420 return errors.New("AZURE_SUBSCRIPTION_ID is not set") 421 } 422 423 cred, err := azidentity.NewDefaultAzureCredential(nil) 424 if err != nil { 425 return errors.Wrap(err, "failed to get default azure credential") 426 } 427 428 vmClient, err := armcompute.NewVirtualMachinesClient(subscriptionID, cred, nil) 429 if err != nil { 430 return errors.Wrap(err, "failed to create virtual machines client") 431 } 432 433 bootDiagnostics, err := vmClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, nil) 434 if err != nil { 435 return errors.Wrap(err, "failed to get boot diagnostics data") 436 } 437 438 return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath) 439 } 440 441 // collectVMSSBootLog collects boot logs of the scale set by using azure boot diagnostics. 442 func collectVMSSBootLog(ctx context.Context, providerID string, outputPath string) error { 443 resourceID := strings.TrimPrefix(providerID, azureutil.ProviderIDPrefix) 444 v := strings.Split(resourceID, "/") 445 instanceID := v[len(v)-1] 446 resourceID = strings.TrimSuffix(resourceID, "/virtualMachines/"+instanceID) 447 resource, err := azureutil.ParseResourceID(resourceID) 448 if err != nil { 449 return errors.Wrap(err, "failed to parse resource id") 450 } 451 452 Logf("Collecting boot logs for VMSS instance %s of scale set %s\n", instanceID, resource.Name) 453 454 subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID") 455 if subscriptionID == "" { 456 return errors.New("AZURE_SUBSCRIPTION_ID is not set") 457 } 458 459 cred, err := azidentity.NewDefaultAzureCredential(nil) 460 if err != nil { 461 return errors.Wrap(err, "failed to get default azure credential") 462 } 463 vmssClient, err := armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, cred, nil) 464 if err != nil { 465 return errors.Wrap(err, "failed to create virtual machine scale set VMs client") 466 } 467 468 bootDiagnostics, err := vmssClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, instanceID, nil) 469 if err != nil { 470 return errors.Wrap(err, "failed to get boot diagnostics data") 471 } 472 473 return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath) 474 } 475 476 func writeBootLog(bootDiagnostics armcompute.RetrieveBootDiagnosticsDataResult, outputPath string) error { 477 var err error 478 req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, *bootDiagnostics.SerialConsoleLogBlobURI, http.NoBody) 479 if err != nil { 480 return errors.Wrap(err, "failed to create HTTP request") 481 } 482 resp, err := http.DefaultClient.Do(req) 483 if err != nil || resp.StatusCode != 200 { 484 return errors.Wrap(err, "failed to get logs from serial console uri") 485 } 486 defer resp.Body.Close() 487 488 content, err := io.ReadAll(resp.Body) 489 if err != nil { 490 return errors.Wrap(err, "failed to read response body") 491 } 492 493 if err := os.WriteFile(filepath.Join(outputPath, "boot.log"), content, 0o600); err != nil { 494 return errors.Wrap(err, "failed to write response to file") 495 } 496 497 return nil 498 }