sigs.k8s.io/cluster-api-provider-azure@v1.17.0/test/e2e/azure_logcollector.go (about) 1 //go:build e2e 2 // +build e2e 3 4 /* 5 Copyright 2020 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package e2e 21 22 import ( 23 "context" 24 "io" 25 "net/http" 26 "os" 27 "path/filepath" 28 "strings" 29 "time" 30 31 "github.com/Azure/azure-sdk-for-go/sdk/azidentity" 32 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" 33 "github.com/pkg/errors" 34 apierrors "k8s.io/apimachinery/pkg/api/errors" 35 infrav1alpha "sigs.k8s.io/cluster-api-provider-azure/api/v1alpha1" 36 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 37 "sigs.k8s.io/cluster-api-provider-azure/azure" 38 infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" 39 azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure" 40 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 41 expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" 42 "sigs.k8s.io/cluster-api/test/framework" 43 "sigs.k8s.io/cluster-api/util" 44 "sigs.k8s.io/controller-runtime/pkg/client" 45 kinderrors "sigs.k8s.io/kind/pkg/errors" 46 ) 47 48 // AzureLogCollector collects logs from a CAPZ workload cluster. 49 type AzureLogCollector struct{} 50 51 const ( 52 collectLogInterval = 3 * time.Second 53 collectLogTimeout = 1 * time.Minute 54 ) 55 56 var _ framework.ClusterLogCollector = &AzureLogCollector{} 57 58 // CollectMachineLog collects logs from a machine. 59 func (k AzureLogCollector) CollectMachineLog(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine, outputPath string) error { 60 var errs []error 61 62 am, err := getAzureMachine(ctx, managementClusterClient, m) 63 if err != nil { 64 return err 65 } 66 67 cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, m.ObjectMeta) 68 if err != nil { 69 return err 70 } 71 72 hostname := getHostname(m, isAzureMachineWindows(am)) 73 74 if err := collectLogsFromNode(cluster, hostname, isAzureMachineWindows(am), outputPath); err != nil { 75 errs = append(errs, err) 76 } 77 78 if err := collectVMBootLog(ctx, am, outputPath); err != nil { 79 errs = append(errs, errors.Wrap(err, "Unable to collect VM Boot Diagnostic logs")) 80 } 81 82 return kinderrors.NewAggregate(errs) 83 } 84 85 // CollectMachinePoolLog collects logs from a machine pool. 86 func (k AzureLogCollector) CollectMachinePoolLog(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool, outputPath string) error { 87 var errs []error 88 var isWindows bool 89 90 am, err := getAzureMachinePool(ctx, managementClusterClient, mp) 91 if err != nil { 92 if !apierrors.IsNotFound(err) { 93 return err 94 } 95 // Machine pool can be an AzureManagedMachinePool for AKS clusters. 96 _, err = getAzureManagedMachinePool(ctx, managementClusterClient, mp) 97 if err != nil { 98 if !apierrors.IsNotFound(err) { 99 return err 100 } 101 _, err = getAzureASOManagedMachinePool(ctx, managementClusterClient, mp) 102 return err 103 } 104 } else { 105 isWindows = isAzureMachinePoolWindows(am) 106 } 107 108 cluster, err := util.GetClusterFromMetadata(ctx, managementClusterClient, mp.ObjectMeta) 109 if err != nil { 110 return err 111 } 112 113 for i, instance := range mp.Spec.ProviderIDList { 114 if mp.Status.NodeRefs != nil && len(mp.Status.NodeRefs) >= (i+1) { 115 hostname := mp.Status.NodeRefs[i].Name 116 117 if err := collectLogsFromNode(cluster, hostname, isWindows, filepath.Join(outputPath, hostname)); err != nil { 118 errs = append(errs, err) 119 } 120 121 if err := collectVMSSBootLog(ctx, instance, filepath.Join(outputPath, hostname)); err != nil { 122 errs = append(errs, errors.Wrap(err, "Unable to collect VMSS Boot Diagnostic logs")) 123 } 124 } else { 125 Logf("MachinePool instance %s does not have a corresponding NodeRef", instance) 126 Logf("Skipping log collection for MachinePool instance %s", instance) 127 } 128 } 129 130 return kinderrors.NewAggregate(errs) 131 } 132 133 // CollectInfrastructureLogs collects log from the infrastructure. 134 // This is currently a no-op implementation to satisfy the LogCollector interface. 135 func (k AzureLogCollector) CollectInfrastructureLogs(ctx context.Context, managementClusterClient client.Client, c *clusterv1.Cluster, outputPath string) error { 136 return nil 137 } 138 139 // collectLogsFromNode collects logs from various sources by ssh'ing into the node 140 func collectLogsFromNode(cluster *clusterv1.Cluster, hostname string, isWindows bool, outputPath string) error { 141 nodeOSType := azure.LinuxOS 142 if isWindows { 143 nodeOSType = azure.WindowsOS 144 } 145 Logf("Collecting logs for %s node %s in cluster %s in namespace %s\n", nodeOSType, hostname, cluster.Name, cluster.Namespace) 146 147 controlPlaneEndpoint := cluster.Spec.ControlPlaneEndpoint.Host 148 149 execToPathFn := func(outputFileName, command string, args ...string) func() error { 150 return func() error { 151 return retryWithTimeout(collectLogInterval, collectLogTimeout, func() error { 152 f, err := fileOnHost(filepath.Join(outputPath, outputFileName)) 153 if err != nil { 154 return err 155 } 156 defer f.Close() 157 return execOnHost(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, f, command, args...) 158 }) 159 } 160 } 161 162 if isWindows { 163 // if we initiate to many ssh connections they get dropped (default is 10) so split it up 164 var errors []error 165 errors = append(errors, kinderrors.AggregateConcurrent(windowsInfo(execToPathFn))) 166 errors = append(errors, kinderrors.AggregateConcurrent(windowsK8sLogs(execToPathFn))) 167 errors = append(errors, kinderrors.AggregateConcurrent(windowsNetworkLogs(execToPathFn))) 168 errors = append(errors, kinderrors.AggregateConcurrent(windowsCrashDumpLogs(execToPathFn))) 169 errors = append(errors, sftpCopyFile(controlPlaneEndpoint, hostname, sshPort, collectLogTimeout, "/c:/crashdumps.tar", filepath.Join(outputPath, "crashdumps.tar"))) 170 171 return kinderrors.NewAggregate(errors) 172 } 173 174 return kinderrors.AggregateConcurrent(linuxLogs(execToPathFn)) 175 } 176 177 func getHostname(m *clusterv1.Machine, isWindows bool) string { 178 hostname := m.Spec.InfrastructureRef.Name 179 if isWindows { 180 // Windows host name ends up being different than the infra machine name 181 // due to Windows name limitations in Azure so use ip address instead. 182 if len(m.Status.Addresses) > 0 { 183 hostname = m.Status.Addresses[0].Address 184 } else { 185 Logf("Unable to collect logs as node doesn't have addresses") 186 } 187 } 188 return hostname 189 } 190 191 func getAzureCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureCluster, error) { 192 key := client.ObjectKey{ 193 Namespace: namespace, 194 Name: name, 195 } 196 197 azCluster := &infrav1.AzureCluster{} 198 err := managementClusterClient.Get(ctx, key, azCluster) 199 return azCluster, err 200 } 201 202 func getAzureManagedControlPlane(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1.AzureManagedControlPlane, error) { 203 key := client.ObjectKey{ 204 Namespace: namespace, 205 Name: name, 206 } 207 208 azManagedControlPlane := &infrav1.AzureManagedControlPlane{} 209 err := managementClusterClient.Get(ctx, key, azManagedControlPlane) 210 return azManagedControlPlane, err 211 } 212 213 func getAzureASOManagedCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*infrav1alpha.AzureASOManagedCluster, error) { 214 key := client.ObjectKey{ 215 Namespace: namespace, 216 Name: name, 217 } 218 219 azManagedCluster := &infrav1alpha.AzureASOManagedCluster{} 220 err := managementClusterClient.Get(ctx, key, azManagedCluster) 221 return azManagedCluster, err 222 } 223 224 func getAzureMachine(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine) (*infrav1.AzureMachine, error) { 225 key := client.ObjectKey{ 226 Namespace: m.Spec.InfrastructureRef.Namespace, 227 Name: m.Spec.InfrastructureRef.Name, 228 } 229 230 azMachine := &infrav1.AzureMachine{} 231 err := managementClusterClient.Get(ctx, key, azMachine) 232 return azMachine, err 233 } 234 235 func getAzureMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1exp.AzureMachinePool, error) { 236 key := client.ObjectKey{ 237 Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, 238 Name: mp.Spec.Template.Spec.InfrastructureRef.Name, 239 } 240 241 azMachinePool := &infrav1exp.AzureMachinePool{} 242 err := managementClusterClient.Get(ctx, key, azMachinePool) 243 return azMachinePool, err 244 } 245 246 func getAzureManagedMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1.AzureManagedMachinePool, error) { 247 key := client.ObjectKey{ 248 Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, 249 Name: mp.Spec.Template.Spec.InfrastructureRef.Name, 250 } 251 252 azManagedMachinePool := &infrav1.AzureManagedMachinePool{} 253 err := managementClusterClient.Get(ctx, key, azManagedMachinePool) 254 return azManagedMachinePool, err 255 } 256 257 func getAzureASOManagedMachinePool(ctx context.Context, managementClusterClient client.Client, mp *expv1.MachinePool) (*infrav1alpha.AzureASOManagedMachinePool, error) { 258 key := client.ObjectKey{ 259 Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, 260 Name: mp.Spec.Template.Spec.InfrastructureRef.Name, 261 } 262 263 azManagedMachinePool := &infrav1alpha.AzureASOManagedMachinePool{} 264 err := managementClusterClient.Get(ctx, key, azManagedMachinePool) 265 return azManagedMachinePool, err 266 } 267 268 func linuxLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 269 return []func() error{ 270 execToPathFn( 271 "journal.log", 272 "sudo", "journalctl", "--no-pager", "--output=short-precise", 273 ), 274 execToPathFn( 275 "kern.log", 276 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-k", 277 ), 278 execToPathFn( 279 "kubelet-version.txt", 280 "PATH=/opt/bin:${PATH}", "kubelet", "--version", 281 ), 282 execToPathFn( 283 "kubelet.log", 284 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "kubelet.service", 285 ), 286 execToPathFn( 287 "containerd.log", 288 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-u", "containerd.service", 289 ), 290 execToPathFn( 291 "ignition.log", 292 "sudo", "journalctl", "--no-pager", "--output=short-precise", "-at", "ignition", 293 ), 294 execToPathFn( 295 "cloud-init.log", 296 "cat", "/var/log/cloud-init.log", 297 ), 298 execToPathFn( 299 "cloud-init-output.log", 300 "cat", "/var/log/cloud-init-output.log", 301 ), 302 execToPathFn( 303 "sentinel-file-dir.txt", 304 "ls", "/run/cluster-api/", 305 ), 306 execToPathFn( 307 "cni.log", 308 "cat", "/var/log/calico/cni/cni.log", 309 ), 310 } 311 } 312 313 func windowsK8sLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 314 return []func() error{ 315 execToPathFn( 316 "hyperv-operation.log", 317 "Get-WinEvent", "-LogName Microsoft-Windows-Hyper-V-Compute-Operational | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Sort-Object TimeCreated | Format-Table -Wrap -Autosize", 318 ), 319 execToPathFn( 320 "containerd-containers.log", 321 "ctr.exe", "-n k8s.io containers list", 322 ), 323 execToPathFn( 324 "containerd-tasks.log", 325 "ctr.exe", "-n k8s.io tasks list", 326 ), 327 execToPathFn( 328 "containers-hcs.log", 329 "hcsdiag", "list", 330 ), 331 execToPathFn( 332 "kubelet.log", 333 `Get-ChildItem "C:\\var\\log\\kubelet\\" | ForEach-Object { if ($_ -match 'log.INFO|err.*.log') { write-output "$_";cat "c:\\var\\log\\kubelet\\$_" } }`, 334 ), 335 execToPathFn( 336 "cni.log", 337 `Get-Content "C:\\cni.log"`, 338 ), 339 } 340 } 341 342 func windowsInfo(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 343 return []func() error{ 344 execToPathFn( 345 "reboots.log", 346 "Get-WinEvent", `-ErrorAction Ignore -FilterHashtable @{logname = 'System'; id = 1074, 1076, 2004, 6005, 6006, 6008 } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`, 347 ), 348 execToPathFn( 349 "scm.log", 350 "Get-WinEvent", `-FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Format-Table -Wrap -Autosize`, 351 ), 352 execToPathFn( 353 "pagefile.log", 354 "Get-CimInstance", "win32_pagefileusage | Format-List *", 355 ), 356 execToPathFn( 357 "cloudbase-init-unattend.log", 358 "get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init-unattend.log'", 359 ), 360 execToPathFn( 361 "cloudbase-init.log", 362 "get-content 'C:\\Program Files\\Cloudbase Solutions\\Cloudbase-Init\\log\\cloudbase-init.log'", 363 ), 364 execToPathFn( 365 "services.log", 366 "get-service", 367 ), 368 } 369 } 370 371 func windowsNetworkLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 372 return []func() error{ 373 execToPathFn( 374 "network.log", 375 "Get-HnsNetwork | Select Name, Type, Id, AddressPrefix | Format-Table -Wrap -Autosize", 376 ), 377 execToPathFn( 378 "network-detailed.log", 379 "Get-hnsnetwork | Convertto-json -Depth 20", 380 ), 381 execToPathFn( 382 "network-individual-detailed.log", 383 "Get-hnsnetwork | % { Get-HnsNetwork -Id $_.ID -Detailed } | Convertto-json -Depth 20", 384 ), 385 execToPathFn( 386 "hnsendpoints.log", 387 "Get-HnsEndpoint | Select IpAddress, MacAddress, IsRemoteEndpoint, State", 388 ), 389 execToPathFn( 390 "hnsendpolicy-detailed.log", 391 "Get-hnspolicylist | Convertto-json -Depth 20", 392 ), 393 execToPathFn( 394 "ipconfig.log", 395 "ipconfig /allcompartments /all", 396 ), 397 execToPathFn( 398 "ips.log", 399 "Get-NetIPAddress -IncludeAllCompartments", 400 ), 401 execToPathFn( 402 "interfaces.log", 403 "Get-NetIPInterface -IncludeAllCompartments", 404 ), 405 execToPathFn( 406 "hnsdiag.txt", 407 "hnsdiag list all -d", 408 ), 409 } 410 } 411 412 func windowsCrashDumpLogs(execToPathFn func(outputFileName string, command string, args ...string) func() error) []func() error { 413 return []func() error{ 414 execToPathFn( 415 "dir-localdumps.log", 416 // note: the powershell 'ls' alias will not have any output if the target directory is empty. 417 // we're logging the contents of the c:\localdumps directory because the command that invokes tar.exe below is 418 // not providing output when run in powershell over ssh for some reason. 419 "ls 'c:\\localdumps' -Recurse", 420 ), 421 execToPathFn( 422 // capture any crashdump files created by windows into a .tar to be collected via sftp 423 "tar-crashdumps.log", 424 "$p = 'c:\\localdumps' ; if (Test-Path $p) { tar.exe -cvzf c:\\crashdumps.tar $p *>&1 | %{ Write-Output \"$_\"} } else { Write-Host \"No crash dumps found at $p\" }", 425 ), 426 } 427 } 428 429 // collectVMBootLog collects boot logs of the vm by using azure boot diagnostics. 430 func collectVMBootLog(ctx context.Context, am *infrav1.AzureMachine, outputPath string) error { 431 if am == nil { 432 return errors.New("AzureMachine is nil") 433 } 434 Logf("Collecting boot logs for AzureMachine %s\n", am.GetName()) 435 436 if am.Spec.ProviderID == nil { 437 return errors.New("AzureMachine provider ID is nil") 438 } 439 440 resource, err := azureutil.ParseResourceID(*am.Spec.ProviderID) 441 if err != nil { 442 return errors.Wrap(err, "failed to parse resource id") 443 } 444 445 subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID") 446 if subscriptionID == "" { 447 return errors.New("AZURE_SUBSCRIPTION_ID is not set") 448 } 449 450 cred, err := azidentity.NewDefaultAzureCredential(nil) 451 if err != nil { 452 return errors.Wrap(err, "failed to get default azure credential") 453 } 454 455 vmClient, err := armcompute.NewVirtualMachinesClient(subscriptionID, cred, nil) 456 if err != nil { 457 return errors.Wrap(err, "failed to create virtual machines client") 458 } 459 460 bootDiagnostics, err := vmClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, nil) 461 if err != nil { 462 return errors.Wrap(err, "failed to get boot diagnostics data") 463 } 464 465 return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath) 466 } 467 468 // collectVMSSBootLog collects boot logs of the scale set by using azure boot diagnostics. 469 func collectVMSSBootLog(ctx context.Context, providerID string, outputPath string) error { 470 resourceID := strings.TrimPrefix(providerID, azureutil.ProviderIDPrefix) 471 v := strings.Split(resourceID, "/") 472 instanceID := v[len(v)-1] 473 resourceID = strings.TrimSuffix(resourceID, "/virtualMachines/"+instanceID) 474 resource, err := azureutil.ParseResourceID(resourceID) 475 if err != nil { 476 return errors.Wrap(err, "failed to parse resource id") 477 } 478 479 Logf("Collecting boot logs for VMSS instance %s of scale set %s\n", instanceID, resource.Name) 480 481 subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID") 482 if subscriptionID == "" { 483 return errors.New("AZURE_SUBSCRIPTION_ID is not set") 484 } 485 486 cred, err := azidentity.NewDefaultAzureCredential(nil) 487 if err != nil { 488 return errors.Wrap(err, "failed to get default azure credential") 489 } 490 vmssClient, err := armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, cred, nil) 491 if err != nil { 492 return errors.Wrap(err, "failed to create virtual machine scale set VMs client") 493 } 494 495 bootDiagnostics, err := vmssClient.RetrieveBootDiagnosticsData(ctx, resource.ResourceGroupName, resource.Name, instanceID, nil) 496 if err != nil { 497 return errors.Wrap(err, "failed to get boot diagnostics data") 498 } 499 500 return writeBootLog(bootDiagnostics.RetrieveBootDiagnosticsDataResult, outputPath) 501 } 502 503 func writeBootLog(bootDiagnostics armcompute.RetrieveBootDiagnosticsDataResult, outputPath string) error { 504 var err error 505 req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, *bootDiagnostics.SerialConsoleLogBlobURI, http.NoBody) 506 if err != nil { 507 return errors.Wrap(err, "failed to create HTTP request") 508 } 509 resp, err := http.DefaultClient.Do(req) 510 if err != nil || resp.StatusCode != 200 { 511 return errors.Wrap(err, "failed to get logs from serial console uri") 512 } 513 defer resp.Body.Close() 514 515 content, err := io.ReadAll(resp.Body) 516 if err != nil { 517 return errors.Wrap(err, "failed to read response body") 518 } 519 520 if err := os.WriteFile(filepath.Join(outputPath, "boot.log"), content, 0o600); err != nil { 521 return errors.Wrap(err, "failed to write response to file") 522 } 523 524 return nil 525 }