sigs.k8s.io/cluster-api-provider-azure@v1.17.0/test/e2e/azure_clusterproxy.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2022 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2e
    21  
    22  import (
    23  	"bufio"
    24  	"context"
    25  	"encoding/json"
    26  	"errors"
    27  	"fmt"
    28  	"io"
    29  	"os"
    30  	"path"
    31  	"path/filepath"
    32  	"strings"
    33  	"time"
    34  
    35  	"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
    36  	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor"
    37  	asocontainerservicev1 "github.com/Azure/azure-service-operator/v2/api/containerservice/v1api20231001"
    38  	asocontainerservicev1preview "github.com/Azure/azure-service-operator/v2/api/containerservice/v1api20231102preview"
    39  	asoresourcesv1 "github.com/Azure/azure-service-operator/v2/api/resources/v1api20200601"
    40  	. "github.com/onsi/ginkgo/v2"
    41  	. "github.com/onsi/gomega"
    42  	corev1 "k8s.io/api/core/v1"
    43  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    44  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    45  	"k8s.io/apimachinery/pkg/runtime"
    46  	"k8s.io/apimachinery/pkg/runtime/schema"
    47  	"k8s.io/kubectl/pkg/describe"
    48  	"k8s.io/utils/ptr"
    49  	infrav1alpha "sigs.k8s.io/cluster-api-provider-azure/api/v1alpha1"
    50  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    51  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    52  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    53  	"sigs.k8s.io/cluster-api/test/framework"
    54  	"sigs.k8s.io/controller-runtime/pkg/client"
    55  )
    56  
    57  type (
    58  	AzureClusterProxy struct {
    59  		framework.ClusterProxy
    60  	}
    61  	// myEventData is used to be able to Marshal armmonitor.EventData into JSON
    62  	// see https://github.com/Azure/azure-sdk-for-go/issues/8224#issuecomment-614777550
    63  	myEventData armmonitor.EventData
    64  )
    65  
    66  func NewAzureClusterProxy(name string, kubeconfigPath string, options ...framework.Option) *AzureClusterProxy {
    67  	proxy := framework.NewClusterProxy(name, kubeconfigPath, initScheme(), options...)
    68  	return &AzureClusterProxy{
    69  		ClusterProxy: proxy,
    70  	}
    71  }
    72  
    73  func initScheme() *runtime.Scheme {
    74  	scheme := runtime.NewScheme()
    75  	framework.TryAddDefaultSchemes(scheme)
    76  	Expect(infrav1.AddToScheme(scheme)).To(Succeed())
    77  	Expect(infrav1exp.AddToScheme(scheme)).To(Succeed())
    78  	Expect(infrav1alpha.AddToScheme(scheme)).To(Succeed())
    79  	Expect(expv1.AddToScheme(scheme)).To(Succeed())
    80  	Expect(asoresourcesv1.AddToScheme(scheme)).To(Succeed())
    81  	Expect(asocontainerservicev1.AddToScheme(scheme)).To(Succeed())
    82  	Expect(asocontainerservicev1preview.AddToScheme(scheme)).To(Succeed())
    83  	return scheme
    84  }
    85  
    86  func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) {
    87  	Logf("Dumping workload cluster %s/%s logs", namespace, name)
    88  	acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath)
    89  
    90  	aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1)
    91  
    92  	Logf("Dumping workload cluster %s/%s nodes", namespace, name)
    93  	start := time.Now()
    94  	acp.collectNodes(ctx, namespace, name, aboveMachinesPath)
    95  	Logf("Fetching nodes took %s", time.Since(start).String())
    96  
    97  	Logf("Dumping workload cluster %s/%s pod logs", namespace, name)
    98  	start = time.Now()
    99  	acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath)
   100  	Logf("Fetching pod logs took %s", time.Since(start).String())
   101  
   102  	Logf("Dumping workload cluster %s/%s Azure activity log", namespace, name)
   103  	start = time.Now()
   104  	acp.collectActivityLogs(ctx, namespace, name, aboveMachinesPath)
   105  	Logf("Fetching activity logs took %s", time.Since(start).String())
   106  }
   107  
   108  func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) {
   109  	workload := acp.GetWorkloadCluster(ctx, namespace, name)
   110  	pods := &corev1.PodList{}
   111  
   112  	Expect(workload.GetClient().List(ctx, pods)).To(Succeed())
   113  
   114  	var err error
   115  	var podDescribe string
   116  
   117  	podDescriber, ok := describe.DescriberFor(schema.GroupKind{Group: corev1.GroupName, Kind: "Pod"}, workload.GetRESTConfig())
   118  	if !ok {
   119  		Logf("failed to get pod describer")
   120  	}
   121  
   122  	for _, pod := range pods.Items {
   123  		podNamespace := pod.GetNamespace()
   124  
   125  		// Describe the pod.
   126  		podDescribe, err = podDescriber.Describe(podNamespace, pod.GetName(), describe.DescriberSettings{ShowEvents: true})
   127  		if err != nil {
   128  			Logf("failed to describe pod %s/%s: %v", podNamespace, pod.GetName(), err)
   129  		}
   130  
   131  		// collect the init container logs
   132  		for _, container := range pod.Spec.InitContainers {
   133  			// Watch each init container's logs in a goroutine, so we can stream them all concurrently.
   134  			go collectContainerLogs(ctx, pod, container, aboveMachinesPath, workload)
   135  		}
   136  
   137  		for _, container := range pod.Spec.Containers {
   138  			// Watch each container's logs in a goroutine, so we can stream them all concurrently.
   139  			go collectContainerLogs(ctx, pod, container, aboveMachinesPath, workload)
   140  		}
   141  
   142  		Logf("Describing Pod %s/%s", podNamespace, pod.Name)
   143  		describeFile := path.Join(aboveMachinesPath, podNamespace, pod.Name, "pod-describe.txt")
   144  		writeLogFile(describeFile, podDescribe)
   145  	}
   146  }
   147  
   148  func collectContainerLogs(ctx context.Context, pod corev1.Pod, container corev1.Container, aboveMachinesPath string, workload framework.ClusterProxy) {
   149  	defer GinkgoRecover()
   150  
   151  	podNamespace := pod.GetNamespace()
   152  
   153  	Logf("Creating log watcher for controller %s/%s, container %s", podNamespace, pod.Name, container.Name)
   154  	logFile := path.Join(aboveMachinesPath, podNamespace, pod.Name, container.Name+".log")
   155  	if err := os.MkdirAll(filepath.Dir(logFile), 0o755); err != nil {
   156  		// Failing to mkdir should not cause the test to fail
   157  		Logf("Error mkdir: %v", err)
   158  		return
   159  	}
   160  
   161  	f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
   162  	if err != nil {
   163  		// Failing to fetch logs should not cause the test to fail
   164  		Logf("Error opening file to write pod logs: %v", err)
   165  		return
   166  	}
   167  	defer f.Close()
   168  
   169  	opts := &corev1.PodLogOptions{
   170  		Container: container.Name,
   171  		Follow:    true,
   172  	}
   173  
   174  	podLogs, err := workload.GetClientSet().CoreV1().Pods(podNamespace).GetLogs(pod.Name, opts).Stream(ctx)
   175  	if err != nil {
   176  		// Failing to stream logs should not cause the test to fail
   177  		Logf("Error starting logs stream for pod %s/%s, container %s: %v", podNamespace, pod.Name, container.Name, err)
   178  		return
   179  	}
   180  	defer podLogs.Close()
   181  
   182  	out := bufio.NewWriter(f)
   183  	defer out.Flush()
   184  	_, err = out.ReadFrom(podLogs)
   185  	if errors.Is(err, io.ErrUnexpectedEOF) {
   186  		// Failing to stream logs should not cause the test to fail
   187  		Logf("Got error while streaming logs for pod %s/%s, container %s: %v", podNamespace, pod.Name, container.Name, err)
   188  	}
   189  }
   190  
   191  func (acp *AzureClusterProxy) collectNodes(ctx context.Context, namespace string, name string, aboveMachinesPath string) {
   192  	workload := acp.GetWorkloadCluster(ctx, namespace, name)
   193  	nodes := &corev1.NodeList{}
   194  
   195  	Expect(workload.GetClient().List(ctx, nodes)).To(Succeed())
   196  
   197  	var err error
   198  	var nodeDescribe string
   199  
   200  	nodeDescriber, ok := describe.DescriberFor(schema.GroupKind{Group: corev1.GroupName, Kind: "Node"}, workload.GetRESTConfig())
   201  	if !ok {
   202  		Logf("failed to get node describer")
   203  	}
   204  
   205  	for _, node := range nodes.Items {
   206  		// Describe the node.
   207  		Logf("Describing Node %s", node.GetName())
   208  		nodeDescribe, err = nodeDescriber.Describe(node.GetNamespace(), node.GetName(), describe.DescriberSettings{ShowEvents: true})
   209  		if err != nil {
   210  			Logf("failed to describe node %s: %v", node.GetName(), err)
   211  		}
   212  
   213  		describeFile := path.Join(aboveMachinesPath, nodesDir, node.GetName(), "node-describe.txt")
   214  		writeLogFile(describeFile, nodeDescribe)
   215  	}
   216  }
   217  
   218  func (acp *AzureClusterProxy) collectActivityLogs(ctx context.Context, namespace, name, aboveMachinesPath string) {
   219  	timeoutctx, cancel := context.WithTimeout(ctx, 30*time.Second)
   220  	defer cancel()
   221  
   222  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   223  	Expect(err).NotTo(HaveOccurred())
   224  	activityLogsClient, err := armmonitor.NewActivityLogsClient(getSubscriptionID(Default), cred, nil)
   225  	Expect(err).NotTo(HaveOccurred())
   226  
   227  	var groupName string
   228  	clusterClient := acp.GetClient()
   229  	workloadCluster, err := getAzureCluster(timeoutctx, clusterClient, namespace, name)
   230  	if apierrors.IsNotFound(err) {
   231  		controlPlane, err := getAzureManagedControlPlane(timeoutctx, clusterClient, namespace, name)
   232  		if apierrors.IsNotFound(err) {
   233  			asoCluster, err := getAzureASOManagedCluster(timeoutctx, clusterClient, namespace, name)
   234  			if err != nil {
   235  				// Failing to fetch logs should not cause the test to fail
   236  				Logf("Error fetching activity logs for cluster %s in namespace %s.  Not able to find the AzureASOManagedCluster on the management cluster: %v", name, namespace, err)
   237  				return
   238  			}
   239  			for _, resource := range asoCluster.Spec.Resources {
   240  				u := &unstructured.Unstructured{}
   241  				Expect(u.UnmarshalJSON(resource.Raw)).To(Succeed())
   242  				if u.GroupVersionKind().Kind != "ResourceGroup" {
   243  					continue
   244  				}
   245  				// AzureName might not be specified in the CAPZ resource. GET the rg to make sure we have it.
   246  				rg := &asoresourcesv1.ResourceGroup{}
   247  				Expect(clusterClient.Get(ctx, client.ObjectKey{Namespace: namespace, Name: u.GetName()}, rg)).To(Succeed())
   248  				groupName = rg.AzureName()
   249  				break
   250  			}
   251  		} else {
   252  			if err != nil {
   253  				// Failing to fetch logs should not cause the test to fail
   254  				Logf("Error fetching activity logs for cluster %s in namespace %s.  Not able to find the AzureManagedControlPlane on the management cluster: %v", name, namespace, err)
   255  				return
   256  			}
   257  			groupName = controlPlane.Spec.ResourceGroupName
   258  		}
   259  	} else {
   260  		if err != nil {
   261  			// Failing to fetch logs should not cause the test to fail
   262  			Logf("Error fetching activity logs for cluster %s in namespace %s.  Not able to find the workload cluster on the management cluster: %v", name, namespace, err)
   263  			return
   264  		}
   265  		groupName = workloadCluster.Spec.ResourceGroup
   266  	}
   267  
   268  	start := time.Now().Add(-2 * time.Hour).UTC().Format(time.RFC3339)
   269  	end := time.Now().UTC().Format(time.RFC3339)
   270  
   271  	filter := fmt.Sprintf("eventTimestamp ge '%s' and eventTimestamp le '%s' and resourceGroupName eq '%s'", start, end, groupName)
   272  	pager := activityLogsClient.NewListPager(filter, nil)
   273  
   274  	logFile := path.Join(aboveMachinesPath, activitylog, groupName+".log")
   275  	Expect(os.MkdirAll(filepath.Dir(logFile), 0o755)).To(Succeed())
   276  
   277  	f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
   278  	if err != nil {
   279  		// Failing to fetch logs should not cause the test to fail
   280  		Logf("Error opening file to write activity logs: %v", err)
   281  		return
   282  	}
   283  	defer f.Close()
   284  	out := bufio.NewWriter(f)
   285  	defer out.Flush()
   286  
   287  	for pager.More() {
   288  		page, err := pager.NextPage(ctx)
   289  		if err != nil {
   290  			// Failing to fetch logs should not cause the test to fail
   291  			Logf("Error getting pager for activity logs in resource group %s: %v", groupName, err)
   292  			return
   293  		}
   294  		for _, event := range page.Value {
   295  			if ptr.Deref(event.Category.Value, "") != "Policy" {
   296  				b, err := json.MarshalIndent(myEventData(*event), "", "    ")
   297  				if err != nil {
   298  					Logf("Got error converting activity logs data to json: %v", err)
   299  				}
   300  				if _, err = out.WriteString(string(b) + "\n"); err != nil {
   301  					Logf("Got error while writing activity logs for resource group %s: %v", groupName, err)
   302  				}
   303  			}
   304  		}
   305  	}
   306  }
   307  
   308  func writeLogFile(logFilepath string, logData string) {
   309  	go func() {
   310  		defer GinkgoRecover()
   311  
   312  		if err := os.MkdirAll(filepath.Dir(logFilepath), 0o755); err != nil {
   313  			// Failing to mkdir should not cause the test to fail
   314  			Logf("Error mkdir: %v", err)
   315  			return
   316  		}
   317  
   318  		f, err := os.OpenFile(logFilepath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
   319  		if err != nil {
   320  			// Failing to open the file should not cause the test to fail
   321  			Logf("Error opening file %s to write logs: %v", logFilepath, err)
   322  			return
   323  		}
   324  		defer f.Close()
   325  
   326  		out := bufio.NewWriter(f)
   327  		defer out.Flush()
   328  		_, err = out.WriteString(logData)
   329  		if err != nil {
   330  			// Failing to write a log file should not cause the test to fail
   331  			Logf("failed to write logFile %s: %v", logFilepath, err)
   332  		}
   333  	}()
   334  }