sigs.k8s.io/cluster-api-provider-azure@v1.14.3/test/e2e/azure_clusterproxy.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2022 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2e
    21  
    22  import (
    23  	"bufio"
    24  	"context"
    25  	"encoding/json"
    26  	"errors"
    27  	"fmt"
    28  	"io"
    29  	"os"
    30  	"path"
    31  	"path/filepath"
    32  	"strings"
    33  	"time"
    34  
    35  	"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
    36  	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor"
    37  	asocontainerservicev1 "github.com/Azure/azure-service-operator/v2/api/containerservice/v1api20231001"
    38  	. "github.com/onsi/ginkgo/v2"
    39  	. "github.com/onsi/gomega"
    40  	corev1 "k8s.io/api/core/v1"
    41  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    42  	"k8s.io/apimachinery/pkg/runtime"
    43  	"k8s.io/apimachinery/pkg/runtime/schema"
    44  	"k8s.io/kubectl/pkg/describe"
    45  	"k8s.io/utils/ptr"
    46  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    47  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    48  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    49  	"sigs.k8s.io/cluster-api/test/framework"
    50  )
    51  
    52  type (
    53  	AzureClusterProxy struct {
    54  		framework.ClusterProxy
    55  	}
    56  	// myEventData is used to be able to Marshal armmonitor.EventData into JSON
    57  	// see https://github.com/Azure/azure-sdk-for-go/issues/8224#issuecomment-614777550
    58  	myEventData armmonitor.EventData
    59  )
    60  
    61  func NewAzureClusterProxy(name string, kubeconfigPath string, options ...framework.Option) *AzureClusterProxy {
    62  	proxy := framework.NewClusterProxy(name, kubeconfigPath, initScheme(), options...)
    63  	return &AzureClusterProxy{
    64  		ClusterProxy: proxy,
    65  	}
    66  }
    67  
    68  func initScheme() *runtime.Scheme {
    69  	scheme := runtime.NewScheme()
    70  	framework.TryAddDefaultSchemes(scheme)
    71  	Expect(infrav1.AddToScheme(scheme)).To(Succeed())
    72  	Expect(infrav1exp.AddToScheme(scheme)).To(Succeed())
    73  	Expect(expv1.AddToScheme(scheme)).To(Succeed())
    74  	Expect(asocontainerservicev1.AddToScheme(scheme)).To(Succeed())
    75  	return scheme
    76  }
    77  
    78  func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) {
    79  	Logf("Dumping workload cluster %s/%s logs", namespace, name)
    80  	acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath)
    81  
    82  	aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1)
    83  
    84  	Logf("Dumping workload cluster %s/%s nodes", namespace, name)
    85  	start := time.Now()
    86  	acp.collectNodes(ctx, namespace, name, aboveMachinesPath)
    87  	Logf("Fetching nodes took %s", time.Since(start).String())
    88  
    89  	Logf("Dumping workload cluster %s/%s pod logs", namespace, name)
    90  	start = time.Now()
    91  	acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath)
    92  	Logf("Fetching pod logs took %s", time.Since(start).String())
    93  
    94  	Logf("Dumping workload cluster %s/%s Azure activity log", namespace, name)
    95  	start = time.Now()
    96  	acp.collectActivityLogs(ctx, namespace, name, aboveMachinesPath)
    97  	Logf("Fetching activity logs took %s", time.Since(start).String())
    98  }
    99  
   100  func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) {
   101  	workload := acp.GetWorkloadCluster(ctx, namespace, name)
   102  	pods := &corev1.PodList{}
   103  
   104  	Expect(workload.GetClient().List(ctx, pods)).To(Succeed())
   105  
   106  	var err error
   107  	var podDescribe string
   108  
   109  	podDescriber, ok := describe.DescriberFor(schema.GroupKind{Group: corev1.GroupName, Kind: "Pod"}, workload.GetRESTConfig())
   110  	if !ok {
   111  		Logf("failed to get pod describer")
   112  	}
   113  
   114  	for _, pod := range pods.Items {
   115  		podNamespace := pod.GetNamespace()
   116  
   117  		// Describe the pod.
   118  		podDescribe, err = podDescriber.Describe(podNamespace, pod.GetName(), describe.DescriberSettings{ShowEvents: true})
   119  		if err != nil {
   120  			Logf("failed to describe pod %s/%s: %v", podNamespace, pod.GetName(), err)
   121  		}
   122  
   123  		// collect the init container logs
   124  		for _, container := range pod.Spec.InitContainers {
   125  			// Watch each init container's logs in a goroutine, so we can stream them all concurrently.
   126  			go collectContainerLogs(ctx, pod, container, aboveMachinesPath, workload)
   127  		}
   128  
   129  		for _, container := range pod.Spec.Containers {
   130  			// Watch each container's logs in a goroutine, so we can stream them all concurrently.
   131  			go collectContainerLogs(ctx, pod, container, aboveMachinesPath, workload)
   132  		}
   133  
   134  		Logf("Describing Pod %s/%s", podNamespace, pod.Name)
   135  		describeFile := path.Join(aboveMachinesPath, podNamespace, pod.Name, "pod-describe.txt")
   136  		writeLogFile(describeFile, podDescribe)
   137  	}
   138  }
   139  
   140  func collectContainerLogs(ctx context.Context, pod corev1.Pod, container corev1.Container, aboveMachinesPath string, workload framework.ClusterProxy) {
   141  	defer GinkgoRecover()
   142  
   143  	podNamespace := pod.GetNamespace()
   144  
   145  	Logf("Creating log watcher for controller %s/%s, container %s", podNamespace, pod.Name, container.Name)
   146  	logFile := path.Join(aboveMachinesPath, podNamespace, pod.Name, container.Name+".log")
   147  	if err := os.MkdirAll(filepath.Dir(logFile), 0o755); err != nil {
   148  		// Failing to mkdir should not cause the test to fail
   149  		Logf("Error mkdir: %v", err)
   150  		return
   151  	}
   152  
   153  	f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
   154  	if err != nil {
   155  		// Failing to fetch logs should not cause the test to fail
   156  		Logf("Error opening file to write pod logs: %v", err)
   157  		return
   158  	}
   159  	defer f.Close()
   160  
   161  	opts := &corev1.PodLogOptions{
   162  		Container: container.Name,
   163  		Follow:    true,
   164  	}
   165  
   166  	podLogs, err := workload.GetClientSet().CoreV1().Pods(podNamespace).GetLogs(pod.Name, opts).Stream(ctx)
   167  	if err != nil {
   168  		// Failing to stream logs should not cause the test to fail
   169  		Logf("Error starting logs stream for pod %s/%s, container %s: %v", podNamespace, pod.Name, container.Name, err)
   170  		return
   171  	}
   172  	defer podLogs.Close()
   173  
   174  	out := bufio.NewWriter(f)
   175  	defer out.Flush()
   176  	_, err = out.ReadFrom(podLogs)
   177  	if errors.Is(err, io.ErrUnexpectedEOF) {
   178  		// Failing to stream logs should not cause the test to fail
   179  		Logf("Got error while streaming logs for pod %s/%s, container %s: %v", podNamespace, pod.Name, container.Name, err)
   180  	}
   181  }
   182  
   183  func (acp *AzureClusterProxy) collectNodes(ctx context.Context, namespace string, name string, aboveMachinesPath string) {
   184  	workload := acp.GetWorkloadCluster(ctx, namespace, name)
   185  	nodes := &corev1.NodeList{}
   186  
   187  	Expect(workload.GetClient().List(ctx, nodes)).To(Succeed())
   188  
   189  	var err error
   190  	var nodeDescribe string
   191  
   192  	nodeDescriber, ok := describe.DescriberFor(schema.GroupKind{Group: corev1.GroupName, Kind: "Node"}, workload.GetRESTConfig())
   193  	if !ok {
   194  		Logf("failed to get node describer")
   195  	}
   196  
   197  	for _, node := range nodes.Items {
   198  		// Describe the node.
   199  		Logf("Describing Node %s", node.GetName())
   200  		nodeDescribe, err = nodeDescriber.Describe(node.GetNamespace(), node.GetName(), describe.DescriberSettings{ShowEvents: true})
   201  		if err != nil {
   202  			Logf("failed to describe node %s: %v", node.GetName(), err)
   203  		}
   204  
   205  		describeFile := path.Join(aboveMachinesPath, nodesDir, node.GetName(), "node-describe.txt")
   206  		writeLogFile(describeFile, nodeDescribe)
   207  	}
   208  }
   209  
   210  func (acp *AzureClusterProxy) collectActivityLogs(ctx context.Context, namespace, name, aboveMachinesPath string) {
   211  	timeoutctx, cancel := context.WithTimeout(ctx, 30*time.Second)
   212  	defer cancel()
   213  
   214  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   215  	Expect(err).NotTo(HaveOccurred())
   216  	activityLogsClient, err := armmonitor.NewActivityLogsClient(getSubscriptionID(Default), cred, nil)
   217  	Expect(err).NotTo(HaveOccurred())
   218  
   219  	var groupName string
   220  	clusterClient := acp.GetClient()
   221  	workloadCluster, err := getAzureCluster(timeoutctx, clusterClient, namespace, name)
   222  	if apierrors.IsNotFound(err) {
   223  		controlPlane, err := getAzureManagedControlPlane(timeoutctx, clusterClient, namespace, name)
   224  		if err != nil {
   225  			// Failing to fetch logs should not cause the test to fail
   226  			Logf("Error fetching activity logs for cluster %s in namespace %s.  Not able to find the AzureManagedControlPlane on the management cluster: %v", name, namespace, err)
   227  			return
   228  		}
   229  		groupName = controlPlane.Spec.ResourceGroupName
   230  	} else {
   231  		if err != nil {
   232  			// Failing to fetch logs should not cause the test to fail
   233  			Logf("Error fetching activity logs for cluster %s in namespace %s.  Not able to find the workload cluster on the management cluster: %v", name, namespace, err)
   234  			return
   235  		}
   236  		groupName = workloadCluster.Spec.ResourceGroup
   237  	}
   238  
   239  	start := time.Now().Add(-2 * time.Hour).UTC().Format(time.RFC3339)
   240  	end := time.Now().UTC().Format(time.RFC3339)
   241  
   242  	filter := fmt.Sprintf("eventTimestamp ge '%s' and eventTimestamp le '%s' and resourceGroupName eq '%s'", start, end, groupName)
   243  	pager := activityLogsClient.NewListPager(filter, nil)
   244  
   245  	logFile := path.Join(aboveMachinesPath, activitylog, groupName+".log")
   246  	Expect(os.MkdirAll(filepath.Dir(logFile), 0o755)).To(Succeed())
   247  
   248  	f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
   249  	if err != nil {
   250  		// Failing to fetch logs should not cause the test to fail
   251  		Logf("Error opening file to write activity logs: %v", err)
   252  		return
   253  	}
   254  	defer f.Close()
   255  	out := bufio.NewWriter(f)
   256  	defer out.Flush()
   257  
   258  	for pager.More() {
   259  		page, err := pager.NextPage(ctx)
   260  		if err != nil {
   261  			// Failing to fetch logs should not cause the test to fail
   262  			Logf("Error getting pager for activity logs in resource group %s: %v", groupName, err)
   263  			return
   264  		}
   265  		for _, event := range page.Value {
   266  			if ptr.Deref(event.Category.Value, "") != "Policy" {
   267  				b, err := json.MarshalIndent(myEventData(*event), "", "    ")
   268  				if err != nil {
   269  					Logf("Got error converting activity logs data to json: %v", err)
   270  				}
   271  				if _, err = out.WriteString(string(b) + "\n"); err != nil {
   272  					Logf("Got error while writing activity logs for resource group %s: %v", groupName, err)
   273  				}
   274  			}
   275  		}
   276  	}
   277  }
   278  
   279  func writeLogFile(logFilepath string, logData string) {
   280  	go func() {
   281  		defer GinkgoRecover()
   282  
   283  		if err := os.MkdirAll(filepath.Dir(logFilepath), 0o755); err != nil {
   284  			// Failing to mkdir should not cause the test to fail
   285  			Logf("Error mkdir: %v", err)
   286  			return
   287  		}
   288  
   289  		f, err := os.OpenFile(logFilepath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
   290  		if err != nil {
   291  			// Failing to open the file should not cause the test to fail
   292  			Logf("Error opening file %s to write logs: %v", logFilepath, err)
   293  			return
   294  		}
   295  		defer f.Close()
   296  
   297  		out := bufio.NewWriter(f)
   298  		defer out.Flush()
   299  		_, err = out.WriteString(logData)
   300  		if err != nil {
   301  			// Failing to write a log file should not cause the test to fail
   302  			Logf("failed to write logFile %s: %v", logFilepath, err)
   303  		}
   304  	}()
   305  }