sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/crier/reporters/gcs/kubernetes/reporter.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kubernetes
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"fmt"
    24  	"hash/crc32"
    25  	"math"
    26  	"path"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/sirupsen/logrus"
    31  	v1 "k8s.io/api/core/v1"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	"k8s.io/apimachinery/pkg/util/sets"
    35  	"k8s.io/client-go/kubernetes/scheme"
    36  	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    37  	utilpointer "k8s.io/utils/pointer"
    38  	ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
    39  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    40  
    41  	prowv1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
    42  	"sigs.k8s.io/prow/pkg/config"
    43  	kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api"
    44  	"sigs.k8s.io/prow/pkg/crier/reporters/gcs/util"
    45  	"sigs.k8s.io/prow/pkg/io"
    46  	"sigs.k8s.io/prow/pkg/io/providers"
    47  )
    48  
    49  type gcsK8sReporter struct {
    50  	cfg            config.Getter
    51  	dryRun         bool
    52  	opener         io.Opener
    53  	rg             resourceGetter
    54  	reportFraction float32
    55  }
    56  
    57  type PodReport struct {
    58  	Pod    *v1.Pod    `json:"pod,omitempty"`
    59  	Events []v1.Event `json:"events,omitempty"`
    60  }
    61  
    62  type resourceGetter interface {
    63  	GetPod(ctx context.Context, cluster, namespace, name string) (*v1.Pod, error)
    64  	GetEvents(cluster, namespace string, pod *v1.Pod) ([]v1.Event, error)
    65  	PatchPod(ctx context.Context, cluster, namespace, name string, pt types.PatchType, data []byte) error
    66  }
    67  
    68  type k8sResourceGetter struct {
    69  	podClientSets map[string]corev1.CoreV1Interface
    70  }
    71  
    72  func NewK8sResourceGetter(podClientSets map[string]corev1.CoreV1Interface) *k8sResourceGetter {
    73  	return &k8sResourceGetter{podClientSets: podClientSets}
    74  }
    75  
    76  func (rg k8sResourceGetter) GetPod(ctx context.Context, cluster, namespace, name string) (*v1.Pod, error) {
    77  	if _, ok := rg.podClientSets[cluster]; !ok {
    78  		return nil, fmt.Errorf("couldn't find cluster %q", cluster)
    79  	}
    80  	return rg.podClientSets[cluster].Pods(namespace).Get(ctx, name, metav1.GetOptions{})
    81  }
    82  
    83  func (rg k8sResourceGetter) PatchPod(ctx context.Context, cluster, namespace, name string, pt types.PatchType, data []byte) error {
    84  	if _, ok := rg.podClientSets[cluster]; !ok {
    85  		return fmt.Errorf("couldn't find cluster %q", cluster)
    86  	}
    87  
    88  	_, err := rg.podClientSets[cluster].Pods(namespace).Patch(ctx, name, pt, data, metav1.PatchOptions{})
    89  	return err
    90  }
    91  
    92  func (rg k8sResourceGetter) GetEvents(cluster, namespace string, pod *v1.Pod) ([]v1.Event, error) {
    93  	if _, ok := rg.podClientSets[cluster]; !ok {
    94  		return nil, fmt.Errorf("couldn't find cluster %q", cluster)
    95  	}
    96  	events, err := rg.podClientSets[cluster].Events(namespace).Search(scheme.Scheme, pod)
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  	return events.Items, nil
   101  }
   102  
   103  func (gr *gcsK8sReporter) Report(ctx context.Context, log *logrus.Entry, pj *prowv1.ProwJob) ([]*prowv1.ProwJob, *reconcile.Result, error) {
   104  	result, err := gr.report(ctx, log, pj)
   105  	return []*prowv1.ProwJob{pj}, result, err
   106  }
   107  
   108  func (gr *gcsK8sReporter) report(ctx context.Context, log *logrus.Entry, pj *prowv1.ProwJob) (*reconcile.Result, error) {
   109  	ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
   110  	defer cancel()
   111  
   112  	// Check if we have a destination before adding a finalizer so we don't add
   113  	// one that we'll never remove.
   114  	_, _, err := util.GetJobDestination(gr.cfg, pj)
   115  	if err != nil {
   116  		log.WithError(err).Warn("Not uploading because we couldn't find a destination")
   117  		return nil, nil
   118  	}
   119  
   120  	if !pj.Complete() && pj.Status.State != prowv1.AbortedState {
   121  		if err := gr.addFinalizer(ctx, pj); err != nil {
   122  			return nil, fmt.Errorf("failed to add finalizer to pod: %w", err)
   123  		}
   124  		return nil, nil
   125  	}
   126  
   127  	// Aborted jobs are not completed initially
   128  	if !pj.Complete() {
   129  		log.Debug("Requeuing aborted job that is not complete.")
   130  		return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
   131  	}
   132  
   133  	return nil, gr.reportPodInfo(ctx, log, pj)
   134  }
   135  
   136  func (gr *gcsK8sReporter) addFinalizer(ctx context.Context, pj *prowv1.ProwJob) error {
   137  	pod, err := gr.rg.GetPod(ctx, pj.Spec.Cluster, gr.cfg().PodNamespace, pj.Name)
   138  	if err != nil {
   139  		return fmt.Errorf("failed to get pod %s: %w", pj.Name, err)
   140  	}
   141  
   142  	if pod.DeletionTimestamp != nil {
   143  		return nil
   144  	}
   145  
   146  	finalizers := sets.New[string](pod.Finalizers...)
   147  	if finalizers.Has(kubernetesreporterapi.FinalizerName) {
   148  		return nil
   149  	}
   150  
   151  	originalPod := pod.DeepCopy()
   152  	pod.Finalizers = sets.List(finalizers.Insert(kubernetesreporterapi.FinalizerName))
   153  	patch := ctrlruntimeclient.MergeFrom(originalPod)
   154  	patchData, err := patch.Data(pod)
   155  	if err != nil {
   156  		return fmt.Errorf("failed to construct patch: %w", err)
   157  	}
   158  
   159  	if err := gr.rg.PatchPod(ctx, pj.Spec.Cluster, pod.Namespace, pod.Name, patch.Type(), patchData); err != nil {
   160  		// The pod occasionally gets deleted between our check above and this request
   161  		if strings.Contains(err.Error(), "no new finalizers can be added if the object is being deleted") {
   162  			return nil
   163  		}
   164  		return fmt.Errorf("failed to patch pod: %w", err)
   165  	}
   166  
   167  	return nil
   168  }
   169  
   170  func (gr *gcsK8sReporter) reportPodInfo(ctx context.Context, log *logrus.Entry, pj *prowv1.ProwJob) error {
   171  	// We only report this after a prowjob is complete (and, therefore, pod state is immutable)
   172  	if !pj.Complete() {
   173  		return errors.New("cannot report incomplete jobs")
   174  	}
   175  
   176  	pod, err := gr.rg.GetPod(ctx, pj.Spec.Cluster, gr.cfg().PodNamespace, pj.Name)
   177  	if err != nil {
   178  		// If we return an error we will be retried ~indefinitely. Given that permanent errors
   179  		// are expected (pods will be garbage collected), this isn't useful. Instead, just
   180  		// go along with it.
   181  		log.WithError(err).Info("Couldn't fetch pod")
   182  		pod = nil
   183  	}
   184  
   185  	var events []v1.Event
   186  	if pod != nil {
   187  		events, err = gr.rg.GetEvents(pj.Spec.Cluster, gr.cfg().PodNamespace, pod)
   188  		if err != nil {
   189  			log.WithError(err).Info("Couldn't fetch events for pod")
   190  		}
   191  	}
   192  
   193  	if pod == nil && len(events) == 0 {
   194  		log.Info("Not reporting job because we could fetch neither pod nor events")
   195  		return nil
   196  	}
   197  
   198  	report := PodReport{
   199  		Pod:    pod,
   200  		Events: events,
   201  	}
   202  
   203  	output, err := json.MarshalIndent(report, "", "\t")
   204  	if err != nil {
   205  		// This should never happen.
   206  		log.WithError(err).Warn("Couldn't marshal pod info")
   207  	}
   208  
   209  	bucketName, dir, err := util.GetJobDestination(gr.cfg, pj)
   210  	if err != nil {
   211  		return fmt.Errorf("couldn't get job destination: %w", err)
   212  	}
   213  
   214  	if gr.dryRun {
   215  		log.WithFields(logrus.Fields{"bucketName": bucketName, "dir": dir}).Info("Would upload pod info")
   216  		return nil
   217  	}
   218  
   219  	overWriteOpts := io.WriterOptions{PreconditionDoesNotExist: utilpointer.Bool(false)}
   220  	podInfoPath, err := providers.StoragePath(bucketName, path.Join(dir, "podinfo.json"))
   221  	if err != nil {
   222  		return fmt.Errorf("failed to resolve podinfo.json path: %v", err)
   223  	}
   224  	if err := io.WriteContent(ctx, log, gr.opener, podInfoPath, output, overWriteOpts); err != nil {
   225  		return fmt.Errorf("failed to upload pod manifest to object storage: %w", err)
   226  	}
   227  
   228  	if pod == nil {
   229  		return nil
   230  	}
   231  
   232  	if err := gr.removeFinalizer(ctx, pj.Spec.Cluster, pod); err != nil {
   233  		return fmt.Errorf("failed to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
   234  	}
   235  
   236  	return nil
   237  }
   238  
   239  func (gr *gcsK8sReporter) removeFinalizer(ctx context.Context, cluster string, pod *v1.Pod) error {
   240  	finalizers := sets.New[string](pod.Finalizers...)
   241  	if !finalizers.Has(kubernetesreporterapi.FinalizerName) {
   242  		return nil
   243  	}
   244  
   245  	oldPod := pod.DeepCopy()
   246  	pod.Finalizers = sets.List(finalizers.Delete(kubernetesreporterapi.FinalizerName))
   247  	patch := ctrlruntimeclient.MergeFrom(oldPod)
   248  	rawPatch, err := patch.Data(pod)
   249  	if err != nil {
   250  		return fmt.Errorf("failed to construct patch: %w", err)
   251  	}
   252  
   253  	if err := gr.rg.PatchPod(ctx, cluster, pod.Namespace, pod.Name, patch.Type(), rawPatch); err != nil {
   254  		return fmt.Errorf("failed to patch pod: %w", err)
   255  	}
   256  
   257  	return nil
   258  }
   259  
   260  func (gr *gcsK8sReporter) GetName() string {
   261  	return kubernetesreporterapi.ReporterName
   262  }
   263  
   264  func (gr *gcsK8sReporter) ShouldReport(_ context.Context, _ *logrus.Entry, pj *prowv1.ProwJob) bool {
   265  	// This reporting only makes sense for the Kubernetes agent (otherwise we don't
   266  	// have a pod to look up). It is only particularly useful for us to look at
   267  	// complete jobs that have a build ID.
   268  	if pj.Spec.Agent != prowv1.KubernetesAgent || pj.Status.PendingTime == nil || pj.Status.BuildID == "" {
   269  		return false
   270  	}
   271  
   272  	// For ramp-up purposes, we can report only on a subset of jobs.
   273  	if gr.reportFraction < 1.0 {
   274  		// Assume the names are opaque and take the CRC-32C checksum of it.
   275  		// (Why CRC-32C? It's sufficiently well distributed and fast)
   276  		crc := crc32.Checksum([]byte(pj.Name), crc32.MakeTable(crc32.Castagnoli))
   277  		if crc > uint32(math.MaxUint32*gr.reportFraction) {
   278  			return false
   279  		}
   280  	}
   281  
   282  	return true
   283  }
   284  
   285  func New(cfg config.Getter, opener io.Opener, rg resourceGetter, reportFraction float32, dryRun bool) *gcsK8sReporter {
   286  	return &gcsK8sReporter{
   287  		cfg:            cfg,
   288  		dryRun:         dryRun,
   289  		opener:         opener,
   290  		rg:             rg,
   291  		reportFraction: reportFraction,
   292  	}
   293  }