sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/crier/reporters/gcs/kubernetes/reporter.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubernetes 18 19 import ( 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "hash/crc32" 25 "math" 26 "path" 27 "strings" 28 "time" 29 30 "github.com/sirupsen/logrus" 31 v1 "k8s.io/api/core/v1" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/sets" 35 "k8s.io/client-go/kubernetes/scheme" 36 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 37 utilpointer "k8s.io/utils/pointer" 38 ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" 39 "sigs.k8s.io/controller-runtime/pkg/reconcile" 40 41 prowv1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1" 42 "sigs.k8s.io/prow/pkg/config" 43 kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api" 44 "sigs.k8s.io/prow/pkg/crier/reporters/gcs/util" 45 "sigs.k8s.io/prow/pkg/io" 46 "sigs.k8s.io/prow/pkg/io/providers" 47 ) 48 49 type gcsK8sReporter struct { 50 cfg config.Getter 51 dryRun bool 52 opener io.Opener 53 rg resourceGetter 54 reportFraction float32 55 } 56 57 type PodReport struct { 58 Pod *v1.Pod `json:"pod,omitempty"` 59 Events []v1.Event `json:"events,omitempty"` 60 } 61 62 type resourceGetter interface { 63 GetPod(ctx context.Context, cluster, namespace, name string) (*v1.Pod, error) 64 GetEvents(cluster, namespace string, pod *v1.Pod) ([]v1.Event, error) 65 PatchPod(ctx context.Context, cluster, namespace, name string, pt types.PatchType, data []byte) error 66 } 67 68 type k8sResourceGetter struct { 69 podClientSets map[string]corev1.CoreV1Interface 70 } 71 72 func NewK8sResourceGetter(podClientSets map[string]corev1.CoreV1Interface) *k8sResourceGetter { 73 return &k8sResourceGetter{podClientSets: podClientSets} 74 } 75 76 func (rg k8sResourceGetter) GetPod(ctx context.Context, cluster, namespace, name string) (*v1.Pod, error) { 77 if _, ok := rg.podClientSets[cluster]; !ok { 78 return nil, fmt.Errorf("couldn't find cluster %q", cluster) 79 } 80 return rg.podClientSets[cluster].Pods(namespace).Get(ctx, name, metav1.GetOptions{}) 81 } 82 83 func (rg k8sResourceGetter) PatchPod(ctx context.Context, cluster, namespace, name string, pt types.PatchType, data []byte) error { 84 if _, ok := rg.podClientSets[cluster]; !ok { 85 return fmt.Errorf("couldn't find cluster %q", cluster) 86 } 87 88 _, err := rg.podClientSets[cluster].Pods(namespace).Patch(ctx, name, pt, data, metav1.PatchOptions{}) 89 return err 90 } 91 92 func (rg k8sResourceGetter) GetEvents(cluster, namespace string, pod *v1.Pod) ([]v1.Event, error) { 93 if _, ok := rg.podClientSets[cluster]; !ok { 94 return nil, fmt.Errorf("couldn't find cluster %q", cluster) 95 } 96 events, err := rg.podClientSets[cluster].Events(namespace).Search(scheme.Scheme, pod) 97 if err != nil { 98 return nil, err 99 } 100 return events.Items, nil 101 } 102 103 func (gr *gcsK8sReporter) Report(ctx context.Context, log *logrus.Entry, pj *prowv1.ProwJob) ([]*prowv1.ProwJob, *reconcile.Result, error) { 104 result, err := gr.report(ctx, log, pj) 105 return []*prowv1.ProwJob{pj}, result, err 106 } 107 108 func (gr *gcsK8sReporter) report(ctx context.Context, log *logrus.Entry, pj *prowv1.ProwJob) (*reconcile.Result, error) { 109 ctx, cancel := context.WithTimeout(ctx, 20*time.Second) 110 defer cancel() 111 112 // Check if we have a destination before adding a finalizer so we don't add 113 // one that we'll never remove. 114 _, _, err := util.GetJobDestination(gr.cfg, pj) 115 if err != nil { 116 log.WithError(err).Warn("Not uploading because we couldn't find a destination") 117 return nil, nil 118 } 119 120 if !pj.Complete() && pj.Status.State != prowv1.AbortedState { 121 if err := gr.addFinalizer(ctx, pj); err != nil { 122 return nil, fmt.Errorf("failed to add finalizer to pod: %w", err) 123 } 124 return nil, nil 125 } 126 127 // Aborted jobs are not completed initially 128 if !pj.Complete() { 129 log.Debug("Requeuing aborted job that is not complete.") 130 return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil 131 } 132 133 return nil, gr.reportPodInfo(ctx, log, pj) 134 } 135 136 func (gr *gcsK8sReporter) addFinalizer(ctx context.Context, pj *prowv1.ProwJob) error { 137 pod, err := gr.rg.GetPod(ctx, pj.Spec.Cluster, gr.cfg().PodNamespace, pj.Name) 138 if err != nil { 139 return fmt.Errorf("failed to get pod %s: %w", pj.Name, err) 140 } 141 142 if pod.DeletionTimestamp != nil { 143 return nil 144 } 145 146 finalizers := sets.New[string](pod.Finalizers...) 147 if finalizers.Has(kubernetesreporterapi.FinalizerName) { 148 return nil 149 } 150 151 originalPod := pod.DeepCopy() 152 pod.Finalizers = sets.List(finalizers.Insert(kubernetesreporterapi.FinalizerName)) 153 patch := ctrlruntimeclient.MergeFrom(originalPod) 154 patchData, err := patch.Data(pod) 155 if err != nil { 156 return fmt.Errorf("failed to construct patch: %w", err) 157 } 158 159 if err := gr.rg.PatchPod(ctx, pj.Spec.Cluster, pod.Namespace, pod.Name, patch.Type(), patchData); err != nil { 160 // The pod occasionally gets deleted between our check above and this request 161 if strings.Contains(err.Error(), "no new finalizers can be added if the object is being deleted") { 162 return nil 163 } 164 return fmt.Errorf("failed to patch pod: %w", err) 165 } 166 167 return nil 168 } 169 170 func (gr *gcsK8sReporter) reportPodInfo(ctx context.Context, log *logrus.Entry, pj *prowv1.ProwJob) error { 171 // We only report this after a prowjob is complete (and, therefore, pod state is immutable) 172 if !pj.Complete() { 173 return errors.New("cannot report incomplete jobs") 174 } 175 176 pod, err := gr.rg.GetPod(ctx, pj.Spec.Cluster, gr.cfg().PodNamespace, pj.Name) 177 if err != nil { 178 // If we return an error we will be retried ~indefinitely. Given that permanent errors 179 // are expected (pods will be garbage collected), this isn't useful. Instead, just 180 // go along with it. 181 log.WithError(err).Info("Couldn't fetch pod") 182 pod = nil 183 } 184 185 var events []v1.Event 186 if pod != nil { 187 events, err = gr.rg.GetEvents(pj.Spec.Cluster, gr.cfg().PodNamespace, pod) 188 if err != nil { 189 log.WithError(err).Info("Couldn't fetch events for pod") 190 } 191 } 192 193 if pod == nil && len(events) == 0 { 194 log.Info("Not reporting job because we could fetch neither pod nor events") 195 return nil 196 } 197 198 report := PodReport{ 199 Pod: pod, 200 Events: events, 201 } 202 203 output, err := json.MarshalIndent(report, "", "\t") 204 if err != nil { 205 // This should never happen. 206 log.WithError(err).Warn("Couldn't marshal pod info") 207 } 208 209 bucketName, dir, err := util.GetJobDestination(gr.cfg, pj) 210 if err != nil { 211 return fmt.Errorf("couldn't get job destination: %w", err) 212 } 213 214 if gr.dryRun { 215 log.WithFields(logrus.Fields{"bucketName": bucketName, "dir": dir}).Info("Would upload pod info") 216 return nil 217 } 218 219 overWriteOpts := io.WriterOptions{PreconditionDoesNotExist: utilpointer.Bool(false)} 220 podInfoPath, err := providers.StoragePath(bucketName, path.Join(dir, "podinfo.json")) 221 if err != nil { 222 return fmt.Errorf("failed to resolve podinfo.json path: %v", err) 223 } 224 if err := io.WriteContent(ctx, log, gr.opener, podInfoPath, output, overWriteOpts); err != nil { 225 return fmt.Errorf("failed to upload pod manifest to object storage: %w", err) 226 } 227 228 if pod == nil { 229 return nil 230 } 231 232 if err := gr.removeFinalizer(ctx, pj.Spec.Cluster, pod); err != nil { 233 return fmt.Errorf("failed to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err) 234 } 235 236 return nil 237 } 238 239 func (gr *gcsK8sReporter) removeFinalizer(ctx context.Context, cluster string, pod *v1.Pod) error { 240 finalizers := sets.New[string](pod.Finalizers...) 241 if !finalizers.Has(kubernetesreporterapi.FinalizerName) { 242 return nil 243 } 244 245 oldPod := pod.DeepCopy() 246 pod.Finalizers = sets.List(finalizers.Delete(kubernetesreporterapi.FinalizerName)) 247 patch := ctrlruntimeclient.MergeFrom(oldPod) 248 rawPatch, err := patch.Data(pod) 249 if err != nil { 250 return fmt.Errorf("failed to construct patch: %w", err) 251 } 252 253 if err := gr.rg.PatchPod(ctx, cluster, pod.Namespace, pod.Name, patch.Type(), rawPatch); err != nil { 254 return fmt.Errorf("failed to patch pod: %w", err) 255 } 256 257 return nil 258 } 259 260 func (gr *gcsK8sReporter) GetName() string { 261 return kubernetesreporterapi.ReporterName 262 } 263 264 func (gr *gcsK8sReporter) ShouldReport(_ context.Context, _ *logrus.Entry, pj *prowv1.ProwJob) bool { 265 // This reporting only makes sense for the Kubernetes agent (otherwise we don't 266 // have a pod to look up). It is only particularly useful for us to look at 267 // complete jobs that have a build ID. 268 if pj.Spec.Agent != prowv1.KubernetesAgent || pj.Status.PendingTime == nil || pj.Status.BuildID == "" { 269 return false 270 } 271 272 // For ramp-up purposes, we can report only on a subset of jobs. 273 if gr.reportFraction < 1.0 { 274 // Assume the names are opaque and take the CRC-32C checksum of it. 275 // (Why CRC-32C? It's sufficiently well distributed and fast) 276 crc := crc32.Checksum([]byte(pj.Name), crc32.MakeTable(crc32.Castagnoli)) 277 if crc > uint32(math.MaxUint32*gr.reportFraction) { 278 return false 279 } 280 } 281 282 return true 283 } 284 285 func New(cfg config.Getter, opener io.Opener, rg resourceGetter, reportFraction float32, dryRun bool) *gcsK8sReporter { 286 return &gcsK8sReporter{ 287 cfg: cfg, 288 dryRun: dryRun, 289 opener: opener, 290 rg: rg, 291 reportFraction: reportFraction, 292 } 293 }