github.com/imran-kn/cilium-fork@v1.6.9/pkg/k8s/cnp.go (about) 1 // Copyright 2016-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package k8s 16 17 import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "time" 23 24 "github.com/cilium/cilium/pkg/backoff" 25 cilium_v2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2" 26 clientset "github.com/cilium/cilium/pkg/k8s/client/clientset/versioned" 27 "github.com/cilium/cilium/pkg/k8s/types" 28 k8sUtils "github.com/cilium/cilium/pkg/k8s/utils" 29 k8sversion "github.com/cilium/cilium/pkg/k8s/version" 30 "github.com/cilium/cilium/pkg/logging/logfields" 31 "github.com/cilium/cilium/pkg/metrics" 32 "github.com/cilium/cilium/pkg/spanstat" 33 34 "github.com/sirupsen/logrus" 35 v1 "k8s.io/api/core/v1" 36 k8sTypes "k8s.io/apimachinery/pkg/types" 37 "k8s.io/client-go/tools/cache" 38 ) 39 40 // ErrParse is an error to describe where policy fails to parse due any invalid 41 // rule. 42 type ErrParse struct { 43 msg string 44 } 45 46 // Error returns the error message for parsing 47 func (e ErrParse) Error() string { 48 return e.msg 49 } 50 51 // IsErrParse returns true if the error is a ErrParse 52 func IsErrParse(e error) bool { 53 _, ok := e.(ErrParse) 54 return ok 55 } 56 57 // CNPStatusUpdateContext is the context required to update the status of a 58 // CNP. It is filled out by the owner of the Kubernetes client before 59 // UpdateStatus() is called. 60 type CNPStatusUpdateContext struct { 61 // CiliumNPClient is the CiliumNetworkPolicy client 62 CiliumNPClient clientset.Interface 63 64 // CiliumV2Store is a store containing all CiliumNetworkPolicy 65 CiliumV2Store cache.Store 66 67 // NodeName is the name of the node, it is used to separate status 68 // field entries per node 69 NodeName string 70 71 // NodeManager implements the backoff.NodeManager interface and is used 72 // to provide cluster-size dependent backoff 73 NodeManager backoff.NodeManager 74 75 // UpdateDuration must be populated using spanstart.Start() to provide 76 // the timestamp of when the status update operation was started. It is 77 // used to provide the latency in the Prometheus metrics. 78 UpdateDuration *spanstat.SpanStat 79 80 // WaitForEndpointsAtPolicyRev must point to a function that will wait 81 // for all local endpoints to reach the particular policy revision 82 WaitForEndpointsAtPolicyRev func(ctx context.Context, rev uint64) error 83 } 84 85 // getUpdatedCNPFromStore gets the most recent version of cnp from the store 86 // ciliumV2Store, which is updated by the Kubernetes watcher. This reduces 87 // the possibility of Cilium trying to update cnp in Kubernetes which has 88 // been updated between the time the watcher in this Cilium instance has 89 // received cnp, and when this function is called. This still may occur, though 90 // and users of the returned CiliumNetworkPolicy may not be able to update 91 // the cnp because it may become out-of-date. Returns an error if the CNP cannot 92 // be retrieved from the store, or the object retrieved from the store is not of 93 // the expected type. 94 func (c *CNPStatusUpdateContext) getUpdatedCNPFromStore(cnp *types.SlimCNP) (*types.SlimCNP, error) { 95 serverRuleStore, exists, err := c.CiliumV2Store.Get(cnp) 96 if err != nil { 97 return nil, fmt.Errorf("unable to find v2.CiliumNetworkPolicy in local cache: %s", err) 98 } 99 if !exists { 100 return nil, errors.New("v2.CiliumNetworkPolicy does not exist in local cache") 101 } 102 103 serverRule, ok := serverRuleStore.(*types.SlimCNP) 104 if !ok { 105 return nil, errors.New("received object of unknown type from API server, expecting v2.CiliumNetworkPolicy") 106 } 107 108 return serverRule, nil 109 } 110 111 func (c *CNPStatusUpdateContext) prepareUpdate(cnp *types.SlimCNP, scopedLog *logrus.Entry) (serverRule *types.SlimCNP, err error) { 112 var localCopy *types.SlimCNP 113 114 if c.CiliumV2Store != nil { 115 localCopy, err = c.getUpdatedCNPFromStore(cnp) 116 if err != nil { 117 scopedLog.WithError(err).Debug("error getting updated CNP from store") 118 return 119 } 120 121 // Make a copy since the rule is a pointer, and any of its fields 122 // which are also pointers could be modified outside of this 123 // function. 124 serverRule = localCopy.DeepCopy() 125 _, err = serverRule.Parse() 126 if err != nil { 127 err = ErrParse{err.Error()} 128 scopedLog.WithError(err).WithField(logfields.Object, logfields.Repr(serverRule)). 129 Warn("Error parsing new CiliumNetworkPolicy rule") 130 } else { 131 scopedLog.WithField("cnpFromStore", serverRule.String()).Debug("copy of CNP retrieved from store which is being updated with status") 132 } 133 134 return 135 } 136 137 serverRule = cnp 138 _, err = cnp.Parse() 139 if err != nil { 140 log.WithError(err).WithField(logfields.Object, logfields.Repr(serverRule)). 141 Warn("Error parsing new CiliumNetworkPolicy rule") 142 err = ErrParse{err.Error()} 143 } 144 145 return 146 } 147 148 func (c *CNPStatusUpdateContext) updateStatus(cnp *types.SlimCNP, rev uint64, policyImportErr, waitForEPsErr error) (err error) { 149 // Update the status of whether the rule is enforced on this node. If 150 // we are unable to parse the CNP retrieved from the store, or if 151 // endpoints did not reach the desired policy revision after 30 152 // seconds, then mark the rule as not being enforced. 153 if policyImportErr != nil { 154 // OK is false here because the policy wasn't imported into 155 // cilium on this node; since it wasn't imported, it also isn't 156 // enforced. 157 err = c.update(cnp, false, false, policyImportErr, rev, cnp.Annotations) 158 } else { 159 // If the deadline by the above context, then not all endpoints 160 // are enforcing the given policy, and waitForEpsErr will be 161 // non-nil. 162 err = c.update(cnp, waitForEPsErr == nil, true, waitForEPsErr, rev, cnp.Annotations) 163 } 164 165 return 166 } 167 168 // UpdateStatus updates the status section of a CiliumNetworkPolicy. It will 169 // retry as long as required to update the status unless a non-temporary error 170 // occurs in which case it expects a surrounding controller to restart or give 171 // up. 172 func (c *CNPStatusUpdateContext) UpdateStatus(ctx context.Context, cnp *types.SlimCNP, rev uint64, policyImportErr error) error { 173 var ( 174 err error 175 serverRule *types.SlimCNP 176 177 // The following is an example distribution with jitter applied: 178 // 179 // nodes 4 16 128 512 1024 2048 180 // 1: 2.6s 5.5s 8.1s 9s 9.9s 12.9s 181 // 2: 1.9s 4.2s 6.3s 11.9s 17.6s 26.2s 182 // 3: 4s 10.4s 15.7s 26.7s 20.7s 23.3s 183 // 4: 18s 12.1s 19.7s 40s 1m6.3s 1m46.3s 184 // 5: 16.2s 28.9s 1m58.2s 46.2s 2m0s 2m0s 185 // 6: 54.7s 7.9s 53.3s 2m0s 2m0s 45.8s 186 // 7: 1m55.5s 22.8s 2m0s 2m0s 2m0s 2m0s 187 // 8: 1m45.8s 1m36.7s 2m0s 2m0s 2m0s 2m0s 188 cnpBackoff = backoff.Exponential{ 189 Min: time.Second, 190 NodeManager: c.NodeManager, 191 Jitter: true, 192 } 193 194 scopedLog = log.WithFields(logrus.Fields{ 195 logfields.CiliumNetworkPolicyName: cnp.ObjectMeta.Name, 196 logfields.K8sAPIVersion: cnp.TypeMeta.APIVersion, 197 logfields.K8sNamespace: cnp.ObjectMeta.Namespace, 198 }) 199 ) 200 ctxEndpointWait, cancel := context.WithTimeout(ctx, 30*time.Second) 201 defer cancel() 202 203 waitForEPsErr := c.WaitForEndpointsAtPolicyRev(ctxEndpointWait, rev) 204 205 numAttempts := 0 206 retryLoop: 207 for { 208 numAttempts++ 209 210 select { 211 case <-ctx.Done(): 212 // The owning controller wants us to stop, no error is 213 // returned. This is graceful 214 err = fmt.Errorf("status update cancelled via context: %s", ctx.Err()) 215 break retryLoop 216 default: 217 } 218 219 // Failure to prepare are returned as error immediately to 220 // expose them via the controller status as these errors are 221 // most likely not temporary. 222 // In case of a CNP parse error will update the status in the CNP. 223 serverRule, err = c.prepareUpdate(cnp, scopedLog) 224 if IsErrParse(err) { 225 statusErr := c.updateStatus(serverRule, rev, err, waitForEPsErr) 226 if statusErr != nil { 227 scopedLog.WithError(statusErr).Debug("CNP status for invalid rule cannot be updated") 228 } 229 } 230 if err != nil { 231 return err 232 } 233 234 err = c.updateStatus(serverRule, rev, policyImportErr, waitForEPsErr) 235 scopedLog.WithError(err).WithField("status", serverRule.Status).Debug("CNP status update result from apiserver") 236 237 switch { 238 case waitForEPsErr != nil: 239 // Waiting for endpoints has failed previously. We made 240 // an attempt to make this error condition visible via 241 // the status field. Regardless of whether this has 242 // succeeded or not, return an error to have the 243 // surrounding controller retry the wait for endpoint 244 // state. 245 err = waitForEPsErr 246 break retryLoop 247 248 case err == nil: 249 // The status update was successful 250 break retryLoop 251 } 252 253 cnpBackoff.Wait(ctx) 254 // error of Wait() can be ignored, if the context is cancelled, 255 // the next iteration of the loop will break out 256 } 257 258 outcome := metrics.LabelValueOutcomeSuccess 259 if err != nil { 260 outcome = metrics.LabelValueOutcomeFail 261 } 262 263 if c.UpdateDuration != nil { 264 latency := c.UpdateDuration.End(err == nil).Total() 265 metrics.KubernetesCNPStatusCompletion.WithLabelValues(fmt.Sprintf("%d", numAttempts), outcome).Observe(latency.Seconds()) 266 } 267 268 return err 269 } 270 271 func (c *CNPStatusUpdateContext) update(cnp *types.SlimCNP, enforcing, ok bool, cnpError error, rev uint64, cnpAnnotations map[string]string) error { 272 var ( 273 cnpns cilium_v2.CiliumNetworkPolicyNodeStatus 274 annotations map[string]string 275 err error 276 ) 277 278 capabilities := k8sversion.Capabilities() 279 280 switch { 281 case cnpAnnotations == nil: 282 // don't bother doing anything if cnpAnnotations is nil. 283 case capabilities.Patch: 284 // in k8s versions that support JSON Patch we can safely modify the 285 // cnpAnnotations as the CNP, along with these annotations, is not sent to 286 // k8s api-server. 287 annotations = cnpAnnotations 288 lastAppliedConfig, ok := annotations[v1.LastAppliedConfigAnnotation] 289 defer func() { 290 if ok { 291 cnpAnnotations[v1.LastAppliedConfigAnnotation] = lastAppliedConfig 292 } 293 }() 294 default: 295 // for all other k8s versions, sense the CNP is sent with the 296 // annotations we need to make a deepcopy. 297 m := make(map[string]string, len(cnpAnnotations)) 298 for k, v := range cnpAnnotations { 299 m[k] = v 300 } 301 annotations = m 302 } 303 304 // Ignore LastAppliedConfigAnnotation as it can be really costly to upload 305 // this as part of the status. 306 delete(annotations, v1.LastAppliedConfigAnnotation) 307 308 if cnpError != nil { 309 cnpns = cilium_v2.CiliumNetworkPolicyNodeStatus{ 310 Enforcing: enforcing, 311 Error: cnpError.Error(), 312 OK: ok, 313 LastUpdated: cilium_v2.NewTimestamp(), 314 Annotations: annotations, 315 } 316 } else { 317 cnpns = cilium_v2.CiliumNetworkPolicyNodeStatus{ 318 Enforcing: enforcing, 319 Revision: rev, 320 OK: ok, 321 LastUpdated: cilium_v2.NewTimestamp(), 322 Annotations: annotations, 323 } 324 } 325 326 ns := k8sUtils.ExtractNamespace(&cnp.ObjectMeta) 327 328 switch { 329 case capabilities.Patch: 330 // This is a JSON Patch [RFC 6902] used to create the `/status/nodes` 331 // field in the CNP. If we don't create, replacing the status for this 332 // node will fail as the path does not exist. 333 // Worst case scenario is that all nodes try to perform this operation 334 // and only one node will succeed. This can be moved to the 335 // cilium-operator which will create the path for all nodes. However 336 // performance tests have shown that performing 2 API calls to 337 // kube-apiserver for 500 nodes versus performing 1 API call, where 338 // one of the nodes would "create" the `/status` path before all other 339 // nodes tried to replace their own status resulted in a gain of 3 %. 340 // This gain is less notable once the number of nodes increases. 341 createStatusAndNodePatch := []JSONPatch{ 342 { 343 OP: "test", 344 Path: "/status", 345 Value: nil, 346 }, 347 { 348 OP: "add", 349 Path: "/status", 350 Value: cilium_v2.CiliumNetworkPolicyStatus{ 351 Nodes: map[string]cilium_v2.CiliumNetworkPolicyNodeStatus{ 352 c.NodeName: cnpns, 353 }, 354 }, 355 }, 356 } 357 358 var createStatusAndNodePatchJSON []byte 359 createStatusAndNodePatchJSON, err = json.Marshal(createStatusAndNodePatch) 360 if err != nil { 361 return err 362 } 363 364 _, err = c.CiliumNPClient.CiliumV2().CiliumNetworkPolicies(ns).Patch(cnp.GetName(), k8sTypes.JSONPatchType, createStatusAndNodePatchJSON, "status") 365 if err != nil { 366 // If it fails it means the test from the previous patch failed 367 // so we can safely replace this node in the CNP status. 368 createStatusAndNodePatch := []JSONPatch{ 369 { 370 OP: "replace", 371 Path: "/status/nodes/" + c.NodeName, 372 Value: cnpns, 373 }, 374 } 375 createStatusAndNodePatchJSON, err = json.Marshal(createStatusAndNodePatch) 376 if err != nil { 377 return err 378 } 379 _, err = c.CiliumNPClient.CiliumV2().CiliumNetworkPolicies(ns).Patch(cnp.GetName(), k8sTypes.JSONPatchType, createStatusAndNodePatchJSON, "status") 380 } 381 case capabilities.UpdateStatus: 382 // k8s < 1.13 as minimal support for JSON patch where kube-apiserver 383 // can print Error messages and even panic in k8s < 1.10. 384 cnp.SetPolicyStatus(c.NodeName, cnpns) 385 _, err = c.CiliumNPClient.CiliumV2().CiliumNetworkPolicies(ns).UpdateStatus(cnp.CiliumNetworkPolicy) 386 default: 387 // k8s < 1.13 as minimal support for JSON patch where kube-apiserver 388 // can print Error messages and even panic in k8s < 1.10. 389 cnp.SetPolicyStatus(c.NodeName, cnpns) 390 _, err = c.CiliumNPClient.CiliumV2().CiliumNetworkPolicies(ns).Update(cnp.CiliumNetworkPolicy) 391 } 392 return err 393 }