open-cluster-management.io/governance-policy-propagator@v0.13.0/controllers/complianceeventsapi/complianceeventsapi_controller.go (about) 1 // Copyright Contributors to the Open Cluster Management project 2 3 package complianceeventsapi 4 5 import ( 6 "context" 7 "database/sql" 8 "embed" 9 "errors" 10 "fmt" 11 "net/url" 12 "os" 13 "path" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/golang-migrate/migrate/v4" 19 // Required to activate the Postgres driver 20 _ "github.com/golang-migrate/migrate/v4/database/postgres" 21 "github.com/golang-migrate/migrate/v4/source" 22 "github.com/golang-migrate/migrate/v4/source/iofs" 23 "github.com/lib/pq" 24 k8sdepwatches "github.com/stolostron/kubernetes-dependency-watches/client" 25 corev1 "k8s.io/api/core/v1" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 k8sruntime "k8s.io/apimachinery/pkg/runtime" 28 "k8s.io/apimachinery/pkg/runtime/schema" 29 "k8s.io/apimachinery/pkg/types" 30 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 31 "k8s.io/client-go/kubernetes" 32 "k8s.io/client-go/util/workqueue" 33 ctrl "sigs.k8s.io/controller-runtime" 34 "sigs.k8s.io/controller-runtime/pkg/event" 35 "sigs.k8s.io/controller-runtime/pkg/reconcile" 36 37 policyv1 "open-cluster-management.io/governance-policy-propagator/api/v1" 38 "open-cluster-management.io/governance-policy-propagator/controllers/common" 39 ) 40 41 //go:embed migrations 42 var migrationsFS embed.FS 43 44 const ( 45 ControllerName = "compliance-events-api" 46 DBSecretName = "governance-policy-database" 47 WatchNamespaceEnvVar = "WATCH_NAMESPACE_COMPLIANCE_EVENTS_STORE" 48 ) 49 50 var ( 51 log = ctrl.Log.WithName(ControllerName) 52 ErrInvalidDBSecret = errors.New("the governance-policy-database secret is invalid") 53 ErrInvalidConnectionURL = errors.New("the database connection URL is invalid") 54 ErrDBConnectionFailed = errors.New("the compliance events database could not be connected to") 55 migrationsSource source.Driver 56 gvkSecret = schema.GroupVersionKind{Version: "v1", Kind: "Secret"} 57 ErrRetryable = errors.New("") 58 ) 59 60 func init() { 61 var err error 62 migrationsSource, err = iofs.New(migrationsFS, "migrations") 63 64 utilruntime.Must(err) 65 } 66 67 // ComplianceServerCtx acts as a "global" database instance that all required controllers share. The 68 // ComplianceDBSecretReconciler reconciler is responsible for updating the DB field if the connection info gets added 69 // or changes. MonitorDatabaseConnection will periodically check the health of the database connection and monitor 70 // the Queue. See MonitorDatabaseConnection for more information. 71 type ComplianceServerCtx struct { 72 // A write lock is used when the database connection changes and the DB object needs to be replaced. 73 // A read lock should be used when the DB is accessed. 74 Lock sync.RWMutex 75 DB *sql.DB 76 Queue workqueue.Interface 77 needsMigration bool 78 // Required to run a migration after the database connection changed or the feature was enabled. 79 connectionURL string 80 // These caches get reset after a database migration due to a connection drop and reconnect. 81 ParentPolicyToID sync.Map 82 PolicyToID sync.Map 83 ClusterID string 84 } 85 86 // NewComplianceServerCtx returns a ComplianceServerCtx with initialized values. It does not start a connection 87 // but does validate the connection URL for syntax. If the connection URL is not provided or is invalid, 88 // ErrInvalidConnectionURL is returned. 89 func NewComplianceServerCtx(dbConnectionURL string, clusterID string) (*ComplianceServerCtx, error) { 90 var db *sql.DB 91 var err error 92 93 if dbConnectionURL == "" { 94 err = ErrInvalidConnectionURL 95 } else { 96 var openErr error 97 // As of the writing of this code, sql.Open doesn't create a connection. db.Ping will though, so this 98 // should never fail unless the connection URL is invalid to the Postgres driver. 99 db, openErr = sql.Open("postgres", dbConnectionURL) 100 if openErr != nil { 101 err = fmt.Errorf("%w: %w", ErrInvalidConnectionURL, err) 102 } 103 } 104 105 return &ComplianceServerCtx{ 106 Lock: sync.RWMutex{}, 107 Queue: workqueue.New(), 108 connectionURL: dbConnectionURL, 109 DB: db, 110 ClusterID: clusterID, 111 }, err 112 } 113 114 // ComplianceDBSecretReconciler is responsible for managing the compliance events history database migrations and 115 // keeping the shared database connection up to date. 116 type ComplianceDBSecretReconciler struct { 117 DynamicWatcher k8sdepwatches.DynamicWatcher 118 Client *kubernetes.Clientset 119 // TempDir is used for temporary files such as a custom CA to use to verify the Postgres TLS connection. The 120 // caller is responsible for cleaning it up after the controller stops. 121 TempDir string 122 ConnectionURL string 123 ComplianceServerCtx *ComplianceServerCtx 124 } 125 126 // WARNING: In production, this should be namespaced to the namespace the controller is running in. 127 //+kubebuilder:rbac:groups=core,resources=secrets,resourceNames=governance-policy-database,verbs=get;list;watch 128 //+kubebuilder:rbac:groups=core,resources=events,verbs=create 129 //+kubebuilder:rbac:groups=authorization.k8s.io,resources=subjectaccessreviews,verbs=create 130 131 // Reconcile watches the governance-policy-database secret in the controller namespace. On updates it'll trigger 132 // a database migration and update the shared database connection. 133 func (r *ComplianceDBSecretReconciler) Reconcile( 134 ctx context.Context, watcher k8sdepwatches.ObjectIdentifier, 135 ) (ctrl.Result, error) { 136 log := log.WithValues("secretNamespace", watcher.Namespace, "secret", watcher.Name) 137 log.Info("Reconciling a Secret") 138 139 // The watch configuration should prevent this from happening, but add this as a precaution. 140 if watcher.Name != DBSecretName { 141 log.Info("Got a reconciliation request for an unexpected Secret. This should have been filtered out.") 142 143 return reconcile.Result{}, nil 144 } 145 146 var parsedConnectionURL string 147 148 dbSecret, err := r.DynamicWatcher.GetFromCache(gvkSecret, watcher.Namespace, watcher.Name) 149 if dbSecret == nil || errors.Is(err, k8sdepwatches.ErrNoCacheEntry) { 150 parsedConnectionURL = "" 151 } else if err != nil { 152 return reconcile.Result{}, err 153 } else { 154 var typedDBSecret corev1.Secret 155 156 err := k8sruntime.DefaultUnstructuredConverter.FromUnstructured(dbSecret.UnstructuredContent(), &typedDBSecret) 157 if err != nil { 158 log.Error(err, "The cached database secret could not be converted to a typed secret") 159 160 return reconcile.Result{}, nil 161 } 162 163 parsedConnectionURL, err = ParseDBSecret(&typedDBSecret, r.TempDir) 164 if errors.Is(err, ErrInvalidDBSecret) { 165 log.Error(err, "Will retry once the invalid secret is updated") 166 167 parsedConnectionURL = "" 168 } else if err != nil { 169 log.Error(err, "Will retry in 30 seconds due to the error") 170 171 return reconcile.Result{RequeueAfter: time.Second * 30}, nil 172 } 173 } 174 175 if r.ConnectionURL != parsedConnectionURL { 176 log.Info( 177 "The database connection URL has changed. Will handle missed database entries during downtime.", 178 ) 179 180 r.ConnectionURL = parsedConnectionURL 181 182 r.ComplianceServerCtx.Lock.Lock() 183 defer r.ComplianceServerCtx.Lock.Unlock() 184 185 // Need the connection URL for the migration. 186 r.ComplianceServerCtx.connectionURL = r.ConnectionURL 187 188 // Clear the database ID caches in case this is a new database or the database was restored 189 r.ComplianceServerCtx.ParentPolicyToID = sync.Map{} 190 r.ComplianceServerCtx.PolicyToID = sync.Map{} 191 clusterKeyCache = sync.Map{} 192 193 if parsedConnectionURL == "" { 194 r.ComplianceServerCtx.DB = nil 195 } else { 196 // As of the writing of this code, sql.Open doesn't create a connection. db.Ping will though, so this 197 // should never fail unless the connection URL is invalid to the Postgres driver. 198 db, err := sql.Open("postgres", r.ConnectionURL) 199 if err != nil { 200 log.Error( 201 err, 202 "The Postgres connection URL could not be parsed by the driver. Try updating the secret.", 203 ) 204 } 205 206 // This may be nil and that is intentional. 207 r.ComplianceServerCtx.DB = db 208 // Once the connection URL changes, a migration is required in the event this is a new database or the 209 // propagator was started when the database was offline and it was not up to date. 210 // If the migration fails, let MonitorDatabaseConnection handle it. 211 _ = r.ComplianceServerCtx.MigrateDB(ctx, r.Client, watcher.Namespace) 212 } 213 } 214 215 return reconcile.Result{}, nil 216 } 217 218 // MonitorDatabaseConnection will check the database connection health every 20 seconds. If healthy, it will migrate 219 // the database if necessary, and send any reconcile requests to the replicated policy controller from 220 // complianceServerCtx.Queue. To stop MonitorDatabaseConnection, cancel the input context. 221 func MonitorDatabaseConnection( 222 ctx context.Context, 223 complianceServerCtx *ComplianceServerCtx, 224 client *kubernetes.Clientset, 225 controllerNamespace string, 226 reconcileRequests chan<- event.GenericEvent, 227 ) { 228 for { 229 sleep, cancelSleep := context.WithTimeout(context.Background(), time.Second*20) 230 231 log.V(3).Info("Sleeping for 20 seconds until the next database check") 232 233 select { 234 case <-ctx.Done(): 235 complianceServerCtx.Queue.ShutDown() 236 cancelSleep() 237 238 return 239 case <-sleep.Done(): 240 // Satisfy the linter, but in reality, this is a noop. 241 cancelSleep() 242 } 243 244 complianceServerCtx.Lock.RLock() 245 246 if complianceServerCtx.DB == nil { 247 complianceServerCtx.Lock.RUnlock() 248 249 continue 250 } 251 252 if !complianceServerCtx.needsMigration && complianceServerCtx.Queue.Len() == 0 { 253 complianceServerCtx.Lock.RUnlock() 254 255 continue 256 } 257 258 if err := complianceServerCtx.DB.PingContext(ctx); err != nil { 259 complianceServerCtx.Lock.RUnlock() 260 261 log.Info("The database connection failed: " + err.Error()) 262 263 continue 264 } 265 266 complianceServerCtx.Lock.RUnlock() 267 268 if complianceServerCtx.needsMigration { 269 complianceServerCtx.Lock.Lock() 270 err := complianceServerCtx.MigrateDB(ctx, client, controllerNamespace) 271 complianceServerCtx.Lock.Unlock() 272 273 if err != nil { 274 continue 275 } 276 } 277 278 log.V(3).Info( 279 "The compliance database is up. Checking for queued up reconcile requests.", 280 "queueLength", complianceServerCtx.Queue.Len(), 281 ) 282 283 sendLogMsg := complianceServerCtx.Queue.Len() > 0 284 285 for complianceServerCtx.Queue.Len() > 0 { 286 request, shutdown := complianceServerCtx.Queue.Get() 287 288 switch v := request.(type) { 289 case types.NamespacedName: 290 reconcileRequests <- event.GenericEvent{ 291 Object: &common.GuttedObject{ 292 TypeMeta: metav1.TypeMeta{ 293 APIVersion: policyv1.GroupVersion.String(), 294 Kind: "Policy", 295 }, 296 ObjectMeta: metav1.ObjectMeta{ 297 Name: v.Name, 298 Namespace: v.Namespace, 299 }, 300 }, 301 } 302 case *EventDetailsQueued: 303 complianceEvent := v 304 305 err := RecordLocalClusterComplianceEvent( 306 ctx, complianceServerCtx, complianceEvent.EventDetails(), 307 ) 308 309 requeue := errors.Is(err, ErrRetryable) 310 311 if requeue { 312 complianceServerCtx.Queue.Add(request) 313 } 314 315 if err != nil { 316 log.Info( 317 "Failed to record the queued compliance event", 318 "requeue", requeue, 319 "error", err.Error(), 320 "eventMessage", complianceEvent.Message, 321 "policyID", complianceEvent.PolicyID, 322 ) 323 } else { 324 log.V(2).Info( 325 "Recorded the queued compliance event", 326 "eventMessage", complianceEvent.Message, 327 "policyID", complianceEvent.PolicyID, 328 ) 329 } 330 } 331 332 complianceServerCtx.Queue.Done(request) 333 334 // The queue should never get shutdown and still reach here which is why it's an info log. We need to 335 // know about it if it happens unexpectedly. 336 if shutdown { 337 log.Info("The queue was shutdown. Exiting MonitorDatabaseConnection.") 338 339 return 340 } 341 } 342 343 if sendLogMsg { 344 log.V(1).Info( 345 "Done sending queued reconcile requests. Sleeping for 20 seconds until the next database check.", 346 "queueLength", complianceServerCtx.Queue.Len(), 347 ) 348 } 349 } 350 } 351 352 // RecordLocalClusterComplianceEvent will record the input compliance event. It returns ErrRetryable if the compliance 353 // event should be requeued to record again later. 354 func RecordLocalClusterComplianceEvent( 355 ctx context.Context, complianceServerCtx *ComplianceServerCtx, complianceEvent *EventDetails, 356 ) error { 357 clusterFK, err := GetClusterForeignKey( 358 ctx, 359 complianceServerCtx.DB, 360 Cluster{ClusterID: complianceServerCtx.ClusterID, Name: "local-cluster"}, 361 ) 362 if err != nil { 363 return fmt.Errorf( 364 "%wfailed to get the cluster foreign key to generate a compliance event: %w", ErrRetryable, err, 365 ) 366 } 367 368 complianceEvent.ClusterID = clusterFK 369 370 query, args := complianceEvent.InsertQuery() 371 372 _, err = complianceServerCtx.DB.ExecContext(ctx, query, args...) 373 if err != nil { 374 // If it's a unique constraint violation, then the event is a duplicate and can be ignored. If it's a foreign 375 // key violation, that means the database experienced data loss and the foreign key is invalid, so the 376 // compliance event can't be recorded. 377 var pqErr *pq.Error 378 379 if errors.As(err, &pqErr) { 380 if pqErr.Code == postgresUniqueViolationCode { 381 return nil 382 } 383 384 if pqErr.Code == postgresForeignKeyViolationCode { 385 return fmt.Errorf( 386 "failed to record the compliance event because the foreign keys no longer apply: %w", err, 387 ) 388 } 389 } 390 391 // If the error was because the database was down, then queue it up for later 392 if complianceServerCtx.DB.PingContext(ctx) != nil { 393 return errors.Join(ErrRetryable, ErrDBConnectionFailed) 394 } 395 396 return fmt.Errorf("failed to record the compliance event: %w", err) 397 } 398 399 return nil 400 } 401 402 // MigrateDB will perform a database migration if required and send Kubernetes events if the migration fails. 403 // ErrDBConnectionFailed will be returned if the database connection failed. Obtain a write lock before calling 404 // this method if multiple goroutines use this ComplianceServerCtx instance. 405 func (c *ComplianceServerCtx) MigrateDB( 406 ctx context.Context, client *kubernetes.Clientset, controllerNamespace string, 407 ) error { 408 c.needsMigration = true 409 410 if c.connectionURL == "" { 411 return fmt.Errorf("%w: the connection URL is not set", ErrDBConnectionFailed) 412 } 413 414 m, err := migrate.NewWithSourceInstance("iofs", migrationsSource, c.connectionURL) 415 if err != nil { 416 msg := "Failed to initialize the database migration client" 417 log.Error(err, msg) 418 419 _ = sendDBErrorEvent(ctx, client, controllerNamespace, msg) 420 421 return fmt.Errorf("%w: %w", ErrDBConnectionFailed, err) 422 } 423 424 defer m.Close() 425 426 err = m.Up() 427 if err != nil && err.Error() == "no change" { 428 log.Info("The database schema is up to date") 429 } else if err != nil { 430 msg := "Failed to perform the database migration. The compliance events endpoint will not start until this " + 431 "is resolved." 432 433 log.Error(err, msg) 434 435 _ = sendDBErrorEvent(ctx, client, controllerNamespace, msg) 436 437 return fmt.Errorf("%w: %w", ErrDBConnectionFailed, err) 438 } else { 439 // The errors don't need to be checked because we know the database migration was successful so there is a 440 // valid version assigned. 441 version, _, _ := m.Version() 442 // The cache gets reset after a migration in case the database changed. If the database 443 // was restored to an older backup, then the propagator needs to restart to clear the cache. 444 c.ParentPolicyToID = sync.Map{} 445 c.PolicyToID = sync.Map{} 446 447 msg := fmt.Sprintf("The compliance events database schema was successfully updated to version %d", version) 448 log.Info(msg) 449 450 _ = sendDBEvent(ctx, client, controllerNamespace, "Normal", "OCMComplianceEventsDB", msg) 451 } 452 453 c.needsMigration = false 454 455 return nil 456 } 457 458 // ParseDBSecret will parse the input database secret and return a connection URL. If the secret contains invalid 459 // connection information, then ErrInvalidDBSecret is returned. 460 func ParseDBSecret(dbSecret *corev1.Secret, tempDirPath string) (string, error) { 461 var connectionURL *url.URL 462 var err error 463 464 if dbSecret.Data["connectionURL"] != nil { 465 connectionURL, err = url.Parse(strings.TrimSpace(string(dbSecret.Data["connectionURL"]))) 466 if err != nil { 467 err := fmt.Errorf("%w: failed to parse the connectionURL value: %w", ErrInvalidDBSecret, err) 468 469 return "", err 470 } 471 } else { 472 if dbSecret.Data["user"] == nil { 473 return "", fmt.Errorf("%w: no user value was provided", ErrInvalidDBSecret) 474 } 475 476 user := string(dbSecret.Data["user"]) 477 478 if dbSecret.Data["password"] == nil { 479 return "", fmt.Errorf("%w: no password value was provided", ErrInvalidDBSecret) 480 } 481 482 password := string(dbSecret.Data["password"]) 483 484 if dbSecret.Data["host"] == nil { 485 return "", fmt.Errorf("%w: no host value was provided", ErrInvalidDBSecret) 486 } 487 488 host := string(dbSecret.Data["host"]) 489 490 var port string 491 492 if dbSecret.Data["port"] == nil { 493 log.Info("No port value was provided. Using the default 5432.") 494 port = "5432" 495 } else { 496 port = string(dbSecret.Data["port"]) 497 } 498 499 if dbSecret.Data["dbname"] == nil { 500 return "", fmt.Errorf("%w: no dbname value was provided", ErrInvalidDBSecret) 501 } 502 503 dbName := string(dbSecret.Data["dbname"]) 504 505 var sslMode string 506 507 if dbSecret.Data["sslmode"] == nil { 508 log.Info("No sslmode value was provided. Using the default sslmode=verify-full.") 509 sslMode = "verify-full" 510 } else { 511 sslMode = string(dbSecret.Data["sslmode"]) 512 } 513 514 connectionURL = &url.URL{ 515 Scheme: "postgresql", 516 User: url.UserPassword(user, password), 517 Host: fmt.Sprintf("%s:%s", host, port), 518 Path: dbName, 519 RawQuery: "sslmode=" + url.QueryEscape(sslMode), 520 } 521 } 522 523 if !strings.Contains(connectionURL.RawQuery, "connect_timeout=") { 524 if connectionURL.RawQuery != "" { 525 connectionURL.RawQuery += "&" 526 } 527 528 // This is important or else db.Ping() takes too long if the connection is down. 529 connectionURL.RawQuery += "connect_timeout=5" 530 } 531 532 if dbSecret.Data["ca"] != nil { 533 caPath := path.Join(tempDirPath, "db-ca.crt") 534 535 err := os.WriteFile(caPath, dbSecret.Data["ca"], 0o600) 536 if err != nil { 537 return "", fmt.Errorf("failed to write the custom root CA specified in the secret: %w", err) 538 } 539 540 if connectionURL.RawQuery != "" { 541 connectionURL.RawQuery += "&" 542 } 543 544 connectionURL.RawQuery += "sslrootcert=" + url.QueryEscape(caPath) 545 } 546 547 if !strings.Contains(connectionURL.RawQuery, "sslmode=verify-full") { 548 log.Info( 549 "The configured Postgres connection URL does not specify sslmode=verify-full. Please consider using a " + 550 "more secure connection.", 551 ) 552 } 553 554 return connectionURL.String(), nil 555 } 556 557 func sendDBEvent( 558 ctx context.Context, client *kubernetes.Clientset, namespace, eventType, reason, msg string, 559 ) error { 560 event := &corev1.Event{ 561 ObjectMeta: metav1.ObjectMeta{ 562 Name: fmt.Sprintf("compliance-events-api.%x", time.Now().UnixNano()), 563 Namespace: namespace, 564 }, 565 InvolvedObject: corev1.ObjectReference{ 566 Kind: "Secret", 567 Namespace: namespace, 568 Name: DBSecretName, 569 APIVersion: "v1", 570 }, 571 Type: eventType, 572 Reason: reason, 573 Message: msg, 574 Source: corev1.EventSource{ 575 Component: ControllerName, 576 }, 577 ReportingController: ControllerName, 578 } 579 580 _, err := client.CoreV1().Events(namespace).Create(ctx, event, metav1.CreateOptions{}) 581 if err != nil { 582 log.Error(err, "Failed to send a Kubernetes warning event") 583 } 584 585 return err 586 } 587 588 func sendDBErrorEvent(ctx context.Context, client *kubernetes.Clientset, namespace, msg string) error { 589 fullMsg := msg + " See the governance-policy-propagator logs for more details." 590 591 return sendDBEvent(ctx, client, namespace, "Warning", "OCMComplianceEventsDBError", fullMsg) 592 }