github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/verrazzano-backup-hook/opensearch/opensearch.go (about) 1 // Copyright (c) 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package opensearch 5 6 import ( 7 "bytes" 8 "context" 9 "encoding/json" 10 "fmt" 11 "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/constants" 12 "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/types" 13 "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/utilities" 14 "go.uber.org/zap" 15 "io" 16 "io/ioutil" 17 "net/http" 18 "os" 19 "time" 20 ) 21 22 // HTTPHelper supports net/http calls of type GET/POST/DELETE 23 func (o *OpensearchImpl) HTTPHelper(ctx context.Context, method, requestURL string, body io.Reader, data interface{}) error { 24 o.Log.Debugf("Invoking HTTP '%s' request with url '%s'", method, requestURL) 25 var response *http.Response 26 var request *http.Request 27 var err error 28 29 timeoutDuration, err := time.ParseDuration(o.Timeout) 30 if err != nil { 31 o.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 32 os.Exit(1) 33 } 34 ctx, cancel := context.WithTimeout(ctx, timeoutDuration) 35 defer cancel() 36 37 switch method { 38 case "GET": 39 request, err = http.NewRequestWithContext(ctx, http.MethodGet, requestURL, body) 40 case "POST": 41 request, err = http.NewRequestWithContext(ctx, http.MethodPost, requestURL, body) 42 case "DELETE": 43 request, err = http.NewRequestWithContext(ctx, http.MethodDelete, requestURL, body) 44 } 45 if err != nil { 46 o.Log.Error("Error creating request ", zap.Error(err)) 47 return err 48 } 49 50 request.Header.Add("Content-Type", constants.HTTPContentType) 51 response, err = o.Client.Do(request) 52 if err != nil { 53 o.Log.Errorf("HTTP '%s' failure while invoking url '%s' due to '%v'", method, requestURL, zap.Error(err)) 54 return err 55 } 56 defer response.Body.Close() 57 58 bdata, err := ioutil.ReadAll(response.Body) 59 if err != nil { 60 o.Log.Errorf("Unable to read response body ", zap.Error(err)) 61 return err 62 } 63 64 if response.StatusCode != 200 { 65 o.Log.Errorf("Error completing request, response code '%v', response body '%v'", response.StatusCode, string(bdata)) 66 return err 67 } 68 69 err = json.Unmarshal(bdata, &data) 70 if err != nil { 71 o.Log.Errorf("json unmarshalling error %v", err) 72 return err 73 } 74 75 return nil 76 } 77 78 // EnsureOpenSearchIsReachable is used determine whether OpenSearch cluster is reachable 79 func (o *OpensearchImpl) EnsureOpenSearchIsReachable() error { 80 o.Log.Infof("Checking if cluster is reachable") 81 var osinfo types.OpenSearchClusterInfo 82 done := false 83 var timeSeconds float64 84 85 if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString { 86 // if UT flag is set, skip to avoid retry logic 87 return nil 88 } 89 90 timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout) 91 if err != nil { 92 o.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 93 return err 94 } 95 totalSeconds := timeParse.Seconds() 96 97 for !done { 98 err := o.HTTPHelper(context.Background(), "GET", o.BaseURL, nil, &osinfo) 99 if err != nil { 100 if timeSeconds < totalSeconds { 101 message := "Cluster is not reachable" 102 duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log) 103 if err != nil { 104 return err 105 } 106 timeSeconds = timeSeconds + float64(duration) 107 } else { 108 o.Log.Errorf("VeleroTimeout '%s' exceeded. Cluster not reachable", o.SecretData.VeleroTimeout) 109 return err 110 } 111 } else { 112 done = true 113 } 114 } 115 116 o.Log.Infof("Cluster '%s' is reachable", osinfo.ClusterName) 117 118 return nil 119 } 120 121 // EnsureOpenSearchIsHealthy ensures OpenSearch cluster is healthy 122 func (o *OpensearchImpl) EnsureOpenSearchIsHealthy() error { 123 o.Log.Infof("Checking if cluster is healthy") 124 var clusterHealth types.OpenSearchHealthResponse 125 err := o.EnsureOpenSearchIsReachable() 126 if err != nil { 127 return err 128 } 129 130 healthURL := fmt.Sprintf("%s/_cluster/health", o.BaseURL) 131 healthReachable := false 132 var timeSeconds float64 133 134 timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout) 135 if err != nil { 136 o.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 137 return err 138 } 139 totalSeconds := timeParse.Seconds() 140 141 if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString { 142 // if UT flag is set, skip to avoid retry logic 143 return nil 144 } 145 146 for !healthReachable { 147 err = o.HTTPHelper(context.Background(), "GET", healthURL, nil, &clusterHealth) 148 if err != nil { 149 if timeSeconds < totalSeconds { 150 message := "Cluster health endpoint is not reachable" 151 duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log) 152 if err != nil { 153 return err 154 } 155 timeSeconds = timeSeconds + float64(duration) 156 } else { 157 o.Log.Errorf("VeleroTimeout '%s' exceeded. Cluster health endpoint is not reachable", o.SecretData.VeleroTimeout) 158 return err 159 } 160 } else { 161 o.Log.Infof("Cluster health endpoint is reachable now") 162 healthReachable = true 163 } 164 } 165 166 healthGreen := false 167 168 for !healthGreen { 169 err = o.HTTPHelper(context.Background(), "GET", healthURL, nil, &clusterHealth) 170 if err != nil { 171 if timeSeconds < totalSeconds { 172 message := "Json unmarshalling error" 173 duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log) 174 if err != nil { 175 return err 176 } 177 timeSeconds = timeSeconds + float64(duration) 178 continue 179 } else { 180 return fmt.Errorf("VeleroTimeout '%s' exceeded. Json unmarshalling error while checking cluster health %v", o.SecretData.VeleroTimeout, zap.Error(err)) 181 } 182 } 183 184 if clusterHealth.Status != "green" { 185 if timeSeconds < totalSeconds { 186 message := fmt.Sprintf("Cluster health is '%s'", clusterHealth.Status) 187 duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log) 188 if err != nil { 189 return err 190 } 191 timeSeconds = timeSeconds + float64(duration) 192 } else { 193 return fmt.Errorf("VeleroTimeout '%s' exceeded. Cluster health expected 'green' , current state '%s'", o.SecretData.VeleroTimeout, clusterHealth.Status) 194 } 195 } else { 196 healthGreen = true 197 } 198 } 199 200 if healthReachable && healthGreen { 201 o.Log.Infof("Cluster is reachable and healthy with status as '%s'", clusterHealth.Status) 202 return nil 203 } 204 205 return err 206 } 207 208 // ReloadOpensearchSecureSettings used to reload secure settings once object store keys are updated 209 func (o *OpensearchImpl) ReloadOpensearchSecureSettings() error { 210 var secureSettings types.OpenSearchSecureSettingsReloadStatus 211 url := fmt.Sprintf("%s/_nodes/reload_secure_settings", o.BaseURL) 212 213 err := o.HTTPHelper(context.Background(), "POST", url, nil, &secureSettings) 214 if err != nil { 215 return err 216 } 217 218 if secureSettings.ClusterNodes.Failed == 0 && secureSettings.ClusterNodes.Total == 0 && secureSettings.ClusterNodes.Successful == 0 { 219 return fmt.Errorf("Invalid cluster settings detected. Check the connection") 220 } 221 222 if secureSettings.ClusterNodes.Failed == 0 && secureSettings.ClusterNodes.Total == secureSettings.ClusterNodes.Successful { 223 o.Log.Infof("Secure settings reloaded sucessfully across all '%v' nodes of the cluster", secureSettings.ClusterNodes.Total) 224 return nil 225 } 226 return fmt.Errorf("Not all nodes were updated successfully. Total = '%v', Failed = '%v' , Successful = '%v'", secureSettings.ClusterNodes.Total, secureSettings.ClusterNodes.Failed, secureSettings.ClusterNodes.Successful) 227 } 228 229 // RegisterSnapshotRepository registers an object store with OpenSearch using the s3-plugin 230 func (o *OpensearchImpl) RegisterSnapshotRepository() error { 231 o.Log.Infof("Registering s3 backend repository '%s'", constants.OpenSearchSnapShotRepoName) 232 var snapshotPayload types.OpenSearchSnapshotRequestPayload 233 var registerResponse types.OpenSearchOperationResponse 234 snapshotPayload.Type = "s3" 235 snapshotPayload.Settings.Bucket = o.SecretData.BucketName 236 snapshotPayload.Settings.Region = o.SecretData.RegionName 237 snapshotPayload.Settings.Client = "default" 238 snapshotPayload.Settings.Endpoint = o.SecretData.Endpoint 239 snapshotPayload.Settings.PathStyleAccess = true 240 241 postBody, err := json.Marshal(snapshotPayload) 242 if err != nil { 243 return err 244 } 245 246 url := fmt.Sprintf("%s/_snapshot/%s", o.BaseURL, constants.OpenSearchSnapShotRepoName) 247 248 err = o.HTTPHelper(context.Background(), "POST", url, bytes.NewBuffer(postBody), ®isterResponse) 249 if err != nil { 250 return err 251 } 252 253 if registerResponse.Acknowledged { 254 o.Log.Infof("Snapshot registered successfully !") 255 return nil 256 } 257 return fmt.Errorf("Snapshot registration unsuccessful. Response = %v", registerResponse) 258 } 259 260 // TriggerSnapshot this triggers a snapshot/backup of all the data streams/indices 261 func (o *OpensearchImpl) TriggerSnapshot() error { 262 o.Log.Infof("Triggering snapshot with name '%s'", o.SecretData.BackupName) 263 var snapshotResponse types.OpenSearchSnapshotResponse 264 snapShotURL := fmt.Sprintf("%s/_snapshot/%s/%s", o.BaseURL, constants.OpenSearchSnapShotRepoName, o.SecretData.BackupName) 265 266 err := o.HTTPHelper(context.Background(), "POST", snapShotURL, nil, &snapshotResponse) 267 if err != nil { 268 return err 269 } 270 271 if !snapshotResponse.Accepted { 272 return fmt.Errorf("Snapshot trigger failure. Response = %v ", snapshotResponse) 273 } 274 o.Log.Infof("Snapshot triggered successfully !") 275 return nil 276 } 277 278 // CheckSnapshotProgress checks the data backup progress. 279 func (o *OpensearchImpl) CheckSnapshotProgress() error { 280 o.Log.Infof("Checking snapshot progress with name '%s'", o.SecretData.BackupName) 281 snapShotURL := fmt.Sprintf("%s/_snapshot/%s/%s", o.BaseURL, constants.OpenSearchSnapShotRepoName, o.SecretData.BackupName) 282 var snapshotInfo types.OpenSearchSnapshotStatus 283 284 if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString { 285 // if UT flag is set, skip to avoid retry logic 286 return nil 287 } 288 289 var timeSeconds float64 290 timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout) 291 if err != nil { 292 o.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 293 return err 294 } 295 totalSeconds := timeParse.Seconds() 296 297 done := false 298 for !done { 299 err := o.HTTPHelper(context.Background(), "GET", snapShotURL, nil, &snapshotInfo) 300 if err != nil { 301 return err 302 } 303 switch snapshotInfo.Snapshots[0].State { 304 case constants.OpenSearchSnapShotInProgress: 305 if timeSeconds < totalSeconds { 306 message := fmt.Sprintf("Snapshot '%s' is in progress", o.SecretData.BackupName) 307 duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log) 308 if err != nil { 309 return err 310 } 311 timeSeconds = timeSeconds + float64(duration) 312 } else { 313 return fmt.Errorf("VeleroTimeout '%s' exceeded. Snapshot '%s' state is still IN_PROGRESS", o.SecretData.VeleroTimeout, o.SecretData.BackupName) 314 } 315 case constants.OpenSearchSnapShotSuccess: 316 o.Log.Infof("Snapshot '%s' complete", o.SecretData.BackupName) 317 done = true 318 default: 319 return fmt.Errorf("Snapshot '%s' state is invalid '%s'", o.SecretData.BackupName, snapshotInfo.Snapshots[0].State) 320 } 321 } 322 323 o.Log.Infof("Backup in progress. total shards = %v, successfull shards backed up = %v, indices = %v, data streams = %v, ", 324 snapshotInfo.Snapshots[0].Shards.Total, snapshotInfo.Snapshots[0].Shards.Successful, 325 snapshotInfo.Snapshots[0].Indices, snapshotInfo.Snapshots[0].DataStreams) 326 return nil 327 } 328 329 // DeleteData used to delete data streams before restore. 330 func (o *OpensearchImpl) DeleteData() error { 331 o.Log.Infof("Deleting data streams followed by index ..") 332 dataStreamURL := fmt.Sprintf("%s/_data_stream/*", o.BaseURL) 333 dataIndexURL := fmt.Sprintf("%s/*", o.BaseURL) 334 var deleteResponse types.OpenSearchOperationResponse 335 336 err := o.HTTPHelper(context.Background(), "DELETE", dataStreamURL, nil, &deleteResponse) 337 if err != nil { 338 return err 339 } 340 341 if !deleteResponse.Acknowledged { 342 return fmt.Errorf("Data streams deletion failure. Response = %v ", deleteResponse) 343 } 344 345 err = o.HTTPHelper(context.Background(), "DELETE", dataIndexURL, nil, &deleteResponse) 346 if err != nil { 347 return err 348 } 349 350 if !deleteResponse.Acknowledged { 351 return fmt.Errorf("Data index deletion failure. Response = %v ", deleteResponse) 352 } 353 354 o.Log.Infof("Data streams and data indexes deleted successfully !") 355 return nil 356 } 357 358 // TriggerRestore Triggers a restore from a specified snapshot 359 func (o *OpensearchImpl) TriggerRestore() error { 360 o.Log.Infof("Triggering restore with name '%s'", o.SecretData.BackupName) 361 restoreURL := fmt.Sprintf("%s/_snapshot/%s/%s/_restore", o.BaseURL, constants.OpenSearchSnapShotRepoName, o.SecretData.BackupName) 362 var restoreResponse types.OpenSearchSnapshotResponse 363 364 err := o.HTTPHelper(context.Background(), "POST", restoreURL, nil, &restoreResponse) 365 if err != nil { 366 return err 367 } 368 369 if !restoreResponse.Accepted { 370 return fmt.Errorf("Snapshot restore trigger failed. Response = %v ", restoreResponse) 371 } 372 o.Log.Infof("Snapshot restore triggered successfully !") 373 return nil 374 } 375 376 // CheckRestoreProgress checks progress of restore process, by monitoring all the data streams 377 func (o *OpensearchImpl) CheckRestoreProgress() error { 378 o.Log.Infof("Checking restore progress with name '%s'", o.SecretData.BackupName) 379 dsURL := fmt.Sprintf("%s/_data_stream", o.BaseURL) 380 var snapshotInfo types.OpenSearchDataStreams 381 382 if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString { 383 // if UT flag is set, skip to avoid retry logic 384 return nil 385 } 386 387 var timeSeconds float64 388 timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout) 389 if err != nil { 390 o.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 391 return err 392 } 393 totalSeconds := timeParse.Seconds() 394 done := false 395 notGreen := false 396 397 for !done { 398 err := o.HTTPHelper(context.Background(), "GET", dsURL, nil, &snapshotInfo) 399 if err != nil { 400 return err 401 } 402 for _, ds := range snapshotInfo.DataStreams { 403 o.Log.Infof("Data stream '%s' restore status '%s'", ds.Name, ds.Status) 404 switch ds.Status { 405 case constants.DataStreamGreen: 406 o.Log.Infof("Data stream '%s' restore complete", ds.Name) 407 default: 408 notGreen = true 409 } 410 } 411 412 if notGreen { 413 if timeSeconds < totalSeconds { 414 message := "Restore is in progress" 415 duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log) 416 if err != nil { 417 return err 418 } 419 timeSeconds = timeSeconds + float64(duration) 420 notGreen = false 421 } else { 422 return fmt.Errorf("VeleroTimeout '%s' exceeded. Restore '%s' state is still IN_PROGRESS", o.SecretData.VeleroTimeout, o.SecretData.BackupName) 423 } 424 } else { 425 // This section is hit when all data streams are green 426 // exit feedback loop 427 done = true 428 } 429 430 } 431 432 o.Log.Infof("All streams are healthy") 433 return nil 434 } 435 436 // Backup - Toplevel method to invoke OpenSearch backup 437 func (o *OpensearchImpl) Backup() error { 438 o.Log.Info("Start backup steps ....") 439 err := o.RegisterSnapshotRepository() 440 if err != nil { 441 return err 442 } 443 444 err = o.TriggerSnapshot() 445 if err != nil { 446 return err 447 } 448 449 err = o.CheckSnapshotProgress() 450 if err != nil { 451 return err 452 } 453 454 return nil 455 } 456 457 // Restore - Top level method to invoke opensearch restore 458 func (o *OpensearchImpl) Restore() error { 459 o.Log.Info("Start restore steps ....") 460 err := o.RegisterSnapshotRepository() 461 if err != nil { 462 return err 463 } 464 465 err = o.DeleteData() 466 if err != nil { 467 return err 468 } 469 470 err = o.TriggerRestore() 471 if err != nil { 472 return err 473 } 474 475 err = o.CheckRestoreProgress() 476 if err != nil { 477 return err 478 } 479 480 return nil 481 }