github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/verrazzano-backup-hook/opensearch/opensearch.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package opensearch
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/constants"
    12  	"github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/types"
    13  	"github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/utilities"
    14  	"go.uber.org/zap"
    15  	"io"
    16  	"io/ioutil"
    17  	"net/http"
    18  	"os"
    19  	"time"
    20  )
    21  
    22  // HTTPHelper supports net/http calls of type GET/POST/DELETE
    23  func (o *OpensearchImpl) HTTPHelper(ctx context.Context, method, requestURL string, body io.Reader, data interface{}) error {
    24  	o.Log.Debugf("Invoking HTTP '%s' request with url '%s'", method, requestURL)
    25  	var response *http.Response
    26  	var request *http.Request
    27  	var err error
    28  
    29  	timeoutDuration, err := time.ParseDuration(o.Timeout)
    30  	if err != nil {
    31  		o.Log.Errorf("Unable to parse time duration ", zap.Error(err))
    32  		os.Exit(1)
    33  	}
    34  	ctx, cancel := context.WithTimeout(ctx, timeoutDuration)
    35  	defer cancel()
    36  
    37  	switch method {
    38  	case "GET":
    39  		request, err = http.NewRequestWithContext(ctx, http.MethodGet, requestURL, body)
    40  	case "POST":
    41  		request, err = http.NewRequestWithContext(ctx, http.MethodPost, requestURL, body)
    42  	case "DELETE":
    43  		request, err = http.NewRequestWithContext(ctx, http.MethodDelete, requestURL, body)
    44  	}
    45  	if err != nil {
    46  		o.Log.Error("Error creating request ", zap.Error(err))
    47  		return err
    48  	}
    49  
    50  	request.Header.Add("Content-Type", constants.HTTPContentType)
    51  	response, err = o.Client.Do(request)
    52  	if err != nil {
    53  		o.Log.Errorf("HTTP '%s' failure while invoking url '%s' due to '%v'", method, requestURL, zap.Error(err))
    54  		return err
    55  	}
    56  	defer response.Body.Close()
    57  
    58  	bdata, err := ioutil.ReadAll(response.Body)
    59  	if err != nil {
    60  		o.Log.Errorf("Unable to read response body ", zap.Error(err))
    61  		return err
    62  	}
    63  
    64  	if response.StatusCode != 200 {
    65  		o.Log.Errorf("Error completing request, response code '%v', response body '%v'", response.StatusCode, string(bdata))
    66  		return err
    67  	}
    68  
    69  	err = json.Unmarshal(bdata, &data)
    70  	if err != nil {
    71  		o.Log.Errorf("json unmarshalling error %v", err)
    72  		return err
    73  	}
    74  
    75  	return nil
    76  }
    77  
    78  // EnsureOpenSearchIsReachable is used determine whether OpenSearch cluster is reachable
    79  func (o *OpensearchImpl) EnsureOpenSearchIsReachable() error {
    80  	o.Log.Infof("Checking if cluster is reachable")
    81  	var osinfo types.OpenSearchClusterInfo
    82  	done := false
    83  	var timeSeconds float64
    84  
    85  	if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString {
    86  		// if UT flag is set, skip to avoid retry logic
    87  		return nil
    88  	}
    89  
    90  	timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout)
    91  	if err != nil {
    92  		o.Log.Errorf("Unable to parse time duration ", zap.Error(err))
    93  		return err
    94  	}
    95  	totalSeconds := timeParse.Seconds()
    96  
    97  	for !done {
    98  		err := o.HTTPHelper(context.Background(), "GET", o.BaseURL, nil, &osinfo)
    99  		if err != nil {
   100  			if timeSeconds < totalSeconds {
   101  				message := "Cluster is not reachable"
   102  				duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log)
   103  				if err != nil {
   104  					return err
   105  				}
   106  				timeSeconds = timeSeconds + float64(duration)
   107  			} else {
   108  				o.Log.Errorf("VeleroTimeout '%s' exceeded. Cluster not reachable", o.SecretData.VeleroTimeout)
   109  				return err
   110  			}
   111  		} else {
   112  			done = true
   113  		}
   114  	}
   115  
   116  	o.Log.Infof("Cluster '%s' is reachable", osinfo.ClusterName)
   117  
   118  	return nil
   119  }
   120  
   121  // EnsureOpenSearchIsHealthy ensures OpenSearch cluster is healthy
   122  func (o *OpensearchImpl) EnsureOpenSearchIsHealthy() error {
   123  	o.Log.Infof("Checking if cluster is healthy")
   124  	var clusterHealth types.OpenSearchHealthResponse
   125  	err := o.EnsureOpenSearchIsReachable()
   126  	if err != nil {
   127  		return err
   128  	}
   129  
   130  	healthURL := fmt.Sprintf("%s/_cluster/health", o.BaseURL)
   131  	healthReachable := false
   132  	var timeSeconds float64
   133  
   134  	timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout)
   135  	if err != nil {
   136  		o.Log.Errorf("Unable to parse time duration ", zap.Error(err))
   137  		return err
   138  	}
   139  	totalSeconds := timeParse.Seconds()
   140  
   141  	if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString {
   142  		// if UT flag is set, skip to avoid retry logic
   143  		return nil
   144  	}
   145  
   146  	for !healthReachable {
   147  		err = o.HTTPHelper(context.Background(), "GET", healthURL, nil, &clusterHealth)
   148  		if err != nil {
   149  			if timeSeconds < totalSeconds {
   150  				message := "Cluster health endpoint is not reachable"
   151  				duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log)
   152  				if err != nil {
   153  					return err
   154  				}
   155  				timeSeconds = timeSeconds + float64(duration)
   156  			} else {
   157  				o.Log.Errorf("VeleroTimeout '%s' exceeded. Cluster health endpoint is not reachable", o.SecretData.VeleroTimeout)
   158  				return err
   159  			}
   160  		} else {
   161  			o.Log.Infof("Cluster health endpoint is reachable now")
   162  			healthReachable = true
   163  		}
   164  	}
   165  
   166  	healthGreen := false
   167  
   168  	for !healthGreen {
   169  		err = o.HTTPHelper(context.Background(), "GET", healthURL, nil, &clusterHealth)
   170  		if err != nil {
   171  			if timeSeconds < totalSeconds {
   172  				message := "Json unmarshalling error"
   173  				duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log)
   174  				if err != nil {
   175  					return err
   176  				}
   177  				timeSeconds = timeSeconds + float64(duration)
   178  				continue
   179  			} else {
   180  				return fmt.Errorf("VeleroTimeout '%s' exceeded. Json unmarshalling error while checking cluster health %v", o.SecretData.VeleroTimeout, zap.Error(err))
   181  			}
   182  		}
   183  
   184  		if clusterHealth.Status != "green" {
   185  			if timeSeconds < totalSeconds {
   186  				message := fmt.Sprintf("Cluster health is '%s'", clusterHealth.Status)
   187  				duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log)
   188  				if err != nil {
   189  					return err
   190  				}
   191  				timeSeconds = timeSeconds + float64(duration)
   192  			} else {
   193  				return fmt.Errorf("VeleroTimeout '%s' exceeded. Cluster health expected 'green' , current state '%s'", o.SecretData.VeleroTimeout, clusterHealth.Status)
   194  			}
   195  		} else {
   196  			healthGreen = true
   197  		}
   198  	}
   199  
   200  	if healthReachable && healthGreen {
   201  		o.Log.Infof("Cluster is reachable and healthy with status as '%s'", clusterHealth.Status)
   202  		return nil
   203  	}
   204  
   205  	return err
   206  }
   207  
   208  // ReloadOpensearchSecureSettings used to reload secure settings once object store keys are updated
   209  func (o *OpensearchImpl) ReloadOpensearchSecureSettings() error {
   210  	var secureSettings types.OpenSearchSecureSettingsReloadStatus
   211  	url := fmt.Sprintf("%s/_nodes/reload_secure_settings", o.BaseURL)
   212  
   213  	err := o.HTTPHelper(context.Background(), "POST", url, nil, &secureSettings)
   214  	if err != nil {
   215  		return err
   216  	}
   217  
   218  	if secureSettings.ClusterNodes.Failed == 0 && secureSettings.ClusterNodes.Total == 0 && secureSettings.ClusterNodes.Successful == 0 {
   219  		return fmt.Errorf("Invalid cluster settings detected. Check the connection")
   220  	}
   221  
   222  	if secureSettings.ClusterNodes.Failed == 0 && secureSettings.ClusterNodes.Total == secureSettings.ClusterNodes.Successful {
   223  		o.Log.Infof("Secure settings reloaded sucessfully across all '%v' nodes of the cluster", secureSettings.ClusterNodes.Total)
   224  		return nil
   225  	}
   226  	return fmt.Errorf("Not all nodes were updated successfully. Total = '%v', Failed = '%v' , Successful = '%v'", secureSettings.ClusterNodes.Total, secureSettings.ClusterNodes.Failed, secureSettings.ClusterNodes.Successful)
   227  }
   228  
   229  // RegisterSnapshotRepository registers an object store with OpenSearch using the s3-plugin
   230  func (o *OpensearchImpl) RegisterSnapshotRepository() error {
   231  	o.Log.Infof("Registering s3 backend repository '%s'", constants.OpenSearchSnapShotRepoName)
   232  	var snapshotPayload types.OpenSearchSnapshotRequestPayload
   233  	var registerResponse types.OpenSearchOperationResponse
   234  	snapshotPayload.Type = "s3"
   235  	snapshotPayload.Settings.Bucket = o.SecretData.BucketName
   236  	snapshotPayload.Settings.Region = o.SecretData.RegionName
   237  	snapshotPayload.Settings.Client = "default"
   238  	snapshotPayload.Settings.Endpoint = o.SecretData.Endpoint
   239  	snapshotPayload.Settings.PathStyleAccess = true
   240  
   241  	postBody, err := json.Marshal(snapshotPayload)
   242  	if err != nil {
   243  		return err
   244  	}
   245  
   246  	url := fmt.Sprintf("%s/_snapshot/%s", o.BaseURL, constants.OpenSearchSnapShotRepoName)
   247  
   248  	err = o.HTTPHelper(context.Background(), "POST", url, bytes.NewBuffer(postBody), &registerResponse)
   249  	if err != nil {
   250  		return err
   251  	}
   252  
   253  	if registerResponse.Acknowledged {
   254  		o.Log.Infof("Snapshot registered successfully !")
   255  		return nil
   256  	}
   257  	return fmt.Errorf("Snapshot registration unsuccessful. Response = %v", registerResponse)
   258  }
   259  
   260  // TriggerSnapshot this triggers a snapshot/backup of all the data streams/indices
   261  func (o *OpensearchImpl) TriggerSnapshot() error {
   262  	o.Log.Infof("Triggering snapshot with name '%s'", o.SecretData.BackupName)
   263  	var snapshotResponse types.OpenSearchSnapshotResponse
   264  	snapShotURL := fmt.Sprintf("%s/_snapshot/%s/%s", o.BaseURL, constants.OpenSearchSnapShotRepoName, o.SecretData.BackupName)
   265  
   266  	err := o.HTTPHelper(context.Background(), "POST", snapShotURL, nil, &snapshotResponse)
   267  	if err != nil {
   268  		return err
   269  	}
   270  
   271  	if !snapshotResponse.Accepted {
   272  		return fmt.Errorf("Snapshot trigger failure. Response = %v ", snapshotResponse)
   273  	}
   274  	o.Log.Infof("Snapshot triggered successfully !")
   275  	return nil
   276  }
   277  
   278  // CheckSnapshotProgress checks the data backup progress.
   279  func (o *OpensearchImpl) CheckSnapshotProgress() error {
   280  	o.Log.Infof("Checking snapshot progress with name '%s'", o.SecretData.BackupName)
   281  	snapShotURL := fmt.Sprintf("%s/_snapshot/%s/%s", o.BaseURL, constants.OpenSearchSnapShotRepoName, o.SecretData.BackupName)
   282  	var snapshotInfo types.OpenSearchSnapshotStatus
   283  
   284  	if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString {
   285  		// if UT flag is set, skip to avoid retry logic
   286  		return nil
   287  	}
   288  
   289  	var timeSeconds float64
   290  	timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout)
   291  	if err != nil {
   292  		o.Log.Errorf("Unable to parse time duration ", zap.Error(err))
   293  		return err
   294  	}
   295  	totalSeconds := timeParse.Seconds()
   296  
   297  	done := false
   298  	for !done {
   299  		err := o.HTTPHelper(context.Background(), "GET", snapShotURL, nil, &snapshotInfo)
   300  		if err != nil {
   301  			return err
   302  		}
   303  		switch snapshotInfo.Snapshots[0].State {
   304  		case constants.OpenSearchSnapShotInProgress:
   305  			if timeSeconds < totalSeconds {
   306  				message := fmt.Sprintf("Snapshot '%s' is in progress", o.SecretData.BackupName)
   307  				duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log)
   308  				if err != nil {
   309  					return err
   310  				}
   311  				timeSeconds = timeSeconds + float64(duration)
   312  			} else {
   313  				return fmt.Errorf("VeleroTimeout '%s' exceeded. Snapshot '%s' state is still IN_PROGRESS", o.SecretData.VeleroTimeout, o.SecretData.BackupName)
   314  			}
   315  		case constants.OpenSearchSnapShotSuccess:
   316  			o.Log.Infof("Snapshot '%s' complete", o.SecretData.BackupName)
   317  			done = true
   318  		default:
   319  			return fmt.Errorf("Snapshot '%s' state is invalid '%s'", o.SecretData.BackupName, snapshotInfo.Snapshots[0].State)
   320  		}
   321  	}
   322  
   323  	o.Log.Infof("Backup in progress. total shards = %v, successfull shards backed up = %v, indices = %v, data streams = %v, ",
   324  		snapshotInfo.Snapshots[0].Shards.Total, snapshotInfo.Snapshots[0].Shards.Successful,
   325  		snapshotInfo.Snapshots[0].Indices, snapshotInfo.Snapshots[0].DataStreams)
   326  	return nil
   327  }
   328  
   329  // DeleteData used to delete data streams before restore.
   330  func (o *OpensearchImpl) DeleteData() error {
   331  	o.Log.Infof("Deleting data streams followed by index ..")
   332  	dataStreamURL := fmt.Sprintf("%s/_data_stream/*", o.BaseURL)
   333  	dataIndexURL := fmt.Sprintf("%s/*", o.BaseURL)
   334  	var deleteResponse types.OpenSearchOperationResponse
   335  
   336  	err := o.HTTPHelper(context.Background(), "DELETE", dataStreamURL, nil, &deleteResponse)
   337  	if err != nil {
   338  		return err
   339  	}
   340  
   341  	if !deleteResponse.Acknowledged {
   342  		return fmt.Errorf("Data streams deletion failure. Response = %v ", deleteResponse)
   343  	}
   344  
   345  	err = o.HTTPHelper(context.Background(), "DELETE", dataIndexURL, nil, &deleteResponse)
   346  	if err != nil {
   347  		return err
   348  	}
   349  
   350  	if !deleteResponse.Acknowledged {
   351  		return fmt.Errorf("Data index deletion failure. Response = %v ", deleteResponse)
   352  	}
   353  
   354  	o.Log.Infof("Data streams and data indexes deleted successfully !")
   355  	return nil
   356  }
   357  
   358  // TriggerRestore Triggers a restore from a specified snapshot
   359  func (o *OpensearchImpl) TriggerRestore() error {
   360  	o.Log.Infof("Triggering restore with name '%s'", o.SecretData.BackupName)
   361  	restoreURL := fmt.Sprintf("%s/_snapshot/%s/%s/_restore", o.BaseURL, constants.OpenSearchSnapShotRepoName, o.SecretData.BackupName)
   362  	var restoreResponse types.OpenSearchSnapshotResponse
   363  
   364  	err := o.HTTPHelper(context.Background(), "POST", restoreURL, nil, &restoreResponse)
   365  	if err != nil {
   366  		return err
   367  	}
   368  
   369  	if !restoreResponse.Accepted {
   370  		return fmt.Errorf("Snapshot restore trigger failed. Response = %v ", restoreResponse)
   371  	}
   372  	o.Log.Infof("Snapshot restore triggered successfully !")
   373  	return nil
   374  }
   375  
   376  // CheckRestoreProgress checks progress of restore process, by monitoring all the data streams
   377  func (o *OpensearchImpl) CheckRestoreProgress() error {
   378  	o.Log.Infof("Checking restore progress with name '%s'", o.SecretData.BackupName)
   379  	dsURL := fmt.Sprintf("%s/_data_stream", o.BaseURL)
   380  	var snapshotInfo types.OpenSearchDataStreams
   381  
   382  	if utilities.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString {
   383  		// if UT flag is set, skip to avoid retry logic
   384  		return nil
   385  	}
   386  
   387  	var timeSeconds float64
   388  	timeParse, err := time.ParseDuration(o.SecretData.VeleroTimeout)
   389  	if err != nil {
   390  		o.Log.Errorf("Unable to parse time duration ", zap.Error(err))
   391  		return err
   392  	}
   393  	totalSeconds := timeParse.Seconds()
   394  	done := false
   395  	notGreen := false
   396  
   397  	for !done {
   398  		err := o.HTTPHelper(context.Background(), "GET", dsURL, nil, &snapshotInfo)
   399  		if err != nil {
   400  			return err
   401  		}
   402  		for _, ds := range snapshotInfo.DataStreams {
   403  			o.Log.Infof("Data stream '%s' restore status '%s'", ds.Name, ds.Status)
   404  			switch ds.Status {
   405  			case constants.DataStreamGreen:
   406  				o.Log.Infof("Data stream '%s' restore complete", ds.Name)
   407  			default:
   408  				notGreen = true
   409  			}
   410  		}
   411  
   412  		if notGreen {
   413  			if timeSeconds < totalSeconds {
   414  				message := "Restore is in progress"
   415  				duration, err := utilities.WaitRandom(message, o.SecretData.VeleroTimeout, o.Log)
   416  				if err != nil {
   417  					return err
   418  				}
   419  				timeSeconds = timeSeconds + float64(duration)
   420  				notGreen = false
   421  			} else {
   422  				return fmt.Errorf("VeleroTimeout '%s' exceeded. Restore '%s' state is still IN_PROGRESS", o.SecretData.VeleroTimeout, o.SecretData.BackupName)
   423  			}
   424  		} else {
   425  			// This section is hit when all data streams are green
   426  			// exit feedback loop
   427  			done = true
   428  		}
   429  
   430  	}
   431  
   432  	o.Log.Infof("All streams are healthy")
   433  	return nil
   434  }
   435  
   436  // Backup - Toplevel method to invoke OpenSearch backup
   437  func (o *OpensearchImpl) Backup() error {
   438  	o.Log.Info("Start backup steps ....")
   439  	err := o.RegisterSnapshotRepository()
   440  	if err != nil {
   441  		return err
   442  	}
   443  
   444  	err = o.TriggerSnapshot()
   445  	if err != nil {
   446  		return err
   447  	}
   448  
   449  	err = o.CheckSnapshotProgress()
   450  	if err != nil {
   451  		return err
   452  	}
   453  
   454  	return nil
   455  }
   456  
   457  // Restore - Top level method to invoke opensearch restore
   458  func (o *OpensearchImpl) Restore() error {
   459  	o.Log.Info("Start restore steps ....")
   460  	err := o.RegisterSnapshotRepository()
   461  	if err != nil {
   462  		return err
   463  	}
   464  
   465  	err = o.DeleteData()
   466  	if err != nil {
   467  		return err
   468  	}
   469  
   470  	err = o.TriggerRestore()
   471  	if err != nil {
   472  		return err
   473  	}
   474  
   475  	err = o.CheckRestoreProgress()
   476  	if err != nil {
   477  		return err
   478  	}
   479  
   480  	return nil
   481  }