github.com/opentofu/opentofu@v1.7.1/internal/backend/remote-state/oss/client.go (about)

     1  // Copyright (c) The OpenTofu Authors
     2  // SPDX-License-Identifier: MPL-2.0
     3  // Copyright (c) 2023 HashiCorp, Inc.
     4  // SPDX-License-Identifier: MPL-2.0
     5  
     6  package oss
     7  
     8  import (
     9  	"bytes"
    10  	"crypto/md5"
    11  	"encoding/hex"
    12  	"encoding/json"
    13  	"fmt"
    14  	"io"
    15  	"log"
    16  	"time"
    17  
    18  	"github.com/aliyun/aliyun-oss-go-sdk/oss"
    19  	"github.com/aliyun/aliyun-tablestore-go-sdk/tablestore"
    20  	"github.com/hashicorp/go-multierror"
    21  	uuid "github.com/hashicorp/go-uuid"
    22  	"github.com/pkg/errors"
    23  
    24  	"github.com/opentofu/opentofu/internal/states/remote"
    25  	"github.com/opentofu/opentofu/internal/states/statemgr"
    26  )
    27  
    28  const (
    29  	// Store the last saved serial in tablestore with this suffix for consistency checks.
    30  	stateIDSuffix = "-md5"
    31  
    32  	pkName = "LockID"
    33  )
    34  
    35  var (
    36  	// The amount of time we will retry a state waiting for it to match the
    37  	// expected checksum.
    38  	consistencyRetryTimeout = 10 * time.Second
    39  
    40  	// delay when polling the state
    41  	consistencyRetryPollInterval = 2 * time.Second
    42  )
    43  
    44  // test hook called when checksums don't match
    45  var testChecksumHook func()
    46  
    47  type RemoteClient struct {
    48  	ossClient            *oss.Client
    49  	otsClient            *tablestore.TableStoreClient
    50  	bucketName           string
    51  	stateFile            string
    52  	lockFile             string
    53  	serverSideEncryption bool
    54  	acl                  string
    55  	otsTable             string
    56  }
    57  
    58  func (c *RemoteClient) Get() (payload *remote.Payload, err error) {
    59  	deadline := time.Now().Add(consistencyRetryTimeout)
    60  
    61  	// If we have a checksum, and the returned payload doesn't match, we retry
    62  	// up until deadline.
    63  	for {
    64  		payload, err = c.getObj()
    65  		if err != nil {
    66  			return nil, err
    67  		}
    68  
    69  		// If the remote state was manually removed the payload will be nil,
    70  		// but if there's still a digest entry for that state we will still try
    71  		// to compare the MD5 below.
    72  		var digest []byte
    73  		if payload != nil {
    74  			digest = payload.MD5
    75  		}
    76  
    77  		// verify that this state is what we expect
    78  		if expected, err := c.getMD5(); err != nil {
    79  			log.Printf("[WARN] failed to fetch state md5: %s", err)
    80  		} else if len(expected) > 0 && !bytes.Equal(expected, digest) {
    81  			log.Printf("[WARN] state md5 mismatch: expected '%x', got '%x'", expected, digest)
    82  
    83  			if testChecksumHook != nil {
    84  				testChecksumHook()
    85  			}
    86  
    87  			if time.Now().Before(deadline) {
    88  				time.Sleep(consistencyRetryPollInterval)
    89  				log.Println("[INFO] retrying OSS RemoteClient.Get...")
    90  				continue
    91  			}
    92  
    93  			return nil, fmt.Errorf(errBadChecksumFmt, digest)
    94  		}
    95  
    96  		break
    97  	}
    98  	return payload, nil
    99  }
   100  
   101  func (c *RemoteClient) Put(data []byte) error {
   102  	bucket, err := c.ossClient.Bucket(c.bucketName)
   103  	if err != nil {
   104  		return fmt.Errorf("error getting bucket: %w", err)
   105  	}
   106  
   107  	body := bytes.NewReader(data)
   108  
   109  	var options []oss.Option
   110  	if c.acl != "" {
   111  		options = append(options, oss.ACL(oss.ACLType(c.acl)))
   112  	}
   113  	options = append(options, oss.ContentType("application/json"))
   114  	if c.serverSideEncryption {
   115  		options = append(options, oss.ServerSideEncryption("AES256"))
   116  	}
   117  	options = append(options, oss.ContentLength(int64(len(data))))
   118  
   119  	if body != nil {
   120  		if err := bucket.PutObject(c.stateFile, body, options...); err != nil {
   121  			return fmt.Errorf("failed to upload state %s: %w", c.stateFile, err)
   122  		}
   123  	}
   124  
   125  	sum := md5.Sum(data)
   126  	if err := c.putMD5(sum[:]); err != nil {
   127  		// if this errors out, we unfortunately have to error out altogether,
   128  		// since the next Get will inevitably fail.
   129  		return fmt.Errorf("failed to store state MD5: %w", err)
   130  	}
   131  	return nil
   132  }
   133  
   134  func (c *RemoteClient) Delete() error {
   135  	bucket, err := c.ossClient.Bucket(c.bucketName)
   136  	if err != nil {
   137  		return fmt.Errorf("error getting bucket %s: %w", c.bucketName, err)
   138  	}
   139  
   140  	log.Printf("[DEBUG] Deleting remote state from OSS: %#v", c.stateFile)
   141  
   142  	if err := bucket.DeleteObject(c.stateFile); err != nil {
   143  		return fmt.Errorf("error deleting state %s: %w", c.stateFile, err)
   144  	}
   145  
   146  	if err := c.deleteMD5(); err != nil {
   147  		log.Printf("[WARN] Error deleting state MD5: %s", err)
   148  	}
   149  	return nil
   150  }
   151  
   152  func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) {
   153  	if c.otsTable == "" {
   154  		return "", nil
   155  	}
   156  
   157  	info.Path = c.lockPath()
   158  
   159  	if info.ID == "" {
   160  		lockID, err := uuid.GenerateUUID()
   161  		if err != nil {
   162  			return "", err
   163  		}
   164  		info.ID = lockID
   165  	}
   166  
   167  	putParams := &tablestore.PutRowChange{
   168  		TableName: c.otsTable,
   169  		PrimaryKey: &tablestore.PrimaryKey{
   170  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   171  				{
   172  					ColumnName: pkName,
   173  					Value:      c.lockPath(),
   174  				},
   175  			},
   176  		},
   177  		Columns: []tablestore.AttributeColumn{
   178  			{
   179  				ColumnName: "Info",
   180  				Value:      string(info.Marshal()),
   181  			},
   182  		},
   183  		Condition: &tablestore.RowCondition{
   184  			RowExistenceExpectation: tablestore.RowExistenceExpectation_EXPECT_NOT_EXIST,
   185  		},
   186  	}
   187  
   188  	log.Printf("[DEBUG] Recording state lock in tablestore: %#v; LOCKID:%s", putParams, c.lockPath())
   189  
   190  	_, err := c.otsClient.PutRow(&tablestore.PutRowRequest{
   191  		PutRowChange: putParams,
   192  	})
   193  	if err != nil {
   194  		err = fmt.Errorf("invoking PutRow got an error: %w", err)
   195  		lockInfo, infoErr := c.getLockInfo()
   196  		if infoErr != nil {
   197  			err = multierror.Append(err, fmt.Errorf("\ngetting lock info got an error: %w", infoErr))
   198  		}
   199  		lockErr := &statemgr.LockError{
   200  			Err:  err,
   201  			Info: lockInfo,
   202  		}
   203  		log.Printf("[ERROR] state lock error: %s", lockErr.Error())
   204  		return "", lockErr
   205  	}
   206  
   207  	return info.ID, nil
   208  }
   209  
   210  func (c *RemoteClient) getMD5() ([]byte, error) {
   211  	if c.otsTable == "" {
   212  		return nil, nil
   213  	}
   214  
   215  	getParams := &tablestore.SingleRowQueryCriteria{
   216  		TableName: c.otsTable,
   217  		PrimaryKey: &tablestore.PrimaryKey{
   218  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   219  				{
   220  					ColumnName: pkName,
   221  					Value:      c.lockPath() + stateIDSuffix,
   222  				},
   223  			},
   224  		},
   225  		ColumnsToGet: []string{pkName, "Digest"},
   226  		MaxVersion:   1,
   227  	}
   228  
   229  	log.Printf("[DEBUG] Retrieving state serial in tablestore: %#v", getParams)
   230  
   231  	object, err := c.otsClient.GetRow(&tablestore.GetRowRequest{
   232  		SingleRowQueryCriteria: getParams,
   233  	})
   234  
   235  	if err != nil {
   236  		return nil, err
   237  	}
   238  
   239  	var val string
   240  	if v, ok := object.GetColumnMap().Columns["Digest"]; ok && len(v) > 0 {
   241  		val = v[0].Value.(string)
   242  	}
   243  
   244  	sum, err := hex.DecodeString(val)
   245  	if err != nil || len(sum) != md5.Size {
   246  		return nil, errors.New("invalid md5")
   247  	}
   248  
   249  	return sum, nil
   250  }
   251  
   252  // store the hash of the state to that clients can check for stale state files.
   253  func (c *RemoteClient) putMD5(sum []byte) error {
   254  	if c.otsTable == "" {
   255  		return nil
   256  	}
   257  
   258  	if len(sum) != md5.Size {
   259  		return errors.New("invalid payload md5")
   260  	}
   261  
   262  	putParams := &tablestore.PutRowChange{
   263  		TableName: c.otsTable,
   264  		PrimaryKey: &tablestore.PrimaryKey{
   265  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   266  				{
   267  					ColumnName: pkName,
   268  					Value:      c.lockPath() + stateIDSuffix,
   269  				},
   270  			},
   271  		},
   272  		Columns: []tablestore.AttributeColumn{
   273  			{
   274  				ColumnName: "Digest",
   275  				Value:      hex.EncodeToString(sum),
   276  			},
   277  		},
   278  		Condition: &tablestore.RowCondition{
   279  			RowExistenceExpectation: tablestore.RowExistenceExpectation_IGNORE,
   280  		},
   281  	}
   282  
   283  	log.Printf("[DEBUG] Recoring state serial in tablestore: %#v", putParams)
   284  
   285  	_, err := c.otsClient.PutRow(&tablestore.PutRowRequest{
   286  		PutRowChange: putParams,
   287  	})
   288  
   289  	if err != nil {
   290  		log.Printf("[WARN] failed to record state serial in tablestore: %s", err)
   291  	}
   292  
   293  	return nil
   294  }
   295  
   296  // remove the hash value for a deleted state
   297  func (c *RemoteClient) deleteMD5() error {
   298  	if c.otsTable == "" {
   299  		return nil
   300  	}
   301  
   302  	params := &tablestore.DeleteRowRequest{
   303  		DeleteRowChange: &tablestore.DeleteRowChange{
   304  			TableName: c.otsTable,
   305  			PrimaryKey: &tablestore.PrimaryKey{
   306  				PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   307  					{
   308  						ColumnName: pkName,
   309  						Value:      c.lockPath() + stateIDSuffix,
   310  					},
   311  				},
   312  			},
   313  			Condition: &tablestore.RowCondition{
   314  				RowExistenceExpectation: tablestore.RowExistenceExpectation_EXPECT_EXIST,
   315  			},
   316  		},
   317  	}
   318  
   319  	log.Printf("[DEBUG] Deleting state serial in tablestore: %#v", params)
   320  
   321  	if _, err := c.otsClient.DeleteRow(params); err != nil {
   322  		return err
   323  	}
   324  
   325  	return nil
   326  }
   327  
   328  func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) {
   329  	getParams := &tablestore.SingleRowQueryCriteria{
   330  		TableName: c.otsTable,
   331  		PrimaryKey: &tablestore.PrimaryKey{
   332  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   333  				{
   334  					ColumnName: pkName,
   335  					Value:      c.lockPath(),
   336  				},
   337  			},
   338  		},
   339  		ColumnsToGet: []string{pkName, "Info"},
   340  		MaxVersion:   1,
   341  	}
   342  
   343  	log.Printf("[DEBUG] Retrieving state lock info from tablestore: %#v", getParams)
   344  
   345  	object, err := c.otsClient.GetRow(&tablestore.GetRowRequest{
   346  		SingleRowQueryCriteria: getParams,
   347  	})
   348  	if err != nil {
   349  		return nil, err
   350  	}
   351  
   352  	var infoData string
   353  	if v, ok := object.GetColumnMap().Columns["Info"]; ok && len(v) > 0 {
   354  		infoData = v[0].Value.(string)
   355  	}
   356  	lockInfo := &statemgr.LockInfo{}
   357  	err = json.Unmarshal([]byte(infoData), lockInfo)
   358  	if err != nil {
   359  		return nil, err
   360  	}
   361  	return lockInfo, nil
   362  }
   363  func (c *RemoteClient) Unlock(id string) error {
   364  	if c.otsTable == "" {
   365  		return nil
   366  	}
   367  
   368  	lockErr := &statemgr.LockError{}
   369  
   370  	lockInfo, err := c.getLockInfo()
   371  	if err != nil {
   372  		lockErr.Err = fmt.Errorf("failed to retrieve lock info: %w", err)
   373  		return lockErr
   374  	}
   375  	lockErr.Info = lockInfo
   376  
   377  	if lockInfo.ID != id {
   378  		lockErr.Err = fmt.Errorf("lock id %q does not match existing lock", id)
   379  		return lockErr
   380  	}
   381  	params := &tablestore.DeleteRowRequest{
   382  		DeleteRowChange: &tablestore.DeleteRowChange{
   383  			TableName: c.otsTable,
   384  			PrimaryKey: &tablestore.PrimaryKey{
   385  				PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   386  					{
   387  						ColumnName: pkName,
   388  						Value:      c.lockPath(),
   389  					},
   390  				},
   391  			},
   392  			Condition: &tablestore.RowCondition{
   393  				RowExistenceExpectation: tablestore.RowExistenceExpectation_IGNORE,
   394  			},
   395  		},
   396  	}
   397  
   398  	_, err = c.otsClient.DeleteRow(params)
   399  
   400  	if err != nil {
   401  		lockErr.Err = err
   402  		return lockErr
   403  	}
   404  
   405  	return nil
   406  }
   407  
   408  func (c *RemoteClient) lockPath() string {
   409  	return fmt.Sprintf("%s/%s", c.bucketName, c.stateFile)
   410  }
   411  
   412  func (c *RemoteClient) getObj() (*remote.Payload, error) {
   413  	bucket, err := c.ossClient.Bucket(c.bucketName)
   414  	if err != nil {
   415  		return nil, fmt.Errorf("error getting bucket %s: %w", c.bucketName, err)
   416  	}
   417  
   418  	if exist, err := bucket.IsObjectExist(c.stateFile); err != nil {
   419  		return nil, fmt.Errorf("estimating object %s is exist got an error: %w", c.stateFile, err)
   420  	} else if !exist {
   421  		return nil, nil
   422  	}
   423  
   424  	var options []oss.Option
   425  	output, err := bucket.GetObject(c.stateFile, options...)
   426  	if err != nil {
   427  		return nil, fmt.Errorf("error getting object: %w", err)
   428  	}
   429  
   430  	buf := bytes.NewBuffer(nil)
   431  	if _, err := io.Copy(buf, output); err != nil {
   432  		return nil, fmt.Errorf("failed to read remote state: %w", err)
   433  	}
   434  	sum := md5.Sum(buf.Bytes())
   435  	payload := &remote.Payload{
   436  		Data: buf.Bytes(),
   437  		MD5:  sum[:],
   438  	}
   439  
   440  	// If there was no data, then return nil
   441  	if len(payload.Data) == 0 {
   442  		return nil, nil
   443  	}
   444  
   445  	return payload, nil
   446  }
   447  
   448  const errBadChecksumFmt = `state data in OSS does not have the expected content.
   449  
   450  This may be caused by unusually long delays in OSS processing a previous state
   451  update.  Please wait for a minute or two and try again. If this problem
   452  persists, and neither OSS nor TableStore are experiencing an outage, you may need
   453  to manually verify the remote state and update the Digest value stored in the
   454  TableStore table to the following value: %x`