github.com/terramate-io/tf@v0.0.0-20230830114523-fce866b4dfcd/backend/remote-state/oss/client.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package oss
     5  
     6  import (
     7  	"bytes"
     8  	"crypto/md5"
     9  	"encoding/hex"
    10  	"encoding/json"
    11  	"fmt"
    12  	"io"
    13  	"log"
    14  	"time"
    15  
    16  	"github.com/aliyun/aliyun-oss-go-sdk/oss"
    17  	"github.com/aliyun/aliyun-tablestore-go-sdk/tablestore"
    18  	"github.com/hashicorp/go-multierror"
    19  	uuid "github.com/hashicorp/go-uuid"
    20  	"github.com/pkg/errors"
    21  
    22  	"github.com/terramate-io/tf/states/remote"
    23  	"github.com/terramate-io/tf/states/statemgr"
    24  )
    25  
    26  const (
    27  	// Store the last saved serial in tablestore with this suffix for consistency checks.
    28  	stateIDSuffix = "-md5"
    29  
    30  	pkName = "LockID"
    31  )
    32  
    33  var (
    34  	// The amount of time we will retry a state waiting for it to match the
    35  	// expected checksum.
    36  	consistencyRetryTimeout = 10 * time.Second
    37  
    38  	// delay when polling the state
    39  	consistencyRetryPollInterval = 2 * time.Second
    40  )
    41  
    42  // test hook called when checksums don't match
    43  var testChecksumHook func()
    44  
    45  type RemoteClient struct {
    46  	ossClient            *oss.Client
    47  	otsClient            *tablestore.TableStoreClient
    48  	bucketName           string
    49  	stateFile            string
    50  	lockFile             string
    51  	serverSideEncryption bool
    52  	acl                  string
    53  	otsTable             string
    54  }
    55  
    56  func (c *RemoteClient) Get() (payload *remote.Payload, err error) {
    57  	deadline := time.Now().Add(consistencyRetryTimeout)
    58  
    59  	// If we have a checksum, and the returned payload doesn't match, we retry
    60  	// up until deadline.
    61  	for {
    62  		payload, err = c.getObj()
    63  		if err != nil {
    64  			return nil, err
    65  		}
    66  
    67  		// If the remote state was manually removed the payload will be nil,
    68  		// but if there's still a digest entry for that state we will still try
    69  		// to compare the MD5 below.
    70  		var digest []byte
    71  		if payload != nil {
    72  			digest = payload.MD5
    73  		}
    74  
    75  		// verify that this state is what we expect
    76  		if expected, err := c.getMD5(); err != nil {
    77  			log.Printf("[WARN] failed to fetch state md5: %s", err)
    78  		} else if len(expected) > 0 && !bytes.Equal(expected, digest) {
    79  			log.Printf("[WARN] state md5 mismatch: expected '%x', got '%x'", expected, digest)
    80  
    81  			if testChecksumHook != nil {
    82  				testChecksumHook()
    83  			}
    84  
    85  			if time.Now().Before(deadline) {
    86  				time.Sleep(consistencyRetryPollInterval)
    87  				log.Println("[INFO] retrying OSS RemoteClient.Get...")
    88  				continue
    89  			}
    90  
    91  			return nil, fmt.Errorf(errBadChecksumFmt, digest)
    92  		}
    93  
    94  		break
    95  	}
    96  	return payload, nil
    97  }
    98  
    99  func (c *RemoteClient) Put(data []byte) error {
   100  	bucket, err := c.ossClient.Bucket(c.bucketName)
   101  	if err != nil {
   102  		return fmt.Errorf("error getting bucket: %#v", err)
   103  	}
   104  
   105  	body := bytes.NewReader(data)
   106  
   107  	var options []oss.Option
   108  	if c.acl != "" {
   109  		options = append(options, oss.ACL(oss.ACLType(c.acl)))
   110  	}
   111  	options = append(options, oss.ContentType("application/json"))
   112  	if c.serverSideEncryption {
   113  		options = append(options, oss.ServerSideEncryption("AES256"))
   114  	}
   115  	options = append(options, oss.ContentLength(int64(len(data))))
   116  
   117  	if body != nil {
   118  		if err := bucket.PutObject(c.stateFile, body, options...); err != nil {
   119  			return fmt.Errorf("failed to upload state %s: %#v", c.stateFile, err)
   120  		}
   121  	}
   122  
   123  	sum := md5.Sum(data)
   124  	if err := c.putMD5(sum[:]); err != nil {
   125  		// if this errors out, we unfortunately have to error out altogether,
   126  		// since the next Get will inevitably fail.
   127  		return fmt.Errorf("failed to store state MD5: %s", err)
   128  	}
   129  	return nil
   130  }
   131  
   132  func (c *RemoteClient) Delete() error {
   133  	bucket, err := c.ossClient.Bucket(c.bucketName)
   134  	if err != nil {
   135  		return fmt.Errorf("error getting bucket %s: %#v", c.bucketName, err)
   136  	}
   137  
   138  	log.Printf("[DEBUG] Deleting remote state from OSS: %#v", c.stateFile)
   139  
   140  	if err := bucket.DeleteObject(c.stateFile); err != nil {
   141  		return fmt.Errorf("error deleting state %s: %#v", c.stateFile, err)
   142  	}
   143  
   144  	if err := c.deleteMD5(); err != nil {
   145  		log.Printf("[WARN] Error deleting state MD5: %s", err)
   146  	}
   147  	return nil
   148  }
   149  
   150  func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) {
   151  	if c.otsTable == "" {
   152  		return "", nil
   153  	}
   154  
   155  	info.Path = c.lockPath()
   156  
   157  	if info.ID == "" {
   158  		lockID, err := uuid.GenerateUUID()
   159  		if err != nil {
   160  			return "", err
   161  		}
   162  		info.ID = lockID
   163  	}
   164  
   165  	putParams := &tablestore.PutRowChange{
   166  		TableName: c.otsTable,
   167  		PrimaryKey: &tablestore.PrimaryKey{
   168  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   169  				{
   170  					ColumnName: pkName,
   171  					Value:      c.lockPath(),
   172  				},
   173  			},
   174  		},
   175  		Columns: []tablestore.AttributeColumn{
   176  			{
   177  				ColumnName: "Info",
   178  				Value:      string(info.Marshal()),
   179  			},
   180  		},
   181  		Condition: &tablestore.RowCondition{
   182  			RowExistenceExpectation: tablestore.RowExistenceExpectation_EXPECT_NOT_EXIST,
   183  		},
   184  	}
   185  
   186  	log.Printf("[DEBUG] Recording state lock in tablestore: %#v; LOCKID:%s", putParams, c.lockPath())
   187  
   188  	_, err := c.otsClient.PutRow(&tablestore.PutRowRequest{
   189  		PutRowChange: putParams,
   190  	})
   191  	if err != nil {
   192  		err = fmt.Errorf("invoking PutRow got an error: %#v", err)
   193  		lockInfo, infoErr := c.getLockInfo()
   194  		if infoErr != nil {
   195  			err = multierror.Append(err, fmt.Errorf("\ngetting lock info got an error: %#v", infoErr))
   196  		}
   197  		lockErr := &statemgr.LockError{
   198  			Err:  err,
   199  			Info: lockInfo,
   200  		}
   201  		log.Printf("[ERROR] state lock error: %s", lockErr.Error())
   202  		return "", lockErr
   203  	}
   204  
   205  	return info.ID, nil
   206  }
   207  
   208  func (c *RemoteClient) getMD5() ([]byte, error) {
   209  	if c.otsTable == "" {
   210  		return nil, nil
   211  	}
   212  
   213  	getParams := &tablestore.SingleRowQueryCriteria{
   214  		TableName: c.otsTable,
   215  		PrimaryKey: &tablestore.PrimaryKey{
   216  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   217  				{
   218  					ColumnName: pkName,
   219  					Value:      c.lockPath() + stateIDSuffix,
   220  				},
   221  			},
   222  		},
   223  		ColumnsToGet: []string{pkName, "Digest"},
   224  		MaxVersion:   1,
   225  	}
   226  
   227  	log.Printf("[DEBUG] Retrieving state serial in tablestore: %#v", getParams)
   228  
   229  	object, err := c.otsClient.GetRow(&tablestore.GetRowRequest{
   230  		SingleRowQueryCriteria: getParams,
   231  	})
   232  
   233  	if err != nil {
   234  		return nil, err
   235  	}
   236  
   237  	var val string
   238  	if v, ok := object.GetColumnMap().Columns["Digest"]; ok && len(v) > 0 {
   239  		val = v[0].Value.(string)
   240  	}
   241  
   242  	sum, err := hex.DecodeString(val)
   243  	if err != nil || len(sum) != md5.Size {
   244  		return nil, errors.New("invalid md5")
   245  	}
   246  
   247  	return sum, nil
   248  }
   249  
   250  // store the hash of the state to that clients can check for stale state files.
   251  func (c *RemoteClient) putMD5(sum []byte) error {
   252  	if c.otsTable == "" {
   253  		return nil
   254  	}
   255  
   256  	if len(sum) != md5.Size {
   257  		return errors.New("invalid payload md5")
   258  	}
   259  
   260  	putParams := &tablestore.PutRowChange{
   261  		TableName: c.otsTable,
   262  		PrimaryKey: &tablestore.PrimaryKey{
   263  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   264  				{
   265  					ColumnName: pkName,
   266  					Value:      c.lockPath() + stateIDSuffix,
   267  				},
   268  			},
   269  		},
   270  		Columns: []tablestore.AttributeColumn{
   271  			{
   272  				ColumnName: "Digest",
   273  				Value:      hex.EncodeToString(sum),
   274  			},
   275  		},
   276  		Condition: &tablestore.RowCondition{
   277  			RowExistenceExpectation: tablestore.RowExistenceExpectation_IGNORE,
   278  		},
   279  	}
   280  
   281  	log.Printf("[DEBUG] Recoring state serial in tablestore: %#v", putParams)
   282  
   283  	_, err := c.otsClient.PutRow(&tablestore.PutRowRequest{
   284  		PutRowChange: putParams,
   285  	})
   286  
   287  	if err != nil {
   288  		log.Printf("[WARN] failed to record state serial in tablestore: %s", err)
   289  	}
   290  
   291  	return nil
   292  }
   293  
   294  // remove the hash value for a deleted state
   295  func (c *RemoteClient) deleteMD5() error {
   296  	if c.otsTable == "" {
   297  		return nil
   298  	}
   299  
   300  	params := &tablestore.DeleteRowRequest{
   301  		DeleteRowChange: &tablestore.DeleteRowChange{
   302  			TableName: c.otsTable,
   303  			PrimaryKey: &tablestore.PrimaryKey{
   304  				PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   305  					{
   306  						ColumnName: pkName,
   307  						Value:      c.lockPath() + stateIDSuffix,
   308  					},
   309  				},
   310  			},
   311  			Condition: &tablestore.RowCondition{
   312  				RowExistenceExpectation: tablestore.RowExistenceExpectation_EXPECT_EXIST,
   313  			},
   314  		},
   315  	}
   316  
   317  	log.Printf("[DEBUG] Deleting state serial in tablestore: %#v", params)
   318  
   319  	if _, err := c.otsClient.DeleteRow(params); err != nil {
   320  		return err
   321  	}
   322  
   323  	return nil
   324  }
   325  
   326  func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) {
   327  	getParams := &tablestore.SingleRowQueryCriteria{
   328  		TableName: c.otsTable,
   329  		PrimaryKey: &tablestore.PrimaryKey{
   330  			PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   331  				{
   332  					ColumnName: pkName,
   333  					Value:      c.lockPath(),
   334  				},
   335  			},
   336  		},
   337  		ColumnsToGet: []string{pkName, "Info"},
   338  		MaxVersion:   1,
   339  	}
   340  
   341  	log.Printf("[DEBUG] Retrieving state lock info from tablestore: %#v", getParams)
   342  
   343  	object, err := c.otsClient.GetRow(&tablestore.GetRowRequest{
   344  		SingleRowQueryCriteria: getParams,
   345  	})
   346  	if err != nil {
   347  		return nil, err
   348  	}
   349  
   350  	var infoData string
   351  	if v, ok := object.GetColumnMap().Columns["Info"]; ok && len(v) > 0 {
   352  		infoData = v[0].Value.(string)
   353  	}
   354  	lockInfo := &statemgr.LockInfo{}
   355  	err = json.Unmarshal([]byte(infoData), lockInfo)
   356  	if err != nil {
   357  		return nil, err
   358  	}
   359  	return lockInfo, nil
   360  }
   361  func (c *RemoteClient) Unlock(id string) error {
   362  	if c.otsTable == "" {
   363  		return nil
   364  	}
   365  
   366  	lockErr := &statemgr.LockError{}
   367  
   368  	lockInfo, err := c.getLockInfo()
   369  	if err != nil {
   370  		lockErr.Err = fmt.Errorf("failed to retrieve lock info: %s", err)
   371  		return lockErr
   372  	}
   373  	lockErr.Info = lockInfo
   374  
   375  	if lockInfo.ID != id {
   376  		lockErr.Err = fmt.Errorf("lock id %q does not match existing lock", id)
   377  		return lockErr
   378  	}
   379  	params := &tablestore.DeleteRowRequest{
   380  		DeleteRowChange: &tablestore.DeleteRowChange{
   381  			TableName: c.otsTable,
   382  			PrimaryKey: &tablestore.PrimaryKey{
   383  				PrimaryKeys: []*tablestore.PrimaryKeyColumn{
   384  					{
   385  						ColumnName: pkName,
   386  						Value:      c.lockPath(),
   387  					},
   388  				},
   389  			},
   390  			Condition: &tablestore.RowCondition{
   391  				RowExistenceExpectation: tablestore.RowExistenceExpectation_IGNORE,
   392  			},
   393  		},
   394  	}
   395  
   396  	_, err = c.otsClient.DeleteRow(params)
   397  
   398  	if err != nil {
   399  		lockErr.Err = err
   400  		return lockErr
   401  	}
   402  
   403  	return nil
   404  }
   405  
   406  func (c *RemoteClient) lockPath() string {
   407  	return fmt.Sprintf("%s/%s", c.bucketName, c.stateFile)
   408  }
   409  
   410  func (c *RemoteClient) getObj() (*remote.Payload, error) {
   411  	bucket, err := c.ossClient.Bucket(c.bucketName)
   412  	if err != nil {
   413  		return nil, fmt.Errorf("error getting bucket %s: %#v", c.bucketName, err)
   414  	}
   415  
   416  	if exist, err := bucket.IsObjectExist(c.stateFile); err != nil {
   417  		return nil, fmt.Errorf("estimating object %s is exist got an error: %#v", c.stateFile, err)
   418  	} else if !exist {
   419  		return nil, nil
   420  	}
   421  
   422  	var options []oss.Option
   423  	output, err := bucket.GetObject(c.stateFile, options...)
   424  	if err != nil {
   425  		return nil, fmt.Errorf("error getting object: %#v", err)
   426  	}
   427  
   428  	buf := bytes.NewBuffer(nil)
   429  	if _, err := io.Copy(buf, output); err != nil {
   430  		return nil, fmt.Errorf("failed to read remote state: %s", err)
   431  	}
   432  	sum := md5.Sum(buf.Bytes())
   433  	payload := &remote.Payload{
   434  		Data: buf.Bytes(),
   435  		MD5:  sum[:],
   436  	}
   437  
   438  	// If there was no data, then return nil
   439  	if len(payload.Data) == 0 {
   440  		return nil, nil
   441  	}
   442  
   443  	return payload, nil
   444  }
   445  
   446  const errBadChecksumFmt = `state data in OSS does not have the expected content.
   447  
   448  This may be caused by unusually long delays in OSS processing a previous state
   449  update.  Please wait for a minute or two and try again. If this problem
   450  persists, and neither OSS nor TableStore are experiencing an outage, you may need
   451  to manually verify the remote state and update the Digest value stored in the
   452  TableStore table to the following value: %x`