github.com/terramate-io/tf@v0.0.0-20230830114523-fce866b4dfcd/backend/remote-state/oss/client.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package oss 5 6 import ( 7 "bytes" 8 "crypto/md5" 9 "encoding/hex" 10 "encoding/json" 11 "fmt" 12 "io" 13 "log" 14 "time" 15 16 "github.com/aliyun/aliyun-oss-go-sdk/oss" 17 "github.com/aliyun/aliyun-tablestore-go-sdk/tablestore" 18 "github.com/hashicorp/go-multierror" 19 uuid "github.com/hashicorp/go-uuid" 20 "github.com/pkg/errors" 21 22 "github.com/terramate-io/tf/states/remote" 23 "github.com/terramate-io/tf/states/statemgr" 24 ) 25 26 const ( 27 // Store the last saved serial in tablestore with this suffix for consistency checks. 28 stateIDSuffix = "-md5" 29 30 pkName = "LockID" 31 ) 32 33 var ( 34 // The amount of time we will retry a state waiting for it to match the 35 // expected checksum. 36 consistencyRetryTimeout = 10 * time.Second 37 38 // delay when polling the state 39 consistencyRetryPollInterval = 2 * time.Second 40 ) 41 42 // test hook called when checksums don't match 43 var testChecksumHook func() 44 45 type RemoteClient struct { 46 ossClient *oss.Client 47 otsClient *tablestore.TableStoreClient 48 bucketName string 49 stateFile string 50 lockFile string 51 serverSideEncryption bool 52 acl string 53 otsTable string 54 } 55 56 func (c *RemoteClient) Get() (payload *remote.Payload, err error) { 57 deadline := time.Now().Add(consistencyRetryTimeout) 58 59 // If we have a checksum, and the returned payload doesn't match, we retry 60 // up until deadline. 61 for { 62 payload, err = c.getObj() 63 if err != nil { 64 return nil, err 65 } 66 67 // If the remote state was manually removed the payload will be nil, 68 // but if there's still a digest entry for that state we will still try 69 // to compare the MD5 below. 70 var digest []byte 71 if payload != nil { 72 digest = payload.MD5 73 } 74 75 // verify that this state is what we expect 76 if expected, err := c.getMD5(); err != nil { 77 log.Printf("[WARN] failed to fetch state md5: %s", err) 78 } else if len(expected) > 0 && !bytes.Equal(expected, digest) { 79 log.Printf("[WARN] state md5 mismatch: expected '%x', got '%x'", expected, digest) 80 81 if testChecksumHook != nil { 82 testChecksumHook() 83 } 84 85 if time.Now().Before(deadline) { 86 time.Sleep(consistencyRetryPollInterval) 87 log.Println("[INFO] retrying OSS RemoteClient.Get...") 88 continue 89 } 90 91 return nil, fmt.Errorf(errBadChecksumFmt, digest) 92 } 93 94 break 95 } 96 return payload, nil 97 } 98 99 func (c *RemoteClient) Put(data []byte) error { 100 bucket, err := c.ossClient.Bucket(c.bucketName) 101 if err != nil { 102 return fmt.Errorf("error getting bucket: %#v", err) 103 } 104 105 body := bytes.NewReader(data) 106 107 var options []oss.Option 108 if c.acl != "" { 109 options = append(options, oss.ACL(oss.ACLType(c.acl))) 110 } 111 options = append(options, oss.ContentType("application/json")) 112 if c.serverSideEncryption { 113 options = append(options, oss.ServerSideEncryption("AES256")) 114 } 115 options = append(options, oss.ContentLength(int64(len(data)))) 116 117 if body != nil { 118 if err := bucket.PutObject(c.stateFile, body, options...); err != nil { 119 return fmt.Errorf("failed to upload state %s: %#v", c.stateFile, err) 120 } 121 } 122 123 sum := md5.Sum(data) 124 if err := c.putMD5(sum[:]); err != nil { 125 // if this errors out, we unfortunately have to error out altogether, 126 // since the next Get will inevitably fail. 127 return fmt.Errorf("failed to store state MD5: %s", err) 128 } 129 return nil 130 } 131 132 func (c *RemoteClient) Delete() error { 133 bucket, err := c.ossClient.Bucket(c.bucketName) 134 if err != nil { 135 return fmt.Errorf("error getting bucket %s: %#v", c.bucketName, err) 136 } 137 138 log.Printf("[DEBUG] Deleting remote state from OSS: %#v", c.stateFile) 139 140 if err := bucket.DeleteObject(c.stateFile); err != nil { 141 return fmt.Errorf("error deleting state %s: %#v", c.stateFile, err) 142 } 143 144 if err := c.deleteMD5(); err != nil { 145 log.Printf("[WARN] Error deleting state MD5: %s", err) 146 } 147 return nil 148 } 149 150 func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) { 151 if c.otsTable == "" { 152 return "", nil 153 } 154 155 info.Path = c.lockPath() 156 157 if info.ID == "" { 158 lockID, err := uuid.GenerateUUID() 159 if err != nil { 160 return "", err 161 } 162 info.ID = lockID 163 } 164 165 putParams := &tablestore.PutRowChange{ 166 TableName: c.otsTable, 167 PrimaryKey: &tablestore.PrimaryKey{ 168 PrimaryKeys: []*tablestore.PrimaryKeyColumn{ 169 { 170 ColumnName: pkName, 171 Value: c.lockPath(), 172 }, 173 }, 174 }, 175 Columns: []tablestore.AttributeColumn{ 176 { 177 ColumnName: "Info", 178 Value: string(info.Marshal()), 179 }, 180 }, 181 Condition: &tablestore.RowCondition{ 182 RowExistenceExpectation: tablestore.RowExistenceExpectation_EXPECT_NOT_EXIST, 183 }, 184 } 185 186 log.Printf("[DEBUG] Recording state lock in tablestore: %#v; LOCKID:%s", putParams, c.lockPath()) 187 188 _, err := c.otsClient.PutRow(&tablestore.PutRowRequest{ 189 PutRowChange: putParams, 190 }) 191 if err != nil { 192 err = fmt.Errorf("invoking PutRow got an error: %#v", err) 193 lockInfo, infoErr := c.getLockInfo() 194 if infoErr != nil { 195 err = multierror.Append(err, fmt.Errorf("\ngetting lock info got an error: %#v", infoErr)) 196 } 197 lockErr := &statemgr.LockError{ 198 Err: err, 199 Info: lockInfo, 200 } 201 log.Printf("[ERROR] state lock error: %s", lockErr.Error()) 202 return "", lockErr 203 } 204 205 return info.ID, nil 206 } 207 208 func (c *RemoteClient) getMD5() ([]byte, error) { 209 if c.otsTable == "" { 210 return nil, nil 211 } 212 213 getParams := &tablestore.SingleRowQueryCriteria{ 214 TableName: c.otsTable, 215 PrimaryKey: &tablestore.PrimaryKey{ 216 PrimaryKeys: []*tablestore.PrimaryKeyColumn{ 217 { 218 ColumnName: pkName, 219 Value: c.lockPath() + stateIDSuffix, 220 }, 221 }, 222 }, 223 ColumnsToGet: []string{pkName, "Digest"}, 224 MaxVersion: 1, 225 } 226 227 log.Printf("[DEBUG] Retrieving state serial in tablestore: %#v", getParams) 228 229 object, err := c.otsClient.GetRow(&tablestore.GetRowRequest{ 230 SingleRowQueryCriteria: getParams, 231 }) 232 233 if err != nil { 234 return nil, err 235 } 236 237 var val string 238 if v, ok := object.GetColumnMap().Columns["Digest"]; ok && len(v) > 0 { 239 val = v[0].Value.(string) 240 } 241 242 sum, err := hex.DecodeString(val) 243 if err != nil || len(sum) != md5.Size { 244 return nil, errors.New("invalid md5") 245 } 246 247 return sum, nil 248 } 249 250 // store the hash of the state to that clients can check for stale state files. 251 func (c *RemoteClient) putMD5(sum []byte) error { 252 if c.otsTable == "" { 253 return nil 254 } 255 256 if len(sum) != md5.Size { 257 return errors.New("invalid payload md5") 258 } 259 260 putParams := &tablestore.PutRowChange{ 261 TableName: c.otsTable, 262 PrimaryKey: &tablestore.PrimaryKey{ 263 PrimaryKeys: []*tablestore.PrimaryKeyColumn{ 264 { 265 ColumnName: pkName, 266 Value: c.lockPath() + stateIDSuffix, 267 }, 268 }, 269 }, 270 Columns: []tablestore.AttributeColumn{ 271 { 272 ColumnName: "Digest", 273 Value: hex.EncodeToString(sum), 274 }, 275 }, 276 Condition: &tablestore.RowCondition{ 277 RowExistenceExpectation: tablestore.RowExistenceExpectation_IGNORE, 278 }, 279 } 280 281 log.Printf("[DEBUG] Recoring state serial in tablestore: %#v", putParams) 282 283 _, err := c.otsClient.PutRow(&tablestore.PutRowRequest{ 284 PutRowChange: putParams, 285 }) 286 287 if err != nil { 288 log.Printf("[WARN] failed to record state serial in tablestore: %s", err) 289 } 290 291 return nil 292 } 293 294 // remove the hash value for a deleted state 295 func (c *RemoteClient) deleteMD5() error { 296 if c.otsTable == "" { 297 return nil 298 } 299 300 params := &tablestore.DeleteRowRequest{ 301 DeleteRowChange: &tablestore.DeleteRowChange{ 302 TableName: c.otsTable, 303 PrimaryKey: &tablestore.PrimaryKey{ 304 PrimaryKeys: []*tablestore.PrimaryKeyColumn{ 305 { 306 ColumnName: pkName, 307 Value: c.lockPath() + stateIDSuffix, 308 }, 309 }, 310 }, 311 Condition: &tablestore.RowCondition{ 312 RowExistenceExpectation: tablestore.RowExistenceExpectation_EXPECT_EXIST, 313 }, 314 }, 315 } 316 317 log.Printf("[DEBUG] Deleting state serial in tablestore: %#v", params) 318 319 if _, err := c.otsClient.DeleteRow(params); err != nil { 320 return err 321 } 322 323 return nil 324 } 325 326 func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) { 327 getParams := &tablestore.SingleRowQueryCriteria{ 328 TableName: c.otsTable, 329 PrimaryKey: &tablestore.PrimaryKey{ 330 PrimaryKeys: []*tablestore.PrimaryKeyColumn{ 331 { 332 ColumnName: pkName, 333 Value: c.lockPath(), 334 }, 335 }, 336 }, 337 ColumnsToGet: []string{pkName, "Info"}, 338 MaxVersion: 1, 339 } 340 341 log.Printf("[DEBUG] Retrieving state lock info from tablestore: %#v", getParams) 342 343 object, err := c.otsClient.GetRow(&tablestore.GetRowRequest{ 344 SingleRowQueryCriteria: getParams, 345 }) 346 if err != nil { 347 return nil, err 348 } 349 350 var infoData string 351 if v, ok := object.GetColumnMap().Columns["Info"]; ok && len(v) > 0 { 352 infoData = v[0].Value.(string) 353 } 354 lockInfo := &statemgr.LockInfo{} 355 err = json.Unmarshal([]byte(infoData), lockInfo) 356 if err != nil { 357 return nil, err 358 } 359 return lockInfo, nil 360 } 361 func (c *RemoteClient) Unlock(id string) error { 362 if c.otsTable == "" { 363 return nil 364 } 365 366 lockErr := &statemgr.LockError{} 367 368 lockInfo, err := c.getLockInfo() 369 if err != nil { 370 lockErr.Err = fmt.Errorf("failed to retrieve lock info: %s", err) 371 return lockErr 372 } 373 lockErr.Info = lockInfo 374 375 if lockInfo.ID != id { 376 lockErr.Err = fmt.Errorf("lock id %q does not match existing lock", id) 377 return lockErr 378 } 379 params := &tablestore.DeleteRowRequest{ 380 DeleteRowChange: &tablestore.DeleteRowChange{ 381 TableName: c.otsTable, 382 PrimaryKey: &tablestore.PrimaryKey{ 383 PrimaryKeys: []*tablestore.PrimaryKeyColumn{ 384 { 385 ColumnName: pkName, 386 Value: c.lockPath(), 387 }, 388 }, 389 }, 390 Condition: &tablestore.RowCondition{ 391 RowExistenceExpectation: tablestore.RowExistenceExpectation_IGNORE, 392 }, 393 }, 394 } 395 396 _, err = c.otsClient.DeleteRow(params) 397 398 if err != nil { 399 lockErr.Err = err 400 return lockErr 401 } 402 403 return nil 404 } 405 406 func (c *RemoteClient) lockPath() string { 407 return fmt.Sprintf("%s/%s", c.bucketName, c.stateFile) 408 } 409 410 func (c *RemoteClient) getObj() (*remote.Payload, error) { 411 bucket, err := c.ossClient.Bucket(c.bucketName) 412 if err != nil { 413 return nil, fmt.Errorf("error getting bucket %s: %#v", c.bucketName, err) 414 } 415 416 if exist, err := bucket.IsObjectExist(c.stateFile); err != nil { 417 return nil, fmt.Errorf("estimating object %s is exist got an error: %#v", c.stateFile, err) 418 } else if !exist { 419 return nil, nil 420 } 421 422 var options []oss.Option 423 output, err := bucket.GetObject(c.stateFile, options...) 424 if err != nil { 425 return nil, fmt.Errorf("error getting object: %#v", err) 426 } 427 428 buf := bytes.NewBuffer(nil) 429 if _, err := io.Copy(buf, output); err != nil { 430 return nil, fmt.Errorf("failed to read remote state: %s", err) 431 } 432 sum := md5.Sum(buf.Bytes()) 433 payload := &remote.Payload{ 434 Data: buf.Bytes(), 435 MD5: sum[:], 436 } 437 438 // If there was no data, then return nil 439 if len(payload.Data) == 0 { 440 return nil, nil 441 } 442 443 return payload, nil 444 } 445 446 const errBadChecksumFmt = `state data in OSS does not have the expected content. 447 448 This may be caused by unusually long delays in OSS processing a previous state 449 update. Please wait for a minute or two and try again. If this problem 450 persists, and neither OSS nor TableStore are experiencing an outage, you may need 451 to manually verify the remote state and update the Digest value stored in the 452 TableStore table to the following value: %x`