github.com/Kevinklinger/open_terraform@v0.11.12-beta1/backend/remote-state/consul/client.go (about) 1 package consul 2 3 import ( 4 "bytes" 5 "compress/gzip" 6 "context" 7 "crypto/md5" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "log" 12 "sync" 13 "time" 14 15 consulapi "github.com/hashicorp/consul/api" 16 multierror "github.com/hashicorp/go-multierror" 17 "github.com/hashicorp/terraform/state" 18 "github.com/hashicorp/terraform/state/remote" 19 ) 20 21 const ( 22 lockSuffix = "/.lock" 23 lockInfoSuffix = "/.lockinfo" 24 25 // The Session TTL associated with this lock. 26 lockSessionTTL = "15s" 27 28 // the delay time from when a session is lost to when the 29 // lock is released by the server 30 lockDelay = 5 * time.Second 31 // interval between attempts to reacquire a lost lock 32 lockReacquireInterval = 2 * time.Second 33 ) 34 35 var lostLockErr = errors.New("consul lock was lost") 36 37 // RemoteClient is a remote client that stores data in Consul. 38 type RemoteClient struct { 39 Client *consulapi.Client 40 Path string 41 GZip bool 42 43 mu sync.Mutex 44 // lockState is true if we're using locks 45 lockState bool 46 47 // The index of the last state we wrote. 48 // If this is > 0, Put will perform a CAS to ensure that the state wasn't 49 // changed during the operation. This is important even with locks, because 50 // if the client loses the lock for some reason, then reacquires it, we 51 // need to make sure that the state was not modified. 52 modifyIndex uint64 53 54 consulLock *consulapi.Lock 55 lockCh <-chan struct{} 56 57 info *state.LockInfo 58 59 // cancel our goroutine which is monitoring the lock to automatically 60 // reacquire it when possible. 61 monitorCancel context.CancelFunc 62 monitorWG sync.WaitGroup 63 64 // sessionCancel cancels the Context use for session.RenewPeriodic, and is 65 // called when unlocking, or before creating a new lock if the lock is 66 // lost. 67 sessionCancel context.CancelFunc 68 } 69 70 func (c *RemoteClient) Get() (*remote.Payload, error) { 71 c.mu.Lock() 72 defer c.mu.Unlock() 73 74 pair, _, err := c.Client.KV().Get(c.Path, nil) 75 if err != nil { 76 return nil, err 77 } 78 if pair == nil { 79 return nil, nil 80 } 81 82 c.modifyIndex = pair.ModifyIndex 83 84 payload := pair.Value 85 // If the payload starts with 0x1f, it's gzip, not json 86 if len(pair.Value) >= 1 && pair.Value[0] == '\x1f' { 87 if data, err := uncompressState(pair.Value); err == nil { 88 payload = data 89 } else { 90 return nil, err 91 } 92 } 93 94 md5 := md5.Sum(pair.Value) 95 return &remote.Payload{ 96 Data: payload, 97 MD5: md5[:], 98 }, nil 99 } 100 101 func (c *RemoteClient) Put(data []byte) error { 102 c.mu.Lock() 103 defer c.mu.Unlock() 104 105 payload := data 106 if c.GZip { 107 if compressedState, err := compressState(data); err == nil { 108 payload = compressedState 109 } else { 110 return err 111 } 112 } 113 114 kv := c.Client.KV() 115 116 // default to doing a CAS 117 verb := consulapi.KVCAS 118 119 // Assume a 0 index doesn't need a CAS for now, since we are either 120 // creating a new state or purposely overwriting one. 121 if c.modifyIndex == 0 { 122 verb = consulapi.KVSet 123 } 124 125 // KV.Put doesn't return the new index, so we use a single operation 126 // transaction to get the new index with a single request. 127 txOps := consulapi.KVTxnOps{ 128 &consulapi.KVTxnOp{ 129 Verb: verb, 130 Key: c.Path, 131 Value: payload, 132 Index: c.modifyIndex, 133 }, 134 } 135 136 ok, resp, _, err := kv.Txn(txOps, nil) 137 if err != nil { 138 return err 139 } 140 141 // transaction was rolled back 142 if !ok { 143 return fmt.Errorf("consul CAS failed with transaction errors: %v", resp.Errors) 144 } 145 146 if len(resp.Results) != 1 { 147 // this probably shouldn't happen 148 return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results)) 149 } 150 151 c.modifyIndex = resp.Results[0].ModifyIndex 152 return nil 153 } 154 155 func (c *RemoteClient) Delete() error { 156 c.mu.Lock() 157 defer c.mu.Unlock() 158 159 kv := c.Client.KV() 160 _, err := kv.Delete(c.Path, nil) 161 return err 162 } 163 164 func (c *RemoteClient) putLockInfo(info *state.LockInfo) error { 165 info.Path = c.Path 166 info.Created = time.Now().UTC() 167 168 kv := c.Client.KV() 169 _, err := kv.Put(&consulapi.KVPair{ 170 Key: c.Path + lockInfoSuffix, 171 Value: info.Marshal(), 172 }, nil) 173 174 return err 175 } 176 177 func (c *RemoteClient) getLockInfo() (*state.LockInfo, error) { 178 path := c.Path + lockInfoSuffix 179 pair, _, err := c.Client.KV().Get(path, nil) 180 if err != nil { 181 return nil, err 182 } 183 if pair == nil { 184 return nil, nil 185 } 186 187 li := &state.LockInfo{} 188 err = json.Unmarshal(pair.Value, li) 189 if err != nil { 190 return nil, fmt.Errorf("error unmarshaling lock info: %s", err) 191 } 192 193 return li, nil 194 } 195 196 func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) { 197 c.mu.Lock() 198 defer c.mu.Unlock() 199 200 if !c.lockState { 201 return "", nil 202 } 203 204 c.info = info 205 206 // These checks only are to ensure we strictly follow the specification. 207 // Terraform shouldn't ever re-lock, so provide errors for the 2 possible 208 // states if this is called. 209 select { 210 case <-c.lockCh: 211 // We had a lock, but lost it. 212 return "", errors.New("lost consul lock, cannot re-lock") 213 default: 214 if c.lockCh != nil { 215 // we have an active lock already 216 return "", fmt.Errorf("state %q already locked", c.Path) 217 } 218 } 219 220 return c.lock() 221 } 222 223 // the lock implementation. 224 // Only to be called while holding Client.mu 225 func (c *RemoteClient) lock() (string, error) { 226 // We create a new session here, so it can be canceled when the lock is 227 // lost or unlocked. 228 lockSession, err := c.createSession() 229 if err != nil { 230 return "", err 231 } 232 233 // store the session ID for correlation with consul logs 234 c.info.Info = "consul session: " + lockSession 235 236 opts := &consulapi.LockOptions{ 237 Key: c.Path + lockSuffix, 238 Session: lockSession, 239 240 // only wait briefly, so terraform has the choice to fail fast or 241 // retry as needed. 242 LockWaitTime: time.Second, 243 LockTryOnce: true, 244 245 // Don't let the lock monitor give up right away, as it's possible the 246 // session is still OK. While the session is refreshed at a rate of 247 // TTL/2, the lock monitor is an idle blocking request and is more 248 // susceptible to being closed by a lower network layer. 249 MonitorRetries: 5, 250 // 251 // The delay between lock monitor retries. 252 // While the session has a 15s TTL plus a 5s wait period on a lost 253 // lock, if we can't get our lock back in 10+ seconds something is 254 // wrong so we're going to drop the session and start over. 255 MonitorRetryTime: 2 * time.Second, 256 } 257 258 c.consulLock, err = c.Client.LockOpts(opts) 259 if err != nil { 260 return "", err 261 } 262 263 lockErr := &state.LockError{} 264 265 lockCh, err := c.consulLock.Lock(make(chan struct{})) 266 if err != nil { 267 lockErr.Err = err 268 return "", lockErr 269 } 270 271 if lockCh == nil { 272 lockInfo, e := c.getLockInfo() 273 if e != nil { 274 lockErr.Err = e 275 return "", lockErr 276 } 277 278 lockErr.Info = lockInfo 279 280 return "", lockErr 281 } 282 283 c.lockCh = lockCh 284 285 err = c.putLockInfo(c.info) 286 if err != nil { 287 if unlockErr := c.unlock(c.info.ID); unlockErr != nil { 288 err = multierror.Append(err, unlockErr) 289 } 290 291 return "", err 292 } 293 294 // Start a goroutine to monitor the lock state. 295 // If we lose the lock to due communication issues with the consul agent, 296 // attempt to immediately reacquire the lock. Put will verify the integrity 297 // of the state by using a CAS operation. 298 ctx, cancel := context.WithCancel(context.Background()) 299 c.monitorCancel = cancel 300 c.monitorWG.Add(1) 301 go func() { 302 defer c.monitorWG.Done() 303 select { 304 case <-c.lockCh: 305 log.Println("[ERROR] lost consul lock") 306 for { 307 c.mu.Lock() 308 // We lost our lock, so we need to cancel the session too. 309 // The CancelFunc is only replaced while holding Client.mu, so 310 // this is safe to call here. This will be replaced by the 311 // lock() call below. 312 c.sessionCancel() 313 314 c.consulLock = nil 315 _, err := c.lock() 316 c.mu.Unlock() 317 318 if err != nil { 319 // We failed to get the lock, keep trying as long as 320 // terraform is running. There may be changes in progress, 321 // so there's no use in aborting. Either we eventually 322 // reacquire the lock, or a Put will fail on a CAS. 323 log.Printf("[ERROR] could not reacquire lock: %s", err) 324 time.Sleep(lockReacquireInterval) 325 326 select { 327 case <-ctx.Done(): 328 return 329 default: 330 } 331 continue 332 } 333 334 // if the error was nil, the new lock started a new copy of 335 // this goroutine. 336 return 337 } 338 339 case <-ctx.Done(): 340 return 341 } 342 }() 343 344 if testLockHook != nil { 345 testLockHook() 346 } 347 348 return c.info.ID, nil 349 } 350 351 // called after a lock is acquired 352 var testLockHook func() 353 354 func (c *RemoteClient) createSession() (string, error) { 355 // create the context first. Even if the session creation fails, we assume 356 // that the CancelFunc is always callable. 357 ctx, cancel := context.WithCancel(context.Background()) 358 c.sessionCancel = cancel 359 360 session := c.Client.Session() 361 se := &consulapi.SessionEntry{ 362 Name: consulapi.DefaultLockSessionName, 363 TTL: lockSessionTTL, 364 LockDelay: lockDelay, 365 } 366 367 id, _, err := session.Create(se, nil) 368 if err != nil { 369 return "", err 370 } 371 372 log.Println("[INFO] created consul lock session", id) 373 374 // keep the session renewed 375 go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done()) 376 377 return id, nil 378 } 379 380 func (c *RemoteClient) Unlock(id string) error { 381 c.mu.Lock() 382 defer c.mu.Unlock() 383 384 if !c.lockState { 385 return nil 386 } 387 388 return c.unlock(id) 389 } 390 391 // the unlock implementation. 392 // Only to be called while holding Client.mu 393 func (c *RemoteClient) unlock(id string) error { 394 // this doesn't use the lock id, because the lock is tied to the consul client. 395 if c.consulLock == nil || c.lockCh == nil { 396 return nil 397 } 398 399 // cancel our monitoring goroutine 400 c.monitorCancel() 401 402 defer func() { 403 c.consulLock = nil 404 405 // The consul session is only used for this single lock, so cancel it 406 // after we unlock. 407 // The session is only created and replaced holding Client.mu, so the 408 // CancelFunc must be non-nil. 409 c.sessionCancel() 410 }() 411 412 select { 413 case <-c.lockCh: 414 return lostLockErr 415 default: 416 } 417 418 kv := c.Client.KV() 419 420 var errs error 421 422 if _, err := kv.Delete(c.Path+lockInfoSuffix, nil); err != nil { 423 errs = multierror.Append(errs, err) 424 } 425 426 if err := c.consulLock.Unlock(); err != nil { 427 errs = multierror.Append(errs, err) 428 } 429 430 // the monitoring goroutine may be in a select on the lockCh, so we need to 431 // wait for it to return before changing the value. 432 c.monitorWG.Wait() 433 c.lockCh = nil 434 435 // This is only cleanup, and will fail if the lock was immediately taken by 436 // another client, so we don't report an error to the user here. 437 c.consulLock.Destroy() 438 439 return errs 440 } 441 442 func compressState(data []byte) ([]byte, error) { 443 b := new(bytes.Buffer) 444 gz := gzip.NewWriter(b) 445 if _, err := gz.Write(data); err != nil { 446 return nil, err 447 } 448 if err := gz.Flush(); err != nil { 449 return nil, err 450 } 451 if err := gz.Close(); err != nil { 452 return nil, err 453 } 454 return b.Bytes(), nil 455 } 456 457 func uncompressState(data []byte) ([]byte, error) { 458 b := new(bytes.Buffer) 459 gz, err := gzip.NewReader(bytes.NewReader(data)) 460 if err != nil { 461 return nil, err 462 } 463 b.ReadFrom(gz) 464 if err := gz.Close(); err != nil { 465 return nil, err 466 } 467 return b.Bytes(), nil 468 }