github.com/pgray/terraform@v0.5.4-0.20170822184730-b6a464c5214d/backend/remote-state/consul/client.go (about) 1 package consul 2 3 import ( 4 "bytes" 5 "compress/gzip" 6 "context" 7 "crypto/md5" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "log" 12 "sync" 13 "time" 14 15 consulapi "github.com/hashicorp/consul/api" 16 multierror "github.com/hashicorp/go-multierror" 17 "github.com/hashicorp/terraform/state" 18 "github.com/hashicorp/terraform/state/remote" 19 ) 20 21 const ( 22 lockSuffix = "/.lock" 23 lockInfoSuffix = "/.lockinfo" 24 25 // The Session TTL associated with this lock. 26 lockSessionTTL = "15s" 27 28 // the delay time from when a session is lost to when the 29 // lock is released by the server 30 lockDelay = 5 * time.Second 31 // interval between attempts to reacquire a lost lock 32 lockReacquireInterval = 2 * time.Second 33 ) 34 35 // RemoteClient is a remote client that stores data in Consul. 36 type RemoteClient struct { 37 Client *consulapi.Client 38 Path string 39 GZip bool 40 41 mu sync.Mutex 42 // lockState is true if we're using locks 43 lockState bool 44 45 // The index of the last state we wrote. 46 // If this is > 0, Put will perform a CAS to ensure that the state wasn't 47 // changed during the operation. This is important even with locks, because 48 // if the client loses the lock for some reason, then reacquires it, we 49 // need to make sure that the state was not modified. 50 modifyIndex uint64 51 52 consulLock *consulapi.Lock 53 lockCh <-chan struct{} 54 55 info *state.LockInfo 56 57 // cancel our goroutine which is monitoring the lock to automatically 58 // reacquire it when possible. 59 monitorCancel context.CancelFunc 60 monitorWG sync.WaitGroup 61 62 // sessionCancel cancels the Context use for session.RenewPeriodic, and is 63 // called when unlocking, or before creating a new lock if the lock is 64 // lost. 65 sessionCancel context.CancelFunc 66 } 67 68 func (c *RemoteClient) Get() (*remote.Payload, error) { 69 c.mu.Lock() 70 defer c.mu.Unlock() 71 72 pair, _, err := c.Client.KV().Get(c.Path, nil) 73 if err != nil { 74 return nil, err 75 } 76 if pair == nil { 77 return nil, nil 78 } 79 80 c.modifyIndex = pair.ModifyIndex 81 82 payload := pair.Value 83 // If the payload starts with 0x1f, it's gzip, not json 84 if len(pair.Value) >= 1 && pair.Value[0] == '\x1f' { 85 if data, err := uncompressState(pair.Value); err == nil { 86 payload = data 87 } else { 88 return nil, err 89 } 90 } 91 92 md5 := md5.Sum(pair.Value) 93 return &remote.Payload{ 94 Data: payload, 95 MD5: md5[:], 96 }, nil 97 } 98 99 func (c *RemoteClient) Put(data []byte) error { 100 c.mu.Lock() 101 defer c.mu.Unlock() 102 103 payload := data 104 if c.GZip { 105 if compressedState, err := compressState(data); err == nil { 106 payload = compressedState 107 } else { 108 return err 109 } 110 } 111 112 kv := c.Client.KV() 113 114 // default to doing a CAS 115 verb := consulapi.KVCAS 116 117 // Assume a 0 index doesn't need a CAS for now, since we are either 118 // creating a new state or purposely overwriting one. 119 if c.modifyIndex == 0 { 120 verb = consulapi.KVSet 121 } 122 123 // KV.Put doesn't return the new index, so we use a single operation 124 // transaction to get the new index with a single request. 125 txOps := consulapi.KVTxnOps{ 126 &consulapi.KVTxnOp{ 127 Verb: verb, 128 Key: c.Path, 129 Value: payload, 130 Index: c.modifyIndex, 131 }, 132 } 133 134 ok, resp, _, err := kv.Txn(txOps, nil) 135 if err != nil { 136 return err 137 } 138 139 // transaction was rolled back 140 if !ok { 141 return fmt.Errorf("consul CAS failed with transaction errors: %v", resp.Errors) 142 } 143 144 if len(resp.Results) != 1 { 145 // this probably shouldn't happen 146 return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results)) 147 } 148 149 c.modifyIndex = resp.Results[0].ModifyIndex 150 return nil 151 } 152 153 func (c *RemoteClient) Delete() error { 154 c.mu.Lock() 155 defer c.mu.Unlock() 156 157 kv := c.Client.KV() 158 _, err := kv.Delete(c.Path, nil) 159 return err 160 } 161 162 func (c *RemoteClient) putLockInfo(info *state.LockInfo) error { 163 info.Path = c.Path 164 info.Created = time.Now().UTC() 165 166 kv := c.Client.KV() 167 _, err := kv.Put(&consulapi.KVPair{ 168 Key: c.Path + lockInfoSuffix, 169 Value: info.Marshal(), 170 }, nil) 171 172 return err 173 } 174 175 func (c *RemoteClient) getLockInfo() (*state.LockInfo, error) { 176 path := c.Path + lockInfoSuffix 177 pair, _, err := c.Client.KV().Get(path, nil) 178 if err != nil { 179 return nil, err 180 } 181 if pair == nil { 182 return nil, nil 183 } 184 185 li := &state.LockInfo{} 186 err = json.Unmarshal(pair.Value, li) 187 if err != nil { 188 return nil, fmt.Errorf("error unmarshaling lock info: %s", err) 189 } 190 191 return li, nil 192 } 193 194 func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) { 195 c.mu.Lock() 196 defer c.mu.Unlock() 197 198 if !c.lockState { 199 return "", nil 200 } 201 202 c.info = info 203 204 // These checks only are to ensure we strictly follow the specification. 205 // Terraform shouldn't ever re-lock, so provide errors for the 2 possible 206 // states if this is called. 207 select { 208 case <-c.lockCh: 209 // We had a lock, but lost it. 210 return "", errors.New("lost consul lock, cannot re-lock") 211 default: 212 if c.lockCh != nil { 213 // we have an active lock already 214 return "", fmt.Errorf("state %q already locked", c.Path) 215 } 216 } 217 218 return c.lock() 219 } 220 221 // the lock implementation. 222 // Only to be called while holding Client.mu 223 func (c *RemoteClient) lock() (string, error) { 224 // We create a new session here, so it can be canceled when the lock is 225 // lost or unlocked. 226 lockSession, err := c.createSession() 227 if err != nil { 228 return "", err 229 } 230 231 opts := &consulapi.LockOptions{ 232 Key: c.Path + lockSuffix, 233 Session: lockSession, 234 235 // only wait briefly, so terraform has the choice to fail fast or 236 // retry as needed. 237 LockWaitTime: time.Second, 238 LockTryOnce: true, 239 240 // Don't let the lock monitor give up right away, as it's possible the 241 // session is still OK. While the session is refreshed at a rate of 242 // TTL/2, the lock monitor is an idle blocking request and is more 243 // susceptible to being closed by a lower network layer. 244 MonitorRetries: 5, 245 // 246 // The delay between lock monitor retries. 247 // While the session has a 15s TTL plus a 5s wait period on a lost 248 // lock, if we can't get our lock back in 10+ seconds something is 249 // wrong so we're going to drop the session and start over. 250 MonitorRetryTime: 2 * time.Second, 251 } 252 253 c.consulLock, err = c.Client.LockOpts(opts) 254 if err != nil { 255 return "", err 256 } 257 258 lockErr := &state.LockError{} 259 260 lockCh, err := c.consulLock.Lock(make(chan struct{})) 261 if err != nil { 262 lockErr.Err = err 263 return "", lockErr 264 } 265 266 if lockCh == nil { 267 lockInfo, e := c.getLockInfo() 268 if e != nil { 269 lockErr.Err = e 270 return "", lockErr 271 } 272 273 lockErr.Info = lockInfo 274 275 return "", lockErr 276 } 277 278 c.lockCh = lockCh 279 280 err = c.putLockInfo(c.info) 281 if err != nil { 282 if unlockErr := c.unlock(c.info.ID); unlockErr != nil { 283 err = multierror.Append(err, unlockErr) 284 } 285 286 return "", err 287 } 288 289 // Start a goroutine to monitor the lock state. 290 // If we lose the lock to due communication issues with the consul agent, 291 // attempt to immediately reacquire the lock. Put will verify the integrity 292 // of the state by using a CAS operation. 293 ctx, cancel := context.WithCancel(context.Background()) 294 c.monitorCancel = cancel 295 c.monitorWG.Add(1) 296 go func() { 297 defer c.monitorWG.Done() 298 select { 299 case <-c.lockCh: 300 log.Println("[ERROR] lost consul lock") 301 for { 302 c.mu.Lock() 303 // We lost our lock, so we need to cancel the session too. 304 // The CancelFunc is only replaced while holding Client.mu, so 305 // this is safe to call here. This will be replaced by the 306 // lock() call below. 307 c.sessionCancel() 308 309 c.consulLock = nil 310 _, err := c.lock() 311 c.mu.Unlock() 312 313 if err != nil { 314 // We failed to get the lock, keep trying as long as 315 // terraform is running. There may be changes in progress, 316 // so there's no use in aborting. Either we eventually 317 // reacquire the lock, or a Put will fail on a CAS. 318 log.Printf("[ERROR] could not reacquire lock: %s", err) 319 time.Sleep(lockReacquireInterval) 320 321 select { 322 case <-ctx.Done(): 323 return 324 default: 325 } 326 continue 327 } 328 329 // if the error was nil, the new lock started a new copy of 330 // this goroutine. 331 return 332 } 333 334 case <-ctx.Done(): 335 return 336 } 337 }() 338 339 if testLockHook != nil { 340 testLockHook() 341 } 342 343 return c.info.ID, nil 344 } 345 346 // called after a lock is acquired 347 var testLockHook func() 348 349 func (c *RemoteClient) createSession() (string, error) { 350 // create the context first. Even if the session creation fails, we assume 351 // that the CancelFunc is always callable. 352 ctx, cancel := context.WithCancel(context.Background()) 353 c.sessionCancel = cancel 354 355 session := c.Client.Session() 356 se := &consulapi.SessionEntry{ 357 Name: consulapi.DefaultLockSessionName, 358 TTL: lockSessionTTL, 359 LockDelay: lockDelay, 360 } 361 362 id, _, err := session.Create(se, nil) 363 if err != nil { 364 return "", err 365 } 366 367 log.Println("[INFO] created consul lock session", id) 368 369 // keep the session renewed 370 go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done()) 371 372 return id, nil 373 } 374 375 func (c *RemoteClient) Unlock(id string) error { 376 c.mu.Lock() 377 defer c.mu.Unlock() 378 379 if !c.lockState { 380 return nil 381 } 382 383 return c.unlock(id) 384 } 385 386 // the unlock implementation. 387 // Only to be called while holding Client.mu 388 func (c *RemoteClient) unlock(id string) error { 389 // this doesn't use the lock id, because the lock is tied to the consul client. 390 if c.consulLock == nil || c.lockCh == nil { 391 return nil 392 } 393 394 // cancel our monitoring goroutine 395 c.monitorCancel() 396 397 defer func() { 398 c.consulLock = nil 399 400 // The consul session is only used for this single lock, so cancel it 401 // after we unlock. 402 // The session is only created and replaced holding Client.mu, so the 403 // CancelFunc must be non-nil. 404 c.sessionCancel() 405 }() 406 407 select { 408 case <-c.lockCh: 409 return errors.New("consul lock was lost") 410 default: 411 } 412 413 kv := c.Client.KV() 414 415 var errs error 416 417 if _, err := kv.Delete(c.Path+lockInfoSuffix, nil); err != nil { 418 errs = multierror.Append(errs, err) 419 } 420 421 if err := c.consulLock.Unlock(); err != nil { 422 errs = multierror.Append(errs, err) 423 } 424 425 // the monitoring goroutine may be in a select on the lockCh, so we need to 426 // wait for it to return before changing the value. 427 c.monitorWG.Wait() 428 c.lockCh = nil 429 430 // This is only cleanup, and will fail if the lock was immediately taken by 431 // another client, so we don't report an error to the user here. 432 c.consulLock.Destroy() 433 434 return errs 435 } 436 437 func compressState(data []byte) ([]byte, error) { 438 b := new(bytes.Buffer) 439 gz := gzip.NewWriter(b) 440 if _, err := gz.Write(data); err != nil { 441 return nil, err 442 } 443 if err := gz.Flush(); err != nil { 444 return nil, err 445 } 446 if err := gz.Close(); err != nil { 447 return nil, err 448 } 449 return b.Bytes(), nil 450 } 451 452 func uncompressState(data []byte) ([]byte, error) { 453 b := new(bytes.Buffer) 454 gz, err := gzip.NewReader(bytes.NewReader(data)) 455 if err != nil { 456 return nil, err 457 } 458 b.ReadFrom(gz) 459 if err := gz.Close(); err != nil { 460 return nil, err 461 } 462 return b.Bytes(), nil 463 }