github.imxd.top/hashicorp/consul@v1.4.5/agent/consul/acl_replication.go (about) 1 package consul 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "time" 8 9 "github.com/armon/go-metrics" 10 "github.com/hashicorp/consul/agent/structs" 11 ) 12 13 const ( 14 // aclReplicationMaxRetryBackoff is the max number of seconds to sleep between ACL replication RPC errors 15 aclReplicationMaxRetryBackoff = 64 16 ) 17 18 func diffACLPolicies(local structs.ACLPolicies, remote structs.ACLPolicyListStubs, lastRemoteIndex uint64) ([]string, []string) { 19 local.Sort() 20 remote.Sort() 21 22 var deletions []string 23 var updates []string 24 var localIdx int 25 var remoteIdx int 26 for localIdx, remoteIdx = 0, 0; localIdx < len(local) && remoteIdx < len(remote); { 27 if local[localIdx].ID == remote[remoteIdx].ID { 28 // policy is in both the local and remote state - need to check raft indices and the Hash 29 if remote[remoteIdx].ModifyIndex > lastRemoteIndex && !bytes.Equal(remote[remoteIdx].Hash, local[localIdx].Hash) { 30 updates = append(updates, remote[remoteIdx].ID) 31 } 32 // increment both indices when equal 33 localIdx += 1 34 remoteIdx += 1 35 } else if local[localIdx].ID < remote[remoteIdx].ID { 36 // policy no longer in remoted state - needs deleting 37 deletions = append(deletions, local[localIdx].ID) 38 39 // increment just the local index 40 localIdx += 1 41 } else { 42 // local state doesn't have this policy - needs updating 43 updates = append(updates, remote[remoteIdx].ID) 44 45 // increment just the remote index 46 remoteIdx += 1 47 } 48 } 49 50 for ; localIdx < len(local); localIdx += 1 { 51 deletions = append(deletions, local[localIdx].ID) 52 } 53 54 for ; remoteIdx < len(remote); remoteIdx += 1 { 55 updates = append(updates, remote[remoteIdx].ID) 56 } 57 58 return deletions, updates 59 } 60 61 func (s *Server) deleteLocalACLPolicies(deletions []string, ctx context.Context) (bool, error) { 62 ticker := time.NewTicker(time.Second / time.Duration(s.config.ACLReplicationApplyLimit)) 63 defer ticker.Stop() 64 65 for i := 0; i < len(deletions); i += aclBatchDeleteSize { 66 req := structs.ACLPolicyBatchDeleteRequest{} 67 68 if i+aclBatchDeleteSize > len(deletions) { 69 req.PolicyIDs = deletions[i:] 70 } else { 71 req.PolicyIDs = deletions[i : i+aclBatchDeleteSize] 72 } 73 74 resp, err := s.raftApply(structs.ACLPolicyDeleteRequestType, &req) 75 if err != nil { 76 return false, fmt.Errorf("Failed to apply policy deletions: %v", err) 77 } 78 if respErr, ok := resp.(error); ok && err != nil { 79 return false, fmt.Errorf("Failed to apply policy deletions: %v", respErr) 80 } 81 82 if i+aclBatchDeleteSize < len(deletions) { 83 select { 84 case <-ctx.Done(): 85 return true, nil 86 case <-ticker.C: 87 // do nothing - ready for the next batch 88 } 89 } 90 } 91 92 return false, nil 93 } 94 95 func (s *Server) updateLocalACLPolicies(policies structs.ACLPolicies, ctx context.Context) (bool, error) { 96 ticker := time.NewTicker(time.Second / time.Duration(s.config.ACLReplicationApplyLimit)) 97 defer ticker.Stop() 98 99 // outer loop handles submitting a batch 100 for batchStart := 0; batchStart < len(policies); { 101 // inner loop finds the last element to include in this batch. 102 batchSize := 0 103 batchEnd := batchStart 104 for ; batchEnd < len(policies) && batchSize < aclBatchUpsertSize; batchEnd += 1 { 105 batchSize += policies[batchEnd].EstimateSize() 106 } 107 108 req := structs.ACLPolicyBatchSetRequest{ 109 Policies: policies[batchStart:batchEnd], 110 } 111 112 resp, err := s.raftApply(structs.ACLPolicySetRequestType, &req) 113 if err != nil { 114 return false, fmt.Errorf("Failed to apply policy upserts: %v", err) 115 } 116 if respErr, ok := resp.(error); ok && respErr != nil { 117 return false, fmt.Errorf("Failed to apply policy upsert: %v", respErr) 118 } 119 s.logger.Printf("[DEBUG] acl: policy replication - upserted 1 batch with %d policies of size %d", batchEnd-batchStart, batchSize) 120 121 // policies[batchEnd] wasn't include as the slicing doesn't include the element at the stop index 122 batchStart = batchEnd 123 124 // prevent waiting if we are done 125 if batchEnd < len(policies) { 126 select { 127 case <-ctx.Done(): 128 return true, nil 129 case <-ticker.C: 130 // nothing to do - just rate limiting 131 } 132 } 133 } 134 return false, nil 135 } 136 137 func (s *Server) fetchACLPoliciesBatch(policyIDs []string) (*structs.ACLPolicyBatchResponse, error) { 138 req := structs.ACLPolicyBatchGetRequest{ 139 Datacenter: s.config.ACLDatacenter, 140 PolicyIDs: policyIDs, 141 QueryOptions: structs.QueryOptions{ 142 AllowStale: true, 143 Token: s.tokens.ReplicationToken(), 144 }, 145 } 146 147 var response structs.ACLPolicyBatchResponse 148 if err := s.RPC("ACL.PolicyBatchRead", &req, &response); err != nil { 149 return nil, err 150 } 151 152 return &response, nil 153 } 154 155 func (s *Server) fetchACLPolicies(lastRemoteIndex uint64) (*structs.ACLPolicyListResponse, error) { 156 defer metrics.MeasureSince([]string{"leader", "replication", "acl", "policy", "fetch"}, time.Now()) 157 158 req := structs.ACLPolicyListRequest{ 159 Datacenter: s.config.ACLDatacenter, 160 QueryOptions: structs.QueryOptions{ 161 AllowStale: true, 162 MinQueryIndex: lastRemoteIndex, 163 Token: s.tokens.ReplicationToken(), 164 }, 165 } 166 167 var response structs.ACLPolicyListResponse 168 if err := s.RPC("ACL.PolicyList", &req, &response); err != nil { 169 return nil, err 170 } 171 return &response, nil 172 } 173 174 type tokenDiffResults struct { 175 LocalDeletes []string 176 LocalUpserts []string 177 LocalSkipped int 178 RemoteSkipped int 179 } 180 181 func diffACLTokens(local structs.ACLTokens, remote structs.ACLTokenListStubs, lastRemoteIndex uint64) tokenDiffResults { 182 // Note: items with empty AccessorIDs will bubble up to the top. 183 local.Sort() 184 remote.Sort() 185 186 var res tokenDiffResults 187 var localIdx int 188 var remoteIdx int 189 for localIdx, remoteIdx = 0, 0; localIdx < len(local) && remoteIdx < len(remote); { 190 if local[localIdx].AccessorID == "" { 191 res.LocalSkipped++ 192 localIdx += 1 193 continue 194 } 195 if remote[remoteIdx].AccessorID == "" { 196 res.RemoteSkipped++ 197 remoteIdx += 1 198 continue 199 } 200 if local[localIdx].AccessorID == remote[remoteIdx].AccessorID { 201 // policy is in both the local and remote state - need to check raft indices and Hash 202 if remote[remoteIdx].ModifyIndex > lastRemoteIndex && !bytes.Equal(remote[remoteIdx].Hash, local[localIdx].Hash) { 203 res.LocalUpserts = append(res.LocalUpserts, remote[remoteIdx].AccessorID) 204 } 205 // increment both indices when equal 206 localIdx += 1 207 remoteIdx += 1 208 } else if local[localIdx].AccessorID < remote[remoteIdx].AccessorID { 209 // policy no longer in remoted state - needs deleting 210 res.LocalDeletes = append(res.LocalDeletes, local[localIdx].AccessorID) 211 212 // increment just the local index 213 localIdx += 1 214 } else { 215 // local state doesn't have this policy - needs updating 216 res.LocalUpserts = append(res.LocalUpserts, remote[remoteIdx].AccessorID) 217 218 // increment just the remote index 219 remoteIdx += 1 220 } 221 } 222 223 for ; localIdx < len(local); localIdx += 1 { 224 if local[localIdx].AccessorID != "" { 225 res.LocalDeletes = append(res.LocalDeletes, local[localIdx].AccessorID) 226 } else { 227 res.LocalSkipped++ 228 } 229 } 230 231 for ; remoteIdx < len(remote); remoteIdx += 1 { 232 if remote[remoteIdx].AccessorID != "" { 233 res.LocalUpserts = append(res.LocalUpserts, remote[remoteIdx].AccessorID) 234 } else { 235 res.RemoteSkipped++ 236 } 237 } 238 239 return res 240 } 241 242 func (s *Server) deleteLocalACLTokens(deletions []string, ctx context.Context) (bool, error) { 243 ticker := time.NewTicker(time.Second / time.Duration(s.config.ACLReplicationApplyLimit)) 244 defer ticker.Stop() 245 246 for i := 0; i < len(deletions); i += aclBatchDeleteSize { 247 req := structs.ACLTokenBatchDeleteRequest{} 248 249 if i+aclBatchDeleteSize > len(deletions) { 250 req.TokenIDs = deletions[i:] 251 } else { 252 req.TokenIDs = deletions[i : i+aclBatchDeleteSize] 253 } 254 255 resp, err := s.raftApply(structs.ACLTokenDeleteRequestType, &req) 256 if err != nil { 257 return false, fmt.Errorf("Failed to apply token deletions: %v", err) 258 } 259 if respErr, ok := resp.(error); ok && err != nil { 260 return false, fmt.Errorf("Failed to apply token deletions: %v", respErr) 261 } 262 263 if i+aclBatchDeleteSize < len(deletions) { 264 select { 265 case <-ctx.Done(): 266 return true, nil 267 case <-ticker.C: 268 // do nothing - ready for the next batch 269 } 270 } 271 } 272 273 return false, nil 274 } 275 276 func (s *Server) updateLocalACLTokens(tokens structs.ACLTokens, ctx context.Context) (bool, error) { 277 ticker := time.NewTicker(time.Second / time.Duration(s.config.ACLReplicationApplyLimit)) 278 defer ticker.Stop() 279 280 // outer loop handles submitting a batch 281 for batchStart := 0; batchStart < len(tokens); { 282 // inner loop finds the last element to include in this batch. 283 batchSize := 0 284 batchEnd := batchStart 285 for ; batchEnd < len(tokens) && batchSize < aclBatchUpsertSize; batchEnd += 1 { 286 if tokens[batchEnd].SecretID == redactedToken { 287 return false, fmt.Errorf("Detected redacted token secrets: stopping token update round - verify that the replication token in use has acl:write permissions.") 288 } 289 batchSize += tokens[batchEnd].EstimateSize() 290 } 291 292 req := structs.ACLTokenBatchSetRequest{ 293 Tokens: tokens[batchStart:batchEnd], 294 CAS: false, 295 } 296 297 resp, err := s.raftApply(structs.ACLTokenSetRequestType, &req) 298 if err != nil { 299 return false, fmt.Errorf("Failed to apply token upserts: %v", err) 300 } 301 if respErr, ok := resp.(error); ok && respErr != nil { 302 return false, fmt.Errorf("Failed to apply token upserts: %v", respErr) 303 } 304 305 s.logger.Printf("[DEBUG] acl: token replication - upserted 1 batch with %d tokens of size %d", batchEnd-batchStart, batchSize) 306 307 // tokens[batchEnd] wasn't include as the slicing doesn't include the element at the stop index 308 batchStart = batchEnd 309 310 // prevent waiting if we are done 311 if batchEnd < len(tokens) { 312 select { 313 case <-ctx.Done(): 314 return true, nil 315 case <-ticker.C: 316 // nothing to do - just rate limiting here 317 } 318 } 319 } 320 return false, nil 321 } 322 323 func (s *Server) fetchACLTokensBatch(tokenIDs []string) (*structs.ACLTokenBatchResponse, error) { 324 req := structs.ACLTokenBatchGetRequest{ 325 Datacenter: s.config.ACLDatacenter, 326 AccessorIDs: tokenIDs, 327 QueryOptions: structs.QueryOptions{ 328 AllowStale: true, 329 Token: s.tokens.ReplicationToken(), 330 }, 331 } 332 333 var response structs.ACLTokenBatchResponse 334 if err := s.RPC("ACL.TokenBatchRead", &req, &response); err != nil { 335 return nil, err 336 } 337 338 return &response, nil 339 } 340 341 func (s *Server) fetchACLTokens(lastRemoteIndex uint64) (*structs.ACLTokenListResponse, error) { 342 defer metrics.MeasureSince([]string{"leader", "replication", "acl", "token", "fetch"}, time.Now()) 343 344 req := structs.ACLTokenListRequest{ 345 Datacenter: s.config.ACLDatacenter, 346 QueryOptions: structs.QueryOptions{ 347 AllowStale: true, 348 MinQueryIndex: lastRemoteIndex, 349 Token: s.tokens.ReplicationToken(), 350 }, 351 IncludeLocal: false, 352 IncludeGlobal: true, 353 } 354 355 var response structs.ACLTokenListResponse 356 if err := s.RPC("ACL.TokenList", &req, &response); err != nil { 357 return nil, err 358 } 359 return &response, nil 360 } 361 362 func (s *Server) replicateACLPolicies(lastRemoteIndex uint64, ctx context.Context) (uint64, bool, error) { 363 remote, err := s.fetchACLPolicies(lastRemoteIndex) 364 if err != nil { 365 return 0, false, fmt.Errorf("failed to retrieve remote ACL policies: %v", err) 366 } 367 368 s.logger.Printf("[DEBUG] acl: finished fetching policies tokens: %d", len(remote.Policies)) 369 370 // Need to check if we should be stopping. This will be common as the fetching process is a blocking 371 // RPC which could have been hanging around for a long time and during that time leadership could 372 // have been lost. 373 select { 374 case <-ctx.Done(): 375 return 0, true, nil 376 default: 377 // do nothing 378 } 379 380 // Measure everything after the remote query, which can block for long 381 // periods of time. This metric is a good measure of how expensive the 382 // replication process is. 383 defer metrics.MeasureSince([]string{"leader", "replication", "acl", "policy", "apply"}, time.Now()) 384 385 _, local, err := s.fsm.State().ACLPolicyList(nil) 386 if err != nil { 387 return 0, false, fmt.Errorf("failed to retrieve local ACL policies: %v", err) 388 } 389 390 // If the remote index ever goes backwards, it's a good indication that 391 // the remote side was rebuilt and we should do a full sync since we 392 // can't make any assumptions about what's going on. 393 if remote.QueryMeta.Index < lastRemoteIndex { 394 s.logger.Printf("[WARN] consul: ACL policy replication remote index moved backwards (%d to %d), forcing a full ACL policy sync", lastRemoteIndex, remote.QueryMeta.Index) 395 lastRemoteIndex = 0 396 } 397 398 s.logger.Printf("[DEBUG] acl: policy replication - local: %d, remote: %d", len(local), len(remote.Policies)) 399 // Calculate the changes required to bring the state into sync and then 400 // apply them. 401 deletions, updates := diffACLPolicies(local, remote.Policies, lastRemoteIndex) 402 403 s.logger.Printf("[DEBUG] acl: policy replication - deletions: %d, updates: %d", len(deletions), len(updates)) 404 405 var policies *structs.ACLPolicyBatchResponse 406 if len(updates) > 0 { 407 policies, err = s.fetchACLPoliciesBatch(updates) 408 if err != nil { 409 return 0, false, fmt.Errorf("failed to retrieve ACL policy updates: %v", err) 410 } 411 s.logger.Printf("[DEBUG] acl: policy replication - downloaded %d policies", len(policies.Policies)) 412 } 413 414 if len(deletions) > 0 { 415 s.logger.Printf("[DEBUG] acl: policy replication - performing deletions") 416 417 exit, err := s.deleteLocalACLPolicies(deletions, ctx) 418 if exit { 419 return 0, true, nil 420 } 421 if err != nil { 422 return 0, false, fmt.Errorf("failed to delete local ACL policies: %v", err) 423 } 424 s.logger.Printf("[DEBUG] acl: policy replication - finished deletions") 425 } 426 427 if len(updates) > 0 { 428 s.logger.Printf("[DEBUG] acl: policy replication - performing updates") 429 exit, err := s.updateLocalACLPolicies(policies.Policies, ctx) 430 if exit { 431 return 0, true, nil 432 } 433 if err != nil { 434 return 0, false, fmt.Errorf("failed to update local ACL policies: %v", err) 435 } 436 s.logger.Printf("[DEBUG] acl: policy replication - finished updates") 437 } 438 439 // Return the index we got back from the remote side, since we've synced 440 // up with the remote state as of that index. 441 return remote.QueryMeta.Index, false, nil 442 } 443 444 func (s *Server) replicateACLTokens(lastRemoteIndex uint64, ctx context.Context) (uint64, bool, error) { 445 remote, err := s.fetchACLTokens(lastRemoteIndex) 446 if err != nil { 447 return 0, false, fmt.Errorf("failed to retrieve remote ACL tokens: %v", err) 448 } 449 450 s.logger.Printf("[DEBUG] acl: finished fetching remote tokens: %d", len(remote.Tokens)) 451 452 // Need to check if we should be stopping. This will be common as the fetching process is a blocking 453 // RPC which could have been hanging around for a long time and during that time leadership could 454 // have been lost. 455 select { 456 case <-ctx.Done(): 457 return 0, true, nil 458 default: 459 // do nothing 460 } 461 462 // Measure everything after the remote query, which can block for long 463 // periods of time. This metric is a good measure of how expensive the 464 // replication process is. 465 defer metrics.MeasureSince([]string{"leader", "replication", "acl", "token", "apply"}, time.Now()) 466 467 _, local, err := s.fsm.State().ACLTokenList(nil, false, true, "") 468 if err != nil { 469 return 0, false, fmt.Errorf("failed to retrieve local ACL tokens: %v", err) 470 } 471 472 // If the remote index ever goes backwards, it's a good indication that 473 // the remote side was rebuilt and we should do a full sync since we 474 // can't make any assumptions about what's going on. 475 if remote.QueryMeta.Index < lastRemoteIndex { 476 s.logger.Printf("[WARN] consul: ACL token replication remote index moved backwards (%d to %d), forcing a full ACL token sync", lastRemoteIndex, remote.QueryMeta.Index) 477 lastRemoteIndex = 0 478 } 479 480 s.logger.Printf("[DEBUG] acl: token replication - local: %d, remote: %d", len(local), len(remote.Tokens)) 481 482 // Calculate the changes required to bring the state into sync and then 483 // apply them. 484 res := diffACLTokens(local, remote.Tokens, lastRemoteIndex) 485 if res.LocalSkipped > 0 || res.RemoteSkipped > 0 { 486 s.logger.Printf("[DEBUG] acl: token replication - deletions: %d, updates: %d, skipped: %d, skippedRemote: %d", 487 len(res.LocalDeletes), len(res.LocalUpserts), res.LocalSkipped, res.RemoteSkipped) 488 } else { 489 s.logger.Printf("[DEBUG] acl: token replication - deletions: %d, updates: %d", len(res.LocalDeletes), len(res.LocalUpserts)) 490 } 491 492 var tokens *structs.ACLTokenBatchResponse 493 if len(res.LocalUpserts) > 0 { 494 tokens, err = s.fetchACLTokensBatch(res.LocalUpserts) 495 if err != nil { 496 return 0, false, fmt.Errorf("failed to retrieve ACL token updates: %v", err) 497 } else if tokens.Redacted { 498 return 0, false, fmt.Errorf("failed to retrieve unredacted tokens - replication token in use does not grant acl:write") 499 } 500 501 s.logger.Printf("[DEBUG] acl: token replication - downloaded %d tokens", len(tokens.Tokens)) 502 } 503 504 if len(res.LocalDeletes) > 0 { 505 s.logger.Printf("[DEBUG] acl: token replication - performing deletions") 506 507 exit, err := s.deleteLocalACLTokens(res.LocalDeletes, ctx) 508 if exit { 509 return 0, true, nil 510 } 511 if err != nil { 512 return 0, false, fmt.Errorf("failed to delete local ACL tokens: %v", err) 513 } 514 s.logger.Printf("[DEBUG] acl: token replication - finished deletions") 515 } 516 517 if len(res.LocalUpserts) > 0 { 518 s.logger.Printf("[DEBUG] acl: token replication - performing updates") 519 exit, err := s.updateLocalACLTokens(tokens.Tokens, ctx) 520 if exit { 521 return 0, true, nil 522 } 523 if err != nil { 524 return 0, false, fmt.Errorf("failed to update local ACL tokens: %v", err) 525 } 526 s.logger.Printf("[DEBUG] acl: token replication - finished updates") 527 } 528 529 // Return the index we got back from the remote side, since we've synced 530 // up with the remote state as of that index. 531 return remote.QueryMeta.Index, false, nil 532 } 533 534 // IsACLReplicationEnabled returns true if ACL replication is enabled. 535 // DEPRECATED (ACL-Legacy-Compat) - with new ACLs at least policy replication is required 536 func (s *Server) IsACLReplicationEnabled() bool { 537 authDC := s.config.ACLDatacenter 538 return len(authDC) > 0 && (authDC != s.config.Datacenter) && 539 s.config.ACLTokenReplication 540 } 541 542 func (s *Server) updateACLReplicationStatusError() { 543 s.aclReplicationStatusLock.Lock() 544 defer s.aclReplicationStatusLock.Unlock() 545 546 s.aclReplicationStatus.LastError = time.Now().Round(time.Second).UTC() 547 } 548 549 func (s *Server) updateACLReplicationStatusIndex(index uint64) { 550 s.aclReplicationStatusLock.Lock() 551 defer s.aclReplicationStatusLock.Unlock() 552 553 s.aclReplicationStatus.LastSuccess = time.Now().Round(time.Second).UTC() 554 s.aclReplicationStatus.ReplicatedIndex = index 555 } 556 557 func (s *Server) updateACLReplicationStatusTokenIndex(index uint64) { 558 s.aclReplicationStatusLock.Lock() 559 defer s.aclReplicationStatusLock.Unlock() 560 561 s.aclReplicationStatus.LastSuccess = time.Now().Round(time.Second).UTC() 562 s.aclReplicationStatus.ReplicatedTokenIndex = index 563 } 564 565 func (s *Server) initReplicationStatus() { 566 s.aclReplicationStatusLock.Lock() 567 defer s.aclReplicationStatusLock.Unlock() 568 569 s.aclReplicationStatus.Enabled = true 570 s.aclReplicationStatus.Running = true 571 s.aclReplicationStatus.SourceDatacenter = s.config.ACLDatacenter 572 } 573 574 func (s *Server) updateACLReplicationStatusStopped() { 575 s.aclReplicationStatusLock.Lock() 576 defer s.aclReplicationStatusLock.Unlock() 577 578 s.aclReplicationStatus.Running = false 579 } 580 581 func (s *Server) updateACLReplicationStatusRunning(replicationType structs.ACLReplicationType) { 582 s.aclReplicationStatusLock.Lock() 583 defer s.aclReplicationStatusLock.Unlock() 584 585 s.aclReplicationStatus.Running = true 586 s.aclReplicationStatus.ReplicationType = replicationType 587 }