github.com/sl1pm4t/consul@v1.4.5-0.20190325224627-74c31c540f9c/agent/ae/ae.go (about) 1 // Package ae provides tools to synchronize state between local and remote consul servers. 2 package ae 3 4 import ( 5 "fmt" 6 "log" 7 "math" 8 "sync" 9 "time" 10 11 "github.com/hashicorp/consul/lib" 12 ) 13 14 // scaleThreshold is the number of nodes after which regular sync runs are 15 // spread out farther apart. The value should be a power of 2 since the 16 // scale function uses log2. 17 // 18 // When set to 128 nodes the delay between regular runs is doubled when the 19 // cluster is larger than 128 nodes. It doubles again when it passes 256 20 // nodes, and again at 512 nodes and so forth. At 8192 nodes, the delay 21 // factor is 8. 22 // 23 // If you update this, you may need to adjust the tuning of 24 // CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize. 25 const scaleThreshold = 128 26 27 // scaleFactor returns a factor by which the next sync run should be delayed to 28 // avoid saturation of the cluster. The larger the cluster grows the farther 29 // the sync runs should be spread apart. 30 // 31 // The current implementation uses a log2 scale which doubles the delay between 32 // runs every time the cluster doubles in size. 33 func scaleFactor(nodes int) int { 34 if nodes <= scaleThreshold { 35 return 1.0 36 } 37 return int(math.Ceil(math.Log2(float64(nodes))-math.Log2(float64(scaleThreshold))) + 1.0) 38 } 39 40 type SyncState interface { 41 SyncChanges() error 42 SyncFull() error 43 } 44 45 // StateSyncer manages background synchronization of the given state. 46 // 47 // The state is synchronized on a regular basis or on demand when either 48 // the state has changed or a new Consul server has joined the cluster. 49 // 50 // The regular state synchronization provides a self-healing mechanism 51 // for the cluster which is also called anti-entropy. 52 type StateSyncer struct { 53 // State contains the data that needs to be synchronized. 54 State SyncState 55 56 // Interval is the time between two full sync runs. 57 Interval time.Duration 58 59 // ShutdownCh is closed when the application is shutting down. 60 ShutdownCh chan struct{} 61 62 // Logger is the logger. 63 Logger *log.Logger 64 65 // ClusterSize returns the number of members in the cluster to 66 // allow staggering the sync runs based on cluster size. 67 // This needs to be set before Run() is called. 68 ClusterSize func() int 69 70 // SyncFull allows triggering an immediate but staggered full sync 71 // in a non-blocking way. 72 SyncFull *Trigger 73 74 // SyncChanges allows triggering an immediate partial sync 75 // in a non-blocking way. 76 SyncChanges *Trigger 77 78 // paused stores whether sync runs are temporarily disabled. 79 pauseLock sync.Mutex 80 paused int 81 82 // serverUpInterval is the max time after which a full sync is 83 // performed when a server has been added to the cluster. 84 serverUpInterval time.Duration 85 86 // retryFailInterval is the time after which a failed full sync is retried. 87 retryFailInterval time.Duration 88 89 // stagger randomly picks a duration between 0s and the given duration. 90 stagger func(time.Duration) time.Duration 91 92 // retrySyncFullEvent generates an event based on multiple conditions 93 // when the state machine is trying to retry a full state sync. 94 retrySyncFullEvent func() event 95 96 // syncChangesEvent generates an event based on multiple conditions 97 // when the state machine is performing partial state syncs. 98 syncChangesEvent func() event 99 } 100 101 const ( 102 // serverUpIntv is the max time to wait before a sync is triggered 103 // when a consul server has been added to the cluster. 104 serverUpIntv = 3 * time.Second 105 106 // retryFailIntv is the min time to wait before a failed sync is retried. 107 retryFailIntv = 15 * time.Second 108 ) 109 110 func NewStateSyncer(state SyncState, intv time.Duration, shutdownCh chan struct{}, logger *log.Logger) *StateSyncer { 111 s := &StateSyncer{ 112 State: state, 113 Interval: intv, 114 ShutdownCh: shutdownCh, 115 Logger: logger, 116 SyncFull: NewTrigger(), 117 SyncChanges: NewTrigger(), 118 serverUpInterval: serverUpIntv, 119 retryFailInterval: retryFailIntv, 120 } 121 122 // retain these methods as member variables so that 123 // we can mock them for testing. 124 s.retrySyncFullEvent = s.retrySyncFullEventFn 125 s.syncChangesEvent = s.syncChangesEventFn 126 s.stagger = s.staggerFn 127 128 return s 129 } 130 131 // fsmState defines states for the state machine. 132 type fsmState string 133 134 const ( 135 doneState fsmState = "done" 136 fullSyncState fsmState = "fullSync" 137 partialSyncState fsmState = "partialSync" 138 retryFullSyncState fsmState = "retryFullSync" 139 ) 140 141 // Run is the long running method to perform state synchronization 142 // between local and remote servers. 143 func (s *StateSyncer) Run() { 144 if s.ClusterSize == nil { 145 panic("ClusterSize not set") 146 } 147 s.runFSM(fullSyncState, s.nextFSMState) 148 } 149 150 // runFSM runs the state machine. 151 func (s *StateSyncer) runFSM(fs fsmState, next func(fsmState) fsmState) { 152 for { 153 if fs = next(fs); fs == doneState { 154 return 155 } 156 } 157 } 158 159 // nextFSMState determines the next state based on the current state. 160 func (s *StateSyncer) nextFSMState(fs fsmState) fsmState { 161 switch fs { 162 case fullSyncState: 163 if s.Paused() { 164 return retryFullSyncState 165 } 166 167 err := s.State.SyncFull() 168 if err != nil { 169 s.Logger.Printf("[ERR] agent: failed to sync remote state: %v", err) 170 return retryFullSyncState 171 } 172 173 return partialSyncState 174 175 case retryFullSyncState: 176 e := s.retrySyncFullEvent() 177 switch e { 178 case syncFullNotifEvent, syncFullTimerEvent: 179 return fullSyncState 180 case shutdownEvent: 181 return doneState 182 default: 183 panic(fmt.Sprintf("invalid event: %s", e)) 184 } 185 186 case partialSyncState: 187 e := s.syncChangesEvent() 188 switch e { 189 case syncFullNotifEvent, syncFullTimerEvent: 190 return fullSyncState 191 192 case syncChangesNotifEvent: 193 if s.Paused() { 194 return partialSyncState 195 } 196 197 err := s.State.SyncChanges() 198 if err != nil { 199 s.Logger.Printf("[ERR] agent: failed to sync changes: %v", err) 200 } 201 return partialSyncState 202 203 case shutdownEvent: 204 return doneState 205 206 default: 207 panic(fmt.Sprintf("invalid event: %s", e)) 208 } 209 210 default: 211 panic(fmt.Sprintf("invalid state: %s", fs)) 212 } 213 } 214 215 // event defines a timing or notification event from multiple timers and 216 // channels. 217 type event string 218 219 const ( 220 shutdownEvent event = "shutdown" 221 syncFullNotifEvent event = "syncFullNotif" 222 syncFullTimerEvent event = "syncFullTimer" 223 syncChangesNotifEvent event = "syncChangesNotif" 224 ) 225 226 // retrySyncFullEventFn waits for an event which triggers a retry 227 // of a full sync or a termination signal. This function should not be 228 // called directly but through s.retryFullSyncState to allow mocking for 229 // testing. 230 func (s *StateSyncer) retrySyncFullEventFn() event { 231 select { 232 // trigger a full sync immediately. 233 // this is usually called when a consul server was added to the cluster. 234 // stagger the delay to avoid a thundering herd. 235 case <-s.SyncFull.Notif(): 236 select { 237 case <-time.After(s.stagger(s.serverUpInterval)): 238 return syncFullNotifEvent 239 case <-s.ShutdownCh: 240 return shutdownEvent 241 } 242 243 // retry full sync after some time 244 // todo(fs): why don't we use s.Interval here? 245 case <-time.After(s.retryFailInterval + s.stagger(s.retryFailInterval)): 246 return syncFullTimerEvent 247 248 case <-s.ShutdownCh: 249 return shutdownEvent 250 } 251 } 252 253 // syncChangesEventFn waits for a event which either triggers a full 254 // or a partial sync or a termination signal. This function should not 255 // be called directly but through s.syncChangesEvent to allow mocking 256 // for testing. 257 func (s *StateSyncer) syncChangesEventFn() event { 258 select { 259 // trigger a full sync immediately 260 // this is usually called when a consul server was added to the cluster. 261 // stagger the delay to avoid a thundering herd. 262 case <-s.SyncFull.Notif(): 263 select { 264 case <-time.After(s.stagger(s.serverUpInterval)): 265 return syncFullNotifEvent 266 case <-s.ShutdownCh: 267 return shutdownEvent 268 } 269 270 // time for a full sync again 271 case <-time.After(s.Interval + s.stagger(s.Interval)): 272 return syncFullTimerEvent 273 274 // do partial syncs on demand 275 case <-s.SyncChanges.Notif(): 276 return syncChangesNotifEvent 277 278 case <-s.ShutdownCh: 279 return shutdownEvent 280 } 281 } 282 283 // stubbed out for testing 284 var libRandomStagger = lib.RandomStagger 285 286 // staggerFn returns a random duration which depends on the cluster size 287 // and a random factor which should provide some timely distribution of 288 // cluster wide events. This function should not be called directly 289 // but through s.stagger to allow mocking for testing. 290 func (s *StateSyncer) staggerFn(d time.Duration) time.Duration { 291 f := scaleFactor(s.ClusterSize()) 292 return libRandomStagger(time.Duration(f) * d) 293 } 294 295 // Pause temporarily disables sync runs. 296 func (s *StateSyncer) Pause() { 297 s.pauseLock.Lock() 298 s.paused++ 299 s.pauseLock.Unlock() 300 } 301 302 // Paused returns whether sync runs are temporarily disabled. 303 func (s *StateSyncer) Paused() bool { 304 s.pauseLock.Lock() 305 defer s.pauseLock.Unlock() 306 return s.paused != 0 307 } 308 309 // Resume re-enables sync runs. It returns true if it was the last pause/resume 310 // pair on the stack and so actually caused the state syncer to resume. 311 func (s *StateSyncer) Resume() bool { 312 s.pauseLock.Lock() 313 s.paused-- 314 if s.paused < 0 { 315 panic("unbalanced pause/resume") 316 } 317 trigger := s.paused == 0 318 s.pauseLock.Unlock() 319 if trigger { 320 s.SyncChanges.Trigger() 321 } 322 return trigger 323 }