github.com/yasker/longhorn-engine@v0.0.0-20160621014712-6ed6cfca0729/agent/controller/controller.go (about) 1 package controller 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net" 7 "net/http" 8 "os" 9 "os/exec" 10 "time" 11 12 "github.com/Sirupsen/logrus" 13 14 "github.com/rancher/go-rancher-metadata/metadata" 15 lclient "github.com/rancher/longhorn/controller/client" 16 "github.com/rancher/longhorn/controller/rest" 17 replicaClient "github.com/rancher/longhorn/replica/client" 18 ) 19 20 const ( 21 defaultVolumeSize = "10737418240" // 10 gb 22 MetadataURL = "http://rancher-metadata/2015-12-19" 23 errorRetryMax = 1 24 ) 25 26 type replica struct { 27 client *replicaClient.ReplicaClient 28 host string 29 port int 30 healthState string 31 size string 32 } 33 34 func ReplicaAddress(host string, port int) string { 35 return fmt.Sprintf("tcp://%s:%d", host, port) 36 } 37 38 type Controller struct { 39 client *lclient.ControllerClient 40 errorRetries map[string]int 41 } 42 43 func New() *Controller { 44 client := lclient.NewControllerClient("http://localhost:9501") 45 return &Controller{ 46 client: client, 47 errorRetries: map[string]int{}, 48 } 49 } 50 51 func (c *Controller) Close() error { 52 logrus.Infof("Shutting down Longhorn.") 53 return nil 54 } 55 56 func (c *Controller) Start() error { 57 logrus.Infof("Starting Longhorn.") 58 59 volume, err := c.client.GetVolume() 60 if err != nil { 61 return fmt.Errorf("Error while getting volume: %v", err) 62 } 63 64 if volume.ReplicaCount == 0 { 65 if err = c.getReplicasAndStart(); err != nil { 66 return err 67 } 68 } else { 69 logrus.Infof("Volume is started with %v replicas.", volume.ReplicaCount) 70 } 71 72 return c.refresh() 73 } 74 75 func (c *Controller) getReplicasAndStart() error { 76 var replicaMetadata map[string]*replica 77 var scale int 78 for { 79 var err error 80 if scale, replicaMetadata, err = c.replicaMetadataAndClient(); err != nil { 81 return err 82 } else if len(replicaMetadata) < scale { 83 logrus.Infof("Waiting for replicas. Current %v, expected: %v", len(replicaMetadata), scale) 84 time.Sleep(1 * time.Second) 85 } else { 86 break 87 } 88 } 89 90 initializingReplicas := map[string]*replica{} 91 closedCleanReplicas := map[string]*replica{} 92 closedDirtyReplicas := map[string]*replica{} 93 openCleanReplicas := map[string]*replica{} 94 openDirtyReplicas := map[string]*replica{} 95 rebuildingClosedReplicas := map[string]*replica{} 96 rebuildingOpenReplicas := map[string]*replica{} 97 otherReplicas := map[string]*replica{} 98 99 for address, replicaMd := range replicaMetadata { 100 replica, err := replicaMd.client.GetReplica() 101 if err != nil { 102 logrus.Errorf("Error getting replica %v. Removing from list of start replcias. Error: %v", address, err) 103 continue 104 } 105 106 if replica.State == "initial" { 107 initializingReplicas[address] = replicaMd 108 109 } else if replica.Rebuilding && replica.State == "closed" { 110 rebuildingClosedReplicas[address] = replicaMd 111 112 } else if replica.Rebuilding { 113 rebuildingOpenReplicas[address] = replicaMd 114 115 } else if replica.State == "closed" && replica.Dirty { 116 closedDirtyReplicas[address] = replicaMd 117 118 } else if replica.State == "closed" { 119 closedCleanReplicas[address] = replicaMd 120 121 } else if replica.State == "open" { 122 openCleanReplicas[address] = replicaMd 123 124 } else if replica.State == "dirty" { 125 openDirtyReplicas[address] = replicaMd 126 127 } else { 128 otherReplicas[address] = replicaMd 129 130 } 131 } 132 logrus.Infof("Initializing replicas: %v", initializingReplicas) 133 logrus.Infof("Closed and clean replicas: %v", closedCleanReplicas) 134 logrus.Infof("Closed and dirty replicas: %v", closedDirtyReplicas) 135 logrus.Infof("Open and dirty replicas: %v", openDirtyReplicas) 136 logrus.Infof("Open and clean replicas: %v", openCleanReplicas) 137 logrus.Infof("Rebuilding and closed replicas: %v", rebuildingClosedReplicas) 138 logrus.Infof("Rebuilding and open replicas: %v", rebuildingOpenReplicas) 139 logrus.Infof("Other replicas (likely in error state)L %v", otherReplicas) 140 141 // Closed and clean. Start with all replicas. 142 attemptedStart, err := c.startWithAll(closedCleanReplicas, false) 143 if attemptedStart { 144 return err 145 } 146 147 // Closed and dirty. Start with one. 148 attemptedStart, err = c.startWithOne(closedDirtyReplicas, false) 149 if attemptedStart { 150 return err 151 } 152 153 // Open and dirty. Close and start with one. 154 attemptedStart, err = c.startWithOne(openDirtyReplicas, true) 155 if attemptedStart { 156 return err 157 } 158 159 // Open and clean. Close and start with one (because they could become dirty before we close). 160 attemptedStart, err = c.startWithOne(openCleanReplicas, true) 161 if attemptedStart { 162 return err 163 } 164 165 // Rebuilding and closed. Start with one. 166 attemptedStart, err = c.startWithOne(rebuildingClosedReplicas, false) 167 if attemptedStart { 168 return err 169 } 170 171 // Rebuilding and open. Close and start with one. 172 attemptedStart, err = c.startWithOne(rebuildingOpenReplicas, true) 173 if attemptedStart { 174 return err 175 } 176 177 // Initial. Start with all 178 attemptedStart, err = c.startWithAll(initializingReplicas, true) 179 if attemptedStart { 180 return err 181 } 182 183 return fmt.Errorf("Couldn't find any valid replicas to start with. Original replicas from metadata: %v", replicaMetadata) 184 } 185 186 func (c *Controller) startWithAll(replicas map[string]*replica, create bool) (bool, error) { 187 addresses := []string{} 188 for address, replica := range replicas { 189 if create { 190 logrus.Infof("Create replica %v", address) 191 if err := replica.client.Create(replica.size); err != nil { 192 logrus.Errorf("Error creating replica %v: %v. It won't be used to start controller.", address, err) 193 continue 194 } 195 } 196 addresses = append(addresses, address) 197 } 198 if len(addresses) > 0 { 199 logrus.Infof("Starting controller with replicas: %v.", addresses) 200 return true, c.client.Start(addresses...) 201 } 202 return false, nil 203 } 204 205 // Start the controller with a single replica from the provided map. If the map is bigger than one, will try with each replica. 206 // Return bool indicates if the controller attempted to start. 207 func (c *Controller) startWithOne(replicas map[string]*replica, close bool) (bool, error) { 208 returnErrors := []error{} 209 for addr, replica := range replicas { 210 if close { 211 logrus.Infof("Closing replica %v", addr) 212 if err := replica.client.Close(); err != nil { 213 logrus.Errorf("Error closing replica %v: %v. It won't be used to start controller.", addr, err) 214 continue 215 } 216 } 217 218 logrus.Infof("Starting controller with replica: %v.", addr) 219 if err := c.client.Start(addr); err != nil { 220 returnErrors = append(returnErrors, fmt.Errorf("%v: %v", addr, err)) 221 } else { 222 return true, nil 223 } 224 } 225 226 var err error 227 if len(returnErrors) > 0 { 228 err = fmt.Errorf("Enountered %v errors trying to start controller. Errors: %v", len(returnErrors), returnErrors) 229 } 230 return err != nil, err 231 } 232 233 func (c *Controller) refresh() error { 234 for { 235 if err := c.syncReplicas(); err != nil { 236 logrus.Errorf("Failed to sync replicas: %v", err) 237 } 238 time.Sleep(5 * time.Second) 239 } 240 } 241 242 func (c *Controller) syncReplicas() (retErr error) { 243 logrus.Debugf("Syncing replicas.") 244 245 // Remove replicas from controller if they aren't in metadata 246 _, fromMetadata, err := c.replicaMetadataAndClient() 247 if err != nil { 248 return fmt.Errorf("Error listing replicas in metadata: %v", err) 249 } 250 if err := c.removeReplicasNotInMetadata(fromMetadata); err != nil { 251 return err 252 } 253 254 // Retry replicas in error state 255 if err := c.retryErroredReplicas(); err != nil { 256 return err 257 } 258 259 // Add new replicas 260 return c.addReplicasInMetadata() 261 } 262 263 func (c *Controller) removeReplicasNotInMetadata(fromMetadata map[string]*replica) error { 264 replicasInController, err := c.client.ListReplicas() 265 if err != nil { 266 return fmt.Errorf("Error listing replicas in controller during remove: %v", err) 267 } 268 fromController := map[string]rest.Replica{} 269 for _, r := range replicasInController { 270 fromController[r.Address] = r 271 } 272 273 if len(fromController) > 1 { 274 for address := range fromController { 275 if _, ok := fromMetadata[address]; !ok { 276 logrus.Infof("Replica %v not in metadata. Removing it.", address) 277 if _, err := c.client.DeleteReplica(address); err != nil { 278 return fmt.Errorf("Error removing replica %v: %v", address, err) 279 } 280 return c.removeReplicasNotInMetadata(fromMetadata) 281 } 282 } 283 } 284 285 return nil 286 } 287 288 func (c *Controller) retryErroredReplicas() error { 289 _, fromMetadata, err := c.replicaMetadataAndClient() 290 if err != nil { 291 return fmt.Errorf("Error listing replicas in metadata during retry: %v", err) 292 } 293 294 replicasInController, err := c.client.ListReplicas() 295 if err != nil { 296 return fmt.Errorf("Error listing replicas in controller during retry: %v", err) 297 } 298 299 for _, r := range replicasInController { 300 if r.Mode != "ERR" { 301 continue 302 } 303 304 if retryCount, ok := c.errorRetries[r.Address]; ok && retryCount >= errorRetryMax { 305 logrus.Infof("Reached max retry count for replica %v. Ignoring it so that replica helthcheck failure destroys it.", r.Address) 306 } else { 307 logrus.Infof("Retrying errored replica %v", r.Address) 308 c.errorRetries[r.Address] = retryCount + 1 309 replicaMD, ok := fromMetadata[r.Address] 310 if !ok { 311 logrus.Warnf("Cannot find errored replica %v in metadata. Won't attempt to re-add it.", r.Actions) 312 } else if err := c.removeAndAdd(r, replicaMD); err != nil { 313 return fmt.Errorf("Error performing remove and add for replica %v: %v", r.Address, err) 314 } else { 315 // remove and add was successful 316 delete(c.errorRetries, r.Address) 317 } 318 } 319 } 320 321 // Cleanup error retires map 322 for address := range c.errorRetries { 323 if _, ok := fromMetadata[address]; !ok { 324 delete(c.errorRetries, address) 325 } 326 } 327 328 return nil 329 } 330 331 func (c *Controller) removeAndAdd(replica rest.Replica, replicaMD *replica) error { 332 logrus.Infof("Removing errored replica %v for re-add.", replica.Address) 333 if _, err := c.client.DeleteReplica(replica.Address); err != nil { 334 return fmt.Errorf("Error removing errored replica %v: %v.", replica.Address, err) 335 } 336 337 freshReplica, err := replicaMD.client.GetReplica() 338 if err != nil { 339 return fmt.Errorf("Error getting replica %v during removeAndAdd: %v.", replica.Address, err) 340 } 341 342 if _, ok := freshReplica.Actions["close"]; ok { 343 err := replicaMD.client.Close() 344 if err != nil { 345 return fmt.Errorf("Error closing replica %v before adding: %v.", replica.Address, err) 346 } 347 } 348 349 return c.addReplica(replicaMD) 350 } 351 352 func (c *Controller) addReplicasInMetadata() error { 353 _, fromMetadata, err := c.replicaMetadataAndClient() 354 if err != nil { 355 return fmt.Errorf("Error listing replicas in metadata during add: %v", err) 356 } 357 358 replicasInController, err := c.client.ListReplicas() 359 if err != nil { 360 return fmt.Errorf("Error listing replicas in controller during add: %v", err) 361 } 362 363 fromController := map[string]rest.Replica{} 364 for _, r := range replicasInController { 365 fromController[r.Address] = r 366 } 367 368 for address, r := range fromMetadata { 369 if _, ok := fromController[address]; !ok { 370 logrus.Infof("Adding replica %v because it isn't in controller.", address) 371 if err := c.addReplica(r); err != nil { 372 return fmt.Errorf("Error adding replica %v: %v", address, err) 373 } 374 } 375 } 376 377 return nil 378 } 379 380 func (c *Controller) testSyncAgent(host string) error { 381 address := fmt.Sprintf("%v:%v", host, 9504) 382 conn, err := net.DialTimeout("tcp", address, time.Second*10) 383 if err != nil { 384 return err 385 } 386 conn.Close() 387 return nil 388 } 389 390 func (c *Controller) addReplica(r *replica) error { 391 replica, err := r.client.GetReplica() 392 if err != nil { 393 return fmt.Errorf("Error getting replica %v before adding: %v", r.host, err) 394 } 395 396 // Ensure sync-agent is up and running 397 if err := c.testSyncAgent(r.host); err != nil { 398 return fmt.Errorf("Error while testing sync agent connection: %v", err) 399 } 400 401 if _, ok := replica.Actions["create"]; ok { 402 err := r.client.Create(r.size) 403 if err != nil { 404 return fmt.Errorf("Error opening replica %v before adding: %v", r.host, err) 405 } 406 } else if _, ok := replica.Actions["close"]; ok { 407 err := r.client.Close() 408 if err != nil { 409 return fmt.Errorf("Error closing replica %v before adding: %v", r.host, err) 410 } 411 } 412 413 address := ReplicaAddress(r.host, r.port) 414 logrus.Infof("Calling longhorn add cli for replica %v.", address) 415 cmd := exec.Command("longhorn", "add", address) 416 cmd.Stderr = os.Stderr 417 cmd.Stdout = os.Stdout 418 419 if err := cmd.Run(); err != nil { 420 logrus.Warnf("longhorn add cli returned error %v while adding replica %v. Attempting to clean up.", err, address) 421 replicas, err2 := c.client.ListReplicas() 422 if err2 != nil { 423 logrus.Errorf("Error listing replicas while trying to clean up after failed add for replica %v: %v", address, err2) 424 } else { 425 for _, replica := range replicas { 426 if replica.Address == address && replica.Mode != "RW" { 427 logrus.Infof("Removing replica %v after having failed to add it. Add failure: %v", address, err) 428 if _, err := c.client.DeleteReplica(address); err != nil { 429 logrus.Errorf("Error while deleting replica as part of cleanup: %v", err) 430 } 431 } 432 } 433 } 434 return fmt.Errorf("Error executing add command %v: %v", cmd, err) 435 } 436 return nil 437 } 438 439 func (c *Controller) replicaMetadataAndClient() (int, map[string]*replica, error) { 440 client, err := metadata.NewClientAndWait(MetadataURL) 441 if err != nil { 442 return 0, nil, err 443 } 444 service, err := client.GetSelfServiceByName("replica") 445 if err != nil { 446 return 0, nil, err 447 } 448 449 // Unmarshalling the metadata as json is forcing it to a bad format 450 resp, err := http.Get(MetadataURL + "/self/service/metadata/volume/volume_config/size") 451 if err != nil { 452 return 0, nil, err 453 } 454 455 size := "" 456 if resp.StatusCode == 200 { 457 body, err := ioutil.ReadAll(resp.Body) 458 if err != nil { 459 return 0, nil, err 460 } 461 size = string(body) 462 } 463 464 if size == "" { 465 size = defaultVolumeSize 466 } 467 468 containers := map[string]metadata.Container{} 469 for _, container := range service.Containers { 470 if c, ok := containers[container.Name]; !ok { 471 containers[container.Name] = container 472 } else if container.CreateIndex > c.CreateIndex { 473 containers[container.Name] = container 474 } 475 } 476 477 result := map[string]*replica{} 478 for _, container := range containers { 479 r := &replica{ 480 healthState: container.HealthState, 481 host: container.PrimaryIp, 482 port: 9502, 483 size: size, 484 } 485 486 address := ReplicaAddress(r.host, r.port) 487 replicaClient, err := replicaClient.NewReplicaClient(address) 488 if err != nil { 489 return 0, nil, fmt.Errorf("Error getting client for replica %v: %v", address, err) 490 } 491 r.client = replicaClient 492 result[address] = r 493 } 494 495 return service.Scale, result, nil 496 }