github.com/yasker/longhorn-engine@v0.0.0-20160621014712-6ed6cfca0729/agent/controller/controller.go (about)

     1  package controller
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"net"
     7  	"net/http"
     8  	"os"
     9  	"os/exec"
    10  	"time"
    11  
    12  	"github.com/Sirupsen/logrus"
    13  
    14  	"github.com/rancher/go-rancher-metadata/metadata"
    15  	lclient "github.com/rancher/longhorn/controller/client"
    16  	"github.com/rancher/longhorn/controller/rest"
    17  	replicaClient "github.com/rancher/longhorn/replica/client"
    18  )
    19  
    20  const (
    21  	defaultVolumeSize = "10737418240" // 10 gb
    22  	MetadataURL       = "http://rancher-metadata/2015-12-19"
    23  	errorRetryMax     = 1
    24  )
    25  
    26  type replica struct {
    27  	client      *replicaClient.ReplicaClient
    28  	host        string
    29  	port        int
    30  	healthState string
    31  	size        string
    32  }
    33  
    34  func ReplicaAddress(host string, port int) string {
    35  	return fmt.Sprintf("tcp://%s:%d", host, port)
    36  }
    37  
    38  type Controller struct {
    39  	client       *lclient.ControllerClient
    40  	errorRetries map[string]int
    41  }
    42  
    43  func New() *Controller {
    44  	client := lclient.NewControllerClient("http://localhost:9501")
    45  	return &Controller{
    46  		client:       client,
    47  		errorRetries: map[string]int{},
    48  	}
    49  }
    50  
    51  func (c *Controller) Close() error {
    52  	logrus.Infof("Shutting down Longhorn.")
    53  	return nil
    54  }
    55  
    56  func (c *Controller) Start() error {
    57  	logrus.Infof("Starting Longhorn.")
    58  
    59  	volume, err := c.client.GetVolume()
    60  	if err != nil {
    61  		return fmt.Errorf("Error while getting volume: %v", err)
    62  	}
    63  
    64  	if volume.ReplicaCount == 0 {
    65  		if err = c.getReplicasAndStart(); err != nil {
    66  			return err
    67  		}
    68  	} else {
    69  		logrus.Infof("Volume is started with %v replicas.", volume.ReplicaCount)
    70  	}
    71  
    72  	return c.refresh()
    73  }
    74  
    75  func (c *Controller) getReplicasAndStart() error {
    76  	var replicaMetadata map[string]*replica
    77  	var scale int
    78  	for {
    79  		var err error
    80  		if scale, replicaMetadata, err = c.replicaMetadataAndClient(); err != nil {
    81  			return err
    82  		} else if len(replicaMetadata) < scale {
    83  			logrus.Infof("Waiting for replicas. Current %v, expected: %v", len(replicaMetadata), scale)
    84  			time.Sleep(1 * time.Second)
    85  		} else {
    86  			break
    87  		}
    88  	}
    89  
    90  	initializingReplicas := map[string]*replica{}
    91  	closedCleanReplicas := map[string]*replica{}
    92  	closedDirtyReplicas := map[string]*replica{}
    93  	openCleanReplicas := map[string]*replica{}
    94  	openDirtyReplicas := map[string]*replica{}
    95  	rebuildingClosedReplicas := map[string]*replica{}
    96  	rebuildingOpenReplicas := map[string]*replica{}
    97  	otherReplicas := map[string]*replica{}
    98  
    99  	for address, replicaMd := range replicaMetadata {
   100  		replica, err := replicaMd.client.GetReplica()
   101  		if err != nil {
   102  			logrus.Errorf("Error getting replica %v. Removing from list of start replcias. Error: %v", address, err)
   103  			continue
   104  		}
   105  
   106  		if replica.State == "initial" {
   107  			initializingReplicas[address] = replicaMd
   108  
   109  		} else if replica.Rebuilding && replica.State == "closed" {
   110  			rebuildingClosedReplicas[address] = replicaMd
   111  
   112  		} else if replica.Rebuilding {
   113  			rebuildingOpenReplicas[address] = replicaMd
   114  
   115  		} else if replica.State == "closed" && replica.Dirty {
   116  			closedDirtyReplicas[address] = replicaMd
   117  
   118  		} else if replica.State == "closed" {
   119  			closedCleanReplicas[address] = replicaMd
   120  
   121  		} else if replica.State == "open" {
   122  			openCleanReplicas[address] = replicaMd
   123  
   124  		} else if replica.State == "dirty" {
   125  			openDirtyReplicas[address] = replicaMd
   126  
   127  		} else {
   128  			otherReplicas[address] = replicaMd
   129  
   130  		}
   131  	}
   132  	logrus.Infof("Initializing replicas: %v", initializingReplicas)
   133  	logrus.Infof("Closed and clean replicas: %v", closedCleanReplicas)
   134  	logrus.Infof("Closed and dirty replicas: %v", closedDirtyReplicas)
   135  	logrus.Infof("Open and dirty replicas: %v", openDirtyReplicas)
   136  	logrus.Infof("Open and clean replicas: %v", openCleanReplicas)
   137  	logrus.Infof("Rebuilding and closed replicas: %v", rebuildingClosedReplicas)
   138  	logrus.Infof("Rebuilding and open replicas: %v", rebuildingOpenReplicas)
   139  	logrus.Infof("Other replicas (likely in error state)L %v", otherReplicas)
   140  
   141  	// Closed and clean. Start with all replicas.
   142  	attemptedStart, err := c.startWithAll(closedCleanReplicas, false)
   143  	if attemptedStart {
   144  		return err
   145  	}
   146  
   147  	// Closed and dirty. Start with one.
   148  	attemptedStart, err = c.startWithOne(closedDirtyReplicas, false)
   149  	if attemptedStart {
   150  		return err
   151  	}
   152  
   153  	// Open and dirty. Close and start with one.
   154  	attemptedStart, err = c.startWithOne(openDirtyReplicas, true)
   155  	if attemptedStart {
   156  		return err
   157  	}
   158  
   159  	// Open and clean. Close and start with one (because they could become dirty before we close).
   160  	attemptedStart, err = c.startWithOne(openCleanReplicas, true)
   161  	if attemptedStart {
   162  		return err
   163  	}
   164  
   165  	// Rebuilding and closed. Start with one.
   166  	attemptedStart, err = c.startWithOne(rebuildingClosedReplicas, false)
   167  	if attemptedStart {
   168  		return err
   169  	}
   170  
   171  	// Rebuilding and open. Close and start with one.
   172  	attemptedStart, err = c.startWithOne(rebuildingOpenReplicas, true)
   173  	if attemptedStart {
   174  		return err
   175  	}
   176  
   177  	// Initial. Start with all
   178  	attemptedStart, err = c.startWithAll(initializingReplicas, true)
   179  	if attemptedStart {
   180  		return err
   181  	}
   182  
   183  	return fmt.Errorf("Couldn't find any valid replicas to start with. Original replicas from metadata: %v", replicaMetadata)
   184  }
   185  
   186  func (c *Controller) startWithAll(replicas map[string]*replica, create bool) (bool, error) {
   187  	addresses := []string{}
   188  	for address, replica := range replicas {
   189  		if create {
   190  			logrus.Infof("Create replica %v", address)
   191  			if err := replica.client.Create(replica.size); err != nil {
   192  				logrus.Errorf("Error creating replica %v: %v. It won't be used to start controller.", address, err)
   193  				continue
   194  			}
   195  		}
   196  		addresses = append(addresses, address)
   197  	}
   198  	if len(addresses) > 0 {
   199  		logrus.Infof("Starting controller with replicas: %v.", addresses)
   200  		return true, c.client.Start(addresses...)
   201  	}
   202  	return false, nil
   203  }
   204  
   205  // Start the controller with a single replica from the provided map. If the map is bigger than one, will try with each replica.
   206  // Return bool indicates if the controller attempted to start.
   207  func (c *Controller) startWithOne(replicas map[string]*replica, close bool) (bool, error) {
   208  	returnErrors := []error{}
   209  	for addr, replica := range replicas {
   210  		if close {
   211  			logrus.Infof("Closing replica %v", addr)
   212  			if err := replica.client.Close(); err != nil {
   213  				logrus.Errorf("Error closing replica %v: %v. It won't be used to start controller.", addr, err)
   214  				continue
   215  			}
   216  		}
   217  
   218  		logrus.Infof("Starting controller with replica: %v.", addr)
   219  		if err := c.client.Start(addr); err != nil {
   220  			returnErrors = append(returnErrors, fmt.Errorf("%v: %v", addr, err))
   221  		} else {
   222  			return true, nil
   223  		}
   224  	}
   225  
   226  	var err error
   227  	if len(returnErrors) > 0 {
   228  		err = fmt.Errorf("Enountered %v errors trying to start controller. Errors: %v", len(returnErrors), returnErrors)
   229  	}
   230  	return err != nil, err
   231  }
   232  
   233  func (c *Controller) refresh() error {
   234  	for {
   235  		if err := c.syncReplicas(); err != nil {
   236  			logrus.Errorf("Failed to sync replicas: %v", err)
   237  		}
   238  		time.Sleep(5 * time.Second)
   239  	}
   240  }
   241  
   242  func (c *Controller) syncReplicas() (retErr error) {
   243  	logrus.Debugf("Syncing replicas.")
   244  
   245  	// Remove replicas from controller if they aren't in metadata
   246  	_, fromMetadata, err := c.replicaMetadataAndClient()
   247  	if err != nil {
   248  		return fmt.Errorf("Error listing replicas in metadata: %v", err)
   249  	}
   250  	if err := c.removeReplicasNotInMetadata(fromMetadata); err != nil {
   251  		return err
   252  	}
   253  
   254  	// Retry replicas in error state
   255  	if err := c.retryErroredReplicas(); err != nil {
   256  		return err
   257  	}
   258  
   259  	// Add new replicas
   260  	return c.addReplicasInMetadata()
   261  }
   262  
   263  func (c *Controller) removeReplicasNotInMetadata(fromMetadata map[string]*replica) error {
   264  	replicasInController, err := c.client.ListReplicas()
   265  	if err != nil {
   266  		return fmt.Errorf("Error listing replicas in controller during remove: %v", err)
   267  	}
   268  	fromController := map[string]rest.Replica{}
   269  	for _, r := range replicasInController {
   270  		fromController[r.Address] = r
   271  	}
   272  
   273  	if len(fromController) > 1 {
   274  		for address := range fromController {
   275  			if _, ok := fromMetadata[address]; !ok {
   276  				logrus.Infof("Replica %v not in metadata. Removing it.", address)
   277  				if _, err := c.client.DeleteReplica(address); err != nil {
   278  					return fmt.Errorf("Error removing replica %v: %v", address, err)
   279  				}
   280  				return c.removeReplicasNotInMetadata(fromMetadata)
   281  			}
   282  		}
   283  	}
   284  
   285  	return nil
   286  }
   287  
   288  func (c *Controller) retryErroredReplicas() error {
   289  	_, fromMetadata, err := c.replicaMetadataAndClient()
   290  	if err != nil {
   291  		return fmt.Errorf("Error listing replicas in metadata during retry: %v", err)
   292  	}
   293  
   294  	replicasInController, err := c.client.ListReplicas()
   295  	if err != nil {
   296  		return fmt.Errorf("Error listing replicas in controller during retry: %v", err)
   297  	}
   298  
   299  	for _, r := range replicasInController {
   300  		if r.Mode != "ERR" {
   301  			continue
   302  		}
   303  
   304  		if retryCount, ok := c.errorRetries[r.Address]; ok && retryCount >= errorRetryMax {
   305  			logrus.Infof("Reached max retry count for replica %v. Ignoring it so that replica helthcheck failure destroys it.", r.Address)
   306  		} else {
   307  			logrus.Infof("Retrying errored replica %v", r.Address)
   308  			c.errorRetries[r.Address] = retryCount + 1
   309  			replicaMD, ok := fromMetadata[r.Address]
   310  			if !ok {
   311  				logrus.Warnf("Cannot find errored replica %v in metadata. Won't attempt to re-add it.", r.Actions)
   312  			} else if err := c.removeAndAdd(r, replicaMD); err != nil {
   313  				return fmt.Errorf("Error performing remove and add for replica %v: %v", r.Address, err)
   314  			} else {
   315  				// remove and add was successful
   316  				delete(c.errorRetries, r.Address)
   317  			}
   318  		}
   319  	}
   320  
   321  	// Cleanup error retires map
   322  	for address := range c.errorRetries {
   323  		if _, ok := fromMetadata[address]; !ok {
   324  			delete(c.errorRetries, address)
   325  		}
   326  	}
   327  
   328  	return nil
   329  }
   330  
   331  func (c *Controller) removeAndAdd(replica rest.Replica, replicaMD *replica) error {
   332  	logrus.Infof("Removing errored replica %v for re-add.", replica.Address)
   333  	if _, err := c.client.DeleteReplica(replica.Address); err != nil {
   334  		return fmt.Errorf("Error removing errored replica %v: %v.", replica.Address, err)
   335  	}
   336  
   337  	freshReplica, err := replicaMD.client.GetReplica()
   338  	if err != nil {
   339  		return fmt.Errorf("Error getting replica %v during removeAndAdd: %v.", replica.Address, err)
   340  	}
   341  
   342  	if _, ok := freshReplica.Actions["close"]; ok {
   343  		err := replicaMD.client.Close()
   344  		if err != nil {
   345  			return fmt.Errorf("Error closing replica %v before adding: %v.", replica.Address, err)
   346  		}
   347  	}
   348  
   349  	return c.addReplica(replicaMD)
   350  }
   351  
   352  func (c *Controller) addReplicasInMetadata() error {
   353  	_, fromMetadata, err := c.replicaMetadataAndClient()
   354  	if err != nil {
   355  		return fmt.Errorf("Error listing replicas in metadata during add: %v", err)
   356  	}
   357  
   358  	replicasInController, err := c.client.ListReplicas()
   359  	if err != nil {
   360  		return fmt.Errorf("Error listing replicas in controller during add: %v", err)
   361  	}
   362  
   363  	fromController := map[string]rest.Replica{}
   364  	for _, r := range replicasInController {
   365  		fromController[r.Address] = r
   366  	}
   367  
   368  	for address, r := range fromMetadata {
   369  		if _, ok := fromController[address]; !ok {
   370  			logrus.Infof("Adding replica %v because it isn't in controller.", address)
   371  			if err := c.addReplica(r); err != nil {
   372  				return fmt.Errorf("Error adding replica %v: %v", address, err)
   373  			}
   374  		}
   375  	}
   376  
   377  	return nil
   378  }
   379  
   380  func (c *Controller) testSyncAgent(host string) error {
   381  	address := fmt.Sprintf("%v:%v", host, 9504)
   382  	conn, err := net.DialTimeout("tcp", address, time.Second*10)
   383  	if err != nil {
   384  		return err
   385  	}
   386  	conn.Close()
   387  	return nil
   388  }
   389  
   390  func (c *Controller) addReplica(r *replica) error {
   391  	replica, err := r.client.GetReplica()
   392  	if err != nil {
   393  		return fmt.Errorf("Error getting replica %v before adding: %v", r.host, err)
   394  	}
   395  
   396  	// Ensure sync-agent is up and running
   397  	if err := c.testSyncAgent(r.host); err != nil {
   398  		return fmt.Errorf("Error while testing sync agent connection: %v", err)
   399  	}
   400  
   401  	if _, ok := replica.Actions["create"]; ok {
   402  		err := r.client.Create(r.size)
   403  		if err != nil {
   404  			return fmt.Errorf("Error opening replica %v before adding: %v", r.host, err)
   405  		}
   406  	} else if _, ok := replica.Actions["close"]; ok {
   407  		err := r.client.Close()
   408  		if err != nil {
   409  			return fmt.Errorf("Error closing replica %v before adding: %v", r.host, err)
   410  		}
   411  	}
   412  
   413  	address := ReplicaAddress(r.host, r.port)
   414  	logrus.Infof("Calling longhorn add cli for replica %v.", address)
   415  	cmd := exec.Command("longhorn", "add", address)
   416  	cmd.Stderr = os.Stderr
   417  	cmd.Stdout = os.Stdout
   418  
   419  	if err := cmd.Run(); err != nil {
   420  		logrus.Warnf("longhorn add cli returned error %v while adding replica %v. Attempting to clean up.", err, address)
   421  		replicas, err2 := c.client.ListReplicas()
   422  		if err2 != nil {
   423  			logrus.Errorf("Error listing replicas while trying to clean up after failed add for replica %v: %v", address, err2)
   424  		} else {
   425  			for _, replica := range replicas {
   426  				if replica.Address == address && replica.Mode != "RW" {
   427  					logrus.Infof("Removing replica %v after having failed to add it. Add failure: %v", address, err)
   428  					if _, err := c.client.DeleteReplica(address); err != nil {
   429  						logrus.Errorf("Error while deleting replica as part of cleanup: %v", err)
   430  					}
   431  				}
   432  			}
   433  		}
   434  		return fmt.Errorf("Error executing add command %v: %v", cmd, err)
   435  	}
   436  	return nil
   437  }
   438  
   439  func (c *Controller) replicaMetadataAndClient() (int, map[string]*replica, error) {
   440  	client, err := metadata.NewClientAndWait(MetadataURL)
   441  	if err != nil {
   442  		return 0, nil, err
   443  	}
   444  	service, err := client.GetSelfServiceByName("replica")
   445  	if err != nil {
   446  		return 0, nil, err
   447  	}
   448  
   449  	// Unmarshalling the metadata as json is forcing it to a bad format
   450  	resp, err := http.Get(MetadataURL + "/self/service/metadata/volume/volume_config/size")
   451  	if err != nil {
   452  		return 0, nil, err
   453  	}
   454  
   455  	size := ""
   456  	if resp.StatusCode == 200 {
   457  		body, err := ioutil.ReadAll(resp.Body)
   458  		if err != nil {
   459  			return 0, nil, err
   460  		}
   461  		size = string(body)
   462  	}
   463  
   464  	if size == "" {
   465  		size = defaultVolumeSize
   466  	}
   467  
   468  	containers := map[string]metadata.Container{}
   469  	for _, container := range service.Containers {
   470  		if c, ok := containers[container.Name]; !ok {
   471  			containers[container.Name] = container
   472  		} else if container.CreateIndex > c.CreateIndex {
   473  			containers[container.Name] = container
   474  		}
   475  	}
   476  
   477  	result := map[string]*replica{}
   478  	for _, container := range containers {
   479  		r := &replica{
   480  			healthState: container.HealthState,
   481  			host:        container.PrimaryIp,
   482  			port:        9502,
   483  			size:        size,
   484  		}
   485  
   486  		address := ReplicaAddress(r.host, r.port)
   487  		replicaClient, err := replicaClient.NewReplicaClient(address)
   488  		if err != nil {
   489  			return 0, nil, fmt.Errorf("Error getting client for replica %v: %v", address, err)
   490  		}
   491  		r.client = replicaClient
   492  		result[address] = r
   493  	}
   494  
   495  	return service.Scale, result, nil
   496  }