gonum.org/v1/gonum@v0.14.0/optimize/cmaes.go

gonum.org/v1/gonum@v0.14.0/optimize/cmaes.go (about)

     1  // Copyright ©2017 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package optimize
     6  
     7  import (
     8  	"math"
     9  	"sort"
    10  
    11  	"golang.org/x/exp/rand"
    12  
    13  	"gonum.org/v1/gonum/floats"
    14  	"gonum.org/v1/gonum/mat"
    15  	"gonum.org/v1/gonum/stat/distmv"
    16  )
    17  
    18  var _ Method = (*CmaEsChol)(nil)
    19  
    20  // TODO(btracey): If we ever implement the traditional CMA-ES algorithm, provide
    21  // the base explanation there, and modify this description to just
    22  // describe the differences.
    23  
    24  // CmaEsChol implements the covariance matrix adaptation evolution strategy (CMA-ES)
    25  // based on the Cholesky decomposition. The full algorithm is described in
    26  //
    27  //	Krause, Oswin, Dídac Rodríguez Arbonès, and Christian Igel. "CMA-ES with
    28  //	optimal covariance update and storage complexity." Advances in Neural
    29  //	Information Processing Systems. 2016.
    30  //	https://papers.nips.cc/paper/6457-cma-es-with-optimal-covariance-update-and-storage-complexity.pdf
    31  //
    32  // CMA-ES is a global optimization method that progressively adapts a population
    33  // of samples. CMA-ES combines techniques from local optimization with global
    34  // optimization. Specifically, the CMA-ES algorithm uses an initial multivariate
    35  // normal distribution to generate a population of input locations. The input locations
    36  // with the lowest function values are used to update the parameters of the normal
    37  // distribution, a new set of input locations are generated, and this procedure
    38  // is iterated until convergence. The initial sampling distribution will have
    39  // a mean specified by the initial x location, and a covariance specified by
    40  // the InitCholesky field.
    41  //
    42  // As the normal distribution is progressively updated according to the best samples,
    43  // it can be that the mean of the distribution is updated in a gradient-descent
    44  // like fashion, followed by a shrinking covariance.
    45  // It is recommended that the algorithm be run multiple times (with different
    46  // InitMean) to have a better chance of finding the global minimum.
    47  //
    48  // The CMA-ES-Chol algorithm differs from the standard CMA-ES algorithm in that
    49  // it directly updates the Cholesky decomposition of the normal distribution.
    50  // This changes the runtime from O(dimension^3) to O(dimension^2*population)
    51  // The evolution of the multi-variate normal will be similar to the baseline
    52  // CMA-ES algorithm, but the covariance update equation is not identical.
    53  //
    54  // For more information about the CMA-ES algorithm, see
    55  //
    56  //	https://en.wikipedia.org/wiki/CMA-ES
    57  //	https://arxiv.org/pdf/1604.00772.pdf
    58  type CmaEsChol struct {
    59  	// InitStepSize sets the initial size of the covariance matrix adaptation.
    60  	// If InitStepSize is 0, a default value of 0.5 is used. InitStepSize cannot
    61  	// be negative, or CmaEsChol will panic.
    62  	InitStepSize float64
    63  	// Population sets the population size for the algorithm. If Population is
    64  	// 0, a default value of 4 + math.Floor(3*math.Log(float64(dim))) is used.
    65  	// Population cannot be negative or CmaEsChol will panic.
    66  	Population int
    67  	// InitCholesky specifies the Cholesky decomposition of the covariance
    68  	// matrix for the initial sampling distribution. If InitCholesky is nil,
    69  	// a default value of I is used. If it is non-nil, then it must have
    70  	// InitCholesky.Size() be equal to the problem dimension.
    71  	InitCholesky *mat.Cholesky
    72  	// StopLogDet sets the threshold for stopping the optimization if the
    73  	// distribution becomes too peaked. The log determinant is a measure of the
    74  	// (log) "volume" of the normal distribution, and when it is too small
    75  	// the samples are almost the same. If the log determinant of the covariance
    76  	// matrix becomes less than StopLogDet, the optimization run is concluded.
    77  	// If StopLogDet is 0, a default value of dim*log(1e-16) is used.
    78  	// If StopLogDet is NaN, the stopping criterion is not used, though
    79  	// this can cause numeric instabilities in the algorithm.
    80  	StopLogDet float64
    81  	// ForgetBest, when true, does not track the best overall function value found,
    82  	// instead returning the new best sample in each iteration. If ForgetBest
    83  	// is false, then the minimum value returned will be the lowest across all
    84  	// iterations, regardless of when that sample was generated.
    85  	ForgetBest bool
    86  	// Src allows a random number generator to be supplied for generating samples.
    87  	// If Src is nil the generator in golang.org/x/math/rand is used.
    88  	Src rand.Source
    89  
    90  	// Fixed algorithm parameters.
    91  	dim                 int
    92  	pop                 int
    93  	weights             []float64
    94  	muEff               float64
    95  	cc, cs, c1, cmu, ds float64
    96  	eChi                float64
    97  
    98  	// Function data.
    99  	xs *mat.Dense
   100  	fs []float64
   101  
   102  	// Adaptive algorithm parameters.
   103  	invSigma float64 // inverse of the sigma parameter
   104  	pc, ps   []float64
   105  	mean     []float64
   106  	chol     mat.Cholesky
   107  
   108  	// Overall best.
   109  	bestX []float64
   110  	bestF float64
   111  
   112  	// Synchronization.
   113  	sentIdx     int
   114  	receivedIdx int
   115  	operation   chan<- Task
   116  	updateErr   error
   117  }
   118  
   119  var (
   120  	_ Statuser = (*CmaEsChol)(nil)
   121  	_ Method   = (*CmaEsChol)(nil)
   122  )
   123  
   124  func (cma *CmaEsChol) methodConverged() Status {
   125  	sd := cma.StopLogDet
   126  	switch {
   127  	case math.IsNaN(sd):
   128  		return NotTerminated
   129  	case sd == 0:
   130  		sd = float64(cma.dim) * -36.8413614879 // ln(1e-16)
   131  	}
   132  	if cma.chol.LogDet() < sd {
   133  		return MethodConverge
   134  	}
   135  	return NotTerminated
   136  }
   137  
   138  // Status returns the status of the method.
   139  func (cma *CmaEsChol) Status() (Status, error) {
   140  	if cma.updateErr != nil {
   141  		return Failure, cma.updateErr
   142  	}
   143  	return cma.methodConverged(), nil
   144  }
   145  
   146  func (*CmaEsChol) Uses(has Available) (uses Available, err error) {
   147  	return has.function()
   148  }
   149  
   150  func (cma *CmaEsChol) Init(dim, tasks int) int {
   151  	if dim <= 0 {
   152  		panic(nonpositiveDimension)
   153  	}
   154  	if tasks < 0 {
   155  		panic(negativeTasks)
   156  	}
   157  
   158  	// Set fixed algorithm parameters.
   159  	// Parameter values are from https://arxiv.org/pdf/1604.00772.pdf .
   160  	cma.dim = dim
   161  	cma.pop = cma.Population
   162  	n := float64(dim)
   163  	if cma.pop == 0 {
   164  		cma.pop = 4 + int(3*math.Log(n)) // Note the implicit floor.
   165  	} else if cma.pop < 0 {
   166  		panic("cma-es-chol: negative population size")
   167  	}
   168  	mu := cma.pop / 2
   169  	cma.weights = resize(cma.weights, mu)
   170  	for i := range cma.weights {
   171  		v := math.Log(float64(mu)+0.5) - math.Log(float64(i)+1)
   172  		cma.weights[i] = v
   173  	}
   174  	floats.Scale(1/floats.Sum(cma.weights), cma.weights)
   175  	cma.muEff = 0
   176  	for _, v := range cma.weights {
   177  		cma.muEff += v * v
   178  	}
   179  	cma.muEff = 1 / cma.muEff
   180  
   181  	cma.cc = (4 + cma.muEff/n) / (n + 4 + 2*cma.muEff/n)
   182  	cma.cs = (cma.muEff + 2) / (n + cma.muEff + 5)
   183  	cma.c1 = 2 / ((n+1.3)*(n+1.3) + cma.muEff)
   184  	cma.cmu = math.Min(1-cma.c1, 2*(cma.muEff-2+1/cma.muEff)/((n+2)*(n+2)+cma.muEff))
   185  	cma.ds = 1 + 2*math.Max(0, math.Sqrt((cma.muEff-1)/(n+1))-1) + cma.cs
   186  	// E[chi] is taken from https://en.wikipedia.org/wiki/CMA-ES (there
   187  	// listed as E[||N(0,1)||]).
   188  	cma.eChi = math.Sqrt(n) * (1 - 1.0/(4*n) + 1/(21*n*n))
   189  
   190  	// Allocate memory for function data.
   191  	cma.xs = mat.NewDense(cma.pop, dim, nil)
   192  	cma.fs = resize(cma.fs, cma.pop)
   193  
   194  	// Allocate and initialize adaptive parameters.
   195  	cma.invSigma = 1 / cma.InitStepSize
   196  	if cma.InitStepSize == 0 {
   197  		cma.invSigma = 10.0 / 3
   198  	} else if cma.InitStepSize < 0 {
   199  		panic("cma-es-chol: negative initial step size")
   200  	}
   201  	cma.pc = resize(cma.pc, dim)
   202  	for i := range cma.pc {
   203  		cma.pc[i] = 0
   204  	}
   205  	cma.ps = resize(cma.ps, dim)
   206  	for i := range cma.ps {
   207  		cma.ps[i] = 0
   208  	}
   209  	cma.mean = resize(cma.mean, dim) // mean location initialized at the start of Run
   210  
   211  	if cma.InitCholesky != nil {
   212  		if cma.InitCholesky.SymmetricDim() != dim {
   213  			panic("cma-es-chol: incorrect InitCholesky size")
   214  		}
   215  		cma.chol.Clone(cma.InitCholesky)
   216  	} else {
   217  		// Set the initial Cholesky to I.
   218  		b := mat.NewDiagDense(dim, nil)
   219  		for i := 0; i < dim; i++ {
   220  			b.SetDiag(i, 1)
   221  		}
   222  		var chol mat.Cholesky
   223  		ok := chol.Factorize(b)
   224  		if !ok {
   225  			panic("cma-es-chol: bad cholesky. shouldn't happen")
   226  		}
   227  		cma.chol = chol
   228  	}
   229  
   230  	cma.bestX = resize(cma.bestX, dim)
   231  	cma.bestF = math.Inf(1)
   232  
   233  	cma.sentIdx = 0
   234  	cma.receivedIdx = 0
   235  	cma.operation = nil
   236  	cma.updateErr = nil
   237  	t := min(tasks, cma.pop)
   238  	return t
   239  }
   240  
   241  func (cma *CmaEsChol) sendInitTasks(tasks []Task) {
   242  	for i, task := range tasks {
   243  		cma.sendTask(i, task)
   244  	}
   245  	cma.sentIdx = len(tasks)
   246  }
   247  
   248  // sendTask generates a sample and sends the task. It does not update the cma index.
   249  func (cma *CmaEsChol) sendTask(idx int, task Task) {
   250  	task.ID = idx
   251  	task.Op = FuncEvaluation
   252  	distmv.NormalRand(cma.xs.RawRowView(idx), cma.mean, &cma.chol, cma.Src)
   253  	copy(task.X, cma.xs.RawRowView(idx))
   254  	cma.operation <- task
   255  }
   256  
   257  // bestIdx returns the best index in the functions. Returns -1 if all values
   258  // are NaN.
   259  func (cma *CmaEsChol) bestIdx() int {
   260  	best := -1
   261  	bestVal := math.Inf(1)
   262  	for i, v := range cma.fs {
   263  		if math.IsNaN(v) {
   264  			continue
   265  		}
   266  		// Use equality in case somewhere evaluates to +inf.
   267  		if v <= bestVal {
   268  			best = i
   269  			bestVal = v
   270  		}
   271  	}
   272  	return best
   273  }
   274  
   275  // findBestAndUpdateTask finds the best task in the current list, updates the
   276  // new best overall, and then stores the best location into task.
   277  func (cma *CmaEsChol) findBestAndUpdateTask(task Task) Task {
   278  	// Find and update the best location.
   279  	// Don't use floats because there may be NaN values.
   280  	best := cma.bestIdx()
   281  	bestF := math.NaN()
   282  	bestX := cma.xs.RawRowView(0)
   283  	if best != -1 {
   284  		bestF = cma.fs[best]
   285  		bestX = cma.xs.RawRowView(best)
   286  	}
   287  	if cma.ForgetBest {
   288  		task.F = bestF
   289  		copy(task.X, bestX)
   290  	} else {
   291  		if bestF < cma.bestF {
   292  			cma.bestF = bestF
   293  			copy(cma.bestX, bestX)
   294  		}
   295  		task.F = cma.bestF
   296  		copy(task.X, cma.bestX)
   297  	}
   298  	return task
   299  }
   300  
   301  func (cma *CmaEsChol) Run(operations chan<- Task, results <-chan Task, tasks []Task) {
   302  	copy(cma.mean, tasks[0].X)
   303  	cma.operation = operations
   304  	// Send the initial tasks. We know there are at most as many tasks as elements
   305  	// of the population.
   306  	cma.sendInitTasks(tasks)
   307  
   308  Loop:
   309  	for {
   310  		result := <-results
   311  		switch result.Op {
   312  		default:
   313  			panic("unknown operation")
   314  		case PostIteration:
   315  			break Loop
   316  		case MajorIteration:
   317  			// The last thing we did was update all of the tasks and send the
   318  			// major iteration. Now we can send a group of tasks again.
   319  			cma.sendInitTasks(tasks)
   320  		case FuncEvaluation:
   321  			cma.receivedIdx++
   322  			cma.fs[result.ID] = result.F
   323  			switch {
   324  			case cma.sentIdx < cma.pop:
   325  				// There are still tasks to evaluate. Send the next.
   326  				cma.sendTask(cma.sentIdx, result)
   327  				cma.sentIdx++
   328  			case cma.receivedIdx < cma.pop:
   329  				// All the tasks have been sent, but not all of them have been received.
   330  				// Need to wait until all are back.
   331  				continue Loop
   332  			default:
   333  				// All of the evaluations have been received.
   334  				if cma.receivedIdx != cma.pop {
   335  					panic("bad logic")
   336  				}
   337  				cma.receivedIdx = 0
   338  				cma.sentIdx = 0
   339  
   340  				task := cma.findBestAndUpdateTask(result)
   341  				// Update the parameters and send a MajorIteration or a convergence.
   342  				err := cma.update()
   343  				// Kill the existing data.
   344  				for i := range cma.fs {
   345  					cma.fs[i] = math.NaN()
   346  					cma.xs.Set(i, 0, math.NaN())
   347  				}
   348  				switch {
   349  				case err != nil:
   350  					cma.updateErr = err
   351  					task.Op = MethodDone
   352  				case cma.methodConverged() != NotTerminated:
   353  					task.Op = MethodDone
   354  				default:
   355  					task.Op = MajorIteration
   356  					task.ID = -1
   357  				}
   358  				operations <- task
   359  			}
   360  		}
   361  	}
   362  
   363  	// Been told to stop. Clean up.
   364  	// Need to see best of our evaluated tasks so far. Should instead just
   365  	// collect, then see.
   366  	for task := range results {
   367  		switch task.Op {
   368  		case MajorIteration:
   369  		case FuncEvaluation:
   370  			cma.fs[task.ID] = task.F
   371  		default:
   372  			panic("unknown operation")
   373  		}
   374  	}
   375  	// Send the new best value if the evaluation is better than any we've
   376  	// found so far. Keep this separate from findBestAndUpdateTask so that
   377  	// we only send an iteration if we find a better location.
   378  	if !cma.ForgetBest {
   379  		best := cma.bestIdx()
   380  		if best != -1 && cma.fs[best] < cma.bestF {
   381  			task := tasks[0]
   382  			task.F = cma.fs[best]
   383  			copy(task.X, cma.xs.RawRowView(best))
   384  			task.Op = MajorIteration
   385  			task.ID = -1
   386  			operations <- task
   387  		}
   388  	}
   389  	close(operations)
   390  }
   391  
   392  // update computes the new parameters (mean, cholesky, etc.). Does not update
   393  // any of the synchronization parameters (taskIdx).
   394  func (cma *CmaEsChol) update() error {
   395  	// Sort the function values to find the elite samples.
   396  	ftmp := make([]float64, cma.pop)
   397  	copy(ftmp, cma.fs)
   398  	indexes := make([]int, cma.pop)
   399  	for i := range indexes {
   400  		indexes[i] = i
   401  	}
   402  	sort.Sort(bestSorter{F: ftmp, Idx: indexes})
   403  
   404  	meanOld := make([]float64, len(cma.mean))
   405  	copy(meanOld, cma.mean)
   406  
   407  	// m_{t+1} = \sum_{i=1}^mu w_i x_i
   408  	for i := range cma.mean {
   409  		cma.mean[i] = 0
   410  	}
   411  	for i, w := range cma.weights {
   412  		idx := indexes[i] // index of teh 1337 sample.
   413  		floats.AddScaled(cma.mean, w, cma.xs.RawRowView(idx))
   414  	}
   415  	meanDiff := make([]float64, len(cma.mean))
   416  	floats.SubTo(meanDiff, cma.mean, meanOld)
   417  
   418  	// p_{c,t+1} = (1-c_c) p_{c,t} + \sqrt(c_c*(2-c_c)*mueff) (m_{t+1}-m_t)/sigma_t
   419  	floats.Scale(1-cma.cc, cma.pc)
   420  	scaleC := math.Sqrt(cma.cc*(2-cma.cc)*cma.muEff) * cma.invSigma
   421  	floats.AddScaled(cma.pc, scaleC, meanDiff)
   422  
   423  	// p_{sigma, t+1} = (1-c_sigma) p_{sigma,t} + \sqrt(c_s*(2-c_s)*mueff) A_t^-1 (m_{t+1}-m_t)/sigma_t
   424  	floats.Scale(1-cma.cs, cma.ps)
   425  	// First compute A_t^-1 (m_{t+1}-m_t), then add the scaled vector.
   426  	tmp := make([]float64, cma.dim)
   427  	tmpVec := mat.NewVecDense(cma.dim, tmp)
   428  	diffVec := mat.NewVecDense(cma.dim, meanDiff)
   429  	err := tmpVec.SolveVec(cma.chol.RawU().T(), diffVec)
   430  	if err != nil {
   431  		return err
   432  	}
   433  	scaleS := math.Sqrt(cma.cs*(2-cma.cs)*cma.muEff) * cma.invSigma
   434  	floats.AddScaled(cma.ps, scaleS, tmp)
   435  
   436  	// Compute the update to A.
   437  	scaleChol := 1 - cma.c1 - cma.cmu
   438  	if scaleChol == 0 {
   439  		scaleChol = math.SmallestNonzeroFloat64 // enough to kill the old data, but still non-zero.
   440  	}
   441  	cma.chol.Scale(scaleChol, &cma.chol)
   442  	cma.chol.SymRankOne(&cma.chol, cma.c1, mat.NewVecDense(cma.dim, cma.pc))
   443  	for i, w := range cma.weights {
   444  		idx := indexes[i]
   445  		floats.SubTo(tmp, cma.xs.RawRowView(idx), meanOld)
   446  		cma.chol.SymRankOne(&cma.chol, cma.cmu*w*cma.invSigma, tmpVec)
   447  	}
   448  
   449  	// sigma_{t+1} = sigma_t exp(c_sigma/d_sigma * norm(p_{sigma,t+1}/ E[chi] -1)
   450  	normPs := floats.Norm(cma.ps, 2)
   451  	cma.invSigma /= math.Exp(cma.cs / cma.ds * (normPs/cma.eChi - 1))
   452  	return nil
   453  }
   454  
   455  type bestSorter struct {
   456  	F   []float64
   457  	Idx []int
   458  }
   459  
   460  func (b bestSorter) Len() int {
   461  	return len(b.F)
   462  }
   463  func (b bestSorter) Less(i, j int) bool {
   464  	return b.F[i] < b.F[j]
   465  }
   466  func (b bestSorter) Swap(i, j int) {
   467  	b.F[i], b.F[j] = b.F[j], b.F[i]
   468  	b.Idx[i], b.Idx[j] = b.Idx[j], b.Idx[i]
   469  }