github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/optimize/cmaes.go (about)

     1  // Copyright ©2017 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package optimize
     6  
     7  import (
     8  	"math"
     9  	"sort"
    10  
    11  	"golang.org/x/exp/rand"
    12  
    13  	"github.com/jingcheng-WU/gonum/floats"
    14  	"github.com/jingcheng-WU/gonum/mat"
    15  	"github.com/jingcheng-WU/gonum/stat/distmv"
    16  )
    17  
    18  var _ Method = (*CmaEsChol)(nil)
    19  
    20  // TODO(btracey): If we ever implement the traditional CMA-ES algorithm, provide
    21  // the base explanation there, and modify this description to just
    22  // describe the differences.
    23  
    24  // CmaEsChol implements the covariance matrix adaptation evolution strategy (CMA-ES)
    25  // based on the Cholesky decomposition. The full algorithm is described in
    26  //  Krause, Oswin, Dídac Rodríguez Arbonès, and Christian Igel. "CMA-ES with
    27  //  optimal covariance update and storage complexity." Advances in Neural
    28  //  Information Processing Systems. 2016.
    29  //  https://papers.nips.cc/paper/6457-cma-es-with-optimal-covariance-update-and-storage-complexity.pdf
    30  // CMA-ES is a global optimization method that progressively adapts a population
    31  // of samples. CMA-ES combines techniques from local optimization with global
    32  // optimization. Specifically, the CMA-ES algorithm uses an initial multivariate
    33  // normal distribution to generate a population of input locations. The input locations
    34  // with the lowest function values are used to update the parameters of the normal
    35  // distribution, a new set of input locations are generated, and this procedure
    36  // is iterated until convergence. The initial sampling distribution will have
    37  // a mean specified by the initial x location, and a covariance specified by
    38  // the InitCholesky field.
    39  //
    40  // As the normal distribution is progressively updated according to the best samples,
    41  // it can be that the mean of the distribution is updated in a gradient-descent
    42  // like fashion, followed by a shrinking covariance.
    43  // It is recommended that the algorithm be run multiple times (with different
    44  // InitMean) to have a better chance of finding the global minimum.
    45  //
    46  // The CMA-ES-Chol algorithm differs from the standard CMA-ES algorithm in that
    47  // it directly updates the Cholesky decomposition of the normal distribution.
    48  // This changes the runtime from O(dimension^3) to O(dimension^2*population)
    49  // The evolution of the multi-variate normal will be similar to the baseline
    50  // CMA-ES algorithm, but the covariance update equation is not identical.
    51  //
    52  // For more information about the CMA-ES algorithm, see
    53  //  https://en.wikipedia.org/wiki/CMA-ES
    54  //  https://arxiv.org/pdf/1604.00772.pdf
    55  type CmaEsChol struct {
    56  	// InitStepSize sets the initial size of the covariance matrix adaptation.
    57  	// If InitStepSize is 0, a default value of 0.5 is used. InitStepSize cannot
    58  	// be negative, or CmaEsChol will panic.
    59  	InitStepSize float64
    60  	// Population sets the population size for the algorithm. If Population is
    61  	// 0, a default value of 4 + math.Floor(3*math.Log(float64(dim))) is used.
    62  	// Population cannot be negative or CmaEsChol will panic.
    63  	Population int
    64  	// InitCholesky specifies the Cholesky decomposition of the covariance
    65  	// matrix for the initial sampling distribution. If InitCholesky is nil,
    66  	// a default value of I is used. If it is non-nil, then it must have
    67  	// InitCholesky.Size() be equal to the problem dimension.
    68  	InitCholesky *mat.Cholesky
    69  	// StopLogDet sets the threshold for stopping the optimization if the
    70  	// distribution becomes too peaked. The log determinant is a measure of the
    71  	// (log) "volume" of the normal distribution, and when it is too small
    72  	// the samples are almost the same. If the log determinant of the covariance
    73  	// matrix becomes less than StopLogDet, the optimization run is concluded.
    74  	// If StopLogDet is 0, a default value of dim*log(1e-16) is used.
    75  	// If StopLogDet is NaN, the stopping criterion is not used, though
    76  	// this can cause numeric instabilities in the algorithm.
    77  	StopLogDet float64
    78  	// ForgetBest, when true, does not track the best overall function value found,
    79  	// instead returning the new best sample in each iteration. If ForgetBest
    80  	// is false, then the minimum value returned will be the lowest across all
    81  	// iterations, regardless of when that sample was generated.
    82  	ForgetBest bool
    83  	// Src allows a random number generator to be supplied for generating samples.
    84  	// If Src is nil the generator in golang.org/x/math/rand is used.
    85  	Src rand.Source
    86  
    87  	// Fixed algorithm parameters.
    88  	dim                 int
    89  	pop                 int
    90  	weights             []float64
    91  	muEff               float64
    92  	cc, cs, c1, cmu, ds float64
    93  	eChi                float64
    94  
    95  	// Function data.
    96  	xs *mat.Dense
    97  	fs []float64
    98  
    99  	// Adaptive algorithm parameters.
   100  	invSigma float64 // inverse of the sigma parameter
   101  	pc, ps   []float64
   102  	mean     []float64
   103  	chol     mat.Cholesky
   104  
   105  	// Overall best.
   106  	bestX []float64
   107  	bestF float64
   108  
   109  	// Synchronization.
   110  	sentIdx     int
   111  	receivedIdx int
   112  	operation   chan<- Task
   113  	updateErr   error
   114  }
   115  
   116  var (
   117  	_ Statuser = (*CmaEsChol)(nil)
   118  	_ Method   = (*CmaEsChol)(nil)
   119  )
   120  
   121  func (cma *CmaEsChol) methodConverged() Status {
   122  	sd := cma.StopLogDet
   123  	switch {
   124  	case math.IsNaN(sd):
   125  		return NotTerminated
   126  	case sd == 0:
   127  		sd = float64(cma.dim) * -36.8413614879 // ln(1e-16)
   128  	}
   129  	if cma.chol.LogDet() < sd {
   130  		return MethodConverge
   131  	}
   132  	return NotTerminated
   133  }
   134  
   135  // Status returns the status of the method.
   136  func (cma *CmaEsChol) Status() (Status, error) {
   137  	if cma.updateErr != nil {
   138  		return Failure, cma.updateErr
   139  	}
   140  	return cma.methodConverged(), nil
   141  }
   142  
   143  func (*CmaEsChol) Uses(has Available) (uses Available, err error) {
   144  	return has.function()
   145  }
   146  
   147  func (cma *CmaEsChol) Init(dim, tasks int) int {
   148  	if dim <= 0 {
   149  		panic(nonpositiveDimension)
   150  	}
   151  	if tasks < 0 {
   152  		panic(negativeTasks)
   153  	}
   154  
   155  	// Set fixed algorithm parameters.
   156  	// Parameter values are from https://arxiv.org/pdf/1604.00772.pdf .
   157  	cma.dim = dim
   158  	cma.pop = cma.Population
   159  	n := float64(dim)
   160  	if cma.pop == 0 {
   161  		cma.pop = 4 + int(3*math.Log(n)) // Note the implicit floor.
   162  	} else if cma.pop < 0 {
   163  		panic("cma-es-chol: negative population size")
   164  	}
   165  	mu := cma.pop / 2
   166  	cma.weights = resize(cma.weights, mu)
   167  	for i := range cma.weights {
   168  		v := math.Log(float64(mu)+0.5) - math.Log(float64(i)+1)
   169  		cma.weights[i] = v
   170  	}
   171  	floats.Scale(1/floats.Sum(cma.weights), cma.weights)
   172  	cma.muEff = 0
   173  	for _, v := range cma.weights {
   174  		cma.muEff += v * v
   175  	}
   176  	cma.muEff = 1 / cma.muEff
   177  
   178  	cma.cc = (4 + cma.muEff/n) / (n + 4 + 2*cma.muEff/n)
   179  	cma.cs = (cma.muEff + 2) / (n + cma.muEff + 5)
   180  	cma.c1 = 2 / ((n+1.3)*(n+1.3) + cma.muEff)
   181  	cma.cmu = math.Min(1-cma.c1, 2*(cma.muEff-2+1/cma.muEff)/((n+2)*(n+2)+cma.muEff))
   182  	cma.ds = 1 + 2*math.Max(0, math.Sqrt((cma.muEff-1)/(n+1))-1) + cma.cs
   183  	// E[chi] is taken from https://en.wikipedia.org/wiki/CMA-ES (there
   184  	// listed as E[||N(0,1)||]).
   185  	cma.eChi = math.Sqrt(n) * (1 - 1.0/(4*n) + 1/(21*n*n))
   186  
   187  	// Allocate memory for function data.
   188  	cma.xs = mat.NewDense(cma.pop, dim, nil)
   189  	cma.fs = resize(cma.fs, cma.pop)
   190  
   191  	// Allocate and initialize adaptive parameters.
   192  	cma.invSigma = 1 / cma.InitStepSize
   193  	if cma.InitStepSize == 0 {
   194  		cma.invSigma = 10.0 / 3
   195  	} else if cma.InitStepSize < 0 {
   196  		panic("cma-es-chol: negative initial step size")
   197  	}
   198  	cma.pc = resize(cma.pc, dim)
   199  	for i := range cma.pc {
   200  		cma.pc[i] = 0
   201  	}
   202  	cma.ps = resize(cma.ps, dim)
   203  	for i := range cma.ps {
   204  		cma.ps[i] = 0
   205  	}
   206  	cma.mean = resize(cma.mean, dim) // mean location initialized at the start of Run
   207  
   208  	if cma.InitCholesky != nil {
   209  		if cma.InitCholesky.Symmetric() != dim {
   210  			panic("cma-es-chol: incorrect InitCholesky size")
   211  		}
   212  		cma.chol.Clone(cma.InitCholesky)
   213  	} else {
   214  		// Set the initial Cholesky to I.
   215  		b := mat.NewDiagDense(dim, nil)
   216  		for i := 0; i < dim; i++ {
   217  			b.SetDiag(i, 1)
   218  		}
   219  		var chol mat.Cholesky
   220  		ok := chol.Factorize(b)
   221  		if !ok {
   222  			panic("cma-es-chol: bad cholesky. shouldn't happen")
   223  		}
   224  		cma.chol = chol
   225  	}
   226  
   227  	cma.bestX = resize(cma.bestX, dim)
   228  	cma.bestF = math.Inf(1)
   229  
   230  	cma.sentIdx = 0
   231  	cma.receivedIdx = 0
   232  	cma.operation = nil
   233  	cma.updateErr = nil
   234  	t := min(tasks, cma.pop)
   235  	return t
   236  }
   237  
   238  func (cma *CmaEsChol) sendInitTasks(tasks []Task) {
   239  	for i, task := range tasks {
   240  		cma.sendTask(i, task)
   241  	}
   242  	cma.sentIdx = len(tasks)
   243  }
   244  
   245  // sendTask generates a sample and sends the task. It does not update the cma index.
   246  func (cma *CmaEsChol) sendTask(idx int, task Task) {
   247  	task.ID = idx
   248  	task.Op = FuncEvaluation
   249  	distmv.NormalRand(cma.xs.RawRowView(idx), cma.mean, &cma.chol, cma.Src)
   250  	copy(task.X, cma.xs.RawRowView(idx))
   251  	cma.operation <- task
   252  }
   253  
   254  // bestIdx returns the best index in the functions. Returns -1 if all values
   255  // are NaN.
   256  func (cma *CmaEsChol) bestIdx() int {
   257  	best := -1
   258  	bestVal := math.Inf(1)
   259  	for i, v := range cma.fs {
   260  		if math.IsNaN(v) {
   261  			continue
   262  		}
   263  		// Use equality in case somewhere evaluates to +inf.
   264  		if v <= bestVal {
   265  			best = i
   266  			bestVal = v
   267  		}
   268  	}
   269  	return best
   270  }
   271  
   272  // findBestAndUpdateTask finds the best task in the current list, updates the
   273  // new best overall, and then stores the best location into task.
   274  func (cma *CmaEsChol) findBestAndUpdateTask(task Task) Task {
   275  	// Find and update the best location.
   276  	// Don't use floats because there may be NaN values.
   277  	best := cma.bestIdx()
   278  	bestF := math.NaN()
   279  	bestX := cma.xs.RawRowView(0)
   280  	if best != -1 {
   281  		bestF = cma.fs[best]
   282  		bestX = cma.xs.RawRowView(best)
   283  	}
   284  	if cma.ForgetBest {
   285  		task.F = bestF
   286  		copy(task.X, bestX)
   287  	} else {
   288  		if bestF < cma.bestF {
   289  			cma.bestF = bestF
   290  			copy(cma.bestX, bestX)
   291  		}
   292  		task.F = cma.bestF
   293  		copy(task.X, cma.bestX)
   294  	}
   295  	return task
   296  }
   297  
   298  func (cma *CmaEsChol) Run(operations chan<- Task, results <-chan Task, tasks []Task) {
   299  	copy(cma.mean, tasks[0].X)
   300  	cma.operation = operations
   301  	// Send the initial tasks. We know there are at most as many tasks as elements
   302  	// of the population.
   303  	cma.sendInitTasks(tasks)
   304  
   305  Loop:
   306  	for {
   307  		result := <-results
   308  		switch result.Op {
   309  		default:
   310  			panic("unknown operation")
   311  		case PostIteration:
   312  			break Loop
   313  		case MajorIteration:
   314  			// The last thing we did was update all of the tasks and send the
   315  			// major iteration. Now we can send a group of tasks again.
   316  			cma.sendInitTasks(tasks)
   317  		case FuncEvaluation:
   318  			cma.receivedIdx++
   319  			cma.fs[result.ID] = result.F
   320  			switch {
   321  			case cma.sentIdx < cma.pop:
   322  				// There are still tasks to evaluate. Send the next.
   323  				cma.sendTask(cma.sentIdx, result)
   324  				cma.sentIdx++
   325  			case cma.receivedIdx < cma.pop:
   326  				// All the tasks have been sent, but not all of them have been received.
   327  				// Need to wait until all are back.
   328  				continue Loop
   329  			default:
   330  				// All of the evaluations have been received.
   331  				if cma.receivedIdx != cma.pop {
   332  					panic("bad logic")
   333  				}
   334  				cma.receivedIdx = 0
   335  				cma.sentIdx = 0
   336  
   337  				task := cma.findBestAndUpdateTask(result)
   338  				// Update the parameters and send a MajorIteration or a convergence.
   339  				err := cma.update()
   340  				// Kill the existing data.
   341  				for i := range cma.fs {
   342  					cma.fs[i] = math.NaN()
   343  					cma.xs.Set(i, 0, math.NaN())
   344  				}
   345  				switch {
   346  				case err != nil:
   347  					cma.updateErr = err
   348  					task.Op = MethodDone
   349  				case cma.methodConverged() != NotTerminated:
   350  					task.Op = MethodDone
   351  				default:
   352  					task.Op = MajorIteration
   353  					task.ID = -1
   354  				}
   355  				operations <- task
   356  			}
   357  		}
   358  	}
   359  
   360  	// Been told to stop. Clean up.
   361  	// Need to see best of our evaluated tasks so far. Should instead just
   362  	// collect, then see.
   363  	for task := range results {
   364  		switch task.Op {
   365  		case MajorIteration:
   366  		case FuncEvaluation:
   367  			cma.fs[task.ID] = task.F
   368  		default:
   369  			panic("unknown operation")
   370  		}
   371  	}
   372  	// Send the new best value if the evaluation is better than any we've
   373  	// found so far. Keep this separate from findBestAndUpdateTask so that
   374  	// we only send an iteration if we find a better location.
   375  	if !cma.ForgetBest {
   376  		best := cma.bestIdx()
   377  		if best != -1 && cma.fs[best] < cma.bestF {
   378  			task := tasks[0]
   379  			task.F = cma.fs[best]
   380  			copy(task.X, cma.xs.RawRowView(best))
   381  			task.Op = MajorIteration
   382  			task.ID = -1
   383  			operations <- task
   384  		}
   385  	}
   386  	close(operations)
   387  }
   388  
   389  // update computes the new parameters (mean, cholesky, etc.). Does not update
   390  // any of the synchronization parameters (taskIdx).
   391  func (cma *CmaEsChol) update() error {
   392  	// Sort the function values to find the elite samples.
   393  	ftmp := make([]float64, cma.pop)
   394  	copy(ftmp, cma.fs)
   395  	indexes := make([]int, cma.pop)
   396  	for i := range indexes {
   397  		indexes[i] = i
   398  	}
   399  	sort.Sort(bestSorter{F: ftmp, Idx: indexes})
   400  
   401  	meanOld := make([]float64, len(cma.mean))
   402  	copy(meanOld, cma.mean)
   403  
   404  	// m_{t+1} = \sum_{i=1}^mu w_i x_i
   405  	for i := range cma.mean {
   406  		cma.mean[i] = 0
   407  	}
   408  	for i, w := range cma.weights {
   409  		idx := indexes[i] // index of teh 1337 sample.
   410  		floats.AddScaled(cma.mean, w, cma.xs.RawRowView(idx))
   411  	}
   412  	meanDiff := make([]float64, len(cma.mean))
   413  	floats.SubTo(meanDiff, cma.mean, meanOld)
   414  
   415  	// p_{c,t+1} = (1-c_c) p_{c,t} + \sqrt(c_c*(2-c_c)*mueff) (m_{t+1}-m_t)/sigma_t
   416  	floats.Scale(1-cma.cc, cma.pc)
   417  	scaleC := math.Sqrt(cma.cc*(2-cma.cc)*cma.muEff) * cma.invSigma
   418  	floats.AddScaled(cma.pc, scaleC, meanDiff)
   419  
   420  	// p_{sigma, t+1} = (1-c_sigma) p_{sigma,t} + \sqrt(c_s*(2-c_s)*mueff) A_t^-1 (m_{t+1}-m_t)/sigma_t
   421  	floats.Scale(1-cma.cs, cma.ps)
   422  	// First compute A_t^-1 (m_{t+1}-m_t), then add the scaled vector.
   423  	tmp := make([]float64, cma.dim)
   424  	tmpVec := mat.NewVecDense(cma.dim, tmp)
   425  	diffVec := mat.NewVecDense(cma.dim, meanDiff)
   426  	err := tmpVec.SolveVec(cma.chol.RawU().T(), diffVec)
   427  	if err != nil {
   428  		return err
   429  	}
   430  	scaleS := math.Sqrt(cma.cs*(2-cma.cs)*cma.muEff) * cma.invSigma
   431  	floats.AddScaled(cma.ps, scaleS, tmp)
   432  
   433  	// Compute the update to A.
   434  	scaleChol := 1 - cma.c1 - cma.cmu
   435  	if scaleChol == 0 {
   436  		scaleChol = math.SmallestNonzeroFloat64 // enough to kill the old data, but still non-zero.
   437  	}
   438  	cma.chol.Scale(scaleChol, &cma.chol)
   439  	cma.chol.SymRankOne(&cma.chol, cma.c1, mat.NewVecDense(cma.dim, cma.pc))
   440  	for i, w := range cma.weights {
   441  		idx := indexes[i]
   442  		floats.SubTo(tmp, cma.xs.RawRowView(idx), meanOld)
   443  		cma.chol.SymRankOne(&cma.chol, cma.cmu*w*cma.invSigma, tmpVec)
   444  	}
   445  
   446  	// sigma_{t+1} = sigma_t exp(c_sigma/d_sigma * norm(p_{sigma,t+1}/ E[chi] -1)
   447  	normPs := floats.Norm(cma.ps, 2)
   448  	cma.invSigma /= math.Exp(cma.cs / cma.ds * (normPs/cma.eChi - 1))
   449  	return nil
   450  }
   451  
   452  type bestSorter struct {
   453  	F   []float64
   454  	Idx []int
   455  }
   456  
   457  func (b bestSorter) Len() int {
   458  	return len(b.F)
   459  }
   460  func (b bestSorter) Less(i, j int) bool {
   461  	return b.F[i] < b.F[j]
   462  }
   463  func (b bestSorter) Swap(i, j int) {
   464  	b.F[i], b.F[j] = b.F[j], b.F[i]
   465  	b.Idx[i], b.Idx[j] = b.Idx[j], b.Idx[i]
   466  }