gonum.org/v1/gonum@v0.14.0/optimize/cmaes.go (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package optimize 6 7 import ( 8 "math" 9 "sort" 10 11 "golang.org/x/exp/rand" 12 13 "gonum.org/v1/gonum/floats" 14 "gonum.org/v1/gonum/mat" 15 "gonum.org/v1/gonum/stat/distmv" 16 ) 17 18 var _ Method = (*CmaEsChol)(nil) 19 20 // TODO(btracey): If we ever implement the traditional CMA-ES algorithm, provide 21 // the base explanation there, and modify this description to just 22 // describe the differences. 23 24 // CmaEsChol implements the covariance matrix adaptation evolution strategy (CMA-ES) 25 // based on the Cholesky decomposition. The full algorithm is described in 26 // 27 // Krause, Oswin, Dídac Rodríguez Arbonès, and Christian Igel. "CMA-ES with 28 // optimal covariance update and storage complexity." Advances in Neural 29 // Information Processing Systems. 2016. 30 // https://papers.nips.cc/paper/6457-cma-es-with-optimal-covariance-update-and-storage-complexity.pdf 31 // 32 // CMA-ES is a global optimization method that progressively adapts a population 33 // of samples. CMA-ES combines techniques from local optimization with global 34 // optimization. Specifically, the CMA-ES algorithm uses an initial multivariate 35 // normal distribution to generate a population of input locations. The input locations 36 // with the lowest function values are used to update the parameters of the normal 37 // distribution, a new set of input locations are generated, and this procedure 38 // is iterated until convergence. The initial sampling distribution will have 39 // a mean specified by the initial x location, and a covariance specified by 40 // the InitCholesky field. 41 // 42 // As the normal distribution is progressively updated according to the best samples, 43 // it can be that the mean of the distribution is updated in a gradient-descent 44 // like fashion, followed by a shrinking covariance. 45 // It is recommended that the algorithm be run multiple times (with different 46 // InitMean) to have a better chance of finding the global minimum. 47 // 48 // The CMA-ES-Chol algorithm differs from the standard CMA-ES algorithm in that 49 // it directly updates the Cholesky decomposition of the normal distribution. 50 // This changes the runtime from O(dimension^3) to O(dimension^2*population) 51 // The evolution of the multi-variate normal will be similar to the baseline 52 // CMA-ES algorithm, but the covariance update equation is not identical. 53 // 54 // For more information about the CMA-ES algorithm, see 55 // 56 // https://en.wikipedia.org/wiki/CMA-ES 57 // https://arxiv.org/pdf/1604.00772.pdf 58 type CmaEsChol struct { 59 // InitStepSize sets the initial size of the covariance matrix adaptation. 60 // If InitStepSize is 0, a default value of 0.5 is used. InitStepSize cannot 61 // be negative, or CmaEsChol will panic. 62 InitStepSize float64 63 // Population sets the population size for the algorithm. If Population is 64 // 0, a default value of 4 + math.Floor(3*math.Log(float64(dim))) is used. 65 // Population cannot be negative or CmaEsChol will panic. 66 Population int 67 // InitCholesky specifies the Cholesky decomposition of the covariance 68 // matrix for the initial sampling distribution. If InitCholesky is nil, 69 // a default value of I is used. If it is non-nil, then it must have 70 // InitCholesky.Size() be equal to the problem dimension. 71 InitCholesky *mat.Cholesky 72 // StopLogDet sets the threshold for stopping the optimization if the 73 // distribution becomes too peaked. The log determinant is a measure of the 74 // (log) "volume" of the normal distribution, and when it is too small 75 // the samples are almost the same. If the log determinant of the covariance 76 // matrix becomes less than StopLogDet, the optimization run is concluded. 77 // If StopLogDet is 0, a default value of dim*log(1e-16) is used. 78 // If StopLogDet is NaN, the stopping criterion is not used, though 79 // this can cause numeric instabilities in the algorithm. 80 StopLogDet float64 81 // ForgetBest, when true, does not track the best overall function value found, 82 // instead returning the new best sample in each iteration. If ForgetBest 83 // is false, then the minimum value returned will be the lowest across all 84 // iterations, regardless of when that sample was generated. 85 ForgetBest bool 86 // Src allows a random number generator to be supplied for generating samples. 87 // If Src is nil the generator in golang.org/x/math/rand is used. 88 Src rand.Source 89 90 // Fixed algorithm parameters. 91 dim int 92 pop int 93 weights []float64 94 muEff float64 95 cc, cs, c1, cmu, ds float64 96 eChi float64 97 98 // Function data. 99 xs *mat.Dense 100 fs []float64 101 102 // Adaptive algorithm parameters. 103 invSigma float64 // inverse of the sigma parameter 104 pc, ps []float64 105 mean []float64 106 chol mat.Cholesky 107 108 // Overall best. 109 bestX []float64 110 bestF float64 111 112 // Synchronization. 113 sentIdx int 114 receivedIdx int 115 operation chan<- Task 116 updateErr error 117 } 118 119 var ( 120 _ Statuser = (*CmaEsChol)(nil) 121 _ Method = (*CmaEsChol)(nil) 122 ) 123 124 func (cma *CmaEsChol) methodConverged() Status { 125 sd := cma.StopLogDet 126 switch { 127 case math.IsNaN(sd): 128 return NotTerminated 129 case sd == 0: 130 sd = float64(cma.dim) * -36.8413614879 // ln(1e-16) 131 } 132 if cma.chol.LogDet() < sd { 133 return MethodConverge 134 } 135 return NotTerminated 136 } 137 138 // Status returns the status of the method. 139 func (cma *CmaEsChol) Status() (Status, error) { 140 if cma.updateErr != nil { 141 return Failure, cma.updateErr 142 } 143 return cma.methodConverged(), nil 144 } 145 146 func (*CmaEsChol) Uses(has Available) (uses Available, err error) { 147 return has.function() 148 } 149 150 func (cma *CmaEsChol) Init(dim, tasks int) int { 151 if dim <= 0 { 152 panic(nonpositiveDimension) 153 } 154 if tasks < 0 { 155 panic(negativeTasks) 156 } 157 158 // Set fixed algorithm parameters. 159 // Parameter values are from https://arxiv.org/pdf/1604.00772.pdf . 160 cma.dim = dim 161 cma.pop = cma.Population 162 n := float64(dim) 163 if cma.pop == 0 { 164 cma.pop = 4 + int(3*math.Log(n)) // Note the implicit floor. 165 } else if cma.pop < 0 { 166 panic("cma-es-chol: negative population size") 167 } 168 mu := cma.pop / 2 169 cma.weights = resize(cma.weights, mu) 170 for i := range cma.weights { 171 v := math.Log(float64(mu)+0.5) - math.Log(float64(i)+1) 172 cma.weights[i] = v 173 } 174 floats.Scale(1/floats.Sum(cma.weights), cma.weights) 175 cma.muEff = 0 176 for _, v := range cma.weights { 177 cma.muEff += v * v 178 } 179 cma.muEff = 1 / cma.muEff 180 181 cma.cc = (4 + cma.muEff/n) / (n + 4 + 2*cma.muEff/n) 182 cma.cs = (cma.muEff + 2) / (n + cma.muEff + 5) 183 cma.c1 = 2 / ((n+1.3)*(n+1.3) + cma.muEff) 184 cma.cmu = math.Min(1-cma.c1, 2*(cma.muEff-2+1/cma.muEff)/((n+2)*(n+2)+cma.muEff)) 185 cma.ds = 1 + 2*math.Max(0, math.Sqrt((cma.muEff-1)/(n+1))-1) + cma.cs 186 // E[chi] is taken from https://en.wikipedia.org/wiki/CMA-ES (there 187 // listed as E[||N(0,1)||]). 188 cma.eChi = math.Sqrt(n) * (1 - 1.0/(4*n) + 1/(21*n*n)) 189 190 // Allocate memory for function data. 191 cma.xs = mat.NewDense(cma.pop, dim, nil) 192 cma.fs = resize(cma.fs, cma.pop) 193 194 // Allocate and initialize adaptive parameters. 195 cma.invSigma = 1 / cma.InitStepSize 196 if cma.InitStepSize == 0 { 197 cma.invSigma = 10.0 / 3 198 } else if cma.InitStepSize < 0 { 199 panic("cma-es-chol: negative initial step size") 200 } 201 cma.pc = resize(cma.pc, dim) 202 for i := range cma.pc { 203 cma.pc[i] = 0 204 } 205 cma.ps = resize(cma.ps, dim) 206 for i := range cma.ps { 207 cma.ps[i] = 0 208 } 209 cma.mean = resize(cma.mean, dim) // mean location initialized at the start of Run 210 211 if cma.InitCholesky != nil { 212 if cma.InitCholesky.SymmetricDim() != dim { 213 panic("cma-es-chol: incorrect InitCholesky size") 214 } 215 cma.chol.Clone(cma.InitCholesky) 216 } else { 217 // Set the initial Cholesky to I. 218 b := mat.NewDiagDense(dim, nil) 219 for i := 0; i < dim; i++ { 220 b.SetDiag(i, 1) 221 } 222 var chol mat.Cholesky 223 ok := chol.Factorize(b) 224 if !ok { 225 panic("cma-es-chol: bad cholesky. shouldn't happen") 226 } 227 cma.chol = chol 228 } 229 230 cma.bestX = resize(cma.bestX, dim) 231 cma.bestF = math.Inf(1) 232 233 cma.sentIdx = 0 234 cma.receivedIdx = 0 235 cma.operation = nil 236 cma.updateErr = nil 237 t := min(tasks, cma.pop) 238 return t 239 } 240 241 func (cma *CmaEsChol) sendInitTasks(tasks []Task) { 242 for i, task := range tasks { 243 cma.sendTask(i, task) 244 } 245 cma.sentIdx = len(tasks) 246 } 247 248 // sendTask generates a sample and sends the task. It does not update the cma index. 249 func (cma *CmaEsChol) sendTask(idx int, task Task) { 250 task.ID = idx 251 task.Op = FuncEvaluation 252 distmv.NormalRand(cma.xs.RawRowView(idx), cma.mean, &cma.chol, cma.Src) 253 copy(task.X, cma.xs.RawRowView(idx)) 254 cma.operation <- task 255 } 256 257 // bestIdx returns the best index in the functions. Returns -1 if all values 258 // are NaN. 259 func (cma *CmaEsChol) bestIdx() int { 260 best := -1 261 bestVal := math.Inf(1) 262 for i, v := range cma.fs { 263 if math.IsNaN(v) { 264 continue 265 } 266 // Use equality in case somewhere evaluates to +inf. 267 if v <= bestVal { 268 best = i 269 bestVal = v 270 } 271 } 272 return best 273 } 274 275 // findBestAndUpdateTask finds the best task in the current list, updates the 276 // new best overall, and then stores the best location into task. 277 func (cma *CmaEsChol) findBestAndUpdateTask(task Task) Task { 278 // Find and update the best location. 279 // Don't use floats because there may be NaN values. 280 best := cma.bestIdx() 281 bestF := math.NaN() 282 bestX := cma.xs.RawRowView(0) 283 if best != -1 { 284 bestF = cma.fs[best] 285 bestX = cma.xs.RawRowView(best) 286 } 287 if cma.ForgetBest { 288 task.F = bestF 289 copy(task.X, bestX) 290 } else { 291 if bestF < cma.bestF { 292 cma.bestF = bestF 293 copy(cma.bestX, bestX) 294 } 295 task.F = cma.bestF 296 copy(task.X, cma.bestX) 297 } 298 return task 299 } 300 301 func (cma *CmaEsChol) Run(operations chan<- Task, results <-chan Task, tasks []Task) { 302 copy(cma.mean, tasks[0].X) 303 cma.operation = operations 304 // Send the initial tasks. We know there are at most as many tasks as elements 305 // of the population. 306 cma.sendInitTasks(tasks) 307 308 Loop: 309 for { 310 result := <-results 311 switch result.Op { 312 default: 313 panic("unknown operation") 314 case PostIteration: 315 break Loop 316 case MajorIteration: 317 // The last thing we did was update all of the tasks and send the 318 // major iteration. Now we can send a group of tasks again. 319 cma.sendInitTasks(tasks) 320 case FuncEvaluation: 321 cma.receivedIdx++ 322 cma.fs[result.ID] = result.F 323 switch { 324 case cma.sentIdx < cma.pop: 325 // There are still tasks to evaluate. Send the next. 326 cma.sendTask(cma.sentIdx, result) 327 cma.sentIdx++ 328 case cma.receivedIdx < cma.pop: 329 // All the tasks have been sent, but not all of them have been received. 330 // Need to wait until all are back. 331 continue Loop 332 default: 333 // All of the evaluations have been received. 334 if cma.receivedIdx != cma.pop { 335 panic("bad logic") 336 } 337 cma.receivedIdx = 0 338 cma.sentIdx = 0 339 340 task := cma.findBestAndUpdateTask(result) 341 // Update the parameters and send a MajorIteration or a convergence. 342 err := cma.update() 343 // Kill the existing data. 344 for i := range cma.fs { 345 cma.fs[i] = math.NaN() 346 cma.xs.Set(i, 0, math.NaN()) 347 } 348 switch { 349 case err != nil: 350 cma.updateErr = err 351 task.Op = MethodDone 352 case cma.methodConverged() != NotTerminated: 353 task.Op = MethodDone 354 default: 355 task.Op = MajorIteration 356 task.ID = -1 357 } 358 operations <- task 359 } 360 } 361 } 362 363 // Been told to stop. Clean up. 364 // Need to see best of our evaluated tasks so far. Should instead just 365 // collect, then see. 366 for task := range results { 367 switch task.Op { 368 case MajorIteration: 369 case FuncEvaluation: 370 cma.fs[task.ID] = task.F 371 default: 372 panic("unknown operation") 373 } 374 } 375 // Send the new best value if the evaluation is better than any we've 376 // found so far. Keep this separate from findBestAndUpdateTask so that 377 // we only send an iteration if we find a better location. 378 if !cma.ForgetBest { 379 best := cma.bestIdx() 380 if best != -1 && cma.fs[best] < cma.bestF { 381 task := tasks[0] 382 task.F = cma.fs[best] 383 copy(task.X, cma.xs.RawRowView(best)) 384 task.Op = MajorIteration 385 task.ID = -1 386 operations <- task 387 } 388 } 389 close(operations) 390 } 391 392 // update computes the new parameters (mean, cholesky, etc.). Does not update 393 // any of the synchronization parameters (taskIdx). 394 func (cma *CmaEsChol) update() error { 395 // Sort the function values to find the elite samples. 396 ftmp := make([]float64, cma.pop) 397 copy(ftmp, cma.fs) 398 indexes := make([]int, cma.pop) 399 for i := range indexes { 400 indexes[i] = i 401 } 402 sort.Sort(bestSorter{F: ftmp, Idx: indexes}) 403 404 meanOld := make([]float64, len(cma.mean)) 405 copy(meanOld, cma.mean) 406 407 // m_{t+1} = \sum_{i=1}^mu w_i x_i 408 for i := range cma.mean { 409 cma.mean[i] = 0 410 } 411 for i, w := range cma.weights { 412 idx := indexes[i] // index of teh 1337 sample. 413 floats.AddScaled(cma.mean, w, cma.xs.RawRowView(idx)) 414 } 415 meanDiff := make([]float64, len(cma.mean)) 416 floats.SubTo(meanDiff, cma.mean, meanOld) 417 418 // p_{c,t+1} = (1-c_c) p_{c,t} + \sqrt(c_c*(2-c_c)*mueff) (m_{t+1}-m_t)/sigma_t 419 floats.Scale(1-cma.cc, cma.pc) 420 scaleC := math.Sqrt(cma.cc*(2-cma.cc)*cma.muEff) * cma.invSigma 421 floats.AddScaled(cma.pc, scaleC, meanDiff) 422 423 // p_{sigma, t+1} = (1-c_sigma) p_{sigma,t} + \sqrt(c_s*(2-c_s)*mueff) A_t^-1 (m_{t+1}-m_t)/sigma_t 424 floats.Scale(1-cma.cs, cma.ps) 425 // First compute A_t^-1 (m_{t+1}-m_t), then add the scaled vector. 426 tmp := make([]float64, cma.dim) 427 tmpVec := mat.NewVecDense(cma.dim, tmp) 428 diffVec := mat.NewVecDense(cma.dim, meanDiff) 429 err := tmpVec.SolveVec(cma.chol.RawU().T(), diffVec) 430 if err != nil { 431 return err 432 } 433 scaleS := math.Sqrt(cma.cs*(2-cma.cs)*cma.muEff) * cma.invSigma 434 floats.AddScaled(cma.ps, scaleS, tmp) 435 436 // Compute the update to A. 437 scaleChol := 1 - cma.c1 - cma.cmu 438 if scaleChol == 0 { 439 scaleChol = math.SmallestNonzeroFloat64 // enough to kill the old data, but still non-zero. 440 } 441 cma.chol.Scale(scaleChol, &cma.chol) 442 cma.chol.SymRankOne(&cma.chol, cma.c1, mat.NewVecDense(cma.dim, cma.pc)) 443 for i, w := range cma.weights { 444 idx := indexes[i] 445 floats.SubTo(tmp, cma.xs.RawRowView(idx), meanOld) 446 cma.chol.SymRankOne(&cma.chol, cma.cmu*w*cma.invSigma, tmpVec) 447 } 448 449 // sigma_{t+1} = sigma_t exp(c_sigma/d_sigma * norm(p_{sigma,t+1}/ E[chi] -1) 450 normPs := floats.Norm(cma.ps, 2) 451 cma.invSigma /= math.Exp(cma.cs / cma.ds * (normPs/cma.eChi - 1)) 452 return nil 453 } 454 455 type bestSorter struct { 456 F []float64 457 Idx []int 458 } 459 460 func (b bestSorter) Len() int { 461 return len(b.F) 462 } 463 func (b bestSorter) Less(i, j int) bool { 464 return b.F[i] < b.F[j] 465 } 466 func (b bestSorter) Swap(i, j int) { 467 b.F[i], b.F[j] = b.F[j], b.F[i] 468 b.Idx[i], b.Idx[j] = b.Idx[j], b.Idx[i] 469 }