github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/optimize/cmaes.go (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package optimize 6 7 import ( 8 "math" 9 "sort" 10 11 "golang.org/x/exp/rand" 12 13 "github.com/jingcheng-WU/gonum/floats" 14 "github.com/jingcheng-WU/gonum/mat" 15 "github.com/jingcheng-WU/gonum/stat/distmv" 16 ) 17 18 var _ Method = (*CmaEsChol)(nil) 19 20 // TODO(btracey): If we ever implement the traditional CMA-ES algorithm, provide 21 // the base explanation there, and modify this description to just 22 // describe the differences. 23 24 // CmaEsChol implements the covariance matrix adaptation evolution strategy (CMA-ES) 25 // based on the Cholesky decomposition. The full algorithm is described in 26 // Krause, Oswin, Dídac Rodríguez Arbonès, and Christian Igel. "CMA-ES with 27 // optimal covariance update and storage complexity." Advances in Neural 28 // Information Processing Systems. 2016. 29 // https://papers.nips.cc/paper/6457-cma-es-with-optimal-covariance-update-and-storage-complexity.pdf 30 // CMA-ES is a global optimization method that progressively adapts a population 31 // of samples. CMA-ES combines techniques from local optimization with global 32 // optimization. Specifically, the CMA-ES algorithm uses an initial multivariate 33 // normal distribution to generate a population of input locations. The input locations 34 // with the lowest function values are used to update the parameters of the normal 35 // distribution, a new set of input locations are generated, and this procedure 36 // is iterated until convergence. The initial sampling distribution will have 37 // a mean specified by the initial x location, and a covariance specified by 38 // the InitCholesky field. 39 // 40 // As the normal distribution is progressively updated according to the best samples, 41 // it can be that the mean of the distribution is updated in a gradient-descent 42 // like fashion, followed by a shrinking covariance. 43 // It is recommended that the algorithm be run multiple times (with different 44 // InitMean) to have a better chance of finding the global minimum. 45 // 46 // The CMA-ES-Chol algorithm differs from the standard CMA-ES algorithm in that 47 // it directly updates the Cholesky decomposition of the normal distribution. 48 // This changes the runtime from O(dimension^3) to O(dimension^2*population) 49 // The evolution of the multi-variate normal will be similar to the baseline 50 // CMA-ES algorithm, but the covariance update equation is not identical. 51 // 52 // For more information about the CMA-ES algorithm, see 53 // https://en.wikipedia.org/wiki/CMA-ES 54 // https://arxiv.org/pdf/1604.00772.pdf 55 type CmaEsChol struct { 56 // InitStepSize sets the initial size of the covariance matrix adaptation. 57 // If InitStepSize is 0, a default value of 0.5 is used. InitStepSize cannot 58 // be negative, or CmaEsChol will panic. 59 InitStepSize float64 60 // Population sets the population size for the algorithm. If Population is 61 // 0, a default value of 4 + math.Floor(3*math.Log(float64(dim))) is used. 62 // Population cannot be negative or CmaEsChol will panic. 63 Population int 64 // InitCholesky specifies the Cholesky decomposition of the covariance 65 // matrix for the initial sampling distribution. If InitCholesky is nil, 66 // a default value of I is used. If it is non-nil, then it must have 67 // InitCholesky.Size() be equal to the problem dimension. 68 InitCholesky *mat.Cholesky 69 // StopLogDet sets the threshold for stopping the optimization if the 70 // distribution becomes too peaked. The log determinant is a measure of the 71 // (log) "volume" of the normal distribution, and when it is too small 72 // the samples are almost the same. If the log determinant of the covariance 73 // matrix becomes less than StopLogDet, the optimization run is concluded. 74 // If StopLogDet is 0, a default value of dim*log(1e-16) is used. 75 // If StopLogDet is NaN, the stopping criterion is not used, though 76 // this can cause numeric instabilities in the algorithm. 77 StopLogDet float64 78 // ForgetBest, when true, does not track the best overall function value found, 79 // instead returning the new best sample in each iteration. If ForgetBest 80 // is false, then the minimum value returned will be the lowest across all 81 // iterations, regardless of when that sample was generated. 82 ForgetBest bool 83 // Src allows a random number generator to be supplied for generating samples. 84 // If Src is nil the generator in golang.org/x/math/rand is used. 85 Src rand.Source 86 87 // Fixed algorithm parameters. 88 dim int 89 pop int 90 weights []float64 91 muEff float64 92 cc, cs, c1, cmu, ds float64 93 eChi float64 94 95 // Function data. 96 xs *mat.Dense 97 fs []float64 98 99 // Adaptive algorithm parameters. 100 invSigma float64 // inverse of the sigma parameter 101 pc, ps []float64 102 mean []float64 103 chol mat.Cholesky 104 105 // Overall best. 106 bestX []float64 107 bestF float64 108 109 // Synchronization. 110 sentIdx int 111 receivedIdx int 112 operation chan<- Task 113 updateErr error 114 } 115 116 var ( 117 _ Statuser = (*CmaEsChol)(nil) 118 _ Method = (*CmaEsChol)(nil) 119 ) 120 121 func (cma *CmaEsChol) methodConverged() Status { 122 sd := cma.StopLogDet 123 switch { 124 case math.IsNaN(sd): 125 return NotTerminated 126 case sd == 0: 127 sd = float64(cma.dim) * -36.8413614879 // ln(1e-16) 128 } 129 if cma.chol.LogDet() < sd { 130 return MethodConverge 131 } 132 return NotTerminated 133 } 134 135 // Status returns the status of the method. 136 func (cma *CmaEsChol) Status() (Status, error) { 137 if cma.updateErr != nil { 138 return Failure, cma.updateErr 139 } 140 return cma.methodConverged(), nil 141 } 142 143 func (*CmaEsChol) Uses(has Available) (uses Available, err error) { 144 return has.function() 145 } 146 147 func (cma *CmaEsChol) Init(dim, tasks int) int { 148 if dim <= 0 { 149 panic(nonpositiveDimension) 150 } 151 if tasks < 0 { 152 panic(negativeTasks) 153 } 154 155 // Set fixed algorithm parameters. 156 // Parameter values are from https://arxiv.org/pdf/1604.00772.pdf . 157 cma.dim = dim 158 cma.pop = cma.Population 159 n := float64(dim) 160 if cma.pop == 0 { 161 cma.pop = 4 + int(3*math.Log(n)) // Note the implicit floor. 162 } else if cma.pop < 0 { 163 panic("cma-es-chol: negative population size") 164 } 165 mu := cma.pop / 2 166 cma.weights = resize(cma.weights, mu) 167 for i := range cma.weights { 168 v := math.Log(float64(mu)+0.5) - math.Log(float64(i)+1) 169 cma.weights[i] = v 170 } 171 floats.Scale(1/floats.Sum(cma.weights), cma.weights) 172 cma.muEff = 0 173 for _, v := range cma.weights { 174 cma.muEff += v * v 175 } 176 cma.muEff = 1 / cma.muEff 177 178 cma.cc = (4 + cma.muEff/n) / (n + 4 + 2*cma.muEff/n) 179 cma.cs = (cma.muEff + 2) / (n + cma.muEff + 5) 180 cma.c1 = 2 / ((n+1.3)*(n+1.3) + cma.muEff) 181 cma.cmu = math.Min(1-cma.c1, 2*(cma.muEff-2+1/cma.muEff)/((n+2)*(n+2)+cma.muEff)) 182 cma.ds = 1 + 2*math.Max(0, math.Sqrt((cma.muEff-1)/(n+1))-1) + cma.cs 183 // E[chi] is taken from https://en.wikipedia.org/wiki/CMA-ES (there 184 // listed as E[||N(0,1)||]). 185 cma.eChi = math.Sqrt(n) * (1 - 1.0/(4*n) + 1/(21*n*n)) 186 187 // Allocate memory for function data. 188 cma.xs = mat.NewDense(cma.pop, dim, nil) 189 cma.fs = resize(cma.fs, cma.pop) 190 191 // Allocate and initialize adaptive parameters. 192 cma.invSigma = 1 / cma.InitStepSize 193 if cma.InitStepSize == 0 { 194 cma.invSigma = 10.0 / 3 195 } else if cma.InitStepSize < 0 { 196 panic("cma-es-chol: negative initial step size") 197 } 198 cma.pc = resize(cma.pc, dim) 199 for i := range cma.pc { 200 cma.pc[i] = 0 201 } 202 cma.ps = resize(cma.ps, dim) 203 for i := range cma.ps { 204 cma.ps[i] = 0 205 } 206 cma.mean = resize(cma.mean, dim) // mean location initialized at the start of Run 207 208 if cma.InitCholesky != nil { 209 if cma.InitCholesky.Symmetric() != dim { 210 panic("cma-es-chol: incorrect InitCholesky size") 211 } 212 cma.chol.Clone(cma.InitCholesky) 213 } else { 214 // Set the initial Cholesky to I. 215 b := mat.NewDiagDense(dim, nil) 216 for i := 0; i < dim; i++ { 217 b.SetDiag(i, 1) 218 } 219 var chol mat.Cholesky 220 ok := chol.Factorize(b) 221 if !ok { 222 panic("cma-es-chol: bad cholesky. shouldn't happen") 223 } 224 cma.chol = chol 225 } 226 227 cma.bestX = resize(cma.bestX, dim) 228 cma.bestF = math.Inf(1) 229 230 cma.sentIdx = 0 231 cma.receivedIdx = 0 232 cma.operation = nil 233 cma.updateErr = nil 234 t := min(tasks, cma.pop) 235 return t 236 } 237 238 func (cma *CmaEsChol) sendInitTasks(tasks []Task) { 239 for i, task := range tasks { 240 cma.sendTask(i, task) 241 } 242 cma.sentIdx = len(tasks) 243 } 244 245 // sendTask generates a sample and sends the task. It does not update the cma index. 246 func (cma *CmaEsChol) sendTask(idx int, task Task) { 247 task.ID = idx 248 task.Op = FuncEvaluation 249 distmv.NormalRand(cma.xs.RawRowView(idx), cma.mean, &cma.chol, cma.Src) 250 copy(task.X, cma.xs.RawRowView(idx)) 251 cma.operation <- task 252 } 253 254 // bestIdx returns the best index in the functions. Returns -1 if all values 255 // are NaN. 256 func (cma *CmaEsChol) bestIdx() int { 257 best := -1 258 bestVal := math.Inf(1) 259 for i, v := range cma.fs { 260 if math.IsNaN(v) { 261 continue 262 } 263 // Use equality in case somewhere evaluates to +inf. 264 if v <= bestVal { 265 best = i 266 bestVal = v 267 } 268 } 269 return best 270 } 271 272 // findBestAndUpdateTask finds the best task in the current list, updates the 273 // new best overall, and then stores the best location into task. 274 func (cma *CmaEsChol) findBestAndUpdateTask(task Task) Task { 275 // Find and update the best location. 276 // Don't use floats because there may be NaN values. 277 best := cma.bestIdx() 278 bestF := math.NaN() 279 bestX := cma.xs.RawRowView(0) 280 if best != -1 { 281 bestF = cma.fs[best] 282 bestX = cma.xs.RawRowView(best) 283 } 284 if cma.ForgetBest { 285 task.F = bestF 286 copy(task.X, bestX) 287 } else { 288 if bestF < cma.bestF { 289 cma.bestF = bestF 290 copy(cma.bestX, bestX) 291 } 292 task.F = cma.bestF 293 copy(task.X, cma.bestX) 294 } 295 return task 296 } 297 298 func (cma *CmaEsChol) Run(operations chan<- Task, results <-chan Task, tasks []Task) { 299 copy(cma.mean, tasks[0].X) 300 cma.operation = operations 301 // Send the initial tasks. We know there are at most as many tasks as elements 302 // of the population. 303 cma.sendInitTasks(tasks) 304 305 Loop: 306 for { 307 result := <-results 308 switch result.Op { 309 default: 310 panic("unknown operation") 311 case PostIteration: 312 break Loop 313 case MajorIteration: 314 // The last thing we did was update all of the tasks and send the 315 // major iteration. Now we can send a group of tasks again. 316 cma.sendInitTasks(tasks) 317 case FuncEvaluation: 318 cma.receivedIdx++ 319 cma.fs[result.ID] = result.F 320 switch { 321 case cma.sentIdx < cma.pop: 322 // There are still tasks to evaluate. Send the next. 323 cma.sendTask(cma.sentIdx, result) 324 cma.sentIdx++ 325 case cma.receivedIdx < cma.pop: 326 // All the tasks have been sent, but not all of them have been received. 327 // Need to wait until all are back. 328 continue Loop 329 default: 330 // All of the evaluations have been received. 331 if cma.receivedIdx != cma.pop { 332 panic("bad logic") 333 } 334 cma.receivedIdx = 0 335 cma.sentIdx = 0 336 337 task := cma.findBestAndUpdateTask(result) 338 // Update the parameters and send a MajorIteration or a convergence. 339 err := cma.update() 340 // Kill the existing data. 341 for i := range cma.fs { 342 cma.fs[i] = math.NaN() 343 cma.xs.Set(i, 0, math.NaN()) 344 } 345 switch { 346 case err != nil: 347 cma.updateErr = err 348 task.Op = MethodDone 349 case cma.methodConverged() != NotTerminated: 350 task.Op = MethodDone 351 default: 352 task.Op = MajorIteration 353 task.ID = -1 354 } 355 operations <- task 356 } 357 } 358 } 359 360 // Been told to stop. Clean up. 361 // Need to see best of our evaluated tasks so far. Should instead just 362 // collect, then see. 363 for task := range results { 364 switch task.Op { 365 case MajorIteration: 366 case FuncEvaluation: 367 cma.fs[task.ID] = task.F 368 default: 369 panic("unknown operation") 370 } 371 } 372 // Send the new best value if the evaluation is better than any we've 373 // found so far. Keep this separate from findBestAndUpdateTask so that 374 // we only send an iteration if we find a better location. 375 if !cma.ForgetBest { 376 best := cma.bestIdx() 377 if best != -1 && cma.fs[best] < cma.bestF { 378 task := tasks[0] 379 task.F = cma.fs[best] 380 copy(task.X, cma.xs.RawRowView(best)) 381 task.Op = MajorIteration 382 task.ID = -1 383 operations <- task 384 } 385 } 386 close(operations) 387 } 388 389 // update computes the new parameters (mean, cholesky, etc.). Does not update 390 // any of the synchronization parameters (taskIdx). 391 func (cma *CmaEsChol) update() error { 392 // Sort the function values to find the elite samples. 393 ftmp := make([]float64, cma.pop) 394 copy(ftmp, cma.fs) 395 indexes := make([]int, cma.pop) 396 for i := range indexes { 397 indexes[i] = i 398 } 399 sort.Sort(bestSorter{F: ftmp, Idx: indexes}) 400 401 meanOld := make([]float64, len(cma.mean)) 402 copy(meanOld, cma.mean) 403 404 // m_{t+1} = \sum_{i=1}^mu w_i x_i 405 for i := range cma.mean { 406 cma.mean[i] = 0 407 } 408 for i, w := range cma.weights { 409 idx := indexes[i] // index of teh 1337 sample. 410 floats.AddScaled(cma.mean, w, cma.xs.RawRowView(idx)) 411 } 412 meanDiff := make([]float64, len(cma.mean)) 413 floats.SubTo(meanDiff, cma.mean, meanOld) 414 415 // p_{c,t+1} = (1-c_c) p_{c,t} + \sqrt(c_c*(2-c_c)*mueff) (m_{t+1}-m_t)/sigma_t 416 floats.Scale(1-cma.cc, cma.pc) 417 scaleC := math.Sqrt(cma.cc*(2-cma.cc)*cma.muEff) * cma.invSigma 418 floats.AddScaled(cma.pc, scaleC, meanDiff) 419 420 // p_{sigma, t+1} = (1-c_sigma) p_{sigma,t} + \sqrt(c_s*(2-c_s)*mueff) A_t^-1 (m_{t+1}-m_t)/sigma_t 421 floats.Scale(1-cma.cs, cma.ps) 422 // First compute A_t^-1 (m_{t+1}-m_t), then add the scaled vector. 423 tmp := make([]float64, cma.dim) 424 tmpVec := mat.NewVecDense(cma.dim, tmp) 425 diffVec := mat.NewVecDense(cma.dim, meanDiff) 426 err := tmpVec.SolveVec(cma.chol.RawU().T(), diffVec) 427 if err != nil { 428 return err 429 } 430 scaleS := math.Sqrt(cma.cs*(2-cma.cs)*cma.muEff) * cma.invSigma 431 floats.AddScaled(cma.ps, scaleS, tmp) 432 433 // Compute the update to A. 434 scaleChol := 1 - cma.c1 - cma.cmu 435 if scaleChol == 0 { 436 scaleChol = math.SmallestNonzeroFloat64 // enough to kill the old data, but still non-zero. 437 } 438 cma.chol.Scale(scaleChol, &cma.chol) 439 cma.chol.SymRankOne(&cma.chol, cma.c1, mat.NewVecDense(cma.dim, cma.pc)) 440 for i, w := range cma.weights { 441 idx := indexes[i] 442 floats.SubTo(tmp, cma.xs.RawRowView(idx), meanOld) 443 cma.chol.SymRankOne(&cma.chol, cma.cmu*w*cma.invSigma, tmpVec) 444 } 445 446 // sigma_{t+1} = sigma_t exp(c_sigma/d_sigma * norm(p_{sigma,t+1}/ E[chi] -1) 447 normPs := floats.Norm(cma.ps, 2) 448 cma.invSigma /= math.Exp(cma.cs / cma.ds * (normPs/cma.eChi - 1)) 449 return nil 450 } 451 452 type bestSorter struct { 453 F []float64 454 Idx []int 455 } 456 457 func (b bestSorter) Len() int { 458 return len(b.F) 459 } 460 func (b bestSorter) Less(i, j int) bool { 461 return b.F[i] < b.F[j] 462 } 463 func (b bestSorter) Swap(i, j int) { 464 b.F[i], b.F[j] = b.F[j], b.F[i] 465 b.Idx[i], b.Idx[j] = b.Idx[j], b.Idx[i] 466 }