gonum.org/v1/gonum@v0.15.1-0.20240517103525-f853624cb1bb/optimize/cg.go (about) 1 // Copyright ©2014 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package optimize 6 7 import ( 8 "math" 9 10 "gonum.org/v1/gonum/floats" 11 ) 12 13 const ( 14 iterationRestartFactor = 6 15 angleRestartThreshold = -0.9 16 ) 17 18 var ( 19 _ Method = (*CG)(nil) 20 _ localMethod = (*CG)(nil) 21 _ NextDirectioner = (*CG)(nil) 22 ) 23 24 // CGVariant calculates the scaling parameter, β, used for updating the 25 // conjugate direction in the nonlinear conjugate gradient (CG) method. 26 type CGVariant interface { 27 // Init is called at the first iteration and provides a way to initialize 28 // any internal state. 29 Init(loc *Location) 30 // Beta returns the value of the scaling parameter that is computed 31 // according to the particular variant of the CG method. 32 Beta(grad, gradPrev, dirPrev []float64) float64 33 } 34 35 var ( 36 _ CGVariant = (*FletcherReeves)(nil) 37 _ CGVariant = (*PolakRibierePolyak)(nil) 38 _ CGVariant = (*HestenesStiefel)(nil) 39 _ CGVariant = (*DaiYuan)(nil) 40 _ CGVariant = (*HagerZhang)(nil) 41 ) 42 43 // CG implements the nonlinear conjugate gradient method for solving nonlinear 44 // unconstrained optimization problems. It is a line search method that 45 // generates the search directions d_k according to the formula 46 // 47 // d_{k+1} = -∇f_{k+1} + β_k*d_k, d_0 = -∇f_0. 48 // 49 // Variants of the conjugate gradient method differ in the choice of the 50 // parameter β_k. The conjugate gradient method usually requires fewer function 51 // evaluations than the gradient descent method and no matrix storage, but 52 // L-BFGS is usually more efficient. 53 // 54 // CG implements a restart strategy that takes the steepest descent direction 55 // (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds: 56 // 57 // - A certain number of iterations has elapsed without a restart. This number 58 // is controllable via IterationRestartFactor and if equal to 0, it is set to 59 // a reasonable default based on the problem dimension. 60 // - The angle between the gradients at two consecutive iterations ∇f_k and 61 // ∇f_{k+1} is too large. 62 // - The direction d_{k+1} is not a descent direction. 63 // - β_k returned from CGVariant.Beta is equal to zero. 64 // 65 // The line search for CG must yield step sizes that satisfy the strong Wolfe 66 // conditions at every iteration, otherwise the generated search direction 67 // might fail to be a descent direction. The line search should be more 68 // stringent compared with those for Newton-like methods, which can be achieved 69 // by setting the gradient constant in the strong Wolfe conditions to a small 70 // value. 71 // 72 // See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate 73 // gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and 74 // references therein. 75 type CG struct { 76 // Linesearcher must satisfy the strong Wolfe conditions at every iteration. 77 // If Linesearcher == nil, an appropriate default is chosen. 78 Linesearcher Linesearcher 79 // Variant implements the particular CG formula for computing β_k. 80 // If Variant is nil, an appropriate default is chosen. 81 Variant CGVariant 82 // InitialStep estimates the initial line search step size, because the CG 83 // method does not generate well-scaled search directions. 84 // If InitialStep is nil, an appropriate default is chosen. 85 InitialStep StepSizer 86 87 // IterationRestartFactor determines the frequency of restarts based on the 88 // problem dimension. The negative gradient direction is taken whenever 89 // ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed 90 // without a restart. For medium and large-scale problems 91 // IterationRestartFactor should be set to 1, low-dimensional problems a 92 // larger value should be chosen. Note that if the ceil function returns 1, 93 // CG will be identical to gradient descent. 94 // If IterationRestartFactor is 0, it will be set to 6. 95 // CG will panic if IterationRestartFactor is negative. 96 IterationRestartFactor float64 97 // AngleRestartThreshold sets the threshold angle for restart. The method 98 // is restarted if the cosine of the angle between two consecutive 99 // gradients is smaller than or equal to AngleRestartThreshold, that is, if 100 // ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold. 101 // A value of AngleRestartThreshold closer to -1 (successive gradients in 102 // exact opposite directions) will tend to reduce the number of restarts. 103 // If AngleRestartThreshold is 0, it will be set to -0.9. 104 // CG will panic if AngleRestartThreshold is not in the interval [-1, 0]. 105 AngleRestartThreshold float64 106 // GradStopThreshold sets the threshold for stopping if the gradient norm 107 // gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and 108 // if it is NaN the setting is not used. 109 GradStopThreshold float64 110 111 ls *LinesearchMethod 112 113 status Status 114 err error 115 116 restartAfter int 117 iterFromRestart int 118 119 dirPrev []float64 120 gradPrev []float64 121 gradPrevNorm float64 122 } 123 124 func (cg *CG) Status() (Status, error) { 125 return cg.status, cg.err 126 } 127 128 func (*CG) Uses(has Available) (uses Available, err error) { 129 return has.gradient() 130 } 131 132 func (cg *CG) Init(dim, tasks int) int { 133 cg.status = NotTerminated 134 cg.err = nil 135 return 1 136 } 137 138 func (cg *CG) Run(operation chan<- Task, result <-chan Task, tasks []Task) { 139 cg.status, cg.err = localOptimizer{}.run(cg, cg.GradStopThreshold, operation, result, tasks) 140 close(operation) 141 } 142 143 func (cg *CG) initLocal(loc *Location) (Operation, error) { 144 if cg.IterationRestartFactor < 0 { 145 panic("cg: IterationRestartFactor is negative") 146 } 147 if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 { 148 panic("cg: AngleRestartThreshold not in [-1, 0]") 149 } 150 151 if cg.Linesearcher == nil { 152 cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1} 153 } 154 if cg.Variant == nil { 155 cg.Variant = &HestenesStiefel{} 156 } 157 if cg.InitialStep == nil { 158 cg.InitialStep = &FirstOrderStepSize{} 159 } 160 161 if cg.IterationRestartFactor == 0 { 162 cg.IterationRestartFactor = iterationRestartFactor 163 } 164 if cg.AngleRestartThreshold == 0 { 165 cg.AngleRestartThreshold = angleRestartThreshold 166 } 167 168 if cg.ls == nil { 169 cg.ls = &LinesearchMethod{} 170 } 171 cg.ls.Linesearcher = cg.Linesearcher 172 cg.ls.NextDirectioner = cg 173 174 return cg.ls.Init(loc) 175 } 176 177 func (cg *CG) iterateLocal(loc *Location) (Operation, error) { 178 return cg.ls.Iterate(loc) 179 } 180 181 func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) { 182 dim := len(loc.X) 183 184 cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim))) 185 cg.iterFromRestart = 0 186 187 // The initial direction is always the negative gradient. 188 copy(dir, loc.Gradient) 189 floats.Scale(-1, dir) 190 191 cg.dirPrev = resize(cg.dirPrev, dim) 192 copy(cg.dirPrev, dir) 193 cg.gradPrev = resize(cg.gradPrev, dim) 194 copy(cg.gradPrev, loc.Gradient) 195 cg.gradPrevNorm = floats.Norm(loc.Gradient, 2) 196 197 cg.Variant.Init(loc) 198 return cg.InitialStep.Init(loc, dir) 199 } 200 201 func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) { 202 copy(dir, loc.Gradient) 203 floats.Scale(-1, dir) 204 205 cg.iterFromRestart++ 206 var restart bool 207 if cg.iterFromRestart == cg.restartAfter { 208 // Restart because too many iterations have been taken without a restart. 209 restart = true 210 } 211 212 gDot := floats.Dot(loc.Gradient, cg.gradPrev) 213 gNorm := floats.Norm(loc.Gradient, 2) 214 if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm { 215 // Restart because the angle between the last two gradients is too large. 216 restart = true 217 } 218 219 // Compute the scaling factor β_k even when restarting, because cg.Variant 220 // may be keeping an inner state that needs to be updated at every iteration. 221 beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev) 222 if beta == 0 { 223 // β_k == 0 means that the steepest descent direction will be taken, so 224 // indicate that the method is in fact being restarted. 225 restart = true 226 } 227 if !restart { 228 // The method is not being restarted, so update the descent direction. 229 floats.AddScaled(dir, beta, cg.dirPrev) 230 if floats.Dot(loc.Gradient, dir) >= 0 { 231 // Restart because the new direction is not a descent direction. 232 restart = true 233 copy(dir, loc.Gradient) 234 floats.Scale(-1, dir) 235 } 236 } 237 238 // Get the initial line search step size from the StepSizer even if the 239 // method was restarted, because StepSizers need to see every iteration. 240 stepSize = cg.InitialStep.StepSize(loc, dir) 241 if restart { 242 // The method was restarted and since the steepest descent direction is 243 // not related to the previous direction, discard the estimated step 244 // size from cg.InitialStep and use step size of 1 instead. 245 stepSize = 1 246 // Reset to 0 the counter of iterations taken since the last restart. 247 cg.iterFromRestart = 0 248 } 249 250 copy(cg.gradPrev, loc.Gradient) 251 copy(cg.dirPrev, dir) 252 cg.gradPrevNorm = gNorm 253 return stepSize 254 } 255 256 func (*CG) needs() struct { 257 Gradient bool 258 Hessian bool 259 } { 260 return struct { 261 Gradient bool 262 Hessian bool 263 }{true, false} 264 } 265 266 // FletcherReeves implements the Fletcher-Reeves variant of the CG method that 267 // computes the scaling parameter β_k according to the formula 268 // 269 // β_k = |∇f_{k+1}|^2 / |∇f_k|^2. 270 type FletcherReeves struct { 271 prevNorm float64 272 } 273 274 func (fr *FletcherReeves) Init(loc *Location) { 275 fr.prevNorm = floats.Norm(loc.Gradient, 2) 276 } 277 278 func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) { 279 norm := floats.Norm(grad, 2) 280 beta = (norm / fr.prevNorm) * (norm / fr.prevNorm) 281 fr.prevNorm = norm 282 return beta 283 } 284 285 // PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG 286 // method that computes the scaling parameter β_k according to the formula 287 // 288 // β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2), 289 // 290 // where y_k = ∇f_{k+1} - ∇f_k. 291 type PolakRibierePolyak struct { 292 prevNorm float64 293 } 294 295 func (pr *PolakRibierePolyak) Init(loc *Location) { 296 pr.prevNorm = floats.Norm(loc.Gradient, 2) 297 } 298 299 func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) { 300 norm := floats.Norm(grad, 2) 301 dot := floats.Dot(grad, gradPrev) 302 beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm) 303 pr.prevNorm = norm 304 return math.Max(0, beta) 305 } 306 307 // HestenesStiefel implements the Hestenes-Stiefel variant of the CG method 308 // that computes the scaling parameter β_k according to the formula 309 // 310 // β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k), 311 // 312 // where y_k = ∇f_{k+1} - ∇f_k. 313 type HestenesStiefel struct { 314 y []float64 315 } 316 317 func (hs *HestenesStiefel) Init(loc *Location) { 318 hs.y = resize(hs.y, len(loc.Gradient)) 319 } 320 321 func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { 322 floats.SubTo(hs.y, grad, gradPrev) 323 beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y) 324 return math.Max(0, beta) 325 } 326 327 // DaiYuan implements the Dai-Yuan variant of the CG method that computes the 328 // scaling parameter β_k according to the formula 329 // 330 // β_k = |∇f_{k+1}|^2 / d_k·y_k, 331 // 332 // where y_k = ∇f_{k+1} - ∇f_k. 333 type DaiYuan struct { 334 y []float64 335 } 336 337 func (dy *DaiYuan) Init(loc *Location) { 338 dy.y = resize(dy.y, len(loc.Gradient)) 339 } 340 341 func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { 342 floats.SubTo(dy.y, grad, gradPrev) 343 norm := floats.Norm(grad, 2) 344 return norm * norm / floats.Dot(dirPrev, dy.y) 345 } 346 347 // HagerZhang implements the Hager-Zhang variant of the CG method that computes the 348 // scaling parameter β_k according to the formula 349 // 350 // β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k), 351 // 352 // where y_k = ∇f_{k+1} - ∇f_k. 353 type HagerZhang struct { 354 y []float64 355 } 356 357 func (hz *HagerZhang) Init(loc *Location) { 358 hz.y = resize(hz.y, len(loc.Gradient)) 359 } 360 361 func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { 362 floats.SubTo(hz.y, grad, gradPrev) 363 dirDotY := floats.Dot(dirPrev, hz.y) 364 gDotY := floats.Dot(grad, hz.y) 365 gDotDir := floats.Dot(grad, dirPrev) 366 yNorm := floats.Norm(hz.y, 2) 367 return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY 368 }