github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/optimize/cg.go (about) 1 // Copyright ©2014 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package optimize 6 7 import ( 8 "math" 9 10 "github.com/jingcheng-WU/gonum/floats" 11 ) 12 13 const ( 14 iterationRestartFactor = 6 15 angleRestartThreshold = -0.9 16 ) 17 18 var ( 19 _ Method = (*CG)(nil) 20 _ localMethod = (*CG)(nil) 21 _ NextDirectioner = (*CG)(nil) 22 ) 23 24 // CGVariant calculates the scaling parameter, β, used for updating the 25 // conjugate direction in the nonlinear conjugate gradient (CG) method. 26 type CGVariant interface { 27 // Init is called at the first iteration and provides a way to initialize 28 // any internal state. 29 Init(loc *Location) 30 // Beta returns the value of the scaling parameter that is computed 31 // according to the particular variant of the CG method. 32 Beta(grad, gradPrev, dirPrev []float64) float64 33 } 34 35 var ( 36 _ CGVariant = (*FletcherReeves)(nil) 37 _ CGVariant = (*PolakRibierePolyak)(nil) 38 _ CGVariant = (*HestenesStiefel)(nil) 39 _ CGVariant = (*DaiYuan)(nil) 40 _ CGVariant = (*HagerZhang)(nil) 41 ) 42 43 // CG implements the nonlinear conjugate gradient method for solving nonlinear 44 // unconstrained optimization problems. It is a line search method that 45 // generates the search directions d_k according to the formula 46 // d_{k+1} = -∇f_{k+1} + β_k*d_k, d_0 = -∇f_0. 47 // Variants of the conjugate gradient method differ in the choice of the 48 // parameter β_k. The conjugate gradient method usually requires fewer function 49 // evaluations than the gradient descent method and no matrix storage, but 50 // L-BFGS is usually more efficient. 51 // 52 // CG implements a restart strategy that takes the steepest descent direction 53 // (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds: 54 // 55 // - A certain number of iterations has elapsed without a restart. This number 56 // is controllable via IterationRestartFactor and if equal to 0, it is set to 57 // a reasonable default based on the problem dimension. 58 // - The angle between the gradients at two consecutive iterations ∇f_k and 59 // ∇f_{k+1} is too large. 60 // - The direction d_{k+1} is not a descent direction. 61 // - β_k returned from CGVariant.Beta is equal to zero. 62 // 63 // The line search for CG must yield step sizes that satisfy the strong Wolfe 64 // conditions at every iteration, otherwise the generated search direction 65 // might fail to be a descent direction. The line search should be more 66 // stringent compared with those for Newton-like methods, which can be achieved 67 // by setting the gradient constant in the strong Wolfe conditions to a small 68 // value. 69 // 70 // See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate 71 // gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and 72 // references therein. 73 type CG struct { 74 // Linesearcher must satisfy the strong Wolfe conditions at every iteration. 75 // If Linesearcher == nil, an appropriate default is chosen. 76 Linesearcher Linesearcher 77 // Variant implements the particular CG formula for computing β_k. 78 // If Variant is nil, an appropriate default is chosen. 79 Variant CGVariant 80 // InitialStep estimates the initial line search step size, because the CG 81 // method does not generate well-scaled search directions. 82 // If InitialStep is nil, an appropriate default is chosen. 83 InitialStep StepSizer 84 85 // IterationRestartFactor determines the frequency of restarts based on the 86 // problem dimension. The negative gradient direction is taken whenever 87 // ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed 88 // without a restart. For medium and large-scale problems 89 // IterationRestartFactor should be set to 1, low-dimensional problems a 90 // larger value should be chosen. Note that if the ceil function returns 1, 91 // CG will be identical to gradient descent. 92 // If IterationRestartFactor is 0, it will be set to 6. 93 // CG will panic if IterationRestartFactor is negative. 94 IterationRestartFactor float64 95 // AngleRestartThreshold sets the threshold angle for restart. The method 96 // is restarted if the cosine of the angle between two consecutive 97 // gradients is smaller than or equal to AngleRestartThreshold, that is, if 98 // ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold. 99 // A value of AngleRestartThreshold closer to -1 (successive gradients in 100 // exact opposite directions) will tend to reduce the number of restarts. 101 // If AngleRestartThreshold is 0, it will be set to -0.9. 102 // CG will panic if AngleRestartThreshold is not in the interval [-1, 0]. 103 AngleRestartThreshold float64 104 // GradStopThreshold sets the threshold for stopping if the gradient norm 105 // gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and 106 // if it is NaN the setting is not used. 107 GradStopThreshold float64 108 109 ls *LinesearchMethod 110 111 status Status 112 err error 113 114 restartAfter int 115 iterFromRestart int 116 117 dirPrev []float64 118 gradPrev []float64 119 gradPrevNorm float64 120 } 121 122 func (cg *CG) Status() (Status, error) { 123 return cg.status, cg.err 124 } 125 126 func (*CG) Uses(has Available) (uses Available, err error) { 127 return has.gradient() 128 } 129 130 func (cg *CG) Init(dim, tasks int) int { 131 cg.status = NotTerminated 132 cg.err = nil 133 return 1 134 } 135 136 func (cg *CG) Run(operation chan<- Task, result <-chan Task, tasks []Task) { 137 cg.status, cg.err = localOptimizer{}.run(cg, cg.GradStopThreshold, operation, result, tasks) 138 close(operation) 139 } 140 141 func (cg *CG) initLocal(loc *Location) (Operation, error) { 142 if cg.IterationRestartFactor < 0 { 143 panic("cg: IterationRestartFactor is negative") 144 } 145 if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 { 146 panic("cg: AngleRestartThreshold not in [-1, 0]") 147 } 148 149 if cg.Linesearcher == nil { 150 cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1} 151 } 152 if cg.Variant == nil { 153 cg.Variant = &HestenesStiefel{} 154 } 155 if cg.InitialStep == nil { 156 cg.InitialStep = &FirstOrderStepSize{} 157 } 158 159 if cg.IterationRestartFactor == 0 { 160 cg.IterationRestartFactor = iterationRestartFactor 161 } 162 if cg.AngleRestartThreshold == 0 { 163 cg.AngleRestartThreshold = angleRestartThreshold 164 } 165 166 if cg.ls == nil { 167 cg.ls = &LinesearchMethod{} 168 } 169 cg.ls.Linesearcher = cg.Linesearcher 170 cg.ls.NextDirectioner = cg 171 172 return cg.ls.Init(loc) 173 } 174 175 func (cg *CG) iterateLocal(loc *Location) (Operation, error) { 176 return cg.ls.Iterate(loc) 177 } 178 179 func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) { 180 dim := len(loc.X) 181 182 cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim))) 183 cg.iterFromRestart = 0 184 185 // The initial direction is always the negative gradient. 186 copy(dir, loc.Gradient) 187 floats.Scale(-1, dir) 188 189 cg.dirPrev = resize(cg.dirPrev, dim) 190 copy(cg.dirPrev, dir) 191 cg.gradPrev = resize(cg.gradPrev, dim) 192 copy(cg.gradPrev, loc.Gradient) 193 cg.gradPrevNorm = floats.Norm(loc.Gradient, 2) 194 195 cg.Variant.Init(loc) 196 return cg.InitialStep.Init(loc, dir) 197 } 198 199 func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) { 200 copy(dir, loc.Gradient) 201 floats.Scale(-1, dir) 202 203 cg.iterFromRestart++ 204 var restart bool 205 if cg.iterFromRestart == cg.restartAfter { 206 // Restart because too many iterations have been taken without a restart. 207 restart = true 208 } 209 210 gDot := floats.Dot(loc.Gradient, cg.gradPrev) 211 gNorm := floats.Norm(loc.Gradient, 2) 212 if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm { 213 // Restart because the angle between the last two gradients is too large. 214 restart = true 215 } 216 217 // Compute the scaling factor β_k even when restarting, because cg.Variant 218 // may be keeping an inner state that needs to be updated at every iteration. 219 beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev) 220 if beta == 0 { 221 // β_k == 0 means that the steepest descent direction will be taken, so 222 // indicate that the method is in fact being restarted. 223 restart = true 224 } 225 if !restart { 226 // The method is not being restarted, so update the descent direction. 227 floats.AddScaled(dir, beta, cg.dirPrev) 228 if floats.Dot(loc.Gradient, dir) >= 0 { 229 // Restart because the new direction is not a descent direction. 230 restart = true 231 copy(dir, loc.Gradient) 232 floats.Scale(-1, dir) 233 } 234 } 235 236 // Get the initial line search step size from the StepSizer even if the 237 // method was restarted, because StepSizers need to see every iteration. 238 stepSize = cg.InitialStep.StepSize(loc, dir) 239 if restart { 240 // The method was restarted and since the steepest descent direction is 241 // not related to the previous direction, discard the estimated step 242 // size from cg.InitialStep and use step size of 1 instead. 243 stepSize = 1 244 // Reset to 0 the counter of iterations taken since the last restart. 245 cg.iterFromRestart = 0 246 } 247 248 copy(cg.gradPrev, loc.Gradient) 249 copy(cg.dirPrev, dir) 250 cg.gradPrevNorm = gNorm 251 return stepSize 252 } 253 254 func (*CG) needs() struct { 255 Gradient bool 256 Hessian bool 257 } { 258 return struct { 259 Gradient bool 260 Hessian bool 261 }{true, false} 262 } 263 264 // FletcherReeves implements the Fletcher-Reeves variant of the CG method that 265 // computes the scaling parameter β_k according to the formula 266 // β_k = |∇f_{k+1}|^2 / |∇f_k|^2. 267 type FletcherReeves struct { 268 prevNorm float64 269 } 270 271 func (fr *FletcherReeves) Init(loc *Location) { 272 fr.prevNorm = floats.Norm(loc.Gradient, 2) 273 } 274 275 func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) { 276 norm := floats.Norm(grad, 2) 277 beta = (norm / fr.prevNorm) * (norm / fr.prevNorm) 278 fr.prevNorm = norm 279 return beta 280 } 281 282 // PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG 283 // method that computes the scaling parameter β_k according to the formula 284 // β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2), 285 // where y_k = ∇f_{k+1} - ∇f_k. 286 type PolakRibierePolyak struct { 287 prevNorm float64 288 } 289 290 func (pr *PolakRibierePolyak) Init(loc *Location) { 291 pr.prevNorm = floats.Norm(loc.Gradient, 2) 292 } 293 294 func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) { 295 norm := floats.Norm(grad, 2) 296 dot := floats.Dot(grad, gradPrev) 297 beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm) 298 pr.prevNorm = norm 299 return math.Max(0, beta) 300 } 301 302 // HestenesStiefel implements the Hestenes-Stiefel variant of the CG method 303 // that computes the scaling parameter β_k according to the formula 304 // β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k), 305 // where y_k = ∇f_{k+1} - ∇f_k. 306 type HestenesStiefel struct { 307 y []float64 308 } 309 310 func (hs *HestenesStiefel) Init(loc *Location) { 311 hs.y = resize(hs.y, len(loc.Gradient)) 312 } 313 314 func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { 315 floats.SubTo(hs.y, grad, gradPrev) 316 beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y) 317 return math.Max(0, beta) 318 } 319 320 // DaiYuan implements the Dai-Yuan variant of the CG method that computes the 321 // scaling parameter β_k according to the formula 322 // β_k = |∇f_{k+1}|^2 / d_k·y_k, 323 // where y_k = ∇f_{k+1} - ∇f_k. 324 type DaiYuan struct { 325 y []float64 326 } 327 328 func (dy *DaiYuan) Init(loc *Location) { 329 dy.y = resize(dy.y, len(loc.Gradient)) 330 } 331 332 func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { 333 floats.SubTo(dy.y, grad, gradPrev) 334 norm := floats.Norm(grad, 2) 335 return norm * norm / floats.Dot(dirPrev, dy.y) 336 } 337 338 // HagerZhang implements the Hager-Zhang variant of the CG method that computes the 339 // scaling parameter β_k according to the formula 340 // β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k), 341 // where y_k = ∇f_{k+1} - ∇f_k. 342 type HagerZhang struct { 343 y []float64 344 } 345 346 func (hz *HagerZhang) Init(loc *Location) { 347 hz.y = resize(hz.y, len(loc.Gradient)) 348 } 349 350 func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) { 351 floats.SubTo(hz.y, grad, gradPrev) 352 dirDotY := floats.Dot(dirPrev, hz.y) 353 gDotY := floats.Dot(grad, hz.y) 354 gDotDir := floats.Dot(grad, dirPrev) 355 yNorm := floats.Norm(hz.y, 2) 356 return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY 357 }