gonum.org/v1/gonum@v0.15.1-0.20240517103525-f853624cb1bb/optimize/cg.go (about)

     1  // Copyright ©2014 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package optimize
     6  
     7  import (
     8  	"math"
     9  
    10  	"gonum.org/v1/gonum/floats"
    11  )
    12  
    13  const (
    14  	iterationRestartFactor = 6
    15  	angleRestartThreshold  = -0.9
    16  )
    17  
    18  var (
    19  	_ Method          = (*CG)(nil)
    20  	_ localMethod     = (*CG)(nil)
    21  	_ NextDirectioner = (*CG)(nil)
    22  )
    23  
    24  // CGVariant calculates the scaling parameter, β, used for updating the
    25  // conjugate direction in the nonlinear conjugate gradient (CG) method.
    26  type CGVariant interface {
    27  	// Init is called at the first iteration and provides a way to initialize
    28  	// any internal state.
    29  	Init(loc *Location)
    30  	// Beta returns the value of the scaling parameter that is computed
    31  	// according to the particular variant of the CG method.
    32  	Beta(grad, gradPrev, dirPrev []float64) float64
    33  }
    34  
    35  var (
    36  	_ CGVariant = (*FletcherReeves)(nil)
    37  	_ CGVariant = (*PolakRibierePolyak)(nil)
    38  	_ CGVariant = (*HestenesStiefel)(nil)
    39  	_ CGVariant = (*DaiYuan)(nil)
    40  	_ CGVariant = (*HagerZhang)(nil)
    41  )
    42  
    43  // CG implements the nonlinear conjugate gradient method for solving nonlinear
    44  // unconstrained optimization problems. It is a line search method that
    45  // generates the search directions d_k according to the formula
    46  //
    47  //	d_{k+1} = -∇f_{k+1} + β_k*d_k,   d_0 = -∇f_0.
    48  //
    49  // Variants of the conjugate gradient method differ in the choice of the
    50  // parameter β_k. The conjugate gradient method usually requires fewer function
    51  // evaluations than the gradient descent method and no matrix storage, but
    52  // L-BFGS is usually more efficient.
    53  //
    54  // CG implements a restart strategy that takes the steepest descent direction
    55  // (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds:
    56  //
    57  //   - A certain number of iterations has elapsed without a restart. This number
    58  //     is controllable via IterationRestartFactor and if equal to 0, it is set to
    59  //     a reasonable default based on the problem dimension.
    60  //   - The angle between the gradients at two consecutive iterations ∇f_k and
    61  //     ∇f_{k+1} is too large.
    62  //   - The direction d_{k+1} is not a descent direction.
    63  //   - β_k returned from CGVariant.Beta is equal to zero.
    64  //
    65  // The line search for CG must yield step sizes that satisfy the strong Wolfe
    66  // conditions at every iteration, otherwise the generated search direction
    67  // might fail to be a descent direction. The line search should be more
    68  // stringent compared with those for Newton-like methods, which can be achieved
    69  // by setting the gradient constant in the strong Wolfe conditions to a small
    70  // value.
    71  //
    72  // See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate
    73  // gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and
    74  // references therein.
    75  type CG struct {
    76  	// Linesearcher must satisfy the strong Wolfe conditions at every iteration.
    77  	// If Linesearcher == nil, an appropriate default is chosen.
    78  	Linesearcher Linesearcher
    79  	// Variant implements the particular CG formula for computing β_k.
    80  	// If Variant is nil, an appropriate default is chosen.
    81  	Variant CGVariant
    82  	// InitialStep estimates the initial line search step size, because the CG
    83  	// method does not generate well-scaled search directions.
    84  	// If InitialStep is nil, an appropriate default is chosen.
    85  	InitialStep StepSizer
    86  
    87  	// IterationRestartFactor determines the frequency of restarts based on the
    88  	// problem dimension. The negative gradient direction is taken whenever
    89  	// ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed
    90  	// without a restart. For medium and large-scale problems
    91  	// IterationRestartFactor should be set to 1, low-dimensional problems a
    92  	// larger value should be chosen. Note that if the ceil function returns 1,
    93  	// CG will be identical to gradient descent.
    94  	// If IterationRestartFactor is 0, it will be set to 6.
    95  	// CG will panic if IterationRestartFactor is negative.
    96  	IterationRestartFactor float64
    97  	// AngleRestartThreshold sets the threshold angle for restart. The method
    98  	// is restarted if the cosine of the angle between two consecutive
    99  	// gradients is smaller than or equal to AngleRestartThreshold, that is, if
   100  	//  ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold.
   101  	// A value of AngleRestartThreshold closer to -1 (successive gradients in
   102  	// exact opposite directions) will tend to reduce the number of restarts.
   103  	// If AngleRestartThreshold is 0, it will be set to -0.9.
   104  	// CG will panic if AngleRestartThreshold is not in the interval [-1, 0].
   105  	AngleRestartThreshold float64
   106  	// GradStopThreshold sets the threshold for stopping if the gradient norm
   107  	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
   108  	// if it is NaN the setting is not used.
   109  	GradStopThreshold float64
   110  
   111  	ls *LinesearchMethod
   112  
   113  	status Status
   114  	err    error
   115  
   116  	restartAfter    int
   117  	iterFromRestart int
   118  
   119  	dirPrev      []float64
   120  	gradPrev     []float64
   121  	gradPrevNorm float64
   122  }
   123  
   124  func (cg *CG) Status() (Status, error) {
   125  	return cg.status, cg.err
   126  }
   127  
   128  func (*CG) Uses(has Available) (uses Available, err error) {
   129  	return has.gradient()
   130  }
   131  
   132  func (cg *CG) Init(dim, tasks int) int {
   133  	cg.status = NotTerminated
   134  	cg.err = nil
   135  	return 1
   136  }
   137  
   138  func (cg *CG) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
   139  	cg.status, cg.err = localOptimizer{}.run(cg, cg.GradStopThreshold, operation, result, tasks)
   140  	close(operation)
   141  }
   142  
   143  func (cg *CG) initLocal(loc *Location) (Operation, error) {
   144  	if cg.IterationRestartFactor < 0 {
   145  		panic("cg: IterationRestartFactor is negative")
   146  	}
   147  	if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 {
   148  		panic("cg: AngleRestartThreshold not in [-1, 0]")
   149  	}
   150  
   151  	if cg.Linesearcher == nil {
   152  		cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1}
   153  	}
   154  	if cg.Variant == nil {
   155  		cg.Variant = &HestenesStiefel{}
   156  	}
   157  	if cg.InitialStep == nil {
   158  		cg.InitialStep = &FirstOrderStepSize{}
   159  	}
   160  
   161  	if cg.IterationRestartFactor == 0 {
   162  		cg.IterationRestartFactor = iterationRestartFactor
   163  	}
   164  	if cg.AngleRestartThreshold == 0 {
   165  		cg.AngleRestartThreshold = angleRestartThreshold
   166  	}
   167  
   168  	if cg.ls == nil {
   169  		cg.ls = &LinesearchMethod{}
   170  	}
   171  	cg.ls.Linesearcher = cg.Linesearcher
   172  	cg.ls.NextDirectioner = cg
   173  
   174  	return cg.ls.Init(loc)
   175  }
   176  
   177  func (cg *CG) iterateLocal(loc *Location) (Operation, error) {
   178  	return cg.ls.Iterate(loc)
   179  }
   180  
   181  func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) {
   182  	dim := len(loc.X)
   183  
   184  	cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim)))
   185  	cg.iterFromRestart = 0
   186  
   187  	// The initial direction is always the negative gradient.
   188  	copy(dir, loc.Gradient)
   189  	floats.Scale(-1, dir)
   190  
   191  	cg.dirPrev = resize(cg.dirPrev, dim)
   192  	copy(cg.dirPrev, dir)
   193  	cg.gradPrev = resize(cg.gradPrev, dim)
   194  	copy(cg.gradPrev, loc.Gradient)
   195  	cg.gradPrevNorm = floats.Norm(loc.Gradient, 2)
   196  
   197  	cg.Variant.Init(loc)
   198  	return cg.InitialStep.Init(loc, dir)
   199  }
   200  
   201  func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) {
   202  	copy(dir, loc.Gradient)
   203  	floats.Scale(-1, dir)
   204  
   205  	cg.iterFromRestart++
   206  	var restart bool
   207  	if cg.iterFromRestart == cg.restartAfter {
   208  		// Restart because too many iterations have been taken without a restart.
   209  		restart = true
   210  	}
   211  
   212  	gDot := floats.Dot(loc.Gradient, cg.gradPrev)
   213  	gNorm := floats.Norm(loc.Gradient, 2)
   214  	if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm {
   215  		// Restart because the angle between the last two gradients is too large.
   216  		restart = true
   217  	}
   218  
   219  	// Compute the scaling factor β_k even when restarting, because cg.Variant
   220  	// may be keeping an inner state that needs to be updated at every iteration.
   221  	beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev)
   222  	if beta == 0 {
   223  		// β_k == 0 means that the steepest descent direction will be taken, so
   224  		// indicate that the method is in fact being restarted.
   225  		restart = true
   226  	}
   227  	if !restart {
   228  		// The method is not being restarted, so update the descent direction.
   229  		floats.AddScaled(dir, beta, cg.dirPrev)
   230  		if floats.Dot(loc.Gradient, dir) >= 0 {
   231  			// Restart because the new direction is not a descent direction.
   232  			restart = true
   233  			copy(dir, loc.Gradient)
   234  			floats.Scale(-1, dir)
   235  		}
   236  	}
   237  
   238  	// Get the initial line search step size from the StepSizer even if the
   239  	// method was restarted, because StepSizers need to see every iteration.
   240  	stepSize = cg.InitialStep.StepSize(loc, dir)
   241  	if restart {
   242  		// The method was restarted and since the steepest descent direction is
   243  		// not related to the previous direction, discard the estimated step
   244  		// size from cg.InitialStep and use step size of 1 instead.
   245  		stepSize = 1
   246  		// Reset to 0 the counter of iterations taken since the last restart.
   247  		cg.iterFromRestart = 0
   248  	}
   249  
   250  	copy(cg.gradPrev, loc.Gradient)
   251  	copy(cg.dirPrev, dir)
   252  	cg.gradPrevNorm = gNorm
   253  	return stepSize
   254  }
   255  
   256  func (*CG) needs() struct {
   257  	Gradient bool
   258  	Hessian  bool
   259  } {
   260  	return struct {
   261  		Gradient bool
   262  		Hessian  bool
   263  	}{true, false}
   264  }
   265  
   266  // FletcherReeves implements the Fletcher-Reeves variant of the CG method that
   267  // computes the scaling parameter β_k according to the formula
   268  //
   269  //	β_k = |∇f_{k+1}|^2 / |∇f_k|^2.
   270  type FletcherReeves struct {
   271  	prevNorm float64
   272  }
   273  
   274  func (fr *FletcherReeves) Init(loc *Location) {
   275  	fr.prevNorm = floats.Norm(loc.Gradient, 2)
   276  }
   277  
   278  func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) {
   279  	norm := floats.Norm(grad, 2)
   280  	beta = (norm / fr.prevNorm) * (norm / fr.prevNorm)
   281  	fr.prevNorm = norm
   282  	return beta
   283  }
   284  
   285  // PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG
   286  // method that computes the scaling parameter β_k according to the formula
   287  //
   288  //	β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2),
   289  //
   290  // where y_k = ∇f_{k+1} - ∇f_k.
   291  type PolakRibierePolyak struct {
   292  	prevNorm float64
   293  }
   294  
   295  func (pr *PolakRibierePolyak) Init(loc *Location) {
   296  	pr.prevNorm = floats.Norm(loc.Gradient, 2)
   297  }
   298  
   299  func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) {
   300  	norm := floats.Norm(grad, 2)
   301  	dot := floats.Dot(grad, gradPrev)
   302  	beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm)
   303  	pr.prevNorm = norm
   304  	return math.Max(0, beta)
   305  }
   306  
   307  // HestenesStiefel implements the Hestenes-Stiefel variant of the CG method
   308  // that computes the scaling parameter β_k according to the formula
   309  //
   310  //	β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k),
   311  //
   312  // where y_k = ∇f_{k+1} - ∇f_k.
   313  type HestenesStiefel struct {
   314  	y []float64
   315  }
   316  
   317  func (hs *HestenesStiefel) Init(loc *Location) {
   318  	hs.y = resize(hs.y, len(loc.Gradient))
   319  }
   320  
   321  func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
   322  	floats.SubTo(hs.y, grad, gradPrev)
   323  	beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y)
   324  	return math.Max(0, beta)
   325  }
   326  
   327  // DaiYuan implements the Dai-Yuan variant of the CG method that computes the
   328  // scaling parameter β_k according to the formula
   329  //
   330  //	β_k = |∇f_{k+1}|^2 / d_k·y_k,
   331  //
   332  // where y_k = ∇f_{k+1} - ∇f_k.
   333  type DaiYuan struct {
   334  	y []float64
   335  }
   336  
   337  func (dy *DaiYuan) Init(loc *Location) {
   338  	dy.y = resize(dy.y, len(loc.Gradient))
   339  }
   340  
   341  func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
   342  	floats.SubTo(dy.y, grad, gradPrev)
   343  	norm := floats.Norm(grad, 2)
   344  	return norm * norm / floats.Dot(dirPrev, dy.y)
   345  }
   346  
   347  // HagerZhang implements the Hager-Zhang variant of the CG method that computes the
   348  // scaling parameter β_k according to the formula
   349  //
   350  //	β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k),
   351  //
   352  // where y_k = ∇f_{k+1} - ∇f_k.
   353  type HagerZhang struct {
   354  	y []float64
   355  }
   356  
   357  func (hz *HagerZhang) Init(loc *Location) {
   358  	hz.y = resize(hz.y, len(loc.Gradient))
   359  }
   360  
   361  func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
   362  	floats.SubTo(hz.y, grad, gradPrev)
   363  	dirDotY := floats.Dot(dirPrev, hz.y)
   364  	gDotY := floats.Dot(grad, hz.y)
   365  	gDotDir := floats.Dot(grad, dirPrev)
   366  	yNorm := floats.Norm(hz.y, 2)
   367  	return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY
   368  }