github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/optimize/cg.go (about)

     1  // Copyright ©2014 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package optimize
     6  
     7  import (
     8  	"math"
     9  
    10  	"github.com/jingcheng-WU/gonum/floats"
    11  )
    12  
    13  const (
    14  	iterationRestartFactor = 6
    15  	angleRestartThreshold  = -0.9
    16  )
    17  
    18  var (
    19  	_ Method          = (*CG)(nil)
    20  	_ localMethod     = (*CG)(nil)
    21  	_ NextDirectioner = (*CG)(nil)
    22  )
    23  
    24  // CGVariant calculates the scaling parameter, β, used for updating the
    25  // conjugate direction in the nonlinear conjugate gradient (CG) method.
    26  type CGVariant interface {
    27  	// Init is called at the first iteration and provides a way to initialize
    28  	// any internal state.
    29  	Init(loc *Location)
    30  	// Beta returns the value of the scaling parameter that is computed
    31  	// according to the particular variant of the CG method.
    32  	Beta(grad, gradPrev, dirPrev []float64) float64
    33  }
    34  
    35  var (
    36  	_ CGVariant = (*FletcherReeves)(nil)
    37  	_ CGVariant = (*PolakRibierePolyak)(nil)
    38  	_ CGVariant = (*HestenesStiefel)(nil)
    39  	_ CGVariant = (*DaiYuan)(nil)
    40  	_ CGVariant = (*HagerZhang)(nil)
    41  )
    42  
    43  // CG implements the nonlinear conjugate gradient method for solving nonlinear
    44  // unconstrained optimization problems. It is a line search method that
    45  // generates the search directions d_k according to the formula
    46  //  d_{k+1} = -∇f_{k+1} + β_k*d_k,   d_0 = -∇f_0.
    47  // Variants of the conjugate gradient method differ in the choice of the
    48  // parameter β_k. The conjugate gradient method usually requires fewer function
    49  // evaluations than the gradient descent method and no matrix storage, but
    50  // L-BFGS is usually more efficient.
    51  //
    52  // CG implements a restart strategy that takes the steepest descent direction
    53  // (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds:
    54  //
    55  //  - A certain number of iterations has elapsed without a restart. This number
    56  //    is controllable via IterationRestartFactor and if equal to 0, it is set to
    57  //    a reasonable default based on the problem dimension.
    58  //  - The angle between the gradients at two consecutive iterations ∇f_k and
    59  //    ∇f_{k+1} is too large.
    60  //  - The direction d_{k+1} is not a descent direction.
    61  //  - β_k returned from CGVariant.Beta is equal to zero.
    62  //
    63  // The line search for CG must yield step sizes that satisfy the strong Wolfe
    64  // conditions at every iteration, otherwise the generated search direction
    65  // might fail to be a descent direction. The line search should be more
    66  // stringent compared with those for Newton-like methods, which can be achieved
    67  // by setting the gradient constant in the strong Wolfe conditions to a small
    68  // value.
    69  //
    70  // See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate
    71  // gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and
    72  // references therein.
    73  type CG struct {
    74  	// Linesearcher must satisfy the strong Wolfe conditions at every iteration.
    75  	// If Linesearcher == nil, an appropriate default is chosen.
    76  	Linesearcher Linesearcher
    77  	// Variant implements the particular CG formula for computing β_k.
    78  	// If Variant is nil, an appropriate default is chosen.
    79  	Variant CGVariant
    80  	// InitialStep estimates the initial line search step size, because the CG
    81  	// method does not generate well-scaled search directions.
    82  	// If InitialStep is nil, an appropriate default is chosen.
    83  	InitialStep StepSizer
    84  
    85  	// IterationRestartFactor determines the frequency of restarts based on the
    86  	// problem dimension. The negative gradient direction is taken whenever
    87  	// ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed
    88  	// without a restart. For medium and large-scale problems
    89  	// IterationRestartFactor should be set to 1, low-dimensional problems a
    90  	// larger value should be chosen. Note that if the ceil function returns 1,
    91  	// CG will be identical to gradient descent.
    92  	// If IterationRestartFactor is 0, it will be set to 6.
    93  	// CG will panic if IterationRestartFactor is negative.
    94  	IterationRestartFactor float64
    95  	// AngleRestartThreshold sets the threshold angle for restart. The method
    96  	// is restarted if the cosine of the angle between two consecutive
    97  	// gradients is smaller than or equal to AngleRestartThreshold, that is, if
    98  	//  ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold.
    99  	// A value of AngleRestartThreshold closer to -1 (successive gradients in
   100  	// exact opposite directions) will tend to reduce the number of restarts.
   101  	// If AngleRestartThreshold is 0, it will be set to -0.9.
   102  	// CG will panic if AngleRestartThreshold is not in the interval [-1, 0].
   103  	AngleRestartThreshold float64
   104  	// GradStopThreshold sets the threshold for stopping if the gradient norm
   105  	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
   106  	// if it is NaN the setting is not used.
   107  	GradStopThreshold float64
   108  
   109  	ls *LinesearchMethod
   110  
   111  	status Status
   112  	err    error
   113  
   114  	restartAfter    int
   115  	iterFromRestart int
   116  
   117  	dirPrev      []float64
   118  	gradPrev     []float64
   119  	gradPrevNorm float64
   120  }
   121  
   122  func (cg *CG) Status() (Status, error) {
   123  	return cg.status, cg.err
   124  }
   125  
   126  func (*CG) Uses(has Available) (uses Available, err error) {
   127  	return has.gradient()
   128  }
   129  
   130  func (cg *CG) Init(dim, tasks int) int {
   131  	cg.status = NotTerminated
   132  	cg.err = nil
   133  	return 1
   134  }
   135  
   136  func (cg *CG) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
   137  	cg.status, cg.err = localOptimizer{}.run(cg, cg.GradStopThreshold, operation, result, tasks)
   138  	close(operation)
   139  }
   140  
   141  func (cg *CG) initLocal(loc *Location) (Operation, error) {
   142  	if cg.IterationRestartFactor < 0 {
   143  		panic("cg: IterationRestartFactor is negative")
   144  	}
   145  	if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 {
   146  		panic("cg: AngleRestartThreshold not in [-1, 0]")
   147  	}
   148  
   149  	if cg.Linesearcher == nil {
   150  		cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1}
   151  	}
   152  	if cg.Variant == nil {
   153  		cg.Variant = &HestenesStiefel{}
   154  	}
   155  	if cg.InitialStep == nil {
   156  		cg.InitialStep = &FirstOrderStepSize{}
   157  	}
   158  
   159  	if cg.IterationRestartFactor == 0 {
   160  		cg.IterationRestartFactor = iterationRestartFactor
   161  	}
   162  	if cg.AngleRestartThreshold == 0 {
   163  		cg.AngleRestartThreshold = angleRestartThreshold
   164  	}
   165  
   166  	if cg.ls == nil {
   167  		cg.ls = &LinesearchMethod{}
   168  	}
   169  	cg.ls.Linesearcher = cg.Linesearcher
   170  	cg.ls.NextDirectioner = cg
   171  
   172  	return cg.ls.Init(loc)
   173  }
   174  
   175  func (cg *CG) iterateLocal(loc *Location) (Operation, error) {
   176  	return cg.ls.Iterate(loc)
   177  }
   178  
   179  func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) {
   180  	dim := len(loc.X)
   181  
   182  	cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim)))
   183  	cg.iterFromRestart = 0
   184  
   185  	// The initial direction is always the negative gradient.
   186  	copy(dir, loc.Gradient)
   187  	floats.Scale(-1, dir)
   188  
   189  	cg.dirPrev = resize(cg.dirPrev, dim)
   190  	copy(cg.dirPrev, dir)
   191  	cg.gradPrev = resize(cg.gradPrev, dim)
   192  	copy(cg.gradPrev, loc.Gradient)
   193  	cg.gradPrevNorm = floats.Norm(loc.Gradient, 2)
   194  
   195  	cg.Variant.Init(loc)
   196  	return cg.InitialStep.Init(loc, dir)
   197  }
   198  
   199  func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) {
   200  	copy(dir, loc.Gradient)
   201  	floats.Scale(-1, dir)
   202  
   203  	cg.iterFromRestart++
   204  	var restart bool
   205  	if cg.iterFromRestart == cg.restartAfter {
   206  		// Restart because too many iterations have been taken without a restart.
   207  		restart = true
   208  	}
   209  
   210  	gDot := floats.Dot(loc.Gradient, cg.gradPrev)
   211  	gNorm := floats.Norm(loc.Gradient, 2)
   212  	if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm {
   213  		// Restart because the angle between the last two gradients is too large.
   214  		restart = true
   215  	}
   216  
   217  	// Compute the scaling factor β_k even when restarting, because cg.Variant
   218  	// may be keeping an inner state that needs to be updated at every iteration.
   219  	beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev)
   220  	if beta == 0 {
   221  		// β_k == 0 means that the steepest descent direction will be taken, so
   222  		// indicate that the method is in fact being restarted.
   223  		restart = true
   224  	}
   225  	if !restart {
   226  		// The method is not being restarted, so update the descent direction.
   227  		floats.AddScaled(dir, beta, cg.dirPrev)
   228  		if floats.Dot(loc.Gradient, dir) >= 0 {
   229  			// Restart because the new direction is not a descent direction.
   230  			restart = true
   231  			copy(dir, loc.Gradient)
   232  			floats.Scale(-1, dir)
   233  		}
   234  	}
   235  
   236  	// Get the initial line search step size from the StepSizer even if the
   237  	// method was restarted, because StepSizers need to see every iteration.
   238  	stepSize = cg.InitialStep.StepSize(loc, dir)
   239  	if restart {
   240  		// The method was restarted and since the steepest descent direction is
   241  		// not related to the previous direction, discard the estimated step
   242  		// size from cg.InitialStep and use step size of 1 instead.
   243  		stepSize = 1
   244  		// Reset to 0 the counter of iterations taken since the last restart.
   245  		cg.iterFromRestart = 0
   246  	}
   247  
   248  	copy(cg.gradPrev, loc.Gradient)
   249  	copy(cg.dirPrev, dir)
   250  	cg.gradPrevNorm = gNorm
   251  	return stepSize
   252  }
   253  
   254  func (*CG) needs() struct {
   255  	Gradient bool
   256  	Hessian  bool
   257  } {
   258  	return struct {
   259  		Gradient bool
   260  		Hessian  bool
   261  	}{true, false}
   262  }
   263  
   264  // FletcherReeves implements the Fletcher-Reeves variant of the CG method that
   265  // computes the scaling parameter β_k according to the formula
   266  //  β_k = |∇f_{k+1}|^2 / |∇f_k|^2.
   267  type FletcherReeves struct {
   268  	prevNorm float64
   269  }
   270  
   271  func (fr *FletcherReeves) Init(loc *Location) {
   272  	fr.prevNorm = floats.Norm(loc.Gradient, 2)
   273  }
   274  
   275  func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) {
   276  	norm := floats.Norm(grad, 2)
   277  	beta = (norm / fr.prevNorm) * (norm / fr.prevNorm)
   278  	fr.prevNorm = norm
   279  	return beta
   280  }
   281  
   282  // PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG
   283  // method that computes the scaling parameter β_k according to the formula
   284  //  β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2),
   285  // where y_k = ∇f_{k+1} - ∇f_k.
   286  type PolakRibierePolyak struct {
   287  	prevNorm float64
   288  }
   289  
   290  func (pr *PolakRibierePolyak) Init(loc *Location) {
   291  	pr.prevNorm = floats.Norm(loc.Gradient, 2)
   292  }
   293  
   294  func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) {
   295  	norm := floats.Norm(grad, 2)
   296  	dot := floats.Dot(grad, gradPrev)
   297  	beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm)
   298  	pr.prevNorm = norm
   299  	return math.Max(0, beta)
   300  }
   301  
   302  // HestenesStiefel implements the Hestenes-Stiefel variant of the CG method
   303  // that computes the scaling parameter β_k according to the formula
   304  //  β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k),
   305  // where y_k = ∇f_{k+1} - ∇f_k.
   306  type HestenesStiefel struct {
   307  	y []float64
   308  }
   309  
   310  func (hs *HestenesStiefel) Init(loc *Location) {
   311  	hs.y = resize(hs.y, len(loc.Gradient))
   312  }
   313  
   314  func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
   315  	floats.SubTo(hs.y, grad, gradPrev)
   316  	beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y)
   317  	return math.Max(0, beta)
   318  }
   319  
   320  // DaiYuan implements the Dai-Yuan variant of the CG method that computes the
   321  // scaling parameter β_k according to the formula
   322  //  β_k = |∇f_{k+1}|^2 / d_k·y_k,
   323  // where y_k = ∇f_{k+1} - ∇f_k.
   324  type DaiYuan struct {
   325  	y []float64
   326  }
   327  
   328  func (dy *DaiYuan) Init(loc *Location) {
   329  	dy.y = resize(dy.y, len(loc.Gradient))
   330  }
   331  
   332  func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
   333  	floats.SubTo(dy.y, grad, gradPrev)
   334  	norm := floats.Norm(grad, 2)
   335  	return norm * norm / floats.Dot(dirPrev, dy.y)
   336  }
   337  
   338  // HagerZhang implements the Hager-Zhang variant of the CG method that computes the
   339  // scaling parameter β_k according to the formula
   340  //  β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k),
   341  // where y_k = ∇f_{k+1} - ∇f_k.
   342  type HagerZhang struct {
   343  	y []float64
   344  }
   345  
   346  func (hz *HagerZhang) Init(loc *Location) {
   347  	hz.y = resize(hz.y, len(loc.Gradient))
   348  }
   349  
   350  func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
   351  	floats.SubTo(hz.y, grad, gradPrev)
   352  	dirDotY := floats.Dot(dirPrev, hz.y)
   353  	gDotY := floats.Dot(grad, hz.y)
   354  	gDotDir := floats.Dot(grad, dirPrev)
   355  	yNorm := floats.Norm(hz.y, 2)
   356  	return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY
   357  }