gonum.org/v1/gonum@v0.15.1-0.20240517103525-f853624cb1bb/stat/pca_cca.go (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package stat
     6  
     7  import (
     8  	"errors"
     9  	"math"
    10  
    11  	"gonum.org/v1/gonum/floats"
    12  	"gonum.org/v1/gonum/mat"
    13  )
    14  
    15  // PC is a type for computing and extracting the principal components of a
    16  // matrix. The results of the principal components analysis are only valid
    17  // if the call to PrincipalComponents was successful.
    18  type PC struct {
    19  	n, d    int
    20  	weights []float64
    21  	svd     *mat.SVD
    22  	ok      bool
    23  }
    24  
    25  // PrincipalComponents performs a weighted principal components analysis on the
    26  // matrix of the input data which is represented as an n×d matrix a where each
    27  // row is an observation and each column is a variable.
    28  //
    29  // PrincipalComponents centers the variables but does not scale the variance.
    30  //
    31  // The weights slice is used to weight the observations. If weights is nil, each
    32  // weight is considered to have a value of one, otherwise the length of weights
    33  // must match the number of observations or PrincipalComponents will panic.
    34  //
    35  // PrincipalComponents returns whether the analysis was successful.
    36  func (c *PC) PrincipalComponents(a mat.Matrix, weights []float64) (ok bool) {
    37  	c.n, c.d = a.Dims()
    38  	if weights != nil && len(weights) != c.n {
    39  		panic("stat: len(weights) != observations")
    40  	}
    41  
    42  	c.svd, c.ok = svdFactorizeCentered(c.svd, a, weights)
    43  	if c.ok {
    44  		c.weights = append(c.weights[:0], weights...)
    45  	}
    46  	return c.ok
    47  }
    48  
    49  // VectorsTo returns the component direction vectors of a principal components
    50  // analysis. The vectors are returned in the columns of a d×min(n, d) matrix.
    51  //
    52  // If dst is empty, VectorsTo will resize dst to be d×min(n, d). When dst is
    53  // non-empty, VectorsTo will panic if dst is not d×min(n, d). VectorsTo will also
    54  // panic if the receiver does not contain a successful PC.
    55  func (c *PC) VectorsTo(dst *mat.Dense) {
    56  	if !c.ok {
    57  		panic("stat: use of unsuccessful principal components analysis")
    58  	}
    59  
    60  	if dst.IsEmpty() {
    61  		dst.ReuseAs(c.d, min(c.n, c.d))
    62  	} else {
    63  		if d, n := dst.Dims(); d != c.d || n != min(c.n, c.d) {
    64  			panic(mat.ErrShape)
    65  		}
    66  	}
    67  	c.svd.VTo(dst)
    68  }
    69  
    70  // VarsTo returns the column variances of the principal component scores,
    71  // b * vecs, where b is a matrix with centered columns. Variances are returned
    72  // in descending order.
    73  // If dst is not nil it is used to store the variances and returned.
    74  // Vars will panic if the receiver has not successfully performed a principal
    75  // components analysis or dst is not nil and the length of dst is not min(n, d).
    76  func (c *PC) VarsTo(dst []float64) []float64 {
    77  	if !c.ok {
    78  		panic("stat: use of unsuccessful principal components analysis")
    79  	}
    80  	if dst != nil && len(dst) != min(c.n, c.d) {
    81  		panic("stat: length of slice does not match analysis")
    82  	}
    83  
    84  	dst = c.svd.Values(dst)
    85  	var f float64
    86  	if c.weights == nil {
    87  		f = 1 / float64(c.n-1)
    88  	} else {
    89  		f = 1 / (floats.Sum(c.weights) - 1)
    90  	}
    91  	for i, v := range dst {
    92  		dst[i] = f * v * v
    93  	}
    94  	return dst
    95  }
    96  
    97  // CC is a type for computing the canonical correlations of a pair of matrices.
    98  // The results of the canonical correlation analysis are only valid
    99  // if the call to CanonicalCorrelations was successful.
   100  type CC struct {
   101  	// n is the number of observations used to
   102  	// construct the canonical correlations.
   103  	n int
   104  
   105  	// xd and yd are used for size checks.
   106  	xd, yd int
   107  
   108  	x, y, c *mat.SVD
   109  	ok      bool
   110  }
   111  
   112  // CanonicalCorrelations performs a canonical correlation analysis of the
   113  // input data x and y, columns of which should be interpretable as two sets
   114  // of measurements on the same observations (rows). These observations are
   115  // optionally weighted by weights. The result of the analysis is stored in
   116  // the receiver if the analysis is successful.
   117  //
   118  // Canonical correlation analysis finds associations between two sets of
   119  // variables on the same observations by finding linear combinations of the two
   120  // sphered datasets that maximize the correlation between them.
   121  //
   122  // Some notation: let Xc and Yc denote the centered input data matrices x
   123  // and y (column means subtracted from each column), let Sx and Sy denote the
   124  // sample covariance matrices within x and y respectively, and let Sxy denote
   125  // the covariance matrix between x and y. The sphered data can then be expressed
   126  // as Xc * Sx^{-1/2} and Yc * Sy^{-1/2} respectively, and the correlation matrix
   127  // between the sphered data is called the canonical correlation matrix,
   128  // Sx^{-1/2} * Sxy * Sy^{-1/2}. In cases where S^{-1/2} is ambiguous for some
   129  // covariance matrix S, S^{-1/2} is taken to be E * D^{-1/2} * Eᵀ where S can
   130  // be eigendecomposed as S = E * D * Eᵀ.
   131  //
   132  // The canonical correlations are the correlations between the corresponding
   133  // pairs of canonical variables and can be obtained with c.Corrs(). Canonical
   134  // variables can be obtained by projecting the sphered data into the left and
   135  // right eigenvectors of the canonical correlation matrix, and these
   136  // eigenvectors can be obtained with c.Left(m, true) and c.Right(m, true)
   137  // respectively. The canonical variables can also be obtained directly from the
   138  // centered raw data by using the back-transformed eigenvectors which can be
   139  // obtained with c.Left(m, false) and c.Right(m, false) respectively.
   140  //
   141  // The first pair of left and right eigenvectors of the canonical correlation
   142  // matrix can be interpreted as directions into which the respective sphered
   143  // data can be projected such that the correlation between the two projections
   144  // is maximized. The second pair and onwards solve the same optimization but
   145  // under the constraint that they are uncorrelated (orthogonal in sphered space)
   146  // to previous projections.
   147  //
   148  // CanonicalCorrelations will panic if the inputs x and y do not have the same
   149  // number of rows.
   150  //
   151  // The slice weights is used to weight the observations. If weights is nil, each
   152  // weight is considered to have a value of one, otherwise the length of weights
   153  // must match the number of observations (rows of both x and y) or
   154  // CanonicalCorrelations will panic.
   155  //
   156  // More details can be found at
   157  // https://en.wikipedia.org/wiki/Canonical_correlation
   158  // or in Chapter 3 of
   159  // Koch, Inge. Analysis of multivariate and high-dimensional data.
   160  // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939
   161  func (c *CC) CanonicalCorrelations(x, y mat.Matrix, weights []float64) error {
   162  	var yn int
   163  	c.n, c.xd = x.Dims()
   164  	yn, c.yd = y.Dims()
   165  	if c.n != yn {
   166  		panic("stat: unequal number of observations")
   167  	}
   168  	if weights != nil && len(weights) != c.n {
   169  		panic("stat: len(weights) != observations")
   170  	}
   171  
   172  	// Center and factorize x and y.
   173  	c.x, c.ok = svdFactorizeCentered(c.x, x, weights)
   174  	if !c.ok {
   175  		return errors.New("stat: failed to factorize x")
   176  	}
   177  	c.y, c.ok = svdFactorizeCentered(c.y, y, weights)
   178  	if !c.ok {
   179  		return errors.New("stat: failed to factorize y")
   180  	}
   181  	var xu, xv, yu, yv mat.Dense
   182  	c.x.UTo(&xu)
   183  	c.x.VTo(&xv)
   184  	c.y.UTo(&yu)
   185  	c.y.VTo(&yv)
   186  
   187  	// Calculate and factorise the canonical correlation matrix.
   188  	var ccor mat.Dense
   189  	ccor.Product(&xv, xu.T(), &yu, yv.T())
   190  	if c.c == nil {
   191  		c.c = &mat.SVD{}
   192  	}
   193  	c.ok = c.c.Factorize(&ccor, mat.SVDThin)
   194  	if !c.ok {
   195  		return errors.New("stat: failed to factorize ccor")
   196  	}
   197  	return nil
   198  }
   199  
   200  // CorrsTo returns the canonical correlations, using dst if it is not nil.
   201  // If dst is not nil and len(dst) does not match the number of columns in
   202  // the y input matrix, Corrs will panic.
   203  func (c *CC) CorrsTo(dst []float64) []float64 {
   204  	if !c.ok {
   205  		panic("stat: canonical correlations missing or invalid")
   206  	}
   207  
   208  	if dst != nil && len(dst) != c.yd {
   209  		panic("stat: length of destination does not match input dimension")
   210  	}
   211  	return c.c.Values(dst)
   212  }
   213  
   214  // LeftTo returns the left eigenvectors of the canonical correlation matrix if
   215  // spheredSpace is true. If spheredSpace is false it returns these eigenvectors
   216  // back-transformed to the original data space.
   217  //
   218  // If dst is empty, LeftTo will resize dst to be xd×yd. When dst is
   219  // non-empty, LeftTo will panic if dst is not xd×yd. LeftTo will also
   220  // panic if the receiver does not contain a successful CC.
   221  func (c *CC) LeftTo(dst *mat.Dense, spheredSpace bool) {
   222  	if !c.ok || c.n < 2 {
   223  		panic("stat: canonical correlations missing or invalid")
   224  	}
   225  
   226  	if dst.IsEmpty() {
   227  		dst.ReuseAs(c.xd, c.yd)
   228  	} else {
   229  		if d, n := dst.Dims(); d != c.xd || n != c.yd {
   230  			panic(mat.ErrShape)
   231  		}
   232  	}
   233  	c.c.UTo(dst)
   234  	if spheredSpace {
   235  		return
   236  	}
   237  
   238  	xs := c.x.Values(nil)
   239  	xv := &mat.Dense{}
   240  	c.x.VTo(xv)
   241  
   242  	scaleColsReciSqrt(xv, xs)
   243  
   244  	dst.Product(xv, xv.T(), dst)
   245  	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
   246  }
   247  
   248  // RightTo returns the right eigenvectors of the canonical correlation matrix if
   249  // spheredSpace is true. If spheredSpace is false it returns these eigenvectors
   250  // back-transformed to the original data space.
   251  //
   252  // If dst is empty, RightTo will resize dst to be yd×yd. When dst is
   253  // non-empty, RightTo will panic if dst is not yd×yd. RightTo will also
   254  // panic if the receiver does not contain a successful CC.
   255  func (c *CC) RightTo(dst *mat.Dense, spheredSpace bool) {
   256  	if !c.ok || c.n < 2 {
   257  		panic("stat: canonical correlations missing or invalid")
   258  	}
   259  
   260  	if dst.IsEmpty() {
   261  		dst.ReuseAs(c.yd, c.yd)
   262  	} else {
   263  		if d, n := dst.Dims(); d != c.yd || n != c.yd {
   264  			panic(mat.ErrShape)
   265  		}
   266  	}
   267  	c.c.VTo(dst)
   268  	if spheredSpace {
   269  		return
   270  	}
   271  
   272  	ys := c.y.Values(nil)
   273  	yv := &mat.Dense{}
   274  	c.y.VTo(yv)
   275  
   276  	scaleColsReciSqrt(yv, ys)
   277  
   278  	dst.Product(yv, yv.T(), dst)
   279  	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
   280  }
   281  
   282  func svdFactorizeCentered(work *mat.SVD, m mat.Matrix, weights []float64) (svd *mat.SVD, ok bool) {
   283  	n, d := m.Dims()
   284  	centered := mat.NewDense(n, d, nil)
   285  	col := make([]float64, n)
   286  	for j := 0; j < d; j++ {
   287  		mat.Col(col, j, m)
   288  		floats.AddConst(-Mean(col, weights), col)
   289  		centered.SetCol(j, col)
   290  	}
   291  	for i, w := range weights {
   292  		floats.Scale(math.Sqrt(w), centered.RawRowView(i))
   293  	}
   294  	if work == nil {
   295  		work = &mat.SVD{}
   296  	}
   297  	ok = work.Factorize(centered, mat.SVDThin)
   298  	return work, ok
   299  }
   300  
   301  // scaleColsReciSqrt scales the columns of cols
   302  // by the reciprocal square-root of vals.
   303  func scaleColsReciSqrt(cols *mat.Dense, vals []float64) {
   304  	if cols == nil {
   305  		panic("stat: input nil")
   306  	}
   307  	n, d := cols.Dims()
   308  	if len(vals) != d {
   309  		panic("stat: input length mismatch")
   310  	}
   311  	col := make([]float64, n)
   312  	for j := 0; j < d; j++ {
   313  		mat.Col(col, j, cols)
   314  		floats.Scale(math.Sqrt(1/vals[j]), col)
   315  		cols.SetCol(j, col)
   316  	}
   317  }