github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/stat/pca_cca.go (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package stat
     6  
     7  import (
     8  	"errors"
     9  	"math"
    10  
    11  	"github.com/jingcheng-WU/gonum/floats"
    12  	"github.com/jingcheng-WU/gonum/mat"
    13  )
    14  
    15  // PC is a type for computing and extracting the principal components of a
    16  // matrix. The results of the principal components analysis are only valid
    17  // if the call to PrincipalComponents was successful.
    18  type PC struct {
    19  	n, d    int
    20  	weights []float64
    21  	svd     *mat.SVD
    22  	ok      bool
    23  }
    24  
    25  // PrincipalComponents performs a weighted principal components analysis on the
    26  // matrix of the input data which is represented as an n×d matrix a where each
    27  // row is an observation and each column is a variable.
    28  //
    29  // PrincipalComponents centers the variables but does not scale the variance.
    30  //
    31  // The weights slice is used to weight the observations. If weights is nil, each
    32  // weight is considered to have a value of one, otherwise the length of weights
    33  // must match the number of observations or PrincipalComponents will panic.
    34  //
    35  // PrincipalComponents returns whether the analysis was successful.
    36  func (c *PC) PrincipalComponents(a mat.Matrix, weights []float64) (ok bool) {
    37  	c.n, c.d = a.Dims()
    38  	if weights != nil && len(weights) != c.n {
    39  		panic("stat: len(weights) != observations")
    40  	}
    41  
    42  	c.svd, c.ok = svdFactorizeCentered(c.svd, a, weights)
    43  	if c.ok {
    44  		c.weights = append(c.weights[:0], weights...)
    45  	}
    46  	return c.ok
    47  }
    48  
    49  // VectorsTo returns the component direction vectors of a principal components
    50  // analysis. The vectors are returned in the columns of a d×min(n, d) matrix.
    51  //
    52  // If dst is empty, VectorsTo will resize dst to be d×min(n, d). When dst is
    53  // non-empty, VectorsTo will panic if dst is not d×min(n, d). VectorsTo will also
    54  // panic if the receiver does not contain a successful PC.
    55  func (c *PC) VectorsTo(dst *mat.Dense) {
    56  	if !c.ok {
    57  		panic("stat: use of unsuccessful principal components analysis")
    58  	}
    59  
    60  	if dst.IsEmpty() {
    61  		dst.ReuseAs(c.d, min(c.n, c.d))
    62  	} else {
    63  		if d, n := dst.Dims(); d != c.d || n != min(c.n, c.d) {
    64  			panic(mat.ErrShape)
    65  		}
    66  	}
    67  	c.svd.VTo(dst)
    68  }
    69  
    70  // VarsTo returns the column variances of the principal component scores,
    71  // b * vecs, where b is a matrix with centered columns. Variances are returned
    72  // in descending order.
    73  // If dst is not nil it is used to store the variances and returned.
    74  // Vars will panic if the receiver has not successfully performed a principal
    75  // components analysis or dst is not nil and the length of dst is not min(n, d).
    76  func (c *PC) VarsTo(dst []float64) []float64 {
    77  	if !c.ok {
    78  		panic("stat: use of unsuccessful principal components analysis")
    79  	}
    80  	if dst != nil && len(dst) != min(c.n, c.d) {
    81  		panic("stat: length of slice does not match analysis")
    82  	}
    83  
    84  	dst = c.svd.Values(dst)
    85  	var f float64
    86  	if c.weights == nil {
    87  		f = 1 / float64(c.n-1)
    88  	} else {
    89  		f = 1 / (floats.Sum(c.weights) - 1)
    90  	}
    91  	for i, v := range dst {
    92  		dst[i] = f * v * v
    93  	}
    94  	return dst
    95  }
    96  
    97  func min(a, b int) int {
    98  	if a < b {
    99  		return a
   100  	}
   101  	return b
   102  }
   103  
   104  // CC is a type for computing the canonical correlations of a pair of matrices.
   105  // The results of the canonical correlation analysis are only valid
   106  // if the call to CanonicalCorrelations was successful.
   107  type CC struct {
   108  	// n is the number of observations used to
   109  	// construct the canonical correlations.
   110  	n int
   111  
   112  	// xd and yd are used for size checks.
   113  	xd, yd int
   114  
   115  	x, y, c *mat.SVD
   116  	ok      bool
   117  }
   118  
   119  // CanonicalCorrelations performs a canonical correlation analysis of the
   120  // input data x and y, columns of which should be interpretable as two sets
   121  // of measurements on the same observations (rows). These observations are
   122  // optionally weighted by weights. The result of the analysis is stored in
   123  // the receiver if the analysis is successful.
   124  //
   125  // Canonical correlation analysis finds associations between two sets of
   126  // variables on the same observations by finding linear combinations of the two
   127  // sphered datasets that maximize the correlation between them.
   128  //
   129  // Some notation: let Xc and Yc denote the centered input data matrices x
   130  // and y (column means subtracted from each column), let Sx and Sy denote the
   131  // sample covariance matrices within x and y respectively, and let Sxy denote
   132  // the covariance matrix between x and y. The sphered data can then be expressed
   133  // as Xc * Sx^{-1/2} and Yc * Sy^{-1/2} respectively, and the correlation matrix
   134  // between the sphered data is called the canonical correlation matrix,
   135  // Sx^{-1/2} * Sxy * Sy^{-1/2}. In cases where S^{-1/2} is ambiguous for some
   136  // covariance matrix S, S^{-1/2} is taken to be E * D^{-1/2} * Eᵀ where S can
   137  // be eigendecomposed as S = E * D * Eᵀ.
   138  //
   139  // The canonical correlations are the correlations between the corresponding
   140  // pairs of canonical variables and can be obtained with c.Corrs(). Canonical
   141  // variables can be obtained by projecting the sphered data into the left and
   142  // right eigenvectors of the canonical correlation matrix, and these
   143  // eigenvectors can be obtained with c.Left(m, true) and c.Right(m, true)
   144  // respectively. The canonical variables can also be obtained directly from the
   145  // centered raw data by using the back-transformed eigenvectors which can be
   146  // obtained with c.Left(m, false) and c.Right(m, false) respectively.
   147  //
   148  // The first pair of left and right eigenvectors of the canonical correlation
   149  // matrix can be interpreted as directions into which the respective sphered
   150  // data can be projected such that the correlation between the two projections
   151  // is maximized. The second pair and onwards solve the same optimization but
   152  // under the constraint that they are uncorrelated (orthogonal in sphered space)
   153  // to previous projections.
   154  //
   155  // CanonicalCorrelations will panic if the inputs x and y do not have the same
   156  // number of rows.
   157  //
   158  // The slice weights is used to weight the observations. If weights is nil, each
   159  // weight is considered to have a value of one, otherwise the length of weights
   160  // must match the number of observations (rows of both x and y) or
   161  // CanonicalCorrelations will panic.
   162  //
   163  // More details can be found at
   164  // https://en.wikipedia.org/wiki/Canonical_correlation
   165  // or in Chapter 3 of
   166  // Koch, Inge. Analysis of multivariate and high-dimensional data.
   167  // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939
   168  func (c *CC) CanonicalCorrelations(x, y mat.Matrix, weights []float64) error {
   169  	var yn int
   170  	c.n, c.xd = x.Dims()
   171  	yn, c.yd = y.Dims()
   172  	if c.n != yn {
   173  		panic("stat: unequal number of observations")
   174  	}
   175  	if weights != nil && len(weights) != c.n {
   176  		panic("stat: len(weights) != observations")
   177  	}
   178  
   179  	// Center and factorize x and y.
   180  	c.x, c.ok = svdFactorizeCentered(c.x, x, weights)
   181  	if !c.ok {
   182  		return errors.New("stat: failed to factorize x")
   183  	}
   184  	c.y, c.ok = svdFactorizeCentered(c.y, y, weights)
   185  	if !c.ok {
   186  		return errors.New("stat: failed to factorize y")
   187  	}
   188  	var xu, xv, yu, yv mat.Dense
   189  	c.x.UTo(&xu)
   190  	c.x.VTo(&xv)
   191  	c.y.UTo(&yu)
   192  	c.y.VTo(&yv)
   193  
   194  	// Calculate and factorise the canonical correlation matrix.
   195  	var ccor mat.Dense
   196  	ccor.Product(&xv, xu.T(), &yu, yv.T())
   197  	if c.c == nil {
   198  		c.c = &mat.SVD{}
   199  	}
   200  	c.ok = c.c.Factorize(&ccor, mat.SVDThin)
   201  	if !c.ok {
   202  		return errors.New("stat: failed to factorize ccor")
   203  	}
   204  	return nil
   205  }
   206  
   207  // CorrsTo returns the canonical correlations, using dst if it is not nil.
   208  // If dst is not nil and len(dst) does not match the number of columns in
   209  // the y input matrix, Corrs will panic.
   210  func (c *CC) CorrsTo(dst []float64) []float64 {
   211  	if !c.ok {
   212  		panic("stat: canonical correlations missing or invalid")
   213  	}
   214  
   215  	if dst != nil && len(dst) != c.yd {
   216  		panic("stat: length of destination does not match input dimension")
   217  	}
   218  	return c.c.Values(dst)
   219  }
   220  
   221  // LeftTo returns the left eigenvectors of the canonical correlation matrix if
   222  // spheredSpace is true. If spheredSpace is false it returns these eigenvectors
   223  // back-transformed to the original data space.
   224  //
   225  // If dst is empty, LeftTo will resize dst to be xd×yd. When dst is
   226  // non-empty, LeftTo will panic if dst is not xd×yd. LeftTo will also
   227  // panic if the receiver does not contain a successful CC.
   228  func (c *CC) LeftTo(dst *mat.Dense, spheredSpace bool) {
   229  	if !c.ok || c.n < 2 {
   230  		panic("stat: canonical correlations missing or invalid")
   231  	}
   232  
   233  	if dst.IsEmpty() {
   234  		dst.ReuseAs(c.xd, c.yd)
   235  	} else {
   236  		if d, n := dst.Dims(); d != c.xd || n != c.yd {
   237  			panic(mat.ErrShape)
   238  		}
   239  	}
   240  	c.c.UTo(dst)
   241  	if spheredSpace {
   242  		return
   243  	}
   244  
   245  	xs := c.x.Values(nil)
   246  	xv := &mat.Dense{}
   247  	c.x.VTo(xv)
   248  
   249  	scaleColsReciSqrt(xv, xs)
   250  
   251  	dst.Product(xv, xv.T(), dst)
   252  	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
   253  }
   254  
   255  // RightTo returns the right eigenvectors of the canonical correlation matrix if
   256  // spheredSpace is true. If spheredSpace is false it returns these eigenvectors
   257  // back-transformed to the original data space.
   258  //
   259  // If dst is empty, RightTo will resize dst to be yd×yd. When dst is
   260  // non-empty, RightTo will panic if dst is not yd×yd. RightTo will also
   261  // panic if the receiver does not contain a successful CC.
   262  func (c *CC) RightTo(dst *mat.Dense, spheredSpace bool) {
   263  	if !c.ok || c.n < 2 {
   264  		panic("stat: canonical correlations missing or invalid")
   265  	}
   266  
   267  	if dst.IsEmpty() {
   268  		dst.ReuseAs(c.yd, c.yd)
   269  	} else {
   270  		if d, n := dst.Dims(); d != c.yd || n != c.yd {
   271  			panic(mat.ErrShape)
   272  		}
   273  	}
   274  	c.c.VTo(dst)
   275  	if spheredSpace {
   276  		return
   277  	}
   278  
   279  	ys := c.y.Values(nil)
   280  	yv := &mat.Dense{}
   281  	c.y.VTo(yv)
   282  
   283  	scaleColsReciSqrt(yv, ys)
   284  
   285  	dst.Product(yv, yv.T(), dst)
   286  	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
   287  }
   288  
   289  func svdFactorizeCentered(work *mat.SVD, m mat.Matrix, weights []float64) (svd *mat.SVD, ok bool) {
   290  	n, d := m.Dims()
   291  	centered := mat.NewDense(n, d, nil)
   292  	col := make([]float64, n)
   293  	for j := 0; j < d; j++ {
   294  		mat.Col(col, j, m)
   295  		floats.AddConst(-Mean(col, weights), col)
   296  		centered.SetCol(j, col)
   297  	}
   298  	for i, w := range weights {
   299  		floats.Scale(math.Sqrt(w), centered.RawRowView(i))
   300  	}
   301  	if work == nil {
   302  		work = &mat.SVD{}
   303  	}
   304  	ok = work.Factorize(centered, mat.SVDThin)
   305  	return work, ok
   306  }
   307  
   308  // scaleColsReciSqrt scales the columns of cols
   309  // by the reciprocal square-root of vals.
   310  func scaleColsReciSqrt(cols *mat.Dense, vals []float64) {
   311  	if cols == nil {
   312  		panic("stat: input nil")
   313  	}
   314  	n, d := cols.Dims()
   315  	if len(vals) != d {
   316  		panic("stat: input length mismatch")
   317  	}
   318  	col := make([]float64, n)
   319  	for j := 0; j < d; j++ {
   320  		mat.Col(col, j, cols)
   321  		floats.Scale(math.Sqrt(1/vals[j]), col)
   322  		cols.SetCol(j, col)
   323  	}
   324  }