github.com/gopherd/gonum@v0.0.4/blas/gonum/level3float64.go (about)

     1  // Copyright ©2014 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package gonum
     6  
     7  import (
     8  	"github.com/gopherd/gonum/blas"
     9  	"github.com/gopherd/gonum/internal/asm/f64"
    10  )
    11  
    12  var _ blas.Float64Level3 = Implementation{}
    13  
    14  // Dtrsm solves one of the matrix equations
    15  //  A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
    16  //  Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
    17  //  X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
    18  //  X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
    19  // where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
    20  // scalar.
    21  //
    22  // At entry to the function, X contains the values of B, and the result is
    23  // stored in-place into X.
    24  //
    25  // No check is made that A is invertible.
    26  func (Implementation) Dtrsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
    27  	if s != blas.Left && s != blas.Right {
    28  		panic(badSide)
    29  	}
    30  	if ul != blas.Lower && ul != blas.Upper {
    31  		panic(badUplo)
    32  	}
    33  	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
    34  		panic(badTranspose)
    35  	}
    36  	if d != blas.NonUnit && d != blas.Unit {
    37  		panic(badDiag)
    38  	}
    39  	if m < 0 {
    40  		panic(mLT0)
    41  	}
    42  	if n < 0 {
    43  		panic(nLT0)
    44  	}
    45  	k := n
    46  	if s == blas.Left {
    47  		k = m
    48  	}
    49  	if lda < max(1, k) {
    50  		panic(badLdA)
    51  	}
    52  	if ldb < max(1, n) {
    53  		panic(badLdB)
    54  	}
    55  
    56  	// Quick return if possible.
    57  	if m == 0 || n == 0 {
    58  		return
    59  	}
    60  
    61  	// For zero matrix size the following slice length checks are trivially satisfied.
    62  	if len(a) < lda*(k-1)+k {
    63  		panic(shortA)
    64  	}
    65  	if len(b) < ldb*(m-1)+n {
    66  		panic(shortB)
    67  	}
    68  
    69  	if alpha == 0 {
    70  		for i := 0; i < m; i++ {
    71  			btmp := b[i*ldb : i*ldb+n]
    72  			for j := range btmp {
    73  				btmp[j] = 0
    74  			}
    75  		}
    76  		return
    77  	}
    78  	nonUnit := d == blas.NonUnit
    79  	if s == blas.Left {
    80  		if tA == blas.NoTrans {
    81  			if ul == blas.Upper {
    82  				for i := m - 1; i >= 0; i-- {
    83  					btmp := b[i*ldb : i*ldb+n]
    84  					if alpha != 1 {
    85  						f64.ScalUnitary(alpha, btmp)
    86  					}
    87  					for ka, va := range a[i*lda+i+1 : i*lda+m] {
    88  						if va != 0 {
    89  							k := ka + i + 1
    90  							f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
    91  						}
    92  					}
    93  					if nonUnit {
    94  						tmp := 1 / a[i*lda+i]
    95  						f64.ScalUnitary(tmp, btmp)
    96  					}
    97  				}
    98  				return
    99  			}
   100  			for i := 0; i < m; i++ {
   101  				btmp := b[i*ldb : i*ldb+n]
   102  				if alpha != 1 {
   103  					f64.ScalUnitary(alpha, btmp)
   104  				}
   105  				for k, va := range a[i*lda : i*lda+i] {
   106  					if va != 0 {
   107  						f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
   108  					}
   109  				}
   110  				if nonUnit {
   111  					tmp := 1 / a[i*lda+i]
   112  					f64.ScalUnitary(tmp, btmp)
   113  				}
   114  			}
   115  			return
   116  		}
   117  		// Cases where a is transposed
   118  		if ul == blas.Upper {
   119  			for k := 0; k < m; k++ {
   120  				btmpk := b[k*ldb : k*ldb+n]
   121  				if nonUnit {
   122  					tmp := 1 / a[k*lda+k]
   123  					f64.ScalUnitary(tmp, btmpk)
   124  				}
   125  				for ia, va := range a[k*lda+k+1 : k*lda+m] {
   126  					if va != 0 {
   127  						i := ia + k + 1
   128  						f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
   129  					}
   130  				}
   131  				if alpha != 1 {
   132  					f64.ScalUnitary(alpha, btmpk)
   133  				}
   134  			}
   135  			return
   136  		}
   137  		for k := m - 1; k >= 0; k-- {
   138  			btmpk := b[k*ldb : k*ldb+n]
   139  			if nonUnit {
   140  				tmp := 1 / a[k*lda+k]
   141  				f64.ScalUnitary(tmp, btmpk)
   142  			}
   143  			for i, va := range a[k*lda : k*lda+k] {
   144  				if va != 0 {
   145  					f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
   146  				}
   147  			}
   148  			if alpha != 1 {
   149  				f64.ScalUnitary(alpha, btmpk)
   150  			}
   151  		}
   152  		return
   153  	}
   154  	// Cases where a is to the right of X.
   155  	if tA == blas.NoTrans {
   156  		if ul == blas.Upper {
   157  			for i := 0; i < m; i++ {
   158  				btmp := b[i*ldb : i*ldb+n]
   159  				if alpha != 1 {
   160  					f64.ScalUnitary(alpha, btmp)
   161  				}
   162  				for k, vb := range btmp {
   163  					if vb == 0 {
   164  						continue
   165  					}
   166  					if nonUnit {
   167  						btmp[k] /= a[k*lda+k]
   168  					}
   169  					f64.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
   170  				}
   171  			}
   172  			return
   173  		}
   174  		for i := 0; i < m; i++ {
   175  			btmp := b[i*ldb : i*ldb+n]
   176  			if alpha != 1 {
   177  				f64.ScalUnitary(alpha, btmp)
   178  			}
   179  			for k := n - 1; k >= 0; k-- {
   180  				if btmp[k] == 0 {
   181  					continue
   182  				}
   183  				if nonUnit {
   184  					btmp[k] /= a[k*lda+k]
   185  				}
   186  				f64.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
   187  			}
   188  		}
   189  		return
   190  	}
   191  	// Cases where a is transposed.
   192  	if ul == blas.Upper {
   193  		for i := 0; i < m; i++ {
   194  			btmp := b[i*ldb : i*ldb+n]
   195  			for j := n - 1; j >= 0; j-- {
   196  				tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
   197  				if nonUnit {
   198  					tmp /= a[j*lda+j]
   199  				}
   200  				btmp[j] = tmp
   201  			}
   202  		}
   203  		return
   204  	}
   205  	for i := 0; i < m; i++ {
   206  		btmp := b[i*ldb : i*ldb+n]
   207  		for j := 0; j < n; j++ {
   208  			tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
   209  			if nonUnit {
   210  				tmp /= a[j*lda+j]
   211  			}
   212  			btmp[j] = tmp
   213  		}
   214  	}
   215  }
   216  
   217  // Dsymm performs one of the matrix-matrix operations
   218  //  C = alpha * A * B + beta * C  if side == blas.Left
   219  //  C = alpha * B * A + beta * C  if side == blas.Right
   220  // where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
   221  // is a scalar.
   222  func (Implementation) Dsymm(s blas.Side, ul blas.Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
   223  	if s != blas.Right && s != blas.Left {
   224  		panic(badSide)
   225  	}
   226  	if ul != blas.Lower && ul != blas.Upper {
   227  		panic(badUplo)
   228  	}
   229  	if m < 0 {
   230  		panic(mLT0)
   231  	}
   232  	if n < 0 {
   233  		panic(nLT0)
   234  	}
   235  	k := n
   236  	if s == blas.Left {
   237  		k = m
   238  	}
   239  	if lda < max(1, k) {
   240  		panic(badLdA)
   241  	}
   242  	if ldb < max(1, n) {
   243  		panic(badLdB)
   244  	}
   245  	if ldc < max(1, n) {
   246  		panic(badLdC)
   247  	}
   248  
   249  	// Quick return if possible.
   250  	if m == 0 || n == 0 {
   251  		return
   252  	}
   253  
   254  	// For zero matrix size the following slice length checks are trivially satisfied.
   255  	if len(a) < lda*(k-1)+k {
   256  		panic(shortA)
   257  	}
   258  	if len(b) < ldb*(m-1)+n {
   259  		panic(shortB)
   260  	}
   261  	if len(c) < ldc*(m-1)+n {
   262  		panic(shortC)
   263  	}
   264  
   265  	// Quick return if possible.
   266  	if alpha == 0 && beta == 1 {
   267  		return
   268  	}
   269  
   270  	if beta == 0 {
   271  		for i := 0; i < m; i++ {
   272  			ctmp := c[i*ldc : i*ldc+n]
   273  			for j := range ctmp {
   274  				ctmp[j] = 0
   275  			}
   276  		}
   277  	}
   278  
   279  	if alpha == 0 {
   280  		if beta != 0 {
   281  			for i := 0; i < m; i++ {
   282  				ctmp := c[i*ldc : i*ldc+n]
   283  				for j := 0; j < n; j++ {
   284  					ctmp[j] *= beta
   285  				}
   286  			}
   287  		}
   288  		return
   289  	}
   290  
   291  	isUpper := ul == blas.Upper
   292  	if s == blas.Left {
   293  		for i := 0; i < m; i++ {
   294  			atmp := alpha * a[i*lda+i]
   295  			btmp := b[i*ldb : i*ldb+n]
   296  			ctmp := c[i*ldc : i*ldc+n]
   297  			for j, v := range btmp {
   298  				ctmp[j] *= beta
   299  				ctmp[j] += atmp * v
   300  			}
   301  
   302  			for k := 0; k < i; k++ {
   303  				var atmp float64
   304  				if isUpper {
   305  					atmp = a[k*lda+i]
   306  				} else {
   307  					atmp = a[i*lda+k]
   308  				}
   309  				atmp *= alpha
   310  				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
   311  			}
   312  			for k := i + 1; k < m; k++ {
   313  				var atmp float64
   314  				if isUpper {
   315  					atmp = a[i*lda+k]
   316  				} else {
   317  					atmp = a[k*lda+i]
   318  				}
   319  				atmp *= alpha
   320  				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
   321  			}
   322  		}
   323  		return
   324  	}
   325  	if isUpper {
   326  		for i := 0; i < m; i++ {
   327  			for j := n - 1; j >= 0; j-- {
   328  				tmp := alpha * b[i*ldb+j]
   329  				var tmp2 float64
   330  				atmp := a[j*lda+j+1 : j*lda+n]
   331  				btmp := b[i*ldb+j+1 : i*ldb+n]
   332  				ctmp := c[i*ldc+j+1 : i*ldc+n]
   333  				for k, v := range atmp {
   334  					ctmp[k] += tmp * v
   335  					tmp2 += btmp[k] * v
   336  				}
   337  				c[i*ldc+j] *= beta
   338  				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
   339  			}
   340  		}
   341  		return
   342  	}
   343  	for i := 0; i < m; i++ {
   344  		for j := 0; j < n; j++ {
   345  			tmp := alpha * b[i*ldb+j]
   346  			var tmp2 float64
   347  			atmp := a[j*lda : j*lda+j]
   348  			btmp := b[i*ldb : i*ldb+j]
   349  			ctmp := c[i*ldc : i*ldc+j]
   350  			for k, v := range atmp {
   351  				ctmp[k] += tmp * v
   352  				tmp2 += btmp[k] * v
   353  			}
   354  			c[i*ldc+j] *= beta
   355  			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
   356  		}
   357  	}
   358  }
   359  
   360  // Dsyrk performs one of the symmetric rank-k operations
   361  //  C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
   362  //  C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
   363  // where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
   364  // beta are scalars.
   365  func (Implementation) Dsyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int) {
   366  	if ul != blas.Lower && ul != blas.Upper {
   367  		panic(badUplo)
   368  	}
   369  	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
   370  		panic(badTranspose)
   371  	}
   372  	if n < 0 {
   373  		panic(nLT0)
   374  	}
   375  	if k < 0 {
   376  		panic(kLT0)
   377  	}
   378  	row, col := k, n
   379  	if tA == blas.NoTrans {
   380  		row, col = n, k
   381  	}
   382  	if lda < max(1, col) {
   383  		panic(badLdA)
   384  	}
   385  	if ldc < max(1, n) {
   386  		panic(badLdC)
   387  	}
   388  
   389  	// Quick return if possible.
   390  	if n == 0 {
   391  		return
   392  	}
   393  
   394  	// For zero matrix size the following slice length checks are trivially satisfied.
   395  	if len(a) < lda*(row-1)+col {
   396  		panic(shortA)
   397  	}
   398  	if len(c) < ldc*(n-1)+n {
   399  		panic(shortC)
   400  	}
   401  
   402  	if alpha == 0 {
   403  		if beta == 0 {
   404  			if ul == blas.Upper {
   405  				for i := 0; i < n; i++ {
   406  					ctmp := c[i*ldc+i : i*ldc+n]
   407  					for j := range ctmp {
   408  						ctmp[j] = 0
   409  					}
   410  				}
   411  				return
   412  			}
   413  			for i := 0; i < n; i++ {
   414  				ctmp := c[i*ldc : i*ldc+i+1]
   415  				for j := range ctmp {
   416  					ctmp[j] = 0
   417  				}
   418  			}
   419  			return
   420  		}
   421  		if ul == blas.Upper {
   422  			for i := 0; i < n; i++ {
   423  				ctmp := c[i*ldc+i : i*ldc+n]
   424  				for j := range ctmp {
   425  					ctmp[j] *= beta
   426  				}
   427  			}
   428  			return
   429  		}
   430  		for i := 0; i < n; i++ {
   431  			ctmp := c[i*ldc : i*ldc+i+1]
   432  			for j := range ctmp {
   433  				ctmp[j] *= beta
   434  			}
   435  		}
   436  		return
   437  	}
   438  	if tA == blas.NoTrans {
   439  		if ul == blas.Upper {
   440  			for i := 0; i < n; i++ {
   441  				ctmp := c[i*ldc+i : i*ldc+n]
   442  				atmp := a[i*lda : i*lda+k]
   443  				if beta == 0 {
   444  					for jc := range ctmp {
   445  						j := jc + i
   446  						ctmp[jc] = alpha * f64.DotUnitary(atmp, a[j*lda:j*lda+k])
   447  					}
   448  				} else {
   449  					for jc, vc := range ctmp {
   450  						j := jc + i
   451  						ctmp[jc] = vc*beta + alpha*f64.DotUnitary(atmp, a[j*lda:j*lda+k])
   452  					}
   453  				}
   454  			}
   455  			return
   456  		}
   457  		for i := 0; i < n; i++ {
   458  			ctmp := c[i*ldc : i*ldc+i+1]
   459  			atmp := a[i*lda : i*lda+k]
   460  			if beta == 0 {
   461  				for j := range ctmp {
   462  					ctmp[j] = alpha * f64.DotUnitary(a[j*lda:j*lda+k], atmp)
   463  				}
   464  			} else {
   465  				for j, vc := range ctmp {
   466  					ctmp[j] = vc*beta + alpha*f64.DotUnitary(a[j*lda:j*lda+k], atmp)
   467  				}
   468  			}
   469  		}
   470  		return
   471  	}
   472  	// Cases where a is transposed.
   473  	if ul == blas.Upper {
   474  		for i := 0; i < n; i++ {
   475  			ctmp := c[i*ldc+i : i*ldc+n]
   476  			if beta == 0 {
   477  				for j := range ctmp {
   478  					ctmp[j] = 0
   479  				}
   480  			} else if beta != 1 {
   481  				for j := range ctmp {
   482  					ctmp[j] *= beta
   483  				}
   484  			}
   485  			for l := 0; l < k; l++ {
   486  				tmp := alpha * a[l*lda+i]
   487  				if tmp != 0 {
   488  					f64.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
   489  				}
   490  			}
   491  		}
   492  		return
   493  	}
   494  	for i := 0; i < n; i++ {
   495  		ctmp := c[i*ldc : i*ldc+i+1]
   496  		if beta != 1 {
   497  			for j := range ctmp {
   498  				ctmp[j] *= beta
   499  			}
   500  		}
   501  		for l := 0; l < k; l++ {
   502  			tmp := alpha * a[l*lda+i]
   503  			if tmp != 0 {
   504  				f64.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
   505  			}
   506  		}
   507  	}
   508  }
   509  
   510  // Dsyr2k performs one of the symmetric rank 2k operations
   511  //  C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
   512  //  C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
   513  // where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
   514  // alpha and beta are scalars.
   515  func (Implementation) Dsyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
   516  	if ul != blas.Lower && ul != blas.Upper {
   517  		panic(badUplo)
   518  	}
   519  	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
   520  		panic(badTranspose)
   521  	}
   522  	if n < 0 {
   523  		panic(nLT0)
   524  	}
   525  	if k < 0 {
   526  		panic(kLT0)
   527  	}
   528  	row, col := k, n
   529  	if tA == blas.NoTrans {
   530  		row, col = n, k
   531  	}
   532  	if lda < max(1, col) {
   533  		panic(badLdA)
   534  	}
   535  	if ldb < max(1, col) {
   536  		panic(badLdB)
   537  	}
   538  	if ldc < max(1, n) {
   539  		panic(badLdC)
   540  	}
   541  
   542  	// Quick return if possible.
   543  	if n == 0 {
   544  		return
   545  	}
   546  
   547  	// For zero matrix size the following slice length checks are trivially satisfied.
   548  	if len(a) < lda*(row-1)+col {
   549  		panic(shortA)
   550  	}
   551  	if len(b) < ldb*(row-1)+col {
   552  		panic(shortB)
   553  	}
   554  	if len(c) < ldc*(n-1)+n {
   555  		panic(shortC)
   556  	}
   557  
   558  	if alpha == 0 {
   559  		if beta == 0 {
   560  			if ul == blas.Upper {
   561  				for i := 0; i < n; i++ {
   562  					ctmp := c[i*ldc+i : i*ldc+n]
   563  					for j := range ctmp {
   564  						ctmp[j] = 0
   565  					}
   566  				}
   567  				return
   568  			}
   569  			for i := 0; i < n; i++ {
   570  				ctmp := c[i*ldc : i*ldc+i+1]
   571  				for j := range ctmp {
   572  					ctmp[j] = 0
   573  				}
   574  			}
   575  			return
   576  		}
   577  		if ul == blas.Upper {
   578  			for i := 0; i < n; i++ {
   579  				ctmp := c[i*ldc+i : i*ldc+n]
   580  				for j := range ctmp {
   581  					ctmp[j] *= beta
   582  				}
   583  			}
   584  			return
   585  		}
   586  		for i := 0; i < n; i++ {
   587  			ctmp := c[i*ldc : i*ldc+i+1]
   588  			for j := range ctmp {
   589  				ctmp[j] *= beta
   590  			}
   591  		}
   592  		return
   593  	}
   594  	if tA == blas.NoTrans {
   595  		if ul == blas.Upper {
   596  			for i := 0; i < n; i++ {
   597  				atmp := a[i*lda : i*lda+k]
   598  				btmp := b[i*ldb : i*ldb+k]
   599  				ctmp := c[i*ldc+i : i*ldc+n]
   600  				if beta == 0 {
   601  					for jc := range ctmp {
   602  						j := i + jc
   603  						var tmp1, tmp2 float64
   604  						binner := b[j*ldb : j*ldb+k]
   605  						for l, v := range a[j*lda : j*lda+k] {
   606  							tmp1 += v * btmp[l]
   607  							tmp2 += atmp[l] * binner[l]
   608  						}
   609  						ctmp[jc] = alpha * (tmp1 + tmp2)
   610  					}
   611  				} else {
   612  					for jc := range ctmp {
   613  						j := i + jc
   614  						var tmp1, tmp2 float64
   615  						binner := b[j*ldb : j*ldb+k]
   616  						for l, v := range a[j*lda : j*lda+k] {
   617  							tmp1 += v * btmp[l]
   618  							tmp2 += atmp[l] * binner[l]
   619  						}
   620  						ctmp[jc] *= beta
   621  						ctmp[jc] += alpha * (tmp1 + tmp2)
   622  					}
   623  				}
   624  			}
   625  			return
   626  		}
   627  		for i := 0; i < n; i++ {
   628  			atmp := a[i*lda : i*lda+k]
   629  			btmp := b[i*ldb : i*ldb+k]
   630  			ctmp := c[i*ldc : i*ldc+i+1]
   631  			if beta == 0 {
   632  				for j := 0; j <= i; j++ {
   633  					var tmp1, tmp2 float64
   634  					binner := b[j*ldb : j*ldb+k]
   635  					for l, v := range a[j*lda : j*lda+k] {
   636  						tmp1 += v * btmp[l]
   637  						tmp2 += atmp[l] * binner[l]
   638  					}
   639  					ctmp[j] = alpha * (tmp1 + tmp2)
   640  				}
   641  			} else {
   642  				for j := 0; j <= i; j++ {
   643  					var tmp1, tmp2 float64
   644  					binner := b[j*ldb : j*ldb+k]
   645  					for l, v := range a[j*lda : j*lda+k] {
   646  						tmp1 += v * btmp[l]
   647  						tmp2 += atmp[l] * binner[l]
   648  					}
   649  					ctmp[j] *= beta
   650  					ctmp[j] += alpha * (tmp1 + tmp2)
   651  				}
   652  			}
   653  		}
   654  		return
   655  	}
   656  	if ul == blas.Upper {
   657  		for i := 0; i < n; i++ {
   658  			ctmp := c[i*ldc+i : i*ldc+n]
   659  			switch beta {
   660  			case 0:
   661  				for j := range ctmp {
   662  					ctmp[j] = 0
   663  				}
   664  			case 1:
   665  			default:
   666  				for j := range ctmp {
   667  					ctmp[j] *= beta
   668  				}
   669  			}
   670  			for l := 0; l < k; l++ {
   671  				tmp1 := alpha * b[l*ldb+i]
   672  				tmp2 := alpha * a[l*lda+i]
   673  				btmp := b[l*ldb+i : l*ldb+n]
   674  				if tmp1 != 0 || tmp2 != 0 {
   675  					for j, v := range a[l*lda+i : l*lda+n] {
   676  						ctmp[j] += v*tmp1 + btmp[j]*tmp2
   677  					}
   678  				}
   679  			}
   680  		}
   681  		return
   682  	}
   683  	for i := 0; i < n; i++ {
   684  		ctmp := c[i*ldc : i*ldc+i+1]
   685  		switch beta {
   686  		case 0:
   687  			for j := range ctmp {
   688  				ctmp[j] = 0
   689  			}
   690  		case 1:
   691  		default:
   692  			for j := range ctmp {
   693  				ctmp[j] *= beta
   694  			}
   695  		}
   696  		for l := 0; l < k; l++ {
   697  			tmp1 := alpha * b[l*ldb+i]
   698  			tmp2 := alpha * a[l*lda+i]
   699  			btmp := b[l*ldb : l*ldb+i+1]
   700  			if tmp1 != 0 || tmp2 != 0 {
   701  				for j, v := range a[l*lda : l*lda+i+1] {
   702  					ctmp[j] += v*tmp1 + btmp[j]*tmp2
   703  				}
   704  			}
   705  		}
   706  	}
   707  }
   708  
   709  // Dtrmm performs one of the matrix-matrix operations
   710  //  B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
   711  //  B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
   712  //  B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
   713  //  B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
   714  // where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
   715  func (Implementation) Dtrmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
   716  	if s != blas.Left && s != blas.Right {
   717  		panic(badSide)
   718  	}
   719  	if ul != blas.Lower && ul != blas.Upper {
   720  		panic(badUplo)
   721  	}
   722  	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
   723  		panic(badTranspose)
   724  	}
   725  	if d != blas.NonUnit && d != blas.Unit {
   726  		panic(badDiag)
   727  	}
   728  	if m < 0 {
   729  		panic(mLT0)
   730  	}
   731  	if n < 0 {
   732  		panic(nLT0)
   733  	}
   734  	k := n
   735  	if s == blas.Left {
   736  		k = m
   737  	}
   738  	if lda < max(1, k) {
   739  		panic(badLdA)
   740  	}
   741  	if ldb < max(1, n) {
   742  		panic(badLdB)
   743  	}
   744  
   745  	// Quick return if possible.
   746  	if m == 0 || n == 0 {
   747  		return
   748  	}
   749  
   750  	// For zero matrix size the following slice length checks are trivially satisfied.
   751  	if len(a) < lda*(k-1)+k {
   752  		panic(shortA)
   753  	}
   754  	if len(b) < ldb*(m-1)+n {
   755  		panic(shortB)
   756  	}
   757  
   758  	if alpha == 0 {
   759  		for i := 0; i < m; i++ {
   760  			btmp := b[i*ldb : i*ldb+n]
   761  			for j := range btmp {
   762  				btmp[j] = 0
   763  			}
   764  		}
   765  		return
   766  	}
   767  
   768  	nonUnit := d == blas.NonUnit
   769  	if s == blas.Left {
   770  		if tA == blas.NoTrans {
   771  			if ul == blas.Upper {
   772  				for i := 0; i < m; i++ {
   773  					tmp := alpha
   774  					if nonUnit {
   775  						tmp *= a[i*lda+i]
   776  					}
   777  					btmp := b[i*ldb : i*ldb+n]
   778  					f64.ScalUnitary(tmp, btmp)
   779  					for ka, va := range a[i*lda+i+1 : i*lda+m] {
   780  						k := ka + i + 1
   781  						if va != 0 {
   782  							f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
   783  						}
   784  					}
   785  				}
   786  				return
   787  			}
   788  			for i := m - 1; i >= 0; i-- {
   789  				tmp := alpha
   790  				if nonUnit {
   791  					tmp *= a[i*lda+i]
   792  				}
   793  				btmp := b[i*ldb : i*ldb+n]
   794  				f64.ScalUnitary(tmp, btmp)
   795  				for k, va := range a[i*lda : i*lda+i] {
   796  					if va != 0 {
   797  						f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
   798  					}
   799  				}
   800  			}
   801  			return
   802  		}
   803  		// Cases where a is transposed.
   804  		if ul == blas.Upper {
   805  			for k := m - 1; k >= 0; k-- {
   806  				btmpk := b[k*ldb : k*ldb+n]
   807  				for ia, va := range a[k*lda+k+1 : k*lda+m] {
   808  					i := ia + k + 1
   809  					btmp := b[i*ldb : i*ldb+n]
   810  					if va != 0 {
   811  						f64.AxpyUnitary(alpha*va, btmpk, btmp)
   812  					}
   813  				}
   814  				tmp := alpha
   815  				if nonUnit {
   816  					tmp *= a[k*lda+k]
   817  				}
   818  				if tmp != 1 {
   819  					f64.ScalUnitary(tmp, btmpk)
   820  				}
   821  			}
   822  			return
   823  		}
   824  		for k := 0; k < m; k++ {
   825  			btmpk := b[k*ldb : k*ldb+n]
   826  			for i, va := range a[k*lda : k*lda+k] {
   827  				btmp := b[i*ldb : i*ldb+n]
   828  				if va != 0 {
   829  					f64.AxpyUnitary(alpha*va, btmpk, btmp)
   830  				}
   831  			}
   832  			tmp := alpha
   833  			if nonUnit {
   834  				tmp *= a[k*lda+k]
   835  			}
   836  			if tmp != 1 {
   837  				f64.ScalUnitary(tmp, btmpk)
   838  			}
   839  		}
   840  		return
   841  	}
   842  	// Cases where a is on the right
   843  	if tA == blas.NoTrans {
   844  		if ul == blas.Upper {
   845  			for i := 0; i < m; i++ {
   846  				btmp := b[i*ldb : i*ldb+n]
   847  				for k := n - 1; k >= 0; k-- {
   848  					tmp := alpha * btmp[k]
   849  					if tmp == 0 {
   850  						continue
   851  					}
   852  					btmp[k] = tmp
   853  					if nonUnit {
   854  						btmp[k] *= a[k*lda+k]
   855  					}
   856  					f64.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
   857  				}
   858  			}
   859  			return
   860  		}
   861  		for i := 0; i < m; i++ {
   862  			btmp := b[i*ldb : i*ldb+n]
   863  			for k := 0; k < n; k++ {
   864  				tmp := alpha * btmp[k]
   865  				if tmp == 0 {
   866  					continue
   867  				}
   868  				btmp[k] = tmp
   869  				if nonUnit {
   870  					btmp[k] *= a[k*lda+k]
   871  				}
   872  				f64.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
   873  			}
   874  		}
   875  		return
   876  	}
   877  	// Cases where a is transposed.
   878  	if ul == blas.Upper {
   879  		for i := 0; i < m; i++ {
   880  			btmp := b[i*ldb : i*ldb+n]
   881  			for j, vb := range btmp {
   882  				tmp := vb
   883  				if nonUnit {
   884  					tmp *= a[j*lda+j]
   885  				}
   886  				tmp += f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
   887  				btmp[j] = alpha * tmp
   888  			}
   889  		}
   890  		return
   891  	}
   892  	for i := 0; i < m; i++ {
   893  		btmp := b[i*ldb : i*ldb+n]
   894  		for j := n - 1; j >= 0; j-- {
   895  			tmp := btmp[j]
   896  			if nonUnit {
   897  				tmp *= a[j*lda+j]
   898  			}
   899  			tmp += f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
   900  			btmp[j] = alpha * tmp
   901  		}
   902  	}
   903  }