github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/blas/gonum/level3float32.go (about)

     1  // Code generated by "go generate github.com/jingcheng-WU/gonum/blas/gonum”; DO NOT EDIT.
     2  
     3  // Copyright ©2014 The Gonum Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  package gonum
     8  
     9  import (
    10  	"github.com/jingcheng-WU/gonum/blas"
    11  	"github.com/jingcheng-WU/gonum/internal/asm/f32"
    12  )
    13  
    14  var _ blas.Float32Level3 = Implementation{}
    15  
    16  // Strsm solves one of the matrix equations
    17  //  A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
    18  //  Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
    19  //  X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
    20  //  X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
    21  // where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
    22  // scalar.
    23  //
    24  // At entry to the function, X contains the values of B, and the result is
    25  // stored in-place into X.
    26  //
    27  // No check is made that A is invertible.
    28  //
    29  // Float32 implementations are autogenerated and not directly tested.
    30  func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
    31  	if s != blas.Left && s != blas.Right {
    32  		panic(badSide)
    33  	}
    34  	if ul != blas.Lower && ul != blas.Upper {
    35  		panic(badUplo)
    36  	}
    37  	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
    38  		panic(badTranspose)
    39  	}
    40  	if d != blas.NonUnit && d != blas.Unit {
    41  		panic(badDiag)
    42  	}
    43  	if m < 0 {
    44  		panic(mLT0)
    45  	}
    46  	if n < 0 {
    47  		panic(nLT0)
    48  	}
    49  	k := n
    50  	if s == blas.Left {
    51  		k = m
    52  	}
    53  	if lda < max(1, k) {
    54  		panic(badLdA)
    55  	}
    56  	if ldb < max(1, n) {
    57  		panic(badLdB)
    58  	}
    59  
    60  	// Quick return if possible.
    61  	if m == 0 || n == 0 {
    62  		return
    63  	}
    64  
    65  	// For zero matrix size the following slice length checks are trivially satisfied.
    66  	if len(a) < lda*(k-1)+k {
    67  		panic(shortA)
    68  	}
    69  	if len(b) < ldb*(m-1)+n {
    70  		panic(shortB)
    71  	}
    72  
    73  	if alpha == 0 {
    74  		for i := 0; i < m; i++ {
    75  			btmp := b[i*ldb : i*ldb+n]
    76  			for j := range btmp {
    77  				btmp[j] = 0
    78  			}
    79  		}
    80  		return
    81  	}
    82  	nonUnit := d == blas.NonUnit
    83  	if s == blas.Left {
    84  		if tA == blas.NoTrans {
    85  			if ul == blas.Upper {
    86  				for i := m - 1; i >= 0; i-- {
    87  					btmp := b[i*ldb : i*ldb+n]
    88  					if alpha != 1 {
    89  						f32.ScalUnitary(alpha, btmp)
    90  					}
    91  					for ka, va := range a[i*lda+i+1 : i*lda+m] {
    92  						if va != 0 {
    93  							k := ka + i + 1
    94  							f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
    95  						}
    96  					}
    97  					if nonUnit {
    98  						tmp := 1 / a[i*lda+i]
    99  						f32.ScalUnitary(tmp, btmp)
   100  					}
   101  				}
   102  				return
   103  			}
   104  			for i := 0; i < m; i++ {
   105  				btmp := b[i*ldb : i*ldb+n]
   106  				if alpha != 1 {
   107  					f32.ScalUnitary(alpha, btmp)
   108  				}
   109  				for k, va := range a[i*lda : i*lda+i] {
   110  					if va != 0 {
   111  						f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
   112  					}
   113  				}
   114  				if nonUnit {
   115  					tmp := 1 / a[i*lda+i]
   116  					f32.ScalUnitary(tmp, btmp)
   117  				}
   118  			}
   119  			return
   120  		}
   121  		// Cases where a is transposed
   122  		if ul == blas.Upper {
   123  			for k := 0; k < m; k++ {
   124  				btmpk := b[k*ldb : k*ldb+n]
   125  				if nonUnit {
   126  					tmp := 1 / a[k*lda+k]
   127  					f32.ScalUnitary(tmp, btmpk)
   128  				}
   129  				for ia, va := range a[k*lda+k+1 : k*lda+m] {
   130  					if va != 0 {
   131  						i := ia + k + 1
   132  						f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
   133  					}
   134  				}
   135  				if alpha != 1 {
   136  					f32.ScalUnitary(alpha, btmpk)
   137  				}
   138  			}
   139  			return
   140  		}
   141  		for k := m - 1; k >= 0; k-- {
   142  			btmpk := b[k*ldb : k*ldb+n]
   143  			if nonUnit {
   144  				tmp := 1 / a[k*lda+k]
   145  				f32.ScalUnitary(tmp, btmpk)
   146  			}
   147  			for i, va := range a[k*lda : k*lda+k] {
   148  				if va != 0 {
   149  					f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
   150  				}
   151  			}
   152  			if alpha != 1 {
   153  				f32.ScalUnitary(alpha, btmpk)
   154  			}
   155  		}
   156  		return
   157  	}
   158  	// Cases where a is to the right of X.
   159  	if tA == blas.NoTrans {
   160  		if ul == blas.Upper {
   161  			for i := 0; i < m; i++ {
   162  				btmp := b[i*ldb : i*ldb+n]
   163  				if alpha != 1 {
   164  					f32.ScalUnitary(alpha, btmp)
   165  				}
   166  				for k, vb := range btmp {
   167  					if vb == 0 {
   168  						continue
   169  					}
   170  					if nonUnit {
   171  						btmp[k] /= a[k*lda+k]
   172  					}
   173  					f32.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
   174  				}
   175  			}
   176  			return
   177  		}
   178  		for i := 0; i < m; i++ {
   179  			btmp := b[i*ldb : i*ldb+n]
   180  			if alpha != 1 {
   181  				f32.ScalUnitary(alpha, btmp)
   182  			}
   183  			for k := n - 1; k >= 0; k-- {
   184  				if btmp[k] == 0 {
   185  					continue
   186  				}
   187  				if nonUnit {
   188  					btmp[k] /= a[k*lda+k]
   189  				}
   190  				f32.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
   191  			}
   192  		}
   193  		return
   194  	}
   195  	// Cases where a is transposed.
   196  	if ul == blas.Upper {
   197  		for i := 0; i < m; i++ {
   198  			btmp := b[i*ldb : i*ldb+n]
   199  			for j := n - 1; j >= 0; j-- {
   200  				tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
   201  				if nonUnit {
   202  					tmp /= a[j*lda+j]
   203  				}
   204  				btmp[j] = tmp
   205  			}
   206  		}
   207  		return
   208  	}
   209  	for i := 0; i < m; i++ {
   210  		btmp := b[i*ldb : i*ldb+n]
   211  		for j := 0; j < n; j++ {
   212  			tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
   213  			if nonUnit {
   214  				tmp /= a[j*lda+j]
   215  			}
   216  			btmp[j] = tmp
   217  		}
   218  	}
   219  }
   220  
   221  // Ssymm performs one of the matrix-matrix operations
   222  //  C = alpha * A * B + beta * C  if side == blas.Left
   223  //  C = alpha * B * A + beta * C  if side == blas.Right
   224  // where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
   225  // is a scalar.
   226  //
   227  // Float32 implementations are autogenerated and not directly tested.
   228  func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
   229  	if s != blas.Right && s != blas.Left {
   230  		panic(badSide)
   231  	}
   232  	if ul != blas.Lower && ul != blas.Upper {
   233  		panic(badUplo)
   234  	}
   235  	if m < 0 {
   236  		panic(mLT0)
   237  	}
   238  	if n < 0 {
   239  		panic(nLT0)
   240  	}
   241  	k := n
   242  	if s == blas.Left {
   243  		k = m
   244  	}
   245  	if lda < max(1, k) {
   246  		panic(badLdA)
   247  	}
   248  	if ldb < max(1, n) {
   249  		panic(badLdB)
   250  	}
   251  	if ldc < max(1, n) {
   252  		panic(badLdC)
   253  	}
   254  
   255  	// Quick return if possible.
   256  	if m == 0 || n == 0 {
   257  		return
   258  	}
   259  
   260  	// For zero matrix size the following slice length checks are trivially satisfied.
   261  	if len(a) < lda*(k-1)+k {
   262  		panic(shortA)
   263  	}
   264  	if len(b) < ldb*(m-1)+n {
   265  		panic(shortB)
   266  	}
   267  	if len(c) < ldc*(m-1)+n {
   268  		panic(shortC)
   269  	}
   270  
   271  	// Quick return if possible.
   272  	if alpha == 0 && beta == 1 {
   273  		return
   274  	}
   275  
   276  	if beta == 0 {
   277  		for i := 0; i < m; i++ {
   278  			ctmp := c[i*ldc : i*ldc+n]
   279  			for j := range ctmp {
   280  				ctmp[j] = 0
   281  			}
   282  		}
   283  	}
   284  
   285  	if alpha == 0 {
   286  		if beta != 0 {
   287  			for i := 0; i < m; i++ {
   288  				ctmp := c[i*ldc : i*ldc+n]
   289  				for j := 0; j < n; j++ {
   290  					ctmp[j] *= beta
   291  				}
   292  			}
   293  		}
   294  		return
   295  	}
   296  
   297  	isUpper := ul == blas.Upper
   298  	if s == blas.Left {
   299  		for i := 0; i < m; i++ {
   300  			atmp := alpha * a[i*lda+i]
   301  			btmp := b[i*ldb : i*ldb+n]
   302  			ctmp := c[i*ldc : i*ldc+n]
   303  			for j, v := range btmp {
   304  				ctmp[j] *= beta
   305  				ctmp[j] += atmp * v
   306  			}
   307  
   308  			for k := 0; k < i; k++ {
   309  				var atmp float32
   310  				if isUpper {
   311  					atmp = a[k*lda+i]
   312  				} else {
   313  					atmp = a[i*lda+k]
   314  				}
   315  				atmp *= alpha
   316  				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
   317  			}
   318  			for k := i + 1; k < m; k++ {
   319  				var atmp float32
   320  				if isUpper {
   321  					atmp = a[i*lda+k]
   322  				} else {
   323  					atmp = a[k*lda+i]
   324  				}
   325  				atmp *= alpha
   326  				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
   327  			}
   328  		}
   329  		return
   330  	}
   331  	if isUpper {
   332  		for i := 0; i < m; i++ {
   333  			for j := n - 1; j >= 0; j-- {
   334  				tmp := alpha * b[i*ldb+j]
   335  				var tmp2 float32
   336  				atmp := a[j*lda+j+1 : j*lda+n]
   337  				btmp := b[i*ldb+j+1 : i*ldb+n]
   338  				ctmp := c[i*ldc+j+1 : i*ldc+n]
   339  				for k, v := range atmp {
   340  					ctmp[k] += tmp * v
   341  					tmp2 += btmp[k] * v
   342  				}
   343  				c[i*ldc+j] *= beta
   344  				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
   345  			}
   346  		}
   347  		return
   348  	}
   349  	for i := 0; i < m; i++ {
   350  		for j := 0; j < n; j++ {
   351  			tmp := alpha * b[i*ldb+j]
   352  			var tmp2 float32
   353  			atmp := a[j*lda : j*lda+j]
   354  			btmp := b[i*ldb : i*ldb+j]
   355  			ctmp := c[i*ldc : i*ldc+j]
   356  			for k, v := range atmp {
   357  				ctmp[k] += tmp * v
   358  				tmp2 += btmp[k] * v
   359  			}
   360  			c[i*ldc+j] *= beta
   361  			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
   362  		}
   363  	}
   364  }
   365  
   366  // Ssyrk performs one of the symmetric rank-k operations
   367  //  C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
   368  //  C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
   369  // where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
   370  // beta are scalars.
   371  //
   372  // Float32 implementations are autogenerated and not directly tested.
   373  func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
   374  	if ul != blas.Lower && ul != blas.Upper {
   375  		panic(badUplo)
   376  	}
   377  	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
   378  		panic(badTranspose)
   379  	}
   380  	if n < 0 {
   381  		panic(nLT0)
   382  	}
   383  	if k < 0 {
   384  		panic(kLT0)
   385  	}
   386  	row, col := k, n
   387  	if tA == blas.NoTrans {
   388  		row, col = n, k
   389  	}
   390  	if lda < max(1, col) {
   391  		panic(badLdA)
   392  	}
   393  	if ldc < max(1, n) {
   394  		panic(badLdC)
   395  	}
   396  
   397  	// Quick return if possible.
   398  	if n == 0 {
   399  		return
   400  	}
   401  
   402  	// For zero matrix size the following slice length checks are trivially satisfied.
   403  	if len(a) < lda*(row-1)+col {
   404  		panic(shortA)
   405  	}
   406  	if len(c) < ldc*(n-1)+n {
   407  		panic(shortC)
   408  	}
   409  
   410  	if alpha == 0 {
   411  		if beta == 0 {
   412  			if ul == blas.Upper {
   413  				for i := 0; i < n; i++ {
   414  					ctmp := c[i*ldc+i : i*ldc+n]
   415  					for j := range ctmp {
   416  						ctmp[j] = 0
   417  					}
   418  				}
   419  				return
   420  			}
   421  			for i := 0; i < n; i++ {
   422  				ctmp := c[i*ldc : i*ldc+i+1]
   423  				for j := range ctmp {
   424  					ctmp[j] = 0
   425  				}
   426  			}
   427  			return
   428  		}
   429  		if ul == blas.Upper {
   430  			for i := 0; i < n; i++ {
   431  				ctmp := c[i*ldc+i : i*ldc+n]
   432  				for j := range ctmp {
   433  					ctmp[j] *= beta
   434  				}
   435  			}
   436  			return
   437  		}
   438  		for i := 0; i < n; i++ {
   439  			ctmp := c[i*ldc : i*ldc+i+1]
   440  			for j := range ctmp {
   441  				ctmp[j] *= beta
   442  			}
   443  		}
   444  		return
   445  	}
   446  	if tA == blas.NoTrans {
   447  		if ul == blas.Upper {
   448  			for i := 0; i < n; i++ {
   449  				ctmp := c[i*ldc+i : i*ldc+n]
   450  				atmp := a[i*lda : i*lda+k]
   451  				if beta == 0 {
   452  					for jc := range ctmp {
   453  						j := jc + i
   454  						ctmp[jc] = alpha * f32.DotUnitary(atmp, a[j*lda:j*lda+k])
   455  					}
   456  				} else {
   457  					for jc, vc := range ctmp {
   458  						j := jc + i
   459  						ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
   460  					}
   461  				}
   462  			}
   463  			return
   464  		}
   465  		for i := 0; i < n; i++ {
   466  			ctmp := c[i*ldc : i*ldc+i+1]
   467  			atmp := a[i*lda : i*lda+k]
   468  			if beta == 0 {
   469  				for j := range ctmp {
   470  					ctmp[j] = alpha * f32.DotUnitary(a[j*lda:j*lda+k], atmp)
   471  				}
   472  			} else {
   473  				for j, vc := range ctmp {
   474  					ctmp[j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
   475  				}
   476  			}
   477  		}
   478  		return
   479  	}
   480  	// Cases where a is transposed.
   481  	if ul == blas.Upper {
   482  		for i := 0; i < n; i++ {
   483  			ctmp := c[i*ldc+i : i*ldc+n]
   484  			if beta == 0 {
   485  				for j := range ctmp {
   486  					ctmp[j] = 0
   487  				}
   488  			} else if beta != 1 {
   489  				for j := range ctmp {
   490  					ctmp[j] *= beta
   491  				}
   492  			}
   493  			for l := 0; l < k; l++ {
   494  				tmp := alpha * a[l*lda+i]
   495  				if tmp != 0 {
   496  					f32.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
   497  				}
   498  			}
   499  		}
   500  		return
   501  	}
   502  	for i := 0; i < n; i++ {
   503  		ctmp := c[i*ldc : i*ldc+i+1]
   504  		if beta != 1 {
   505  			for j := range ctmp {
   506  				ctmp[j] *= beta
   507  			}
   508  		}
   509  		for l := 0; l < k; l++ {
   510  			tmp := alpha * a[l*lda+i]
   511  			if tmp != 0 {
   512  				f32.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
   513  			}
   514  		}
   515  	}
   516  }
   517  
   518  // Ssyr2k performs one of the symmetric rank 2k operations
   519  //  C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
   520  //  C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
   521  // where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
   522  // alpha and beta are scalars.
   523  //
   524  // Float32 implementations are autogenerated and not directly tested.
   525  func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
   526  	if ul != blas.Lower && ul != blas.Upper {
   527  		panic(badUplo)
   528  	}
   529  	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
   530  		panic(badTranspose)
   531  	}
   532  	if n < 0 {
   533  		panic(nLT0)
   534  	}
   535  	if k < 0 {
   536  		panic(kLT0)
   537  	}
   538  	row, col := k, n
   539  	if tA == blas.NoTrans {
   540  		row, col = n, k
   541  	}
   542  	if lda < max(1, col) {
   543  		panic(badLdA)
   544  	}
   545  	if ldb < max(1, col) {
   546  		panic(badLdB)
   547  	}
   548  	if ldc < max(1, n) {
   549  		panic(badLdC)
   550  	}
   551  
   552  	// Quick return if possible.
   553  	if n == 0 {
   554  		return
   555  	}
   556  
   557  	// For zero matrix size the following slice length checks are trivially satisfied.
   558  	if len(a) < lda*(row-1)+col {
   559  		panic(shortA)
   560  	}
   561  	if len(b) < ldb*(row-1)+col {
   562  		panic(shortB)
   563  	}
   564  	if len(c) < ldc*(n-1)+n {
   565  		panic(shortC)
   566  	}
   567  
   568  	if alpha == 0 {
   569  		if beta == 0 {
   570  			if ul == blas.Upper {
   571  				for i := 0; i < n; i++ {
   572  					ctmp := c[i*ldc+i : i*ldc+n]
   573  					for j := range ctmp {
   574  						ctmp[j] = 0
   575  					}
   576  				}
   577  				return
   578  			}
   579  			for i := 0; i < n; i++ {
   580  				ctmp := c[i*ldc : i*ldc+i+1]
   581  				for j := range ctmp {
   582  					ctmp[j] = 0
   583  				}
   584  			}
   585  			return
   586  		}
   587  		if ul == blas.Upper {
   588  			for i := 0; i < n; i++ {
   589  				ctmp := c[i*ldc+i : i*ldc+n]
   590  				for j := range ctmp {
   591  					ctmp[j] *= beta
   592  				}
   593  			}
   594  			return
   595  		}
   596  		for i := 0; i < n; i++ {
   597  			ctmp := c[i*ldc : i*ldc+i+1]
   598  			for j := range ctmp {
   599  				ctmp[j] *= beta
   600  			}
   601  		}
   602  		return
   603  	}
   604  	if tA == blas.NoTrans {
   605  		if ul == blas.Upper {
   606  			for i := 0; i < n; i++ {
   607  				atmp := a[i*lda : i*lda+k]
   608  				btmp := b[i*ldb : i*ldb+k]
   609  				ctmp := c[i*ldc+i : i*ldc+n]
   610  				if beta == 0 {
   611  					for jc := range ctmp {
   612  						j := i + jc
   613  						var tmp1, tmp2 float32
   614  						binner := b[j*ldb : j*ldb+k]
   615  						for l, v := range a[j*lda : j*lda+k] {
   616  							tmp1 += v * btmp[l]
   617  							tmp2 += atmp[l] * binner[l]
   618  						}
   619  						ctmp[jc] = alpha * (tmp1 + tmp2)
   620  					}
   621  				} else {
   622  					for jc := range ctmp {
   623  						j := i + jc
   624  						var tmp1, tmp2 float32
   625  						binner := b[j*ldb : j*ldb+k]
   626  						for l, v := range a[j*lda : j*lda+k] {
   627  							tmp1 += v * btmp[l]
   628  							tmp2 += atmp[l] * binner[l]
   629  						}
   630  						ctmp[jc] *= beta
   631  						ctmp[jc] += alpha * (tmp1 + tmp2)
   632  					}
   633  				}
   634  			}
   635  			return
   636  		}
   637  		for i := 0; i < n; i++ {
   638  			atmp := a[i*lda : i*lda+k]
   639  			btmp := b[i*ldb : i*ldb+k]
   640  			ctmp := c[i*ldc : i*ldc+i+1]
   641  			if beta == 0 {
   642  				for j := 0; j <= i; j++ {
   643  					var tmp1, tmp2 float32
   644  					binner := b[j*ldb : j*ldb+k]
   645  					for l, v := range a[j*lda : j*lda+k] {
   646  						tmp1 += v * btmp[l]
   647  						tmp2 += atmp[l] * binner[l]
   648  					}
   649  					ctmp[j] = alpha * (tmp1 + tmp2)
   650  				}
   651  			} else {
   652  				for j := 0; j <= i; j++ {
   653  					var tmp1, tmp2 float32
   654  					binner := b[j*ldb : j*ldb+k]
   655  					for l, v := range a[j*lda : j*lda+k] {
   656  						tmp1 += v * btmp[l]
   657  						tmp2 += atmp[l] * binner[l]
   658  					}
   659  					ctmp[j] *= beta
   660  					ctmp[j] += alpha * (tmp1 + tmp2)
   661  				}
   662  			}
   663  		}
   664  		return
   665  	}
   666  	if ul == blas.Upper {
   667  		for i := 0; i < n; i++ {
   668  			ctmp := c[i*ldc+i : i*ldc+n]
   669  			switch beta {
   670  			case 0:
   671  				for j := range ctmp {
   672  					ctmp[j] = 0
   673  				}
   674  			case 1:
   675  			default:
   676  				for j := range ctmp {
   677  					ctmp[j] *= beta
   678  				}
   679  			}
   680  			for l := 0; l < k; l++ {
   681  				tmp1 := alpha * b[l*ldb+i]
   682  				tmp2 := alpha * a[l*lda+i]
   683  				btmp := b[l*ldb+i : l*ldb+n]
   684  				if tmp1 != 0 || tmp2 != 0 {
   685  					for j, v := range a[l*lda+i : l*lda+n] {
   686  						ctmp[j] += v*tmp1 + btmp[j]*tmp2
   687  					}
   688  				}
   689  			}
   690  		}
   691  		return
   692  	}
   693  	for i := 0; i < n; i++ {
   694  		ctmp := c[i*ldc : i*ldc+i+1]
   695  		switch beta {
   696  		case 0:
   697  			for j := range ctmp {
   698  				ctmp[j] = 0
   699  			}
   700  		case 1:
   701  		default:
   702  			for j := range ctmp {
   703  				ctmp[j] *= beta
   704  			}
   705  		}
   706  		for l := 0; l < k; l++ {
   707  			tmp1 := alpha * b[l*ldb+i]
   708  			tmp2 := alpha * a[l*lda+i]
   709  			btmp := b[l*ldb : l*ldb+i+1]
   710  			if tmp1 != 0 || tmp2 != 0 {
   711  				for j, v := range a[l*lda : l*lda+i+1] {
   712  					ctmp[j] += v*tmp1 + btmp[j]*tmp2
   713  				}
   714  			}
   715  		}
   716  	}
   717  }
   718  
   719  // Strmm performs one of the matrix-matrix operations
   720  //  B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
   721  //  B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
   722  //  B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
   723  //  B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
   724  // where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
   725  //
   726  // Float32 implementations are autogenerated and not directly tested.
   727  func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
   728  	if s != blas.Left && s != blas.Right {
   729  		panic(badSide)
   730  	}
   731  	if ul != blas.Lower && ul != blas.Upper {
   732  		panic(badUplo)
   733  	}
   734  	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
   735  		panic(badTranspose)
   736  	}
   737  	if d != blas.NonUnit && d != blas.Unit {
   738  		panic(badDiag)
   739  	}
   740  	if m < 0 {
   741  		panic(mLT0)
   742  	}
   743  	if n < 0 {
   744  		panic(nLT0)
   745  	}
   746  	k := n
   747  	if s == blas.Left {
   748  		k = m
   749  	}
   750  	if lda < max(1, k) {
   751  		panic(badLdA)
   752  	}
   753  	if ldb < max(1, n) {
   754  		panic(badLdB)
   755  	}
   756  
   757  	// Quick return if possible.
   758  	if m == 0 || n == 0 {
   759  		return
   760  	}
   761  
   762  	// For zero matrix size the following slice length checks are trivially satisfied.
   763  	if len(a) < lda*(k-1)+k {
   764  		panic(shortA)
   765  	}
   766  	if len(b) < ldb*(m-1)+n {
   767  		panic(shortB)
   768  	}
   769  
   770  	if alpha == 0 {
   771  		for i := 0; i < m; i++ {
   772  			btmp := b[i*ldb : i*ldb+n]
   773  			for j := range btmp {
   774  				btmp[j] = 0
   775  			}
   776  		}
   777  		return
   778  	}
   779  
   780  	nonUnit := d == blas.NonUnit
   781  	if s == blas.Left {
   782  		if tA == blas.NoTrans {
   783  			if ul == blas.Upper {
   784  				for i := 0; i < m; i++ {
   785  					tmp := alpha
   786  					if nonUnit {
   787  						tmp *= a[i*lda+i]
   788  					}
   789  					btmp := b[i*ldb : i*ldb+n]
   790  					f32.ScalUnitary(tmp, btmp)
   791  					for ka, va := range a[i*lda+i+1 : i*lda+m] {
   792  						k := ka + i + 1
   793  						if va != 0 {
   794  							f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
   795  						}
   796  					}
   797  				}
   798  				return
   799  			}
   800  			for i := m - 1; i >= 0; i-- {
   801  				tmp := alpha
   802  				if nonUnit {
   803  					tmp *= a[i*lda+i]
   804  				}
   805  				btmp := b[i*ldb : i*ldb+n]
   806  				f32.ScalUnitary(tmp, btmp)
   807  				for k, va := range a[i*lda : i*lda+i] {
   808  					if va != 0 {
   809  						f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
   810  					}
   811  				}
   812  			}
   813  			return
   814  		}
   815  		// Cases where a is transposed.
   816  		if ul == blas.Upper {
   817  			for k := m - 1; k >= 0; k-- {
   818  				btmpk := b[k*ldb : k*ldb+n]
   819  				for ia, va := range a[k*lda+k+1 : k*lda+m] {
   820  					i := ia + k + 1
   821  					btmp := b[i*ldb : i*ldb+n]
   822  					if va != 0 {
   823  						f32.AxpyUnitary(alpha*va, btmpk, btmp)
   824  					}
   825  				}
   826  				tmp := alpha
   827  				if nonUnit {
   828  					tmp *= a[k*lda+k]
   829  				}
   830  				if tmp != 1 {
   831  					f32.ScalUnitary(tmp, btmpk)
   832  				}
   833  			}
   834  			return
   835  		}
   836  		for k := 0; k < m; k++ {
   837  			btmpk := b[k*ldb : k*ldb+n]
   838  			for i, va := range a[k*lda : k*lda+k] {
   839  				btmp := b[i*ldb : i*ldb+n]
   840  				if va != 0 {
   841  					f32.AxpyUnitary(alpha*va, btmpk, btmp)
   842  				}
   843  			}
   844  			tmp := alpha
   845  			if nonUnit {
   846  				tmp *= a[k*lda+k]
   847  			}
   848  			if tmp != 1 {
   849  				f32.ScalUnitary(tmp, btmpk)
   850  			}
   851  		}
   852  		return
   853  	}
   854  	// Cases where a is on the right
   855  	if tA == blas.NoTrans {
   856  		if ul == blas.Upper {
   857  			for i := 0; i < m; i++ {
   858  				btmp := b[i*ldb : i*ldb+n]
   859  				for k := n - 1; k >= 0; k-- {
   860  					tmp := alpha * btmp[k]
   861  					if tmp == 0 {
   862  						continue
   863  					}
   864  					btmp[k] = tmp
   865  					if nonUnit {
   866  						btmp[k] *= a[k*lda+k]
   867  					}
   868  					f32.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
   869  				}
   870  			}
   871  			return
   872  		}
   873  		for i := 0; i < m; i++ {
   874  			btmp := b[i*ldb : i*ldb+n]
   875  			for k := 0; k < n; k++ {
   876  				tmp := alpha * btmp[k]
   877  				if tmp == 0 {
   878  					continue
   879  				}
   880  				btmp[k] = tmp
   881  				if nonUnit {
   882  					btmp[k] *= a[k*lda+k]
   883  				}
   884  				f32.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
   885  			}
   886  		}
   887  		return
   888  	}
   889  	// Cases where a is transposed.
   890  	if ul == blas.Upper {
   891  		for i := 0; i < m; i++ {
   892  			btmp := b[i*ldb : i*ldb+n]
   893  			for j, vb := range btmp {
   894  				tmp := vb
   895  				if nonUnit {
   896  					tmp *= a[j*lda+j]
   897  				}
   898  				tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
   899  				btmp[j] = alpha * tmp
   900  			}
   901  		}
   902  		return
   903  	}
   904  	for i := 0; i < m; i++ {
   905  		btmp := b[i*ldb : i*ldb+n]
   906  		for j := n - 1; j >= 0; j-- {
   907  			tmp := btmp[j]
   908  			if nonUnit {
   909  				tmp *= a[j*lda+j]
   910  			}
   911  			tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
   912  			btmp[j] = alpha * tmp
   913  		}
   914  	}
   915  }