gonum.org/v1/gonum@v0.14.0/lapack/gonum/dlaqr5.go (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package gonum
     6  
     7  import (
     8  	"math"
     9  
    10  	"gonum.org/v1/gonum/blas"
    11  	"gonum.org/v1/gonum/blas/blas64"
    12  )
    13  
    14  // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
    15  // block of a Hessenberg matrix.
    16  //
    17  // wantt and wantz determine whether the quasi-triangular Schur factor and the
    18  // orthogonal Schur factor, respectively, will be computed.
    19  //
    20  // kacc22 specifies the computation mode of far-from-diagonal orthogonal
    21  // updates. Permitted values are:
    22  //
    23  //	0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
    24  //	   multiply to update far-from-diagonal matrix entries.
    25  //	1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
    26  //	   update far-from-diagonal matrix entries.
    27  //	2: Same as kacc22=1. This option used to enable exploiting the 2×2 structure
    28  //	   during matrix multiplications, but this is no longer supported.
    29  //
    30  // For other values of kacc2 Dlaqr5 will panic.
    31  //
    32  // n is the order of the Hessenberg matrix H.
    33  //
    34  // ktop and kbot are indices of the first and last row and column of an isolated
    35  // diagonal block upon which the QR sweep will be applied. It must hold that
    36  //
    37  //	ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
    38  //	kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
    39  //
    40  // otherwise Dlaqr5 will panic.
    41  //
    42  // nshfts is the number of simultaneous shifts. It must be positive and even,
    43  // otherwise Dlaqr5 will panic.
    44  //
    45  // sr and si contain the real and imaginary parts, respectively, of the shifts
    46  // of origin that define the multi-shift QR sweep. On return both slices may be
    47  // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
    48  // will panic.
    49  //
    50  // h and ldh represent the Hessenberg matrix H of size n×n. On return
    51  // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
    52  // diagonal block in rows and columns ktop through kbot, inclusive.
    53  //
    54  // iloz and ihiz specify the rows of Z to which transformations will be applied
    55  // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
    56  // will panic.
    57  //
    58  // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
    59  // orthogonal similarity transformation is accumulated into
    60  // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
    61  //
    62  // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
    63  // is transposed with respect to the reference netlib implementation.
    64  //
    65  // u and ldu represent an auxiliary matrix of size (2*nshfts)×(2*nshfts).
    66  //
    67  // wh and ldwh represent an auxiliary matrix of size (2*nshfts-1)×nh.
    68  //
    69  // wv and ldwv represent an auxiliary matrix of size nv×(2*nshfts-1).
    70  //
    71  // Dlaqr5 is an internal routine. It is exported for testing purposes.
    72  func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
    73  	switch {
    74  	case kacc22 != 0 && kacc22 != 1 && kacc22 != 2:
    75  		panic(badKacc22)
    76  	case n < 0:
    77  		panic(nLT0)
    78  	case ktop < 0 || n <= ktop:
    79  		panic(badKtop)
    80  	case kbot < 0 || n <= kbot:
    81  		panic(badKbot)
    82  
    83  	case nshfts < 0:
    84  		panic(nshftsLT0)
    85  	case nshfts&0x1 != 0:
    86  		panic(nshftsOdd)
    87  	case len(sr) != nshfts:
    88  		panic(badLenSr)
    89  	case len(si) != nshfts:
    90  		panic(badLenSi)
    91  
    92  	case ldh < max(1, n):
    93  		panic(badLdH)
    94  	case len(h) < (n-1)*ldh+n:
    95  		panic(shortH)
    96  
    97  	case wantz && ihiz >= n:
    98  		panic(badIhiz)
    99  	case wantz && iloz < 0 || ihiz < iloz:
   100  		panic(badIloz)
   101  	case ldz < 1, wantz && ldz < n:
   102  		panic(badLdZ)
   103  	case wantz && len(z) < (n-1)*ldz+n:
   104  		panic(shortZ)
   105  
   106  	case ldv < 3:
   107  		// V is transposed w.r.t. reference lapack.
   108  		panic(badLdV)
   109  	case len(v) < (nshfts/2-1)*ldv+3:
   110  		panic(shortV)
   111  
   112  	case ldu < max(1, 2*nshfts):
   113  		panic(badLdU)
   114  	case len(u) < (2*nshfts-1)*ldu+2*nshfts:
   115  		panic(shortU)
   116  
   117  	case nv < 0:
   118  		panic(nvLT0)
   119  	case ldwv < max(1, 2*nshfts):
   120  		panic(badLdWV)
   121  	case len(wv) < (nv-1)*ldwv+2*nshfts:
   122  		panic(shortWV)
   123  
   124  	case nh < 0:
   125  		panic(nhLT0)
   126  	case ldwh < max(1, nh):
   127  		panic(badLdWH)
   128  	case len(wh) < (2*nshfts-1)*ldwh+nh:
   129  		panic(shortWH)
   130  
   131  	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
   132  		panic(notIsolated)
   133  	case kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0:
   134  		panic(notIsolated)
   135  	}
   136  
   137  	// If there are no shifts, then there is nothing to do.
   138  	if nshfts < 2 {
   139  		return
   140  	}
   141  	// If the active block is empty or 1×1, then there is nothing to do.
   142  	if ktop >= kbot {
   143  		return
   144  	}
   145  
   146  	// Shuffle shifts into pairs of real shifts and pairs of complex
   147  	// conjugate shifts assuming complex conjugate shifts are already
   148  	// adjacent to one another.
   149  	for i := 0; i < nshfts-2; i += 2 {
   150  		if si[i] == -si[i+1] {
   151  			continue
   152  		}
   153  		sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
   154  		si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
   155  	}
   156  
   157  	// Note: lapack says that nshfts must be even but allows it to be odd
   158  	// anyway. We panic above if nshfts is not even, so reducing it by one
   159  	// is unnecessary. The only caller Dlaqr04 uses only even nshfts.
   160  	//
   161  	// The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
   162  	// *     ==== NSHFTS is supposed to be even, but if it is odd,
   163  	// *     .    then simply reduce it by one.  The shuffle above
   164  	// *     .    ensures that the dropped shift is real and that
   165  	// *     .    the remaining shifts are paired. ====
   166  	// *
   167  	//      NS = NSHFTS - MOD( NSHFTS, 2 )
   168  	ns := nshfts
   169  
   170  	safmin := dlamchS
   171  	ulp := dlamchP
   172  	smlnum := safmin * float64(n) / ulp
   173  
   174  	// Use accumulated reflections to update far-from-diagonal entries?
   175  	accum := kacc22 == 1 || kacc22 == 2
   176  
   177  	// Clear trash.
   178  	if ktop+2 <= kbot {
   179  		h[(ktop+2)*ldh+ktop] = 0
   180  	}
   181  
   182  	// nbmps = number of 2-shift bulges in the chain.
   183  	nbmps := ns / 2
   184  
   185  	// kdu = width of slab.
   186  	kdu := 4 * nbmps
   187  
   188  	// Create and chase chains of nbmps bulges.
   189  	for incol := ktop - 2*nbmps + 1; incol <= kbot-2; incol += 2 * nbmps {
   190  		// jtop is an index from which updates from the right start.
   191  		var jtop int
   192  		switch {
   193  		case accum:
   194  			jtop = max(ktop, incol)
   195  		case wantt:
   196  		default:
   197  			jtop = ktop
   198  		}
   199  		ndcol := incol + kdu
   200  		if accum {
   201  			impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
   202  		}
   203  		// Near-the-diagonal bulge chase. The following loop performs
   204  		// the near-the-diagonal part of a small bulge multi-shift QR
   205  		// sweep. Each 4*nbmps column diagonal chunk extends from
   206  		// column incol to column ndcol (including both column incol and
   207  		// column ndcol). The following loop chases a 2*nbmps+1 column
   208  		// long chain of nbmps bulges 2*nbmps columns to the right.
   209  		// (incol may be less than ktop and ndcol may be greater than
   210  		// kbot indicating phantom columns from which to chase bulges
   211  		// before they are actually introduced or to which to chase
   212  		// bulges beyond column kbot.)
   213  		for krcol := incol; krcol <= min(incol+2*nbmps-1, kbot-2); krcol++ {
   214  			// Bulges number mtop to mbot are active double implicit
   215  			// shift bulges. There may or may not also be small 2×2
   216  			// bulge, if there is room. The inactive bulges (if any)
   217  			// must wait until the active bulges have moved down the
   218  			// diagonal to make room. The phantom matrix paradigm
   219  			// described above helps keep track.
   220  			mtop := max(0, (ktop-krcol)/2)
   221  			mbot := min(nbmps, (kbot-krcol-1)/2) - 1
   222  			m22 := mbot + 1
   223  			bmp22 := (mbot < nbmps-1) && (krcol+2*m22 == kbot-2)
   224  			// Generate reflections to chase the chain right one column.
   225  			// The minimum value of k is ktop-1.
   226  			if bmp22 {
   227  				// Special case: 2×2 reflection at bottom treated separately.
   228  				k := krcol + 2*m22
   229  				if k == ktop-1 {
   230  					impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
   231  						sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
   232  						v[m22*ldv:m22*ldv+2])
   233  					beta := v[m22*ldv]
   234  					_, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
   235  				} else {
   236  					beta := h[(k+1)*ldh+k]
   237  					v[m22*ldv+1] = h[(k+2)*ldh+k]
   238  					beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
   239  					h[(k+1)*ldh+k] = beta
   240  					h[(k+2)*ldh+k] = 0
   241  				}
   242  				// Perform update from right within computational window.
   243  				t1 := v[m22*ldv]
   244  				t2 := t1 * v[m22*ldv+1]
   245  				for j := jtop; j <= min(kbot, k+3); j++ {
   246  					refsum := h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]
   247  					h[j*ldh+k+1] -= refsum * t1
   248  					h[j*ldh+k+2] -= refsum * t2
   249  				}
   250  				// Perform update from left within computational window.
   251  				var jbot int
   252  				switch {
   253  				case accum:
   254  					jbot = min(ndcol, kbot)
   255  				case wantt:
   256  					jbot = n - 1
   257  				default:
   258  					jbot = kbot
   259  				}
   260  				t1 = v[m22*ldv]
   261  				t2 = t1 * v[m22*ldv+1]
   262  				for j := k + 1; j <= jbot; j++ {
   263  					refsum := h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]
   264  					h[(k+1)*ldh+j] -= refsum * t1
   265  					h[(k+2)*ldh+j] -= refsum * t2
   266  				}
   267  				// The following convergence test requires that the traditional
   268  				// small-compared-to-nearby-diagonals criterion and the Ahues &
   269  				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
   270  				// improves accuracy in some examples. Falling back on an alternate
   271  				// convergence criterion when tst1 or tst2 is zero (as done here) is
   272  				// traditional but probably unnecessary.
   273  				if k >= ktop && h[(k+1)*ldh+k] != 0 {
   274  					tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
   275  					if tst1 == 0 {
   276  						if k >= ktop+1 {
   277  							tst1 += math.Abs(h[k*ldh+k-1])
   278  						}
   279  						if k >= ktop+2 {
   280  							tst1 += math.Abs(h[k*ldh+k-2])
   281  						}
   282  						if k >= ktop+3 {
   283  							tst1 += math.Abs(h[k*ldh+k-3])
   284  						}
   285  						if k <= kbot-2 {
   286  							tst1 += math.Abs(h[(k+2)*ldh+k+1])
   287  						}
   288  						if k <= kbot-3 {
   289  							tst1 += math.Abs(h[(k+3)*ldh+k+1])
   290  						}
   291  						if k <= kbot-4 {
   292  							tst1 += math.Abs(h[(k+4)*ldh+k+1])
   293  						}
   294  					}
   295  					if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
   296  						h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   297  						h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   298  						h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   299  						h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   300  						scl := h11 + h12
   301  						tst2 := h22 * (h11 / scl)
   302  						if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
   303  							h[(k+1)*ldh+k] = 0
   304  						}
   305  					}
   306  				}
   307  				// Accumulate orthogonal transformations.
   308  				if accum {
   309  					kms := k - incol - 1
   310  					t1 = v[m22*ldv]
   311  					t2 = t1 * v[m22*ldv+1]
   312  					for j := max(0, ktop-incol-1); j < kdu; j++ {
   313  						refsum := u[j*ldu+kms+1] + v[m22*ldv+1]*u[j*ldu+kms+2]
   314  						u[j*ldu+kms+1] -= refsum * t1
   315  						u[j*ldu+kms+2] -= refsum * t2
   316  					}
   317  				} else if wantz {
   318  					t1 = v[m22*ldv]
   319  					t2 = t1 * v[m22*ldv+1]
   320  					for j := iloz; j <= ihiz; j++ {
   321  						refsum := z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]
   322  						z[j*ldz+k+1] -= refsum * t1
   323  						z[j*ldz+k+2] -= refsum * t2
   324  					}
   325  				}
   326  			}
   327  			// Normal case: Chain of 3×3 reflections.
   328  			for m := mbot; m >= mtop; m-- {
   329  				k := krcol + 2*m
   330  				if k == ktop-1 {
   331  					impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
   332  						sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
   333  						v[m*ldv:m*ldv+3])
   334  					alpha := v[m*ldv]
   335  					_, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
   336  				} else {
   337  					// Perform delayed transformation of row below m-th bulge.
   338  					// Exploit fact that first two elements of row are actually
   339  					// zero.
   340  					t1 := v[m*ldv]
   341  					t2 := t1 * v[m*ldv+1]
   342  					t3 := t1 * v[m*ldv+2]
   343  					refsum := v[m*ldv+2] * h[(k+3)*ldh+k+2]
   344  					h[(k+3)*ldh+k] = -refsum * t1
   345  					h[(k+3)*ldh+k+1] = -refsum * t2
   346  					h[(k+3)*ldh+k+2] -= refsum * t3
   347  					// Calculate reflection to move m-th bulge one step.
   348  					beta := h[(k+1)*ldh+k]
   349  					v[m*ldv+1] = h[(k+2)*ldh+k]
   350  					v[m*ldv+2] = h[(k+3)*ldh+k]
   351  					beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)
   352  					// A bulge may collapse because of vigilant deflation or
   353  					// destructive underflow. In the underflow case, try the
   354  					// two-small-subdiagonals trick to try to reinflate the
   355  					// bulge.
   356  					if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
   357  						// Typical case: not collapsed (yet).
   358  						h[(k+1)*ldh+k] = beta
   359  						h[(k+2)*ldh+k] = 0
   360  						h[(k+3)*ldh+k] = 0
   361  					} else {
   362  						// Atypical case: collapsed. Attempt to reintroduce
   363  						// ignoring H[k+1,k] and H[k+2,k]. If the fill resulting
   364  						// from the new reflector is too large, then abandon it.
   365  						// Otherwise, use the new one.
   366  						var vt [3]float64
   367  						impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh,
   368  							sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
   369  							vt[:])
   370  						_, vt[0] = impl.Dlarfg(3, vt[0], vt[1:3], 1)
   371  						t1 = vt[0]
   372  						t2 = t1 * vt[1]
   373  						t3 = t1 * vt[2]
   374  						refsum = h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]
   375  						dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
   376  						if math.Abs(h[(k+2)*ldh+k]-refsum*t2)+math.Abs(refsum*t3) > ulp*dsum {
   377  							// Starting a new bulge here would create
   378  							// non-negligible fill. Use the old one with
   379  							// trepidation.
   380  							h[(k+1)*ldh+k] = beta
   381  							h[(k+2)*ldh+k] = 0
   382  							h[(k+3)*ldh+k] = 0
   383  						} else {
   384  							// Starting a new bulge here would create only
   385  							// negligible fill. Replace the old reflector with
   386  							// the new one.
   387  							h[(k+1)*ldh+k] -= refsum * t1
   388  							h[(k+2)*ldh+k] = 0
   389  							h[(k+3)*ldh+k] = 0
   390  							v[m*ldv] = vt[0]
   391  							v[m*ldv+1] = vt[1]
   392  							v[m*ldv+2] = vt[2]
   393  						}
   394  					}
   395  				}
   396  				// Apply reflection from the right and the first column of
   397  				// update from the left. These updates are required for the
   398  				// vigilant deflation check. We still delay most of the updates
   399  				// from the left for efficiency.
   400  				t1 := v[m*ldv]
   401  				t2 := t1 * v[m*ldv+1]
   402  				t3 := t1 * v[m*ldv+2]
   403  				for j := jtop; j <= min(kbot, k+3); j++ {
   404  					refsum := h[j*ldh+k+1] + v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]
   405  					h[j*ldh+k+1] -= refsum * t1
   406  					h[j*ldh+k+2] -= refsum * t2
   407  					h[j*ldh+k+3] -= refsum * t3
   408  				}
   409  				// Perform update from left for subsequent column.
   410  				refsum := h[(k+1)*ldh+k+1] + v[m*ldv+1]*h[(k+2)*ldh+k+1] + v[m*ldv+2]*h[(k+3)*ldh+k+1]
   411  				h[(k+1)*ldh+k+1] -= refsum * t1
   412  				h[(k+2)*ldh+k+1] -= refsum * t2
   413  				h[(k+3)*ldh+k+1] -= refsum * t3
   414  				// The following convergence test requires that the tradition
   415  				// small-compared-to-nearby-diagonals criterion and the Ahues &
   416  				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The
   417  				// latter improves accuracy in some examples. Falling back on an
   418  				// alternate convergence criterion when tst1 or tst2 is zero (as
   419  				// done here) is traditional but probably unnecessary.
   420  				if k < ktop {
   421  					continue
   422  				}
   423  				if h[(k+1)*ldh+k] != 0 {
   424  					tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
   425  					if tst1 == 0 {
   426  						if k >= ktop+1 {
   427  							tst1 += math.Abs(h[k*ldh+k-1])
   428  						}
   429  						if k >= ktop+2 {
   430  							tst1 += math.Abs(h[k*ldh+k-2])
   431  						}
   432  						if k >= ktop+3 {
   433  							tst1 += math.Abs(h[k*ldh+k-3])
   434  						}
   435  						if k <= kbot-2 {
   436  							tst1 += math.Abs(h[(k+2)*ldh+k+1])
   437  						}
   438  						if k <= kbot-3 {
   439  							tst1 += math.Abs(h[(k+3)*ldh+k+1])
   440  						}
   441  						if k <= kbot-4 {
   442  							tst1 += math.Abs(h[(k+4)*ldh+k+1])
   443  						}
   444  					}
   445  					if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
   446  						h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   447  						h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   448  						h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   449  						h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   450  						scl := h11 + h12
   451  						tst2 := h22 * (h11 / scl)
   452  						if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
   453  							h[(k+1)*ldh+k] = 0
   454  						}
   455  					}
   456  				}
   457  			}
   458  			// Multiply H by reflections from the left.
   459  			var jbot int
   460  			switch {
   461  			case accum:
   462  				jbot = min(ndcol, kbot)
   463  			case wantt:
   464  				jbot = n - 1
   465  			default:
   466  				jbot = kbot
   467  			}
   468  			for m := mbot; m >= mtop; m-- {
   469  				k := krcol + 2*m
   470  				t1 := v[m*ldv]
   471  				t2 := t1 * v[m*ldv+1]
   472  				t3 := t1 * v[m*ldv+2]
   473  				for j := max(ktop, krcol+2*(m+1)); j <= jbot; j++ {
   474  					refsum := h[(k+1)*ldh+j] + v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]
   475  					h[(k+1)*ldh+j] -= refsum * t1
   476  					h[(k+2)*ldh+j] -= refsum * t2
   477  					h[(k+3)*ldh+j] -= refsum * t3
   478  				}
   479  			}
   480  			// Accumulate orthogonal transformations.
   481  			if accum {
   482  				// Accumulate U. If necessary, update Z later with an
   483  				// efficient matrix-matrix multiply.
   484  				for m := mbot; m >= mtop; m-- {
   485  					k := krcol + 2*m
   486  					kms := k - incol - 1
   487  					i2 := max(0, ktop-incol-1)
   488  					i2 = max(i2, kms-(krcol-incol))
   489  					i4 := min(kdu, krcol+2*mbot-incol+5)
   490  					t1 := v[m*ldv]
   491  					t2 := t1 * v[m*ldv+1]
   492  					t3 := t1 * v[m*ldv+2]
   493  					for j := i2; j < i4; j++ {
   494  						refsum := u[j*ldu+kms+1] + v[m*ldv+1]*u[j*ldu+kms+2] + v[m*ldv+2]*u[j*ldu+kms+3]
   495  						u[j*ldu+kms+1] -= refsum * t1
   496  						u[j*ldu+kms+2] -= refsum * t2
   497  						u[j*ldu+kms+3] -= refsum * t3
   498  					}
   499  				}
   500  			} else if wantz {
   501  				// U is not accumulated, so update Z now by multiplying by
   502  				// reflections from the right.
   503  				for m := mbot; m >= mtop; m-- {
   504  					k := krcol + 2*m
   505  					t1 := v[m*ldv]
   506  					t2 := t1 * v[m*ldv+1]
   507  					t3 := t1 * v[m*ldv+2]
   508  					for j := iloz; j <= ihiz; j++ {
   509  						refsum := z[j*ldz+k+1] + v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]
   510  						z[j*ldz+k+1] -= refsum * t1
   511  						z[j*ldz+k+2] -= refsum * t2
   512  						z[j*ldz+k+3] -= refsum * t3
   513  					}
   514  				}
   515  			}
   516  		}
   517  		// Use U (if accumulated) to update far-from-diagonal entries in H.
   518  		// If required, use U to update Z as well.
   519  		if !accum {
   520  			continue
   521  		}
   522  		jtop, jbot := ktop, kbot
   523  		if wantt {
   524  			jtop = 0
   525  			jbot = n - 1
   526  		}
   527  		bi := blas64.Implementation()
   528  		k1 := max(0, ktop-incol-1)
   529  		nu := kdu - max(0, ndcol-kbot) - k1
   530  		// Horizontal multiply.
   531  		for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
   532  			jlen := min(nh, jbot-jcol+1)
   533  			bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
   534  				1, u[k1*ldu+k1:], ldu,
   535  				h[(incol+k1+1)*ldh+jcol:], ldh,
   536  				0, wh, ldwh)
   537  			impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k1+1)*ldh+jcol:], ldh)
   538  		}
   539  		// Vertical multiply.
   540  		for jrow := jtop; jrow < max(ktop, incol); jrow += nv {
   541  			jlen := min(nv, max(ktop, incol)-jrow)
   542  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
   543  				1, h[jrow*ldh+incol+k1+1:], ldh,
   544  				u[k1*ldu+k1:], ldu,
   545  				0, wv, ldwv)
   546  			impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k1+1:], ldh)
   547  		}
   548  		// Z multiply (also vertical).
   549  		if wantz {
   550  			for jrow := iloz; jrow <= ihiz; jrow += nv {
   551  				jlen := min(nv, ihiz-jrow+1)
   552  				bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
   553  					1, z[jrow*ldz+incol+k1+1:], ldz,
   554  					u[k1*ldu+k1:], ldu,
   555  					0, wv, ldwv)
   556  				impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k1+1:], ldz)
   557  			}
   558  		}
   559  	}
   560  }