github.com/gopherd/gonum@v0.0.4/lapack/gonum/dlaqr5.go (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package gonum
     6  
     7  import (
     8  	"math"
     9  
    10  	"github.com/gopherd/gonum/blas"
    11  	"github.com/gopherd/gonum/blas/blas64"
    12  )
    13  
    14  // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
    15  // block of a Hessenberg matrix.
    16  //
    17  // wantt and wantz determine whether the quasi-triangular Schur factor and the
    18  // orthogonal Schur factor, respectively, will be computed.
    19  //
    20  // kacc22 specifies the computation mode of far-from-diagonal orthogonal
    21  // updates. Permitted values are:
    22  //  0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
    23  //     multiply to update far-from-diagonal matrix entries.
    24  //  1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
    25  //     update far-from-diagonal matrix entries.
    26  //  2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update
    27  //     far-from-diagonal matrix entries, and take advantage of 2×2 block
    28  //     structure during matrix multiplies.
    29  // For other values of kacc2 Dlaqr5 will panic.
    30  //
    31  // n is the order of the Hessenberg matrix H.
    32  //
    33  // ktop and kbot are indices of the first and last row and column of an isolated
    34  // diagonal block upon which the QR sweep will be applied. It must hold that
    35  //  ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
    36  //  kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
    37  // otherwise Dlaqr5 will panic.
    38  //
    39  // nshfts is the number of simultaneous shifts. It must be positive and even,
    40  // otherwise Dlaqr5 will panic.
    41  //
    42  // sr and si contain the real and imaginary parts, respectively, of the shifts
    43  // of origin that define the multi-shift QR sweep. On return both slices may be
    44  // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
    45  // will panic.
    46  //
    47  // h and ldh represent the Hessenberg matrix H of size n×n. On return
    48  // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
    49  // diagonal block in rows and columns ktop through kbot, inclusive.
    50  //
    51  // iloz and ihiz specify the rows of Z to which transformations will be applied
    52  // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
    53  // will panic.
    54  //
    55  // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
    56  // orthogonal similarity transformation is accumulated into
    57  // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
    58  //
    59  // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
    60  // is transposed with respect to the reference netlib implementation.
    61  //
    62  // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3).
    63  //
    64  // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh.
    65  //
    66  // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3).
    67  //
    68  // Dlaqr5 is an internal routine. It is exported for testing purposes.
    69  func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
    70  	switch {
    71  	case kacc22 != 0 && kacc22 != 1 && kacc22 != 2:
    72  		panic(badKacc22)
    73  	case n < 0:
    74  		panic(nLT0)
    75  	case ktop < 0 || n <= ktop:
    76  		panic(badKtop)
    77  	case kbot < 0 || n <= kbot:
    78  		panic(badKbot)
    79  
    80  	case nshfts < 0:
    81  		panic(nshftsLT0)
    82  	case nshfts&0x1 != 0:
    83  		panic(nshftsOdd)
    84  	case len(sr) != nshfts:
    85  		panic(badLenSr)
    86  	case len(si) != nshfts:
    87  		panic(badLenSi)
    88  
    89  	case ldh < max(1, n):
    90  		panic(badLdH)
    91  	case len(h) < (n-1)*ldh+n:
    92  		panic(shortH)
    93  
    94  	case wantz && ihiz >= n:
    95  		panic(badIhiz)
    96  	case wantz && iloz < 0 || ihiz < iloz:
    97  		panic(badIloz)
    98  	case ldz < 1, wantz && ldz < n:
    99  		panic(badLdZ)
   100  	case wantz && len(z) < (n-1)*ldz+n:
   101  		panic(shortZ)
   102  
   103  	case ldv < 3:
   104  		// V is transposed w.r.t. reference lapack.
   105  		panic(badLdV)
   106  	case len(v) < (nshfts/2-1)*ldv+3:
   107  		panic(shortV)
   108  
   109  	case ldu < max(1, 3*nshfts-3):
   110  		panic(badLdU)
   111  	case len(u) < (3*nshfts-3-1)*ldu+3*nshfts-3:
   112  		panic(shortU)
   113  
   114  	case nv < 0:
   115  		panic(nvLT0)
   116  	case ldwv < max(1, 3*nshfts-3):
   117  		panic(badLdWV)
   118  	case len(wv) < (nv-1)*ldwv+3*nshfts-3:
   119  		panic(shortWV)
   120  
   121  	case nh < 0:
   122  		panic(nhLT0)
   123  	case ldwh < max(1, nh):
   124  		panic(badLdWH)
   125  	case len(wh) < (3*nshfts-3-1)*ldwh+nh:
   126  		panic(shortWH)
   127  
   128  	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
   129  		panic(notIsolated)
   130  	case kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0:
   131  		panic(notIsolated)
   132  	}
   133  
   134  	// If there are no shifts, then there is nothing to do.
   135  	if nshfts < 2 {
   136  		return
   137  	}
   138  	// If the active block is empty or 1×1, then there is nothing to do.
   139  	if ktop >= kbot {
   140  		return
   141  	}
   142  
   143  	// Shuffle shifts into pairs of real shifts and pairs of complex
   144  	// conjugate shifts assuming complex conjugate shifts are already
   145  	// adjacent to one another.
   146  	for i := 0; i < nshfts-2; i += 2 {
   147  		if si[i] == -si[i+1] {
   148  			continue
   149  		}
   150  		sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
   151  		si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
   152  	}
   153  
   154  	// Note: lapack says that nshfts must be even but allows it to be odd
   155  	// anyway. We panic above if nshfts is not even, so reducing it by one
   156  	// is unnecessary. The only caller Dlaqr04 uses only even nshfts.
   157  	//
   158  	// The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
   159  	// *     ==== NSHFTS is supposed to be even, but if it is odd,
   160  	// *     .    then simply reduce it by one.  The shuffle above
   161  	// *     .    ensures that the dropped shift is real and that
   162  	// *     .    the remaining shifts are paired. ====
   163  	// *
   164  	//      NS = NSHFTS - MOD( NSHFTS, 2 )
   165  	ns := nshfts
   166  
   167  	safmin := dlamchS
   168  	ulp := dlamchP
   169  	smlnum := safmin * float64(n) / ulp
   170  
   171  	// Use accumulated reflections to update far-from-diagonal entries?
   172  	accum := kacc22 == 1 || kacc22 == 2
   173  	// If so, exploit the 2×2 block structure?
   174  	blk22 := ns > 2 && kacc22 == 2
   175  
   176  	// Clear trash.
   177  	if ktop+2 <= kbot {
   178  		h[(ktop+2)*ldh+ktop] = 0
   179  	}
   180  
   181  	// nbmps = number of 2-shift bulges in the chain.
   182  	nbmps := ns / 2
   183  
   184  	// kdu = width of slab.
   185  	kdu := 6*nbmps - 3
   186  
   187  	// Create and chase chains of nbmps bulges.
   188  	for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 {
   189  		ndcol := incol + kdu
   190  		if accum {
   191  			impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
   192  		}
   193  
   194  		// Near-the-diagonal bulge chase. The following loop performs
   195  		// the near-the-diagonal part of a small bulge multi-shift QR
   196  		// sweep. Each 6*nbmps-2 column diagonal chunk extends from
   197  		// column incol to column ndcol (including both column incol and
   198  		// column ndcol). The following loop chases a 3*nbmps column
   199  		// long chain of nbmps bulges 3*nbmps-2 columns to the right.
   200  		// (incol may be less than ktop and ndcol may be greater than
   201  		// kbot indicating phantom columns from which to chase bulges
   202  		// before they are actually introduced or to which to chase
   203  		// bulges beyond column kbot.)
   204  		for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ {
   205  			// Bulges number mtop to mbot are active double implicit
   206  			// shift bulges. There may or may not also be small 2×2
   207  			// bulge, if there is room. The inactive bulges (if any)
   208  			// must wait until the active bulges have moved down the
   209  			// diagonal to make room. The phantom matrix paradigm
   210  			// described above helps keep track.
   211  
   212  			mtop := max(0, ((ktop-1)-krcol+2)/3)
   213  			mbot := min(nbmps, (kbot-krcol)/3) - 1
   214  			m22 := mbot + 1
   215  			bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2)
   216  
   217  			// Generate reflections to chase the chain right one
   218  			// column. (The minimum value of k is ktop-1.)
   219  			for m := mtop; m <= mbot; m++ {
   220  				k := krcol + 3*m
   221  				if k == ktop-1 {
   222  					impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
   223  						sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
   224  						v[m*ldv:m*ldv+3])
   225  					alpha := v[m*ldv]
   226  					_, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
   227  					continue
   228  				}
   229  				beta := h[(k+1)*ldh+k]
   230  				v[m*ldv+1] = h[(k+2)*ldh+k]
   231  				v[m*ldv+2] = h[(k+3)*ldh+k]
   232  				beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)
   233  
   234  				// A bulge may collapse because of vigilant deflation or
   235  				// destructive underflow. In the underflow case, try the
   236  				// two-small-subdiagonals trick to try to reinflate the
   237  				// bulge.
   238  				if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
   239  					// Typical case: not collapsed (yet).
   240  					h[(k+1)*ldh+k] = beta
   241  					h[(k+2)*ldh+k] = 0
   242  					h[(k+3)*ldh+k] = 0
   243  					continue
   244  				}
   245  
   246  				// Atypical case: collapsed. Attempt to reintroduce
   247  				// ignoring H[k+1,k] and H[k+2,k]. If the fill
   248  				// resulting from the new reflector is too large,
   249  				// then abandon it. Otherwise, use the new one.
   250  				var vt [3]float64
   251  				impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m],
   252  					si[2*m], sr[2*m+1], si[2*m+1], vt[:])
   253  				alpha := vt[0]
   254  				_, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1)
   255  				refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k])
   256  
   257  				dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
   258  				if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum {
   259  					// Starting a new bulge here would create
   260  					// non-negligible fill. Use the old one with
   261  					// trepidation.
   262  					h[(k+1)*ldh+k] = beta
   263  					h[(k+2)*ldh+k] = 0
   264  					h[(k+3)*ldh+k] = 0
   265  					continue
   266  				} else {
   267  					// Starting a new bulge here would create
   268  					// only negligible fill. Replace the old
   269  					// reflector with the new one.
   270  					h[(k+1)*ldh+k] -= refsum
   271  					h[(k+2)*ldh+k] = 0
   272  					h[(k+3)*ldh+k] = 0
   273  					v[m*ldv] = vt[0]
   274  					v[m*ldv+1] = vt[1]
   275  					v[m*ldv+2] = vt[2]
   276  				}
   277  			}
   278  
   279  			// Generate a 2×2 reflection, if needed.
   280  			if bmp22 {
   281  				k := krcol + 3*m22
   282  				if k == ktop-1 {
   283  					impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
   284  						sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
   285  						v[m22*ldv:m22*ldv+2])
   286  					beta := v[m22*ldv]
   287  					_, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
   288  				} else {
   289  					beta := h[(k+1)*ldh+k]
   290  					v[m22*ldv+1] = h[(k+2)*ldh+k]
   291  					beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
   292  					h[(k+1)*ldh+k] = beta
   293  					h[(k+2)*ldh+k] = 0
   294  				}
   295  			}
   296  
   297  			// Multiply H by reflections from the left.
   298  			var jbot int
   299  			switch {
   300  			case accum:
   301  				jbot = min(ndcol, kbot)
   302  			case wantt:
   303  				jbot = n - 1
   304  			default:
   305  				jbot = kbot
   306  			}
   307  			for j := max(ktop, krcol); j <= jbot; j++ {
   308  				mend := min(mbot+1, (j-krcol+2)/3) - 1
   309  				for m := mtop; m <= mend; m++ {
   310  					k := krcol + 3*m
   311  					refsum := v[m*ldv] * (h[(k+1)*ldh+j] +
   312  						v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j])
   313  					h[(k+1)*ldh+j] -= refsum
   314  					h[(k+2)*ldh+j] -= refsum * v[m*ldv+1]
   315  					h[(k+3)*ldh+j] -= refsum * v[m*ldv+2]
   316  				}
   317  			}
   318  			if bmp22 {
   319  				k := krcol + 3*m22
   320  				for j := max(k+1, ktop); j <= jbot; j++ {
   321  					refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j])
   322  					h[(k+1)*ldh+j] -= refsum
   323  					h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1]
   324  				}
   325  			}
   326  
   327  			// Multiply H by reflections from the right. Delay filling in the last row
   328  			// until the vigilant deflation check is complete.
   329  			var jtop int
   330  			switch {
   331  			case accum:
   332  				jtop = max(ktop, incol)
   333  			case wantt:
   334  				jtop = 0
   335  			default:
   336  				jtop = ktop
   337  			}
   338  			for m := mtop; m <= mbot; m++ {
   339  				if v[m*ldv] == 0 {
   340  					continue
   341  				}
   342  				k := krcol + 3*m
   343  				for j := jtop; j <= min(kbot, k+3); j++ {
   344  					refsum := v[m*ldv] * (h[j*ldh+k+1] +
   345  						v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3])
   346  					h[j*ldh+k+1] -= refsum
   347  					h[j*ldh+k+2] -= refsum * v[m*ldv+1]
   348  					h[j*ldh+k+3] -= refsum * v[m*ldv+2]
   349  				}
   350  				if accum {
   351  					// Accumulate U. (If necessary, update Z later with an
   352  					// efficient matrix-matrix multiply.)
   353  					kms := k - incol
   354  					for j := max(0, ktop-incol-1); j < kdu; j++ {
   355  						refsum := v[m*ldv] * (u[j*ldu+kms] +
   356  							v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2])
   357  						u[j*ldu+kms] -= refsum
   358  						u[j*ldu+kms+1] -= refsum * v[m*ldv+1]
   359  						u[j*ldu+kms+2] -= refsum * v[m*ldv+2]
   360  					}
   361  				} else if wantz {
   362  					// U is not accumulated, so update Z now by multiplying by
   363  					// reflections from the right.
   364  					for j := iloz; j <= ihiz; j++ {
   365  						refsum := v[m*ldv] * (z[j*ldz+k+1] +
   366  							v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3])
   367  						z[j*ldz+k+1] -= refsum
   368  						z[j*ldz+k+2] -= refsum * v[m*ldv+1]
   369  						z[j*ldz+k+3] -= refsum * v[m*ldv+2]
   370  					}
   371  				}
   372  			}
   373  
   374  			// Special case: 2×2 reflection (if needed).
   375  			if bmp22 && v[m22*ldv] != 0 {
   376  				k := krcol + 3*m22
   377  				for j := jtop; j <= min(kbot, k+3); j++ {
   378  					refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2])
   379  					h[j*ldh+k+1] -= refsum
   380  					h[j*ldh+k+2] -= refsum * v[m22*ldv+1]
   381  				}
   382  				if accum {
   383  					kms := k - incol
   384  					for j := max(0, ktop-incol-1); j < kdu; j++ {
   385  						refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1])
   386  						u[j*ldu+kms] -= refsum
   387  						u[j*ldu+kms+1] -= refsum * v[m22*ldv+1]
   388  					}
   389  				} else if wantz {
   390  					for j := iloz; j <= ihiz; j++ {
   391  						refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2])
   392  						z[j*ldz+k+1] -= refsum
   393  						z[j*ldz+k+2] -= refsum * v[m22*ldv+1]
   394  					}
   395  				}
   396  			}
   397  
   398  			// Vigilant deflation check.
   399  			mstart := mtop
   400  			if krcol+3*mstart < ktop {
   401  				mstart++
   402  			}
   403  			mend := mbot
   404  			if bmp22 {
   405  				mend++
   406  			}
   407  			if krcol == kbot-2 {
   408  				mend++
   409  			}
   410  			for m := mstart; m <= mend; m++ {
   411  				k := min(kbot-1, krcol+3*m)
   412  
   413  				// The following convergence test requires that the tradition
   414  				// small-compared-to-nearby-diagonals criterion and the Ahues &
   415  				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
   416  				// improves accuracy in some examples. Falling back on an alternate
   417  				// convergence criterion when tst1 or tst2 is zero (as done here) is
   418  				// traditional but probably unnecessary.
   419  
   420  				if h[(k+1)*ldh+k] == 0 {
   421  					continue
   422  				}
   423  				tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
   424  				if tst1 == 0 {
   425  					if k >= ktop+1 {
   426  						tst1 += math.Abs(h[k*ldh+k-1])
   427  					}
   428  					if k >= ktop+2 {
   429  						tst1 += math.Abs(h[k*ldh+k-2])
   430  					}
   431  					if k >= ktop+3 {
   432  						tst1 += math.Abs(h[k*ldh+k-3])
   433  					}
   434  					if k <= kbot-2 {
   435  						tst1 += math.Abs(h[(k+2)*ldh+k+1])
   436  					}
   437  					if k <= kbot-3 {
   438  						tst1 += math.Abs(h[(k+3)*ldh+k+1])
   439  					}
   440  					if k <= kbot-4 {
   441  						tst1 += math.Abs(h[(k+4)*ldh+k+1])
   442  					}
   443  				}
   444  				if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
   445  					h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   446  					h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   447  					h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   448  					h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   449  					scl := h11 + h12
   450  					tst2 := h22 * (h11 / scl)
   451  					if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
   452  						h[(k+1)*ldh+k] = 0
   453  					}
   454  				}
   455  			}
   456  
   457  			// Fill in the last row of each bulge.
   458  			mend = min(nbmps, (kbot-krcol-1)/3) - 1
   459  			for m := mtop; m <= mend; m++ {
   460  				k := krcol + 3*m
   461  				refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3]
   462  				h[(k+4)*ldh+k+1] = -refsum
   463  				h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1]
   464  				h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2]
   465  			}
   466  		}
   467  
   468  		// Use U (if accumulated) to update far-from-diagonal entries in H.
   469  		// If required, use U to update Z as well.
   470  		if !accum {
   471  			continue
   472  		}
   473  		var jtop, jbot int
   474  		if wantt {
   475  			jtop = 0
   476  			jbot = n - 1
   477  		} else {
   478  			jtop = ktop
   479  			jbot = kbot
   480  		}
   481  		bi := blas64.Implementation()
   482  		if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 {
   483  			// Updates not exploiting the 2×2 block structure of U. k0 and nu keep track
   484  			// of the location and size of U in the special cases of introducing bulges
   485  			// and chasing bulges off the bottom. In these special cases and in case the
   486  			// number of shifts is ns = 2, there is no 2×2 block structure to exploit.
   487  
   488  			k0 := max(0, ktop-incol-1)
   489  			nu := kdu - max(0, ndcol-kbot) - k0
   490  
   491  			// Horizontal multiply.
   492  			for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
   493  				jlen := min(nh, jbot-jcol+1)
   494  				bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
   495  					1, u[k0*ldu+k0:], ldu,
   496  					h[(incol+k0+1)*ldh+jcol:], ldh,
   497  					0, wh, ldwh)
   498  				impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh)
   499  			}
   500  
   501  			// Vertical multiply.
   502  			for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv {
   503  				jlen := min(nv, max(ktop, incol)-jrow)
   504  				bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
   505  					1, h[jrow*ldh+incol+k0+1:], ldh,
   506  					u[k0*ldu+k0:], ldu,
   507  					0, wv, ldwv)
   508  				impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh)
   509  			}
   510  
   511  			// Z multiply (also vertical).
   512  			if wantz {
   513  				for jrow := iloz; jrow <= ihiz; jrow += nv {
   514  					jlen := min(nv, ihiz-jrow+1)
   515  					bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
   516  						1, z[jrow*ldz+incol+k0+1:], ldz,
   517  						u[k0*ldu+k0:], ldu,
   518  						0, wv, ldwv)
   519  					impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz)
   520  				}
   521  			}
   522  
   523  			continue
   524  		}
   525  
   526  		// Updates exploiting U's 2×2 block structure.
   527  
   528  		// i2, i4, j2, j4 are the last rows and columns of the blocks.
   529  		i2 := (kdu + 1) / 2
   530  		i4 := kdu
   531  		j2 := i4 - i2
   532  		j4 := kdu
   533  
   534  		// kzs and knz deal with the band of zeros along the diagonal of one of the
   535  		// triangular blocks.
   536  		kzs := (j4 - j2) - (ns + 1)
   537  		knz := ns + 1
   538  
   539  		// Horizontal multiply.
   540  		for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
   541  			jlen := min(nh, jbot-jcol+1)
   542  
   543  			// Copy bottom of H to top+kzs of scratch (the first kzs
   544  			// rows get multiplied by zero).
   545  			impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh)
   546  
   547  			// Multiply by U21ᵀ.
   548  			impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh)
   549  			bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen,
   550  				1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh)
   551  
   552  			// Multiply top of H by U11ᵀ.
   553  			bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2,
   554  				1, u, ldu, h[(incol+1)*ldh+jcol:], ldh,
   555  				1, wh, ldwh)
   556  
   557  			// Copy top of H to bottom of WH.
   558  			impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh)
   559  
   560  			// Multiply by U21ᵀ.
   561  			bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen,
   562  				1, u[i2:], ldu, wh[i2*ldwh:], ldwh)
   563  
   564  			// Multiply by U22.
   565  			bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2,
   566  				1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh,
   567  				1, wh[i2*ldwh:], ldwh)
   568  
   569  			// Copy it back.
   570  			impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh)
   571  		}
   572  
   573  		// Vertical multiply.
   574  		for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv {
   575  			jlen := min(nv, max(incol, ktop)-jrow)
   576  
   577  			// Copy right of H to scratch (the first kzs columns get multiplied
   578  			// by zero).
   579  			impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv)
   580  
   581  			// Multiply by U21.
   582  			impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
   583  			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
   584  				1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)
   585  
   586  			// Multiply by U11.
   587  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
   588  				1, h[jrow*ldh+incol+1:], ldh, u, ldu,
   589  				1, wv, ldwv)
   590  
   591  			// Copy left of H to right of scratch.
   592  			impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv)
   593  
   594  			// Multiply by U21.
   595  			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
   596  				1, u[i2:], ldu, wv[i2:], ldwv)
   597  
   598  			// Multiply by U22.
   599  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
   600  				1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu,
   601  				1, wv[i2:], ldwv)
   602  
   603  			// Copy it back.
   604  			impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh)
   605  		}
   606  
   607  		if !wantz {
   608  			continue
   609  		}
   610  		// Multiply Z (also vertical).
   611  		for jrow := iloz; jrow <= ihiz; jrow += nv {
   612  			jlen := min(nv, ihiz-jrow+1)
   613  
   614  			// Copy right of Z to left of scratch (first kzs columns get
   615  			// multiplied by zero).
   616  			impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv)
   617  
   618  			// Multiply by U12.
   619  			impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
   620  			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
   621  				1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)
   622  
   623  			// Multiply by U11.
   624  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
   625  				1, z[jrow*ldz+incol+1:], ldz, u, ldu,
   626  				1, wv, ldwv)
   627  
   628  			// Copy left of Z to right of scratch.
   629  			impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv)
   630  
   631  			// Multiply by U21.
   632  			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
   633  				1, u[i2:], ldu, wv[i2:], ldwv)
   634  
   635  			// Multiply by U22.
   636  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
   637  				1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu,
   638  				1, wv[i2:], ldwv)
   639  
   640  			// Copy the result back to Z.
   641  			impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz)
   642  		}
   643  	}
   644  }