github.com/gonum/lapack@v0.0.0-20181123203213-e4cdc5a0bff9/native/dlaqr5.go

github.com/gonum/lapack@v0.0.0-20181123203213-e4cdc5a0bff9/native/dlaqr5.go (about)

     1  // Copyright ©2016 The gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package native
     6  
     7  import (
     8  	"math"
     9  
    10  	"github.com/gonum/blas"
    11  	"github.com/gonum/blas/blas64"
    12  )
    13  
    14  // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
    15  // block of a Hessenberg matrix.
    16  //
    17  // wantt and wantz determine whether the quasi-triangular Schur factor and the
    18  // orthogonal Schur factor, respectively, will be computed.
    19  //
    20  // kacc22 specifies the computation mode of far-from-diagonal orthogonal
    21  // updates. Permitted values are:
    22  //  0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
    23  //     multiply to update far-from-diagonal matrix entries.
    24  //  1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
    25  //     update far-from-diagonal matrix entries.
    26  //  2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update
    27  //     far-from-diagonal matrix entries, and take advantage of 2×2 block
    28  //     structure during matrix multiplies.
    29  // For other values of kacc2 Dlaqr5 will panic.
    30  //
    31  // n is the order of the Hessenberg matrix H.
    32  //
    33  // ktop and kbot are indices of the first and last row and column of an isolated
    34  // diagonal block upon which the QR sweep will be applied. It must hold that
    35  //  ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
    36  //  kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
    37  // otherwise Dlaqr5 will panic.
    38  //
    39  // nshfts is the number of simultaneous shifts. It must be positive and even,
    40  // otherwise Dlaqr5 will panic.
    41  //
    42  // sr and si contain the real and imaginary parts, respectively, of the shifts
    43  // of origin that define the multi-shift QR sweep. On return both slices may be
    44  // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
    45  // will panic.
    46  //
    47  // h and ldh represent the Hessenberg matrix H of size n×n. On return
    48  // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
    49  // diagonal block in rows and columns ktop through kbot, inclusive.
    50  //
    51  // iloz and ihiz specify the rows of Z to which transformations will be applied
    52  // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
    53  // will panic.
    54  //
    55  // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
    56  // orthogonal similarity transformation is accumulated into
    57  // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
    58  //
    59  // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
    60  // is transposed with respect to the reference netlib implementation.
    61  //
    62  // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3).
    63  //
    64  // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh.
    65  //
    66  // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3).
    67  //
    68  // Dlaqr5 is an internal routine. It is exported for testing purposes.
    69  func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
    70  	checkMatrix(n, n, h, ldh)
    71  	if ktop < 0 || n <= ktop {
    72  		panic("lapack: invalid value of ktop")
    73  	}
    74  	if ktop > 0 && h[ktop*ldh+ktop-1] != 0 {
    75  		panic("lapack: diagonal block is not isolated")
    76  	}
    77  	if kbot < 0 || n <= kbot {
    78  		panic("lapack: invalid value of kbot")
    79  	}
    80  	if kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0 {
    81  		panic("lapack: diagonal block is not isolated")
    82  	}
    83  	if nshfts < 0 || nshfts&0x1 != 0 {
    84  		panic("lapack: invalid number of shifts")
    85  	}
    86  	if len(sr) != nshfts || len(si) != nshfts {
    87  		panic(badSlice) // TODO(vladimir-ch) Another message?
    88  	}
    89  	if wantz {
    90  		if ihiz >= n {
    91  			panic("lapack: invalid value of ihiz")
    92  		}
    93  		if iloz < 0 || ihiz < iloz {
    94  			panic("lapack: invalid value of iloz")
    95  		}
    96  		checkMatrix(n, n, z, ldz)
    97  	}
    98  	checkMatrix(nshfts/2, 3, v, ldv) // Transposed w.r.t. lapack.
    99  	checkMatrix(3*nshfts-3, 3*nshfts-3, u, ldu)
   100  	checkMatrix(nv, 3*nshfts-3, wv, ldwv)
   101  	checkMatrix(3*nshfts-3, nh, wh, ldwh)
   102  	if kacc22 != 0 && kacc22 != 1 && kacc22 != 2 {
   103  		panic("lapack: invalid value of kacc22")
   104  	}
   105  
   106  	// If there are no shifts, then there is nothing to do.
   107  	if nshfts < 2 {
   108  		return
   109  	}
   110  	// If the active block is empty or 1×1, then there is nothing to do.
   111  	if ktop >= kbot {
   112  		return
   113  	}
   114  
   115  	// Shuffle shifts into pairs of real shifts and pairs of complex
   116  	// conjugate shifts assuming complex conjugate shifts are already
   117  	// adjacent to one another.
   118  	for i := 0; i < nshfts-2; i += 2 {
   119  		if si[i] == -si[i+1] {
   120  			continue
   121  		}
   122  		sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
   123  		si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
   124  	}
   125  
   126  	// Note: lapack says that nshfts must be even but allows it to be odd
   127  	// anyway. We panic above if nshfts is not even, so reducing it by one
   128  	// is unnecessary. The only caller Dlaqr04 uses only even nshfts.
   129  	//
   130  	// The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
   131  	// *     ==== NSHFTS is supposed to be even, but if it is odd,
   132  	// *     .    then simply reduce it by one.  The shuffle above
   133  	// *     .    ensures that the dropped shift is real and that
   134  	// *     .    the remaining shifts are paired. ====
   135  	// *
   136  	//      NS = NSHFTS - MOD( NSHFTS, 2 )
   137  	ns := nshfts
   138  
   139  	safmin := dlamchS
   140  	ulp := dlamchP
   141  	smlnum := safmin * float64(n) / ulp
   142  
   143  	// Use accumulated reflections to update far-from-diagonal entries?
   144  	accum := kacc22 == 1 || kacc22 == 2
   145  	// If so, exploit the 2×2 block structure?
   146  	blk22 := ns > 2 && kacc22 == 2
   147  
   148  	// Clear trash.
   149  	if ktop+2 <= kbot {
   150  		h[(ktop+2)*ldh+ktop] = 0
   151  	}
   152  
   153  	// nbmps = number of 2-shift bulges in the chain.
   154  	nbmps := ns / 2
   155  
   156  	// kdu = width of slab.
   157  	kdu := 6*nbmps - 3
   158  
   159  	// Create and chase chains of nbmps bulges.
   160  	for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 {
   161  		ndcol := incol + kdu
   162  		if accum {
   163  			impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
   164  		}
   165  
   166  		// Near-the-diagonal bulge chase. The following loop performs
   167  		// the near-the-diagonal part of a small bulge multi-shift QR
   168  		// sweep. Each 6*nbmps-2 column diagonal chunk extends from
   169  		// column incol to column ndcol (including both column incol and
   170  		// column ndcol). The following loop chases a 3*nbmps column
   171  		// long chain of nbmps bulges 3*nbmps-2 columns to the right.
   172  		// (incol may be less than ktop and ndcol may be greater than
   173  		// kbot indicating phantom columns from which to chase bulges
   174  		// before they are actually introduced or to which to chase
   175  		// bulges beyond column kbot.)
   176  		for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ {
   177  			// Bulges number mtop to mbot are active double implicit
   178  			// shift bulges. There may or may not also be small 2×2
   179  			// bulge, if there is room. The inactive bulges (if any)
   180  			// must wait until the active bulges have moved down the
   181  			// diagonal to make room. The phantom matrix paradigm
   182  			// described above helps keep track.
   183  
   184  			mtop := max(0, ((ktop-1)-krcol+2)/3)
   185  			mbot := min(nbmps, (kbot-krcol)/3) - 1
   186  			m22 := mbot + 1
   187  			bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2)
   188  
   189  			// Generate reflections to chase the chain right one
   190  			// column. (The minimum value of k is ktop-1.)
   191  			for m := mtop; m <= mbot; m++ {
   192  				k := krcol + 3*m
   193  				if k == ktop-1 {
   194  					impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
   195  						sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
   196  						v[m*ldv:m*ldv+3])
   197  					alpha := v[m*ldv]
   198  					_, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
   199  					continue
   200  				}
   201  				beta := h[(k+1)*ldh+k]
   202  				v[m*ldv+1] = h[(k+2)*ldh+k]
   203  				v[m*ldv+2] = h[(k+3)*ldh+k]
   204  				beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)
   205  
   206  				// A bulge may collapse because of vigilant deflation or
   207  				// destructive underflow. In the underflow case, try the
   208  				// two-small-subdiagonals trick to try to reinflate the
   209  				// bulge.
   210  				if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
   211  					// Typical case: not collapsed (yet).
   212  					h[(k+1)*ldh+k] = beta
   213  					h[(k+2)*ldh+k] = 0
   214  					h[(k+3)*ldh+k] = 0
   215  					continue
   216  				}
   217  
   218  				// Atypical case: collapsed. Attempt to reintroduce
   219  				// ignoring H[k+1,k] and H[k+2,k]. If the fill
   220  				// resulting from the new reflector is too large,
   221  				// then abandon it. Otherwise, use the new one.
   222  				var vt [3]float64
   223  				impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m],
   224  					si[2*m], sr[2*m+1], si[2*m+1], vt[:])
   225  				alpha := vt[0]
   226  				_, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1)
   227  				refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k])
   228  
   229  				dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
   230  				if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum {
   231  					// Starting a new bulge here would create
   232  					// non-negligible fill. Use the old one with
   233  					// trepidation.
   234  					h[(k+1)*ldh+k] = beta
   235  					h[(k+2)*ldh+k] = 0
   236  					h[(k+3)*ldh+k] = 0
   237  					continue
   238  				} else {
   239  					// Starting a new bulge here would create
   240  					// only negligible fill. Replace the old
   241  					// reflector with the new one.
   242  					h[(k+1)*ldh+k] -= refsum
   243  					h[(k+2)*ldh+k] = 0
   244  					h[(k+3)*ldh+k] = 0
   245  					v[m*ldv] = vt[0]
   246  					v[m*ldv+1] = vt[1]
   247  					v[m*ldv+2] = vt[2]
   248  				}
   249  			}
   250  
   251  			// Generate a 2×2 reflection, if needed.
   252  			if bmp22 {
   253  				k := krcol + 3*m22
   254  				if k == ktop-1 {
   255  					impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
   256  						sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
   257  						v[m22*ldv:m22*ldv+2])
   258  					beta := v[m22*ldv]
   259  					_, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
   260  				} else {
   261  					beta := h[(k+1)*ldh+k]
   262  					v[m22*ldv+1] = h[(k+2)*ldh+k]
   263  					beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
   264  					h[(k+1)*ldh+k] = beta
   265  					h[(k+2)*ldh+k] = 0
   266  				}
   267  			}
   268  
   269  			// Multiply H by reflections from the left.
   270  			var jbot int
   271  			switch {
   272  			case accum:
   273  				jbot = min(ndcol, kbot)
   274  			case wantt:
   275  				jbot = n - 1
   276  			default:
   277  				jbot = kbot
   278  			}
   279  			for j := max(ktop, krcol); j <= jbot; j++ {
   280  				mend := min(mbot+1, (j-krcol+2)/3) - 1
   281  				for m := mtop; m <= mend; m++ {
   282  					k := krcol + 3*m
   283  					refsum := v[m*ldv] * (h[(k+1)*ldh+j] +
   284  						v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j])
   285  					h[(k+1)*ldh+j] -= refsum
   286  					h[(k+2)*ldh+j] -= refsum * v[m*ldv+1]
   287  					h[(k+3)*ldh+j] -= refsum * v[m*ldv+2]
   288  				}
   289  			}
   290  			if bmp22 {
   291  				k := krcol + 3*m22
   292  				for j := max(k+1, ktop); j <= jbot; j++ {
   293  					refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j])
   294  					h[(k+1)*ldh+j] -= refsum
   295  					h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1]
   296  				}
   297  			}
   298  
   299  			// Multiply H by reflections from the right. Delay filling in the last row
   300  			// until the vigilant deflation check is complete.
   301  			var jtop int
   302  			switch {
   303  			case accum:
   304  				jtop = max(ktop, incol)
   305  			case wantt:
   306  				jtop = 0
   307  			default:
   308  				jtop = ktop
   309  			}
   310  			for m := mtop; m <= mbot; m++ {
   311  				if v[m*ldv] == 0 {
   312  					continue
   313  				}
   314  				k := krcol + 3*m
   315  				for j := jtop; j <= min(kbot, k+3); j++ {
   316  					refsum := v[m*ldv] * (h[j*ldh+k+1] +
   317  						v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3])
   318  					h[j*ldh+k+1] -= refsum
   319  					h[j*ldh+k+2] -= refsum * v[m*ldv+1]
   320  					h[j*ldh+k+3] -= refsum * v[m*ldv+2]
   321  				}
   322  				if accum {
   323  					// Accumulate U. (If necessary, update Z later with with an
   324  					// efficient matrix-matrix multiply.)
   325  					kms := k - incol
   326  					for j := max(0, ktop-incol-1); j < kdu; j++ {
   327  						refsum := v[m*ldv] * (u[j*ldu+kms] +
   328  							v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2])
   329  						u[j*ldu+kms] -= refsum
   330  						u[j*ldu+kms+1] -= refsum * v[m*ldv+1]
   331  						u[j*ldu+kms+2] -= refsum * v[m*ldv+2]
   332  					}
   333  				} else if wantz {
   334  					// U is not accumulated, so update Z now by multiplying by
   335  					// reflections from the right.
   336  					for j := iloz; j <= ihiz; j++ {
   337  						refsum := v[m*ldv] * (z[j*ldz+k+1] +
   338  							v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3])
   339  						z[j*ldz+k+1] -= refsum
   340  						z[j*ldz+k+2] -= refsum * v[m*ldv+1]
   341  						z[j*ldz+k+3] -= refsum * v[m*ldv+2]
   342  					}
   343  				}
   344  			}
   345  
   346  			// Special case: 2×2 reflection (if needed).
   347  			if bmp22 && v[m22*ldv] != 0 {
   348  				k := krcol + 3*m22
   349  				for j := jtop; j <= min(kbot, k+3); j++ {
   350  					refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2])
   351  					h[j*ldh+k+1] -= refsum
   352  					h[j*ldh+k+2] -= refsum * v[m22*ldv+1]
   353  				}
   354  				if accum {
   355  					kms := k - incol
   356  					for j := max(0, ktop-incol-1); j < kdu; j++ {
   357  						refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1])
   358  						u[j*ldu+kms] -= refsum
   359  						u[j*ldu+kms+1] -= refsum * v[m22*ldv+1]
   360  					}
   361  				} else if wantz {
   362  					for j := iloz; j <= ihiz; j++ {
   363  						refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2])
   364  						z[j*ldz+k+1] -= refsum
   365  						z[j*ldz+k+2] -= refsum * v[m22*ldv+1]
   366  					}
   367  				}
   368  			}
   369  
   370  			// Vigilant deflation check.
   371  			mstart := mtop
   372  			if krcol+3*mstart < ktop {
   373  				mstart++
   374  			}
   375  			mend := mbot
   376  			if bmp22 {
   377  				mend++
   378  			}
   379  			if krcol == kbot-2 {
   380  				mend++
   381  			}
   382  			for m := mstart; m <= mend; m++ {
   383  				k := min(kbot-1, krcol+3*m)
   384  
   385  				// The following convergence test requires that the tradition
   386  				// small-compared-to-nearby-diagonals criterion and the Ahues &
   387  				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
   388  				// improves accuracy in some examples. Falling back on an alternate
   389  				// convergence criterion when tst1 or tst2 is zero (as done here) is
   390  				// traditional but probably unnecessary.
   391  
   392  				if h[(k+1)*ldh+k] == 0 {
   393  					continue
   394  				}
   395  				tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
   396  				if tst1 == 0 {
   397  					if k >= ktop+1 {
   398  						tst1 += math.Abs(h[k*ldh+k-1])
   399  					}
   400  					if k >= ktop+2 {
   401  						tst1 += math.Abs(h[k*ldh+k-2])
   402  					}
   403  					if k >= ktop+3 {
   404  						tst1 += math.Abs(h[k*ldh+k-3])
   405  					}
   406  					if k <= kbot-2 {
   407  						tst1 += math.Abs(h[(k+2)*ldh+k+1])
   408  					}
   409  					if k <= kbot-3 {
   410  						tst1 += math.Abs(h[(k+3)*ldh+k+1])
   411  					}
   412  					if k <= kbot-4 {
   413  						tst1 += math.Abs(h[(k+4)*ldh+k+1])
   414  					}
   415  				}
   416  				if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
   417  					h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   418  					h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
   419  					h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   420  					h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
   421  					scl := h11 + h12
   422  					tst2 := h22 * (h11 / scl)
   423  					if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
   424  						h[(k+1)*ldh+k] = 0
   425  					}
   426  				}
   427  			}
   428  
   429  			// Fill in the last row of each bulge.
   430  			mend = min(nbmps, (kbot-krcol-1)/3) - 1
   431  			for m := mtop; m <= mend; m++ {
   432  				k := krcol + 3*m
   433  				refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3]
   434  				h[(k+4)*ldh+k+1] = -refsum
   435  				h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1]
   436  				h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2]
   437  			}
   438  		}
   439  
   440  		// Use U (if accumulated) to update far-from-diagonal entries in H.
   441  		// If required, use U to update Z as well.
   442  		if !accum {
   443  			continue
   444  		}
   445  		var jtop, jbot int
   446  		if wantt {
   447  			jtop = 0
   448  			jbot = n - 1
   449  		} else {
   450  			jtop = ktop
   451  			jbot = kbot
   452  		}
   453  		bi := blas64.Implementation()
   454  		if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 {
   455  			// Updates not exploiting the 2×2 block structure of U. k0 and nu keep track
   456  			// of the location and size of U in the special cases of introducing bulges
   457  			// and chasing bulges off the bottom. In these special cases and in case the
   458  			// number of shifts is ns = 2, there is no 2×2 block structure to exploit.
   459  
   460  			k0 := max(0, ktop-incol-1)
   461  			nu := kdu - max(0, ndcol-kbot) - k0
   462  
   463  			// Horizontal multiply.
   464  			for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
   465  				jlen := min(nh, jbot-jcol+1)
   466  				bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
   467  					1, u[k0*ldu+k0:], ldu,
   468  					h[(incol+k0+1)*ldh+jcol:], ldh,
   469  					0, wh, ldwh)
   470  				impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh)
   471  			}
   472  
   473  			// Vertical multiply.
   474  			for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv {
   475  				jlen := min(nv, max(ktop, incol)-jrow)
   476  				bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
   477  					1, h[jrow*ldh+incol+k0+1:], ldh,
   478  					u[k0*ldu+k0:], ldu,
   479  					0, wv, ldwv)
   480  				impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh)
   481  			}
   482  
   483  			// Z multiply (also vertical).
   484  			if wantz {
   485  				for jrow := iloz; jrow <= ihiz; jrow += nv {
   486  					jlen := min(nv, ihiz-jrow+1)
   487  					bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
   488  						1, z[jrow*ldz+incol+k0+1:], ldz,
   489  						u[k0*ldu+k0:], ldu,
   490  						0, wv, ldwv)
   491  					impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz)
   492  				}
   493  			}
   494  
   495  			continue
   496  		}
   497  
   498  		// Updates exploiting U's 2×2 block structure.
   499  
   500  		// i2, i4, j2, j4 are the last rows and columns of the blocks.
   501  		i2 := (kdu + 1) / 2
   502  		i4 := kdu
   503  		j2 := i4 - i2
   504  		j4 := kdu
   505  
   506  		// kzs and knz deal with the band of zeros along the diagonal of one of the
   507  		// triangular blocks.
   508  		kzs := (j4 - j2) - (ns + 1)
   509  		knz := ns + 1
   510  
   511  		// Horizontal multiply.
   512  		for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
   513  			jlen := min(nh, jbot-jcol+1)
   514  
   515  			// Copy bottom of H to top+kzs of scratch (the first kzs
   516  			// rows get multiplied by zero).
   517  			impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh)
   518  
   519  			// Multiply by U21^T.
   520  			impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh)
   521  			bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen,
   522  				1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh)
   523  
   524  			// Multiply top of H by U11^T.
   525  			bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2,
   526  				1, u, ldu, h[(incol+1)*ldh+jcol:], ldh,
   527  				1, wh, ldwh)
   528  
   529  			// Copy top of H to bottom of WH.
   530  			impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh)
   531  
   532  			// Multiply by U21^T.
   533  			bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen,
   534  				1, u[i2:], ldu, wh[i2*ldwh:], ldwh)
   535  
   536  			// Multiply by U22.
   537  			bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2,
   538  				1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh,
   539  				1, wh[i2*ldwh:], ldwh)
   540  
   541  			// Copy it back.
   542  			impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh)
   543  		}
   544  
   545  		// Vertical multiply.
   546  		for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv {
   547  			jlen := min(nv, max(incol, ktop)-jrow)
   548  
   549  			// Copy right of H to scratch (the first kzs columns get multiplied
   550  			// by zero).
   551  			impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv)
   552  
   553  			// Multiply by U21.
   554  			impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
   555  			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
   556  				1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)
   557  
   558  			// Multiply by U11.
   559  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
   560  				1, h[jrow*ldh+incol+1:], ldh, u, ldu,
   561  				1, wv, ldwv)
   562  
   563  			// Copy left of H to right of scratch.
   564  			impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv)
   565  
   566  			// Multiply by U21.
   567  			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
   568  				1, u[i2:], ldu, wv[i2:], ldwv)
   569  
   570  			// Multiply by U22.
   571  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
   572  				1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu,
   573  				1, wv[i2:], ldwv)
   574  
   575  			// Copy it back.
   576  			impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh)
   577  		}
   578  
   579  		if !wantz {
   580  			continue
   581  		}
   582  		// Multiply Z (also vertical).
   583  		for jrow := iloz; jrow <= ihiz; jrow += nv {
   584  			jlen := min(nv, ihiz-jrow+1)
   585  
   586  			// Copy right of Z to left of scratch (first kzs columns get
   587  			// multiplied by zero).
   588  			impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv)
   589  
   590  			// Multiply by U12.
   591  			impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
   592  			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
   593  				1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)
   594  
   595  			// Multiply by U11.
   596  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
   597  				1, z[jrow*ldz+incol+1:], ldz, u, ldu,
   598  				1, wv, ldwv)
   599  
   600  			// Copy left of Z to right of scratch.
   601  			impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv)
   602  
   603  			// Multiply by U21.
   604  			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
   605  				1, u[i2:], ldu, wv[i2:], ldwv)
   606  
   607  			// Multiply by U22.
   608  			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
   609  				1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu,
   610  				1, wv[i2:], ldwv)
   611  
   612  			// Copy the result back to Z.
   613  			impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz)
   614  		}
   615  	}
   616  }