github.com/gopherd/gonum@v0.0.4/lapack/gonum/dlaqr5.go (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package gonum 6 7 import ( 8 "math" 9 10 "github.com/gopherd/gonum/blas" 11 "github.com/gopherd/gonum/blas/blas64" 12 ) 13 14 // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated 15 // block of a Hessenberg matrix. 16 // 17 // wantt and wantz determine whether the quasi-triangular Schur factor and the 18 // orthogonal Schur factor, respectively, will be computed. 19 // 20 // kacc22 specifies the computation mode of far-from-diagonal orthogonal 21 // updates. Permitted values are: 22 // 0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix 23 // multiply to update far-from-diagonal matrix entries. 24 // 1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to 25 // update far-from-diagonal matrix entries. 26 // 2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update 27 // far-from-diagonal matrix entries, and take advantage of 2×2 block 28 // structure during matrix multiplies. 29 // For other values of kacc2 Dlaqr5 will panic. 30 // 31 // n is the order of the Hessenberg matrix H. 32 // 33 // ktop and kbot are indices of the first and last row and column of an isolated 34 // diagonal block upon which the QR sweep will be applied. It must hold that 35 // ktop == 0, or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and 36 // kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0, 37 // otherwise Dlaqr5 will panic. 38 // 39 // nshfts is the number of simultaneous shifts. It must be positive and even, 40 // otherwise Dlaqr5 will panic. 41 // 42 // sr and si contain the real and imaginary parts, respectively, of the shifts 43 // of origin that define the multi-shift QR sweep. On return both slices may be 44 // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5 45 // will panic. 46 // 47 // h and ldh represent the Hessenberg matrix H of size n×n. On return 48 // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated 49 // diagonal block in rows and columns ktop through kbot, inclusive. 50 // 51 // iloz and ihiz specify the rows of Z to which transformations will be applied 52 // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5 53 // will panic. 54 // 55 // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep 56 // orthogonal similarity transformation is accumulated into 57 // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced. 58 // 59 // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V 60 // is transposed with respect to the reference netlib implementation. 61 // 62 // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3). 63 // 64 // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh. 65 // 66 // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3). 67 // 68 // Dlaqr5 is an internal routine. It is exported for testing purposes. 69 func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) { 70 switch { 71 case kacc22 != 0 && kacc22 != 1 && kacc22 != 2: 72 panic(badKacc22) 73 case n < 0: 74 panic(nLT0) 75 case ktop < 0 || n <= ktop: 76 panic(badKtop) 77 case kbot < 0 || n <= kbot: 78 panic(badKbot) 79 80 case nshfts < 0: 81 panic(nshftsLT0) 82 case nshfts&0x1 != 0: 83 panic(nshftsOdd) 84 case len(sr) != nshfts: 85 panic(badLenSr) 86 case len(si) != nshfts: 87 panic(badLenSi) 88 89 case ldh < max(1, n): 90 panic(badLdH) 91 case len(h) < (n-1)*ldh+n: 92 panic(shortH) 93 94 case wantz && ihiz >= n: 95 panic(badIhiz) 96 case wantz && iloz < 0 || ihiz < iloz: 97 panic(badIloz) 98 case ldz < 1, wantz && ldz < n: 99 panic(badLdZ) 100 case wantz && len(z) < (n-1)*ldz+n: 101 panic(shortZ) 102 103 case ldv < 3: 104 // V is transposed w.r.t. reference lapack. 105 panic(badLdV) 106 case len(v) < (nshfts/2-1)*ldv+3: 107 panic(shortV) 108 109 case ldu < max(1, 3*nshfts-3): 110 panic(badLdU) 111 case len(u) < (3*nshfts-3-1)*ldu+3*nshfts-3: 112 panic(shortU) 113 114 case nv < 0: 115 panic(nvLT0) 116 case ldwv < max(1, 3*nshfts-3): 117 panic(badLdWV) 118 case len(wv) < (nv-1)*ldwv+3*nshfts-3: 119 panic(shortWV) 120 121 case nh < 0: 122 panic(nhLT0) 123 case ldwh < max(1, nh): 124 panic(badLdWH) 125 case len(wh) < (3*nshfts-3-1)*ldwh+nh: 126 panic(shortWH) 127 128 case ktop > 0 && h[ktop*ldh+ktop-1] != 0: 129 panic(notIsolated) 130 case kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0: 131 panic(notIsolated) 132 } 133 134 // If there are no shifts, then there is nothing to do. 135 if nshfts < 2 { 136 return 137 } 138 // If the active block is empty or 1×1, then there is nothing to do. 139 if ktop >= kbot { 140 return 141 } 142 143 // Shuffle shifts into pairs of real shifts and pairs of complex 144 // conjugate shifts assuming complex conjugate shifts are already 145 // adjacent to one another. 146 for i := 0; i < nshfts-2; i += 2 { 147 if si[i] == -si[i+1] { 148 continue 149 } 150 sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i] 151 si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i] 152 } 153 154 // Note: lapack says that nshfts must be even but allows it to be odd 155 // anyway. We panic above if nshfts is not even, so reducing it by one 156 // is unnecessary. The only caller Dlaqr04 uses only even nshfts. 157 // 158 // The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341: 159 // * ==== NSHFTS is supposed to be even, but if it is odd, 160 // * . then simply reduce it by one. The shuffle above 161 // * . ensures that the dropped shift is real and that 162 // * . the remaining shifts are paired. ==== 163 // * 164 // NS = NSHFTS - MOD( NSHFTS, 2 ) 165 ns := nshfts 166 167 safmin := dlamchS 168 ulp := dlamchP 169 smlnum := safmin * float64(n) / ulp 170 171 // Use accumulated reflections to update far-from-diagonal entries? 172 accum := kacc22 == 1 || kacc22 == 2 173 // If so, exploit the 2×2 block structure? 174 blk22 := ns > 2 && kacc22 == 2 175 176 // Clear trash. 177 if ktop+2 <= kbot { 178 h[(ktop+2)*ldh+ktop] = 0 179 } 180 181 // nbmps = number of 2-shift bulges in the chain. 182 nbmps := ns / 2 183 184 // kdu = width of slab. 185 kdu := 6*nbmps - 3 186 187 // Create and chase chains of nbmps bulges. 188 for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 { 189 ndcol := incol + kdu 190 if accum { 191 impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu) 192 } 193 194 // Near-the-diagonal bulge chase. The following loop performs 195 // the near-the-diagonal part of a small bulge multi-shift QR 196 // sweep. Each 6*nbmps-2 column diagonal chunk extends from 197 // column incol to column ndcol (including both column incol and 198 // column ndcol). The following loop chases a 3*nbmps column 199 // long chain of nbmps bulges 3*nbmps-2 columns to the right. 200 // (incol may be less than ktop and ndcol may be greater than 201 // kbot indicating phantom columns from which to chase bulges 202 // before they are actually introduced or to which to chase 203 // bulges beyond column kbot.) 204 for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ { 205 // Bulges number mtop to mbot are active double implicit 206 // shift bulges. There may or may not also be small 2×2 207 // bulge, if there is room. The inactive bulges (if any) 208 // must wait until the active bulges have moved down the 209 // diagonal to make room. The phantom matrix paradigm 210 // described above helps keep track. 211 212 mtop := max(0, ((ktop-1)-krcol+2)/3) 213 mbot := min(nbmps, (kbot-krcol)/3) - 1 214 m22 := mbot + 1 215 bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2) 216 217 // Generate reflections to chase the chain right one 218 // column. (The minimum value of k is ktop-1.) 219 for m := mtop; m <= mbot; m++ { 220 k := krcol + 3*m 221 if k == ktop-1 { 222 impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh, 223 sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], 224 v[m*ldv:m*ldv+3]) 225 alpha := v[m*ldv] 226 _, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1) 227 continue 228 } 229 beta := h[(k+1)*ldh+k] 230 v[m*ldv+1] = h[(k+2)*ldh+k] 231 v[m*ldv+2] = h[(k+3)*ldh+k] 232 beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1) 233 234 // A bulge may collapse because of vigilant deflation or 235 // destructive underflow. In the underflow case, try the 236 // two-small-subdiagonals trick to try to reinflate the 237 // bulge. 238 if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 { 239 // Typical case: not collapsed (yet). 240 h[(k+1)*ldh+k] = beta 241 h[(k+2)*ldh+k] = 0 242 h[(k+3)*ldh+k] = 0 243 continue 244 } 245 246 // Atypical case: collapsed. Attempt to reintroduce 247 // ignoring H[k+1,k] and H[k+2,k]. If the fill 248 // resulting from the new reflector is too large, 249 // then abandon it. Otherwise, use the new one. 250 var vt [3]float64 251 impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m], 252 si[2*m], sr[2*m+1], si[2*m+1], vt[:]) 253 alpha := vt[0] 254 _, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1) 255 refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]) 256 257 dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2]) 258 if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum { 259 // Starting a new bulge here would create 260 // non-negligible fill. Use the old one with 261 // trepidation. 262 h[(k+1)*ldh+k] = beta 263 h[(k+2)*ldh+k] = 0 264 h[(k+3)*ldh+k] = 0 265 continue 266 } else { 267 // Starting a new bulge here would create 268 // only negligible fill. Replace the old 269 // reflector with the new one. 270 h[(k+1)*ldh+k] -= refsum 271 h[(k+2)*ldh+k] = 0 272 h[(k+3)*ldh+k] = 0 273 v[m*ldv] = vt[0] 274 v[m*ldv+1] = vt[1] 275 v[m*ldv+2] = vt[2] 276 } 277 } 278 279 // Generate a 2×2 reflection, if needed. 280 if bmp22 { 281 k := krcol + 3*m22 282 if k == ktop-1 { 283 impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh, 284 sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1], 285 v[m22*ldv:m22*ldv+2]) 286 beta := v[m22*ldv] 287 _, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) 288 } else { 289 beta := h[(k+1)*ldh+k] 290 v[m22*ldv+1] = h[(k+2)*ldh+k] 291 beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) 292 h[(k+1)*ldh+k] = beta 293 h[(k+2)*ldh+k] = 0 294 } 295 } 296 297 // Multiply H by reflections from the left. 298 var jbot int 299 switch { 300 case accum: 301 jbot = min(ndcol, kbot) 302 case wantt: 303 jbot = n - 1 304 default: 305 jbot = kbot 306 } 307 for j := max(ktop, krcol); j <= jbot; j++ { 308 mend := min(mbot+1, (j-krcol+2)/3) - 1 309 for m := mtop; m <= mend; m++ { 310 k := krcol + 3*m 311 refsum := v[m*ldv] * (h[(k+1)*ldh+j] + 312 v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]) 313 h[(k+1)*ldh+j] -= refsum 314 h[(k+2)*ldh+j] -= refsum * v[m*ldv+1] 315 h[(k+3)*ldh+j] -= refsum * v[m*ldv+2] 316 } 317 } 318 if bmp22 { 319 k := krcol + 3*m22 320 for j := max(k+1, ktop); j <= jbot; j++ { 321 refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]) 322 h[(k+1)*ldh+j] -= refsum 323 h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1] 324 } 325 } 326 327 // Multiply H by reflections from the right. Delay filling in the last row 328 // until the vigilant deflation check is complete. 329 var jtop int 330 switch { 331 case accum: 332 jtop = max(ktop, incol) 333 case wantt: 334 jtop = 0 335 default: 336 jtop = ktop 337 } 338 for m := mtop; m <= mbot; m++ { 339 if v[m*ldv] == 0 { 340 continue 341 } 342 k := krcol + 3*m 343 for j := jtop; j <= min(kbot, k+3); j++ { 344 refsum := v[m*ldv] * (h[j*ldh+k+1] + 345 v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]) 346 h[j*ldh+k+1] -= refsum 347 h[j*ldh+k+2] -= refsum * v[m*ldv+1] 348 h[j*ldh+k+3] -= refsum * v[m*ldv+2] 349 } 350 if accum { 351 // Accumulate U. (If necessary, update Z later with an 352 // efficient matrix-matrix multiply.) 353 kms := k - incol 354 for j := max(0, ktop-incol-1); j < kdu; j++ { 355 refsum := v[m*ldv] * (u[j*ldu+kms] + 356 v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2]) 357 u[j*ldu+kms] -= refsum 358 u[j*ldu+kms+1] -= refsum * v[m*ldv+1] 359 u[j*ldu+kms+2] -= refsum * v[m*ldv+2] 360 } 361 } else if wantz { 362 // U is not accumulated, so update Z now by multiplying by 363 // reflections from the right. 364 for j := iloz; j <= ihiz; j++ { 365 refsum := v[m*ldv] * (z[j*ldz+k+1] + 366 v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]) 367 z[j*ldz+k+1] -= refsum 368 z[j*ldz+k+2] -= refsum * v[m*ldv+1] 369 z[j*ldz+k+3] -= refsum * v[m*ldv+2] 370 } 371 } 372 } 373 374 // Special case: 2×2 reflection (if needed). 375 if bmp22 && v[m22*ldv] != 0 { 376 k := krcol + 3*m22 377 for j := jtop; j <= min(kbot, k+3); j++ { 378 refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]) 379 h[j*ldh+k+1] -= refsum 380 h[j*ldh+k+2] -= refsum * v[m22*ldv+1] 381 } 382 if accum { 383 kms := k - incol 384 for j := max(0, ktop-incol-1); j < kdu; j++ { 385 refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1]) 386 u[j*ldu+kms] -= refsum 387 u[j*ldu+kms+1] -= refsum * v[m22*ldv+1] 388 } 389 } else if wantz { 390 for j := iloz; j <= ihiz; j++ { 391 refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]) 392 z[j*ldz+k+1] -= refsum 393 z[j*ldz+k+2] -= refsum * v[m22*ldv+1] 394 } 395 } 396 } 397 398 // Vigilant deflation check. 399 mstart := mtop 400 if krcol+3*mstart < ktop { 401 mstart++ 402 } 403 mend := mbot 404 if bmp22 { 405 mend++ 406 } 407 if krcol == kbot-2 { 408 mend++ 409 } 410 for m := mstart; m <= mend; m++ { 411 k := min(kbot-1, krcol+3*m) 412 413 // The following convergence test requires that the tradition 414 // small-compared-to-nearby-diagonals criterion and the Ahues & 415 // Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter 416 // improves accuracy in some examples. Falling back on an alternate 417 // convergence criterion when tst1 or tst2 is zero (as done here) is 418 // traditional but probably unnecessary. 419 420 if h[(k+1)*ldh+k] == 0 { 421 continue 422 } 423 tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) 424 if tst1 == 0 { 425 if k >= ktop+1 { 426 tst1 += math.Abs(h[k*ldh+k-1]) 427 } 428 if k >= ktop+2 { 429 tst1 += math.Abs(h[k*ldh+k-2]) 430 } 431 if k >= ktop+3 { 432 tst1 += math.Abs(h[k*ldh+k-3]) 433 } 434 if k <= kbot-2 { 435 tst1 += math.Abs(h[(k+2)*ldh+k+1]) 436 } 437 if k <= kbot-3 { 438 tst1 += math.Abs(h[(k+3)*ldh+k+1]) 439 } 440 if k <= kbot-4 { 441 tst1 += math.Abs(h[(k+4)*ldh+k+1]) 442 } 443 } 444 if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) { 445 h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 446 h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 447 h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 448 h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 449 scl := h11 + h12 450 tst2 := h22 * (h11 / scl) 451 if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) { 452 h[(k+1)*ldh+k] = 0 453 } 454 } 455 } 456 457 // Fill in the last row of each bulge. 458 mend = min(nbmps, (kbot-krcol-1)/3) - 1 459 for m := mtop; m <= mend; m++ { 460 k := krcol + 3*m 461 refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3] 462 h[(k+4)*ldh+k+1] = -refsum 463 h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1] 464 h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2] 465 } 466 } 467 468 // Use U (if accumulated) to update far-from-diagonal entries in H. 469 // If required, use U to update Z as well. 470 if !accum { 471 continue 472 } 473 var jtop, jbot int 474 if wantt { 475 jtop = 0 476 jbot = n - 1 477 } else { 478 jtop = ktop 479 jbot = kbot 480 } 481 bi := blas64.Implementation() 482 if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 { 483 // Updates not exploiting the 2×2 block structure of U. k0 and nu keep track 484 // of the location and size of U in the special cases of introducing bulges 485 // and chasing bulges off the bottom. In these special cases and in case the 486 // number of shifts is ns = 2, there is no 2×2 block structure to exploit. 487 488 k0 := max(0, ktop-incol-1) 489 nu := kdu - max(0, ndcol-kbot) - k0 490 491 // Horizontal multiply. 492 for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { 493 jlen := min(nh, jbot-jcol+1) 494 bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu, 495 1, u[k0*ldu+k0:], ldu, 496 h[(incol+k0+1)*ldh+jcol:], ldh, 497 0, wh, ldwh) 498 impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh) 499 } 500 501 // Vertical multiply. 502 for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv { 503 jlen := min(nv, max(ktop, incol)-jrow) 504 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 505 1, h[jrow*ldh+incol+k0+1:], ldh, 506 u[k0*ldu+k0:], ldu, 507 0, wv, ldwv) 508 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh) 509 } 510 511 // Z multiply (also vertical). 512 if wantz { 513 for jrow := iloz; jrow <= ihiz; jrow += nv { 514 jlen := min(nv, ihiz-jrow+1) 515 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 516 1, z[jrow*ldz+incol+k0+1:], ldz, 517 u[k0*ldu+k0:], ldu, 518 0, wv, ldwv) 519 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz) 520 } 521 } 522 523 continue 524 } 525 526 // Updates exploiting U's 2×2 block structure. 527 528 // i2, i4, j2, j4 are the last rows and columns of the blocks. 529 i2 := (kdu + 1) / 2 530 i4 := kdu 531 j2 := i4 - i2 532 j4 := kdu 533 534 // kzs and knz deal with the band of zeros along the diagonal of one of the 535 // triangular blocks. 536 kzs := (j4 - j2) - (ns + 1) 537 knz := ns + 1 538 539 // Horizontal multiply. 540 for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { 541 jlen := min(nh, jbot-jcol+1) 542 543 // Copy bottom of H to top+kzs of scratch (the first kzs 544 // rows get multiplied by zero). 545 impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh) 546 547 // Multiply by U21ᵀ. 548 impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh) 549 bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen, 550 1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh) 551 552 // Multiply top of H by U11ᵀ. 553 bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2, 554 1, u, ldu, h[(incol+1)*ldh+jcol:], ldh, 555 1, wh, ldwh) 556 557 // Copy top of H to bottom of WH. 558 impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh) 559 560 // Multiply by U21ᵀ. 561 bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen, 562 1, u[i2:], ldu, wh[i2*ldwh:], ldwh) 563 564 // Multiply by U22. 565 bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2, 566 1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh, 567 1, wh[i2*ldwh:], ldwh) 568 569 // Copy it back. 570 impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh) 571 } 572 573 // Vertical multiply. 574 for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv { 575 jlen := min(nv, max(incol, ktop)-jrow) 576 577 // Copy right of H to scratch (the first kzs columns get multiplied 578 // by zero). 579 impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv) 580 581 // Multiply by U21. 582 impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) 583 bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, 584 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) 585 586 // Multiply by U11. 587 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, 588 1, h[jrow*ldh+incol+1:], ldh, u, ldu, 589 1, wv, ldwv) 590 591 // Copy left of H to right of scratch. 592 impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv) 593 594 // Multiply by U21. 595 bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, 596 1, u[i2:], ldu, wv[i2:], ldwv) 597 598 // Multiply by U22. 599 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, 600 1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu, 601 1, wv[i2:], ldwv) 602 603 // Copy it back. 604 impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh) 605 } 606 607 if !wantz { 608 continue 609 } 610 // Multiply Z (also vertical). 611 for jrow := iloz; jrow <= ihiz; jrow += nv { 612 jlen := min(nv, ihiz-jrow+1) 613 614 // Copy right of Z to left of scratch (first kzs columns get 615 // multiplied by zero). 616 impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv) 617 618 // Multiply by U12. 619 impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) 620 bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, 621 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) 622 623 // Multiply by U11. 624 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, 625 1, z[jrow*ldz+incol+1:], ldz, u, ldu, 626 1, wv, ldwv) 627 628 // Copy left of Z to right of scratch. 629 impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv) 630 631 // Multiply by U21. 632 bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, 633 1, u[i2:], ldu, wv[i2:], ldwv) 634 635 // Multiply by U22. 636 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, 637 1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu, 638 1, wv[i2:], ldwv) 639 640 // Copy the result back to Z. 641 impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz) 642 } 643 } 644 }