github.com/gonum/lapack@v0.0.0-20181123203213-e4cdc5a0bff9/native/dlaqr5.go (about) 1 // Copyright ©2016 The gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package native 6 7 import ( 8 "math" 9 10 "github.com/gonum/blas" 11 "github.com/gonum/blas/blas64" 12 ) 13 14 // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated 15 // block of a Hessenberg matrix. 16 // 17 // wantt and wantz determine whether the quasi-triangular Schur factor and the 18 // orthogonal Schur factor, respectively, will be computed. 19 // 20 // kacc22 specifies the computation mode of far-from-diagonal orthogonal 21 // updates. Permitted values are: 22 // 0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix 23 // multiply to update far-from-diagonal matrix entries. 24 // 1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to 25 // update far-from-diagonal matrix entries. 26 // 2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update 27 // far-from-diagonal matrix entries, and take advantage of 2×2 block 28 // structure during matrix multiplies. 29 // For other values of kacc2 Dlaqr5 will panic. 30 // 31 // n is the order of the Hessenberg matrix H. 32 // 33 // ktop and kbot are indices of the first and last row and column of an isolated 34 // diagonal block upon which the QR sweep will be applied. It must hold that 35 // ktop == 0, or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and 36 // kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0, 37 // otherwise Dlaqr5 will panic. 38 // 39 // nshfts is the number of simultaneous shifts. It must be positive and even, 40 // otherwise Dlaqr5 will panic. 41 // 42 // sr and si contain the real and imaginary parts, respectively, of the shifts 43 // of origin that define the multi-shift QR sweep. On return both slices may be 44 // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5 45 // will panic. 46 // 47 // h and ldh represent the Hessenberg matrix H of size n×n. On return 48 // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated 49 // diagonal block in rows and columns ktop through kbot, inclusive. 50 // 51 // iloz and ihiz specify the rows of Z to which transformations will be applied 52 // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5 53 // will panic. 54 // 55 // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep 56 // orthogonal similarity transformation is accumulated into 57 // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced. 58 // 59 // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V 60 // is transposed with respect to the reference netlib implementation. 61 // 62 // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3). 63 // 64 // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh. 65 // 66 // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3). 67 // 68 // Dlaqr5 is an internal routine. It is exported for testing purposes. 69 func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) { 70 checkMatrix(n, n, h, ldh) 71 if ktop < 0 || n <= ktop { 72 panic("lapack: invalid value of ktop") 73 } 74 if ktop > 0 && h[ktop*ldh+ktop-1] != 0 { 75 panic("lapack: diagonal block is not isolated") 76 } 77 if kbot < 0 || n <= kbot { 78 panic("lapack: invalid value of kbot") 79 } 80 if kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0 { 81 panic("lapack: diagonal block is not isolated") 82 } 83 if nshfts < 0 || nshfts&0x1 != 0 { 84 panic("lapack: invalid number of shifts") 85 } 86 if len(sr) != nshfts || len(si) != nshfts { 87 panic(badSlice) // TODO(vladimir-ch) Another message? 88 } 89 if wantz { 90 if ihiz >= n { 91 panic("lapack: invalid value of ihiz") 92 } 93 if iloz < 0 || ihiz < iloz { 94 panic("lapack: invalid value of iloz") 95 } 96 checkMatrix(n, n, z, ldz) 97 } 98 checkMatrix(nshfts/2, 3, v, ldv) // Transposed w.r.t. lapack. 99 checkMatrix(3*nshfts-3, 3*nshfts-3, u, ldu) 100 checkMatrix(nv, 3*nshfts-3, wv, ldwv) 101 checkMatrix(3*nshfts-3, nh, wh, ldwh) 102 if kacc22 != 0 && kacc22 != 1 && kacc22 != 2 { 103 panic("lapack: invalid value of kacc22") 104 } 105 106 // If there are no shifts, then there is nothing to do. 107 if nshfts < 2 { 108 return 109 } 110 // If the active block is empty or 1×1, then there is nothing to do. 111 if ktop >= kbot { 112 return 113 } 114 115 // Shuffle shifts into pairs of real shifts and pairs of complex 116 // conjugate shifts assuming complex conjugate shifts are already 117 // adjacent to one another. 118 for i := 0; i < nshfts-2; i += 2 { 119 if si[i] == -si[i+1] { 120 continue 121 } 122 sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i] 123 si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i] 124 } 125 126 // Note: lapack says that nshfts must be even but allows it to be odd 127 // anyway. We panic above if nshfts is not even, so reducing it by one 128 // is unnecessary. The only caller Dlaqr04 uses only even nshfts. 129 // 130 // The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341: 131 // * ==== NSHFTS is supposed to be even, but if it is odd, 132 // * . then simply reduce it by one. The shuffle above 133 // * . ensures that the dropped shift is real and that 134 // * . the remaining shifts are paired. ==== 135 // * 136 // NS = NSHFTS - MOD( NSHFTS, 2 ) 137 ns := nshfts 138 139 safmin := dlamchS 140 ulp := dlamchP 141 smlnum := safmin * float64(n) / ulp 142 143 // Use accumulated reflections to update far-from-diagonal entries? 144 accum := kacc22 == 1 || kacc22 == 2 145 // If so, exploit the 2×2 block structure? 146 blk22 := ns > 2 && kacc22 == 2 147 148 // Clear trash. 149 if ktop+2 <= kbot { 150 h[(ktop+2)*ldh+ktop] = 0 151 } 152 153 // nbmps = number of 2-shift bulges in the chain. 154 nbmps := ns / 2 155 156 // kdu = width of slab. 157 kdu := 6*nbmps - 3 158 159 // Create and chase chains of nbmps bulges. 160 for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 { 161 ndcol := incol + kdu 162 if accum { 163 impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu) 164 } 165 166 // Near-the-diagonal bulge chase. The following loop performs 167 // the near-the-diagonal part of a small bulge multi-shift QR 168 // sweep. Each 6*nbmps-2 column diagonal chunk extends from 169 // column incol to column ndcol (including both column incol and 170 // column ndcol). The following loop chases a 3*nbmps column 171 // long chain of nbmps bulges 3*nbmps-2 columns to the right. 172 // (incol may be less than ktop and ndcol may be greater than 173 // kbot indicating phantom columns from which to chase bulges 174 // before they are actually introduced or to which to chase 175 // bulges beyond column kbot.) 176 for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ { 177 // Bulges number mtop to mbot are active double implicit 178 // shift bulges. There may or may not also be small 2×2 179 // bulge, if there is room. The inactive bulges (if any) 180 // must wait until the active bulges have moved down the 181 // diagonal to make room. The phantom matrix paradigm 182 // described above helps keep track. 183 184 mtop := max(0, ((ktop-1)-krcol+2)/3) 185 mbot := min(nbmps, (kbot-krcol)/3) - 1 186 m22 := mbot + 1 187 bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2) 188 189 // Generate reflections to chase the chain right one 190 // column. (The minimum value of k is ktop-1.) 191 for m := mtop; m <= mbot; m++ { 192 k := krcol + 3*m 193 if k == ktop-1 { 194 impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh, 195 sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], 196 v[m*ldv:m*ldv+3]) 197 alpha := v[m*ldv] 198 _, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1) 199 continue 200 } 201 beta := h[(k+1)*ldh+k] 202 v[m*ldv+1] = h[(k+2)*ldh+k] 203 v[m*ldv+2] = h[(k+3)*ldh+k] 204 beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1) 205 206 // A bulge may collapse because of vigilant deflation or 207 // destructive underflow. In the underflow case, try the 208 // two-small-subdiagonals trick to try to reinflate the 209 // bulge. 210 if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 { 211 // Typical case: not collapsed (yet). 212 h[(k+1)*ldh+k] = beta 213 h[(k+2)*ldh+k] = 0 214 h[(k+3)*ldh+k] = 0 215 continue 216 } 217 218 // Atypical case: collapsed. Attempt to reintroduce 219 // ignoring H[k+1,k] and H[k+2,k]. If the fill 220 // resulting from the new reflector is too large, 221 // then abandon it. Otherwise, use the new one. 222 var vt [3]float64 223 impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m], 224 si[2*m], sr[2*m+1], si[2*m+1], vt[:]) 225 alpha := vt[0] 226 _, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1) 227 refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]) 228 229 dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2]) 230 if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum { 231 // Starting a new bulge here would create 232 // non-negligible fill. Use the old one with 233 // trepidation. 234 h[(k+1)*ldh+k] = beta 235 h[(k+2)*ldh+k] = 0 236 h[(k+3)*ldh+k] = 0 237 continue 238 } else { 239 // Starting a new bulge here would create 240 // only negligible fill. Replace the old 241 // reflector with the new one. 242 h[(k+1)*ldh+k] -= refsum 243 h[(k+2)*ldh+k] = 0 244 h[(k+3)*ldh+k] = 0 245 v[m*ldv] = vt[0] 246 v[m*ldv+1] = vt[1] 247 v[m*ldv+2] = vt[2] 248 } 249 } 250 251 // Generate a 2×2 reflection, if needed. 252 if bmp22 { 253 k := krcol + 3*m22 254 if k == ktop-1 { 255 impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh, 256 sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1], 257 v[m22*ldv:m22*ldv+2]) 258 beta := v[m22*ldv] 259 _, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) 260 } else { 261 beta := h[(k+1)*ldh+k] 262 v[m22*ldv+1] = h[(k+2)*ldh+k] 263 beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) 264 h[(k+1)*ldh+k] = beta 265 h[(k+2)*ldh+k] = 0 266 } 267 } 268 269 // Multiply H by reflections from the left. 270 var jbot int 271 switch { 272 case accum: 273 jbot = min(ndcol, kbot) 274 case wantt: 275 jbot = n - 1 276 default: 277 jbot = kbot 278 } 279 for j := max(ktop, krcol); j <= jbot; j++ { 280 mend := min(mbot+1, (j-krcol+2)/3) - 1 281 for m := mtop; m <= mend; m++ { 282 k := krcol + 3*m 283 refsum := v[m*ldv] * (h[(k+1)*ldh+j] + 284 v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]) 285 h[(k+1)*ldh+j] -= refsum 286 h[(k+2)*ldh+j] -= refsum * v[m*ldv+1] 287 h[(k+3)*ldh+j] -= refsum * v[m*ldv+2] 288 } 289 } 290 if bmp22 { 291 k := krcol + 3*m22 292 for j := max(k+1, ktop); j <= jbot; j++ { 293 refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]) 294 h[(k+1)*ldh+j] -= refsum 295 h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1] 296 } 297 } 298 299 // Multiply H by reflections from the right. Delay filling in the last row 300 // until the vigilant deflation check is complete. 301 var jtop int 302 switch { 303 case accum: 304 jtop = max(ktop, incol) 305 case wantt: 306 jtop = 0 307 default: 308 jtop = ktop 309 } 310 for m := mtop; m <= mbot; m++ { 311 if v[m*ldv] == 0 { 312 continue 313 } 314 k := krcol + 3*m 315 for j := jtop; j <= min(kbot, k+3); j++ { 316 refsum := v[m*ldv] * (h[j*ldh+k+1] + 317 v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]) 318 h[j*ldh+k+1] -= refsum 319 h[j*ldh+k+2] -= refsum * v[m*ldv+1] 320 h[j*ldh+k+3] -= refsum * v[m*ldv+2] 321 } 322 if accum { 323 // Accumulate U. (If necessary, update Z later with with an 324 // efficient matrix-matrix multiply.) 325 kms := k - incol 326 for j := max(0, ktop-incol-1); j < kdu; j++ { 327 refsum := v[m*ldv] * (u[j*ldu+kms] + 328 v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2]) 329 u[j*ldu+kms] -= refsum 330 u[j*ldu+kms+1] -= refsum * v[m*ldv+1] 331 u[j*ldu+kms+2] -= refsum * v[m*ldv+2] 332 } 333 } else if wantz { 334 // U is not accumulated, so update Z now by multiplying by 335 // reflections from the right. 336 for j := iloz; j <= ihiz; j++ { 337 refsum := v[m*ldv] * (z[j*ldz+k+1] + 338 v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]) 339 z[j*ldz+k+1] -= refsum 340 z[j*ldz+k+2] -= refsum * v[m*ldv+1] 341 z[j*ldz+k+3] -= refsum * v[m*ldv+2] 342 } 343 } 344 } 345 346 // Special case: 2×2 reflection (if needed). 347 if bmp22 && v[m22*ldv] != 0 { 348 k := krcol + 3*m22 349 for j := jtop; j <= min(kbot, k+3); j++ { 350 refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]) 351 h[j*ldh+k+1] -= refsum 352 h[j*ldh+k+2] -= refsum * v[m22*ldv+1] 353 } 354 if accum { 355 kms := k - incol 356 for j := max(0, ktop-incol-1); j < kdu; j++ { 357 refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1]) 358 u[j*ldu+kms] -= refsum 359 u[j*ldu+kms+1] -= refsum * v[m22*ldv+1] 360 } 361 } else if wantz { 362 for j := iloz; j <= ihiz; j++ { 363 refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]) 364 z[j*ldz+k+1] -= refsum 365 z[j*ldz+k+2] -= refsum * v[m22*ldv+1] 366 } 367 } 368 } 369 370 // Vigilant deflation check. 371 mstart := mtop 372 if krcol+3*mstart < ktop { 373 mstart++ 374 } 375 mend := mbot 376 if bmp22 { 377 mend++ 378 } 379 if krcol == kbot-2 { 380 mend++ 381 } 382 for m := mstart; m <= mend; m++ { 383 k := min(kbot-1, krcol+3*m) 384 385 // The following convergence test requires that the tradition 386 // small-compared-to-nearby-diagonals criterion and the Ahues & 387 // Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter 388 // improves accuracy in some examples. Falling back on an alternate 389 // convergence criterion when tst1 or tst2 is zero (as done here) is 390 // traditional but probably unnecessary. 391 392 if h[(k+1)*ldh+k] == 0 { 393 continue 394 } 395 tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) 396 if tst1 == 0 { 397 if k >= ktop+1 { 398 tst1 += math.Abs(h[k*ldh+k-1]) 399 } 400 if k >= ktop+2 { 401 tst1 += math.Abs(h[k*ldh+k-2]) 402 } 403 if k >= ktop+3 { 404 tst1 += math.Abs(h[k*ldh+k-3]) 405 } 406 if k <= kbot-2 { 407 tst1 += math.Abs(h[(k+2)*ldh+k+1]) 408 } 409 if k <= kbot-3 { 410 tst1 += math.Abs(h[(k+3)*ldh+k+1]) 411 } 412 if k <= kbot-4 { 413 tst1 += math.Abs(h[(k+4)*ldh+k+1]) 414 } 415 } 416 if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) { 417 h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 418 h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 419 h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 420 h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 421 scl := h11 + h12 422 tst2 := h22 * (h11 / scl) 423 if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) { 424 h[(k+1)*ldh+k] = 0 425 } 426 } 427 } 428 429 // Fill in the last row of each bulge. 430 mend = min(nbmps, (kbot-krcol-1)/3) - 1 431 for m := mtop; m <= mend; m++ { 432 k := krcol + 3*m 433 refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3] 434 h[(k+4)*ldh+k+1] = -refsum 435 h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1] 436 h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2] 437 } 438 } 439 440 // Use U (if accumulated) to update far-from-diagonal entries in H. 441 // If required, use U to update Z as well. 442 if !accum { 443 continue 444 } 445 var jtop, jbot int 446 if wantt { 447 jtop = 0 448 jbot = n - 1 449 } else { 450 jtop = ktop 451 jbot = kbot 452 } 453 bi := blas64.Implementation() 454 if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 { 455 // Updates not exploiting the 2×2 block structure of U. k0 and nu keep track 456 // of the location and size of U in the special cases of introducing bulges 457 // and chasing bulges off the bottom. In these special cases and in case the 458 // number of shifts is ns = 2, there is no 2×2 block structure to exploit. 459 460 k0 := max(0, ktop-incol-1) 461 nu := kdu - max(0, ndcol-kbot) - k0 462 463 // Horizontal multiply. 464 for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { 465 jlen := min(nh, jbot-jcol+1) 466 bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu, 467 1, u[k0*ldu+k0:], ldu, 468 h[(incol+k0+1)*ldh+jcol:], ldh, 469 0, wh, ldwh) 470 impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh) 471 } 472 473 // Vertical multiply. 474 for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv { 475 jlen := min(nv, max(ktop, incol)-jrow) 476 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 477 1, h[jrow*ldh+incol+k0+1:], ldh, 478 u[k0*ldu+k0:], ldu, 479 0, wv, ldwv) 480 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh) 481 } 482 483 // Z multiply (also vertical). 484 if wantz { 485 for jrow := iloz; jrow <= ihiz; jrow += nv { 486 jlen := min(nv, ihiz-jrow+1) 487 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 488 1, z[jrow*ldz+incol+k0+1:], ldz, 489 u[k0*ldu+k0:], ldu, 490 0, wv, ldwv) 491 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz) 492 } 493 } 494 495 continue 496 } 497 498 // Updates exploiting U's 2×2 block structure. 499 500 // i2, i4, j2, j4 are the last rows and columns of the blocks. 501 i2 := (kdu + 1) / 2 502 i4 := kdu 503 j2 := i4 - i2 504 j4 := kdu 505 506 // kzs and knz deal with the band of zeros along the diagonal of one of the 507 // triangular blocks. 508 kzs := (j4 - j2) - (ns + 1) 509 knz := ns + 1 510 511 // Horizontal multiply. 512 for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { 513 jlen := min(nh, jbot-jcol+1) 514 515 // Copy bottom of H to top+kzs of scratch (the first kzs 516 // rows get multiplied by zero). 517 impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh) 518 519 // Multiply by U21^T. 520 impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh) 521 bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen, 522 1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh) 523 524 // Multiply top of H by U11^T. 525 bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2, 526 1, u, ldu, h[(incol+1)*ldh+jcol:], ldh, 527 1, wh, ldwh) 528 529 // Copy top of H to bottom of WH. 530 impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh) 531 532 // Multiply by U21^T. 533 bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen, 534 1, u[i2:], ldu, wh[i2*ldwh:], ldwh) 535 536 // Multiply by U22. 537 bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2, 538 1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh, 539 1, wh[i2*ldwh:], ldwh) 540 541 // Copy it back. 542 impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh) 543 } 544 545 // Vertical multiply. 546 for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv { 547 jlen := min(nv, max(incol, ktop)-jrow) 548 549 // Copy right of H to scratch (the first kzs columns get multiplied 550 // by zero). 551 impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv) 552 553 // Multiply by U21. 554 impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) 555 bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, 556 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) 557 558 // Multiply by U11. 559 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, 560 1, h[jrow*ldh+incol+1:], ldh, u, ldu, 561 1, wv, ldwv) 562 563 // Copy left of H to right of scratch. 564 impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv) 565 566 // Multiply by U21. 567 bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, 568 1, u[i2:], ldu, wv[i2:], ldwv) 569 570 // Multiply by U22. 571 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, 572 1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu, 573 1, wv[i2:], ldwv) 574 575 // Copy it back. 576 impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh) 577 } 578 579 if !wantz { 580 continue 581 } 582 // Multiply Z (also vertical). 583 for jrow := iloz; jrow <= ihiz; jrow += nv { 584 jlen := min(nv, ihiz-jrow+1) 585 586 // Copy right of Z to left of scratch (first kzs columns get 587 // multiplied by zero). 588 impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv) 589 590 // Multiply by U12. 591 impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) 592 bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, 593 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) 594 595 // Multiply by U11. 596 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, 597 1, z[jrow*ldz+incol+1:], ldz, u, ldu, 598 1, wv, ldwv) 599 600 // Copy left of Z to right of scratch. 601 impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv) 602 603 // Multiply by U21. 604 bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, 605 1, u[i2:], ldu, wv[i2:], ldwv) 606 607 // Multiply by U22. 608 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, 609 1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu, 610 1, wv[i2:], ldwv) 611 612 // Copy the result back to Z. 613 impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz) 614 } 615 } 616 }