gonum.org/v1/gonum@v0.14.0/lapack/gonum/dlaqr5.go (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package gonum 6 7 import ( 8 "math" 9 10 "gonum.org/v1/gonum/blas" 11 "gonum.org/v1/gonum/blas/blas64" 12 ) 13 14 // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated 15 // block of a Hessenberg matrix. 16 // 17 // wantt and wantz determine whether the quasi-triangular Schur factor and the 18 // orthogonal Schur factor, respectively, will be computed. 19 // 20 // kacc22 specifies the computation mode of far-from-diagonal orthogonal 21 // updates. Permitted values are: 22 // 23 // 0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix 24 // multiply to update far-from-diagonal matrix entries. 25 // 1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to 26 // update far-from-diagonal matrix entries. 27 // 2: Same as kacc22=1. This option used to enable exploiting the 2×2 structure 28 // during matrix multiplications, but this is no longer supported. 29 // 30 // For other values of kacc2 Dlaqr5 will panic. 31 // 32 // n is the order of the Hessenberg matrix H. 33 // 34 // ktop and kbot are indices of the first and last row and column of an isolated 35 // diagonal block upon which the QR sweep will be applied. It must hold that 36 // 37 // ktop == 0, or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and 38 // kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0, 39 // 40 // otherwise Dlaqr5 will panic. 41 // 42 // nshfts is the number of simultaneous shifts. It must be positive and even, 43 // otherwise Dlaqr5 will panic. 44 // 45 // sr and si contain the real and imaginary parts, respectively, of the shifts 46 // of origin that define the multi-shift QR sweep. On return both slices may be 47 // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5 48 // will panic. 49 // 50 // h and ldh represent the Hessenberg matrix H of size n×n. On return 51 // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated 52 // diagonal block in rows and columns ktop through kbot, inclusive. 53 // 54 // iloz and ihiz specify the rows of Z to which transformations will be applied 55 // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5 56 // will panic. 57 // 58 // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep 59 // orthogonal similarity transformation is accumulated into 60 // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced. 61 // 62 // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V 63 // is transposed with respect to the reference netlib implementation. 64 // 65 // u and ldu represent an auxiliary matrix of size (2*nshfts)×(2*nshfts). 66 // 67 // wh and ldwh represent an auxiliary matrix of size (2*nshfts-1)×nh. 68 // 69 // wv and ldwv represent an auxiliary matrix of size nv×(2*nshfts-1). 70 // 71 // Dlaqr5 is an internal routine. It is exported for testing purposes. 72 func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) { 73 switch { 74 case kacc22 != 0 && kacc22 != 1 && kacc22 != 2: 75 panic(badKacc22) 76 case n < 0: 77 panic(nLT0) 78 case ktop < 0 || n <= ktop: 79 panic(badKtop) 80 case kbot < 0 || n <= kbot: 81 panic(badKbot) 82 83 case nshfts < 0: 84 panic(nshftsLT0) 85 case nshfts&0x1 != 0: 86 panic(nshftsOdd) 87 case len(sr) != nshfts: 88 panic(badLenSr) 89 case len(si) != nshfts: 90 panic(badLenSi) 91 92 case ldh < max(1, n): 93 panic(badLdH) 94 case len(h) < (n-1)*ldh+n: 95 panic(shortH) 96 97 case wantz && ihiz >= n: 98 panic(badIhiz) 99 case wantz && iloz < 0 || ihiz < iloz: 100 panic(badIloz) 101 case ldz < 1, wantz && ldz < n: 102 panic(badLdZ) 103 case wantz && len(z) < (n-1)*ldz+n: 104 panic(shortZ) 105 106 case ldv < 3: 107 // V is transposed w.r.t. reference lapack. 108 panic(badLdV) 109 case len(v) < (nshfts/2-1)*ldv+3: 110 panic(shortV) 111 112 case ldu < max(1, 2*nshfts): 113 panic(badLdU) 114 case len(u) < (2*nshfts-1)*ldu+2*nshfts: 115 panic(shortU) 116 117 case nv < 0: 118 panic(nvLT0) 119 case ldwv < max(1, 2*nshfts): 120 panic(badLdWV) 121 case len(wv) < (nv-1)*ldwv+2*nshfts: 122 panic(shortWV) 123 124 case nh < 0: 125 panic(nhLT0) 126 case ldwh < max(1, nh): 127 panic(badLdWH) 128 case len(wh) < (2*nshfts-1)*ldwh+nh: 129 panic(shortWH) 130 131 case ktop > 0 && h[ktop*ldh+ktop-1] != 0: 132 panic(notIsolated) 133 case kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0: 134 panic(notIsolated) 135 } 136 137 // If there are no shifts, then there is nothing to do. 138 if nshfts < 2 { 139 return 140 } 141 // If the active block is empty or 1×1, then there is nothing to do. 142 if ktop >= kbot { 143 return 144 } 145 146 // Shuffle shifts into pairs of real shifts and pairs of complex 147 // conjugate shifts assuming complex conjugate shifts are already 148 // adjacent to one another. 149 for i := 0; i < nshfts-2; i += 2 { 150 if si[i] == -si[i+1] { 151 continue 152 } 153 sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i] 154 si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i] 155 } 156 157 // Note: lapack says that nshfts must be even but allows it to be odd 158 // anyway. We panic above if nshfts is not even, so reducing it by one 159 // is unnecessary. The only caller Dlaqr04 uses only even nshfts. 160 // 161 // The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341: 162 // * ==== NSHFTS is supposed to be even, but if it is odd, 163 // * . then simply reduce it by one. The shuffle above 164 // * . ensures that the dropped shift is real and that 165 // * . the remaining shifts are paired. ==== 166 // * 167 // NS = NSHFTS - MOD( NSHFTS, 2 ) 168 ns := nshfts 169 170 safmin := dlamchS 171 ulp := dlamchP 172 smlnum := safmin * float64(n) / ulp 173 174 // Use accumulated reflections to update far-from-diagonal entries? 175 accum := kacc22 == 1 || kacc22 == 2 176 177 // Clear trash. 178 if ktop+2 <= kbot { 179 h[(ktop+2)*ldh+ktop] = 0 180 } 181 182 // nbmps = number of 2-shift bulges in the chain. 183 nbmps := ns / 2 184 185 // kdu = width of slab. 186 kdu := 4 * nbmps 187 188 // Create and chase chains of nbmps bulges. 189 for incol := ktop - 2*nbmps + 1; incol <= kbot-2; incol += 2 * nbmps { 190 // jtop is an index from which updates from the right start. 191 var jtop int 192 switch { 193 case accum: 194 jtop = max(ktop, incol) 195 case wantt: 196 default: 197 jtop = ktop 198 } 199 ndcol := incol + kdu 200 if accum { 201 impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu) 202 } 203 // Near-the-diagonal bulge chase. The following loop performs 204 // the near-the-diagonal part of a small bulge multi-shift QR 205 // sweep. Each 4*nbmps column diagonal chunk extends from 206 // column incol to column ndcol (including both column incol and 207 // column ndcol). The following loop chases a 2*nbmps+1 column 208 // long chain of nbmps bulges 2*nbmps columns to the right. 209 // (incol may be less than ktop and ndcol may be greater than 210 // kbot indicating phantom columns from which to chase bulges 211 // before they are actually introduced or to which to chase 212 // bulges beyond column kbot.) 213 for krcol := incol; krcol <= min(incol+2*nbmps-1, kbot-2); krcol++ { 214 // Bulges number mtop to mbot are active double implicit 215 // shift bulges. There may or may not also be small 2×2 216 // bulge, if there is room. The inactive bulges (if any) 217 // must wait until the active bulges have moved down the 218 // diagonal to make room. The phantom matrix paradigm 219 // described above helps keep track. 220 mtop := max(0, (ktop-krcol)/2) 221 mbot := min(nbmps, (kbot-krcol-1)/2) - 1 222 m22 := mbot + 1 223 bmp22 := (mbot < nbmps-1) && (krcol+2*m22 == kbot-2) 224 // Generate reflections to chase the chain right one column. 225 // The minimum value of k is ktop-1. 226 if bmp22 { 227 // Special case: 2×2 reflection at bottom treated separately. 228 k := krcol + 2*m22 229 if k == ktop-1 { 230 impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh, 231 sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1], 232 v[m22*ldv:m22*ldv+2]) 233 beta := v[m22*ldv] 234 _, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) 235 } else { 236 beta := h[(k+1)*ldh+k] 237 v[m22*ldv+1] = h[(k+2)*ldh+k] 238 beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) 239 h[(k+1)*ldh+k] = beta 240 h[(k+2)*ldh+k] = 0 241 } 242 // Perform update from right within computational window. 243 t1 := v[m22*ldv] 244 t2 := t1 * v[m22*ldv+1] 245 for j := jtop; j <= min(kbot, k+3); j++ { 246 refsum := h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2] 247 h[j*ldh+k+1] -= refsum * t1 248 h[j*ldh+k+2] -= refsum * t2 249 } 250 // Perform update from left within computational window. 251 var jbot int 252 switch { 253 case accum: 254 jbot = min(ndcol, kbot) 255 case wantt: 256 jbot = n - 1 257 default: 258 jbot = kbot 259 } 260 t1 = v[m22*ldv] 261 t2 = t1 * v[m22*ldv+1] 262 for j := k + 1; j <= jbot; j++ { 263 refsum := h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j] 264 h[(k+1)*ldh+j] -= refsum * t1 265 h[(k+2)*ldh+j] -= refsum * t2 266 } 267 // The following convergence test requires that the traditional 268 // small-compared-to-nearby-diagonals criterion and the Ahues & 269 // Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter 270 // improves accuracy in some examples. Falling back on an alternate 271 // convergence criterion when tst1 or tst2 is zero (as done here) is 272 // traditional but probably unnecessary. 273 if k >= ktop && h[(k+1)*ldh+k] != 0 { 274 tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) 275 if tst1 == 0 { 276 if k >= ktop+1 { 277 tst1 += math.Abs(h[k*ldh+k-1]) 278 } 279 if k >= ktop+2 { 280 tst1 += math.Abs(h[k*ldh+k-2]) 281 } 282 if k >= ktop+3 { 283 tst1 += math.Abs(h[k*ldh+k-3]) 284 } 285 if k <= kbot-2 { 286 tst1 += math.Abs(h[(k+2)*ldh+k+1]) 287 } 288 if k <= kbot-3 { 289 tst1 += math.Abs(h[(k+3)*ldh+k+1]) 290 } 291 if k <= kbot-4 { 292 tst1 += math.Abs(h[(k+4)*ldh+k+1]) 293 } 294 } 295 if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) { 296 h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 297 h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 298 h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 299 h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 300 scl := h11 + h12 301 tst2 := h22 * (h11 / scl) 302 if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) { 303 h[(k+1)*ldh+k] = 0 304 } 305 } 306 } 307 // Accumulate orthogonal transformations. 308 if accum { 309 kms := k - incol - 1 310 t1 = v[m22*ldv] 311 t2 = t1 * v[m22*ldv+1] 312 for j := max(0, ktop-incol-1); j < kdu; j++ { 313 refsum := u[j*ldu+kms+1] + v[m22*ldv+1]*u[j*ldu+kms+2] 314 u[j*ldu+kms+1] -= refsum * t1 315 u[j*ldu+kms+2] -= refsum * t2 316 } 317 } else if wantz { 318 t1 = v[m22*ldv] 319 t2 = t1 * v[m22*ldv+1] 320 for j := iloz; j <= ihiz; j++ { 321 refsum := z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2] 322 z[j*ldz+k+1] -= refsum * t1 323 z[j*ldz+k+2] -= refsum * t2 324 } 325 } 326 } 327 // Normal case: Chain of 3×3 reflections. 328 for m := mbot; m >= mtop; m-- { 329 k := krcol + 2*m 330 if k == ktop-1 { 331 impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh, 332 sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], 333 v[m*ldv:m*ldv+3]) 334 alpha := v[m*ldv] 335 _, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1) 336 } else { 337 // Perform delayed transformation of row below m-th bulge. 338 // Exploit fact that first two elements of row are actually 339 // zero. 340 t1 := v[m*ldv] 341 t2 := t1 * v[m*ldv+1] 342 t3 := t1 * v[m*ldv+2] 343 refsum := v[m*ldv+2] * h[(k+3)*ldh+k+2] 344 h[(k+3)*ldh+k] = -refsum * t1 345 h[(k+3)*ldh+k+1] = -refsum * t2 346 h[(k+3)*ldh+k+2] -= refsum * t3 347 // Calculate reflection to move m-th bulge one step. 348 beta := h[(k+1)*ldh+k] 349 v[m*ldv+1] = h[(k+2)*ldh+k] 350 v[m*ldv+2] = h[(k+3)*ldh+k] 351 beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1) 352 // A bulge may collapse because of vigilant deflation or 353 // destructive underflow. In the underflow case, try the 354 // two-small-subdiagonals trick to try to reinflate the 355 // bulge. 356 if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 { 357 // Typical case: not collapsed (yet). 358 h[(k+1)*ldh+k] = beta 359 h[(k+2)*ldh+k] = 0 360 h[(k+3)*ldh+k] = 0 361 } else { 362 // Atypical case: collapsed. Attempt to reintroduce 363 // ignoring H[k+1,k] and H[k+2,k]. If the fill resulting 364 // from the new reflector is too large, then abandon it. 365 // Otherwise, use the new one. 366 var vt [3]float64 367 impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, 368 sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], 369 vt[:]) 370 _, vt[0] = impl.Dlarfg(3, vt[0], vt[1:3], 1) 371 t1 = vt[0] 372 t2 = t1 * vt[1] 373 t3 = t1 * vt[2] 374 refsum = h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k] 375 dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2]) 376 if math.Abs(h[(k+2)*ldh+k]-refsum*t2)+math.Abs(refsum*t3) > ulp*dsum { 377 // Starting a new bulge here would create 378 // non-negligible fill. Use the old one with 379 // trepidation. 380 h[(k+1)*ldh+k] = beta 381 h[(k+2)*ldh+k] = 0 382 h[(k+3)*ldh+k] = 0 383 } else { 384 // Starting a new bulge here would create only 385 // negligible fill. Replace the old reflector with 386 // the new one. 387 h[(k+1)*ldh+k] -= refsum * t1 388 h[(k+2)*ldh+k] = 0 389 h[(k+3)*ldh+k] = 0 390 v[m*ldv] = vt[0] 391 v[m*ldv+1] = vt[1] 392 v[m*ldv+2] = vt[2] 393 } 394 } 395 } 396 // Apply reflection from the right and the first column of 397 // update from the left. These updates are required for the 398 // vigilant deflation check. We still delay most of the updates 399 // from the left for efficiency. 400 t1 := v[m*ldv] 401 t2 := t1 * v[m*ldv+1] 402 t3 := t1 * v[m*ldv+2] 403 for j := jtop; j <= min(kbot, k+3); j++ { 404 refsum := h[j*ldh+k+1] + v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3] 405 h[j*ldh+k+1] -= refsum * t1 406 h[j*ldh+k+2] -= refsum * t2 407 h[j*ldh+k+3] -= refsum * t3 408 } 409 // Perform update from left for subsequent column. 410 refsum := h[(k+1)*ldh+k+1] + v[m*ldv+1]*h[(k+2)*ldh+k+1] + v[m*ldv+2]*h[(k+3)*ldh+k+1] 411 h[(k+1)*ldh+k+1] -= refsum * t1 412 h[(k+2)*ldh+k+1] -= refsum * t2 413 h[(k+3)*ldh+k+1] -= refsum * t3 414 // The following convergence test requires that the tradition 415 // small-compared-to-nearby-diagonals criterion and the Ahues & 416 // Tisseur (LAWN 122, 1997) criteria both be satisfied. The 417 // latter improves accuracy in some examples. Falling back on an 418 // alternate convergence criterion when tst1 or tst2 is zero (as 419 // done here) is traditional but probably unnecessary. 420 if k < ktop { 421 continue 422 } 423 if h[(k+1)*ldh+k] != 0 { 424 tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) 425 if tst1 == 0 { 426 if k >= ktop+1 { 427 tst1 += math.Abs(h[k*ldh+k-1]) 428 } 429 if k >= ktop+2 { 430 tst1 += math.Abs(h[k*ldh+k-2]) 431 } 432 if k >= ktop+3 { 433 tst1 += math.Abs(h[k*ldh+k-3]) 434 } 435 if k <= kbot-2 { 436 tst1 += math.Abs(h[(k+2)*ldh+k+1]) 437 } 438 if k <= kbot-3 { 439 tst1 += math.Abs(h[(k+3)*ldh+k+1]) 440 } 441 if k <= kbot-4 { 442 tst1 += math.Abs(h[(k+4)*ldh+k+1]) 443 } 444 } 445 if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) { 446 h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 447 h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) 448 h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 449 h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) 450 scl := h11 + h12 451 tst2 := h22 * (h11 / scl) 452 if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) { 453 h[(k+1)*ldh+k] = 0 454 } 455 } 456 } 457 } 458 // Multiply H by reflections from the left. 459 var jbot int 460 switch { 461 case accum: 462 jbot = min(ndcol, kbot) 463 case wantt: 464 jbot = n - 1 465 default: 466 jbot = kbot 467 } 468 for m := mbot; m >= mtop; m-- { 469 k := krcol + 2*m 470 t1 := v[m*ldv] 471 t2 := t1 * v[m*ldv+1] 472 t3 := t1 * v[m*ldv+2] 473 for j := max(ktop, krcol+2*(m+1)); j <= jbot; j++ { 474 refsum := h[(k+1)*ldh+j] + v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j] 475 h[(k+1)*ldh+j] -= refsum * t1 476 h[(k+2)*ldh+j] -= refsum * t2 477 h[(k+3)*ldh+j] -= refsum * t3 478 } 479 } 480 // Accumulate orthogonal transformations. 481 if accum { 482 // Accumulate U. If necessary, update Z later with an 483 // efficient matrix-matrix multiply. 484 for m := mbot; m >= mtop; m-- { 485 k := krcol + 2*m 486 kms := k - incol - 1 487 i2 := max(0, ktop-incol-1) 488 i2 = max(i2, kms-(krcol-incol)) 489 i4 := min(kdu, krcol+2*mbot-incol+5) 490 t1 := v[m*ldv] 491 t2 := t1 * v[m*ldv+1] 492 t3 := t1 * v[m*ldv+2] 493 for j := i2; j < i4; j++ { 494 refsum := u[j*ldu+kms+1] + v[m*ldv+1]*u[j*ldu+kms+2] + v[m*ldv+2]*u[j*ldu+kms+3] 495 u[j*ldu+kms+1] -= refsum * t1 496 u[j*ldu+kms+2] -= refsum * t2 497 u[j*ldu+kms+3] -= refsum * t3 498 } 499 } 500 } else if wantz { 501 // U is not accumulated, so update Z now by multiplying by 502 // reflections from the right. 503 for m := mbot; m >= mtop; m-- { 504 k := krcol + 2*m 505 t1 := v[m*ldv] 506 t2 := t1 * v[m*ldv+1] 507 t3 := t1 * v[m*ldv+2] 508 for j := iloz; j <= ihiz; j++ { 509 refsum := z[j*ldz+k+1] + v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3] 510 z[j*ldz+k+1] -= refsum * t1 511 z[j*ldz+k+2] -= refsum * t2 512 z[j*ldz+k+3] -= refsum * t3 513 } 514 } 515 } 516 } 517 // Use U (if accumulated) to update far-from-diagonal entries in H. 518 // If required, use U to update Z as well. 519 if !accum { 520 continue 521 } 522 jtop, jbot := ktop, kbot 523 if wantt { 524 jtop = 0 525 jbot = n - 1 526 } 527 bi := blas64.Implementation() 528 k1 := max(0, ktop-incol-1) 529 nu := kdu - max(0, ndcol-kbot) - k1 530 // Horizontal multiply. 531 for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { 532 jlen := min(nh, jbot-jcol+1) 533 bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu, 534 1, u[k1*ldu+k1:], ldu, 535 h[(incol+k1+1)*ldh+jcol:], ldh, 536 0, wh, ldwh) 537 impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k1+1)*ldh+jcol:], ldh) 538 } 539 // Vertical multiply. 540 for jrow := jtop; jrow < max(ktop, incol); jrow += nv { 541 jlen := min(nv, max(ktop, incol)-jrow) 542 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 543 1, h[jrow*ldh+incol+k1+1:], ldh, 544 u[k1*ldu+k1:], ldu, 545 0, wv, ldwv) 546 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k1+1:], ldh) 547 } 548 // Z multiply (also vertical). 549 if wantz { 550 for jrow := iloz; jrow <= ihiz; jrow += nv { 551 jlen := min(nv, ihiz-jrow+1) 552 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 553 1, z[jrow*ldz+incol+k1+1:], ldz, 554 u[k1*ldu+k1:], ldu, 555 0, wv, ldwv) 556 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k1+1:], ldz) 557 } 558 } 559 } 560 }