github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/sqr_basecase.asm (about) 1 dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C TODO: 34 C * Improve ad-hoc outer loop code and register handling. Some feed-in 35 C scheduling could improve things by several cycles per outer iteration. 36 C * In Lam3...Lam1 code for, keep accumulation operands in registers, without 37 C storing intermediates to rp. 38 C * We might want to keep 32 in a free mm register, since the register form is 39 C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save. 40 C * Look into different loop alignment, we now expand the code about 50 bytes 41 C with possibly needless alignment. 42 C * Use OSP, should solve feed-in latency problems. 43 C * Address relative slowness for un<=3 for Pentium M. The old code is there 44 C considerably faster. (1:20/14, 2:34:32, 3:66/57) 45 46 C INPUT PARAMETERS 47 C rp sp + 4 48 C up sp + 8 49 C un sp + 12 50 51 TEXT 52 ALIGN(16) 53 PROLOGUE(mpn_sqr_basecase) 54 mov 4(%esp), %edx C rp 55 mov 8(%esp), %eax C up 56 mov 12(%esp), %ecx C un 57 58 cmp $2, %ecx 59 jc L(un1) 60 jz L(un2) 61 cmp $4, %ecx 62 jc L(un3) 63 jz L(un4) 64 jmp L(big) 65 66 L(un1): mov (%eax), %eax 67 mov %edx, %ecx 68 mul %eax 69 mov %eax, (%ecx) 70 mov %edx, 4(%ecx) 71 ret 72 L(un2): movd (%eax), %mm0 C un=2 73 movd (%eax), %mm2 C un=2 74 movd 4(%eax), %mm1 C un=2 75 pmuludq %mm0, %mm0 C 64b weight 0 un=2 76 pmuludq %mm1, %mm2 C 64b weight 32 un=2 77 pmuludq %mm1, %mm1 C 64b weight 64 un=2 78 movd %mm0, (%edx) C un=2 79 psrlq $32, %mm0 C 32b weight 32 un=2 80 pcmpeqd %mm7, %mm7 C un=2 81 psrlq $33, %mm7 C 0x000000007FFFFFFF un=2 82 pand %mm2, %mm7 C 31b weight 32 un=2 83 psrlq $31, %mm2 C 33b weight 65 un=2 84 psllq $1, %mm7 C 31b weight 33 un=2 85 paddq %mm7, %mm0 C un=2 86 movd %mm0, 4(%edx) C un=2 87 psrlq $32, %mm0 C un=2 88 paddq %mm2, %mm1 C un=2 89 paddq %mm0, %mm1 C un=2 90 movd %mm1, 8(%edx) C un=2 91 psrlq $32, %mm1 C un=2 92 movd %mm1, 12(%edx) C un=2 93 emms 94 ret 95 L(un3): movd (%eax), %mm7 C un=3 96 movd 4(%eax), %mm6 C un=3 97 pmuludq %mm7, %mm6 C un=3 98 movd 8(%eax), %mm2 C un=3 99 pmuludq %mm7, %mm2 C un=3 100 movd %mm6, 4(%edx) C un=3 101 psrlq $32, %mm6 C un=3 102 paddq %mm2, %mm6 C un=3 103 movd %mm6, 8(%edx) C un=3 104 psrlq $32, %mm6 C un=3 105 movd %mm6, 12(%edx) C un=3 106 lea 4(%edx), %edx C un=3 107 lea 4(%eax), %eax C un=3 108 jmp L(am1) 109 L(un4): movd (%eax), %mm7 C un=4 110 movd 4(%eax), %mm6 C un=4 111 pmuludq %mm7, %mm6 C un=4 112 movd 8(%eax), %mm0 C un=4 113 pmuludq %mm7, %mm0 C un=4 114 movd 12(%eax), %mm1 C un=4 115 pmuludq %mm7, %mm1 C un=4 116 movd %mm6, 4(%edx) C un=4 117 psrlq $32, %mm6 C un=4 118 paddq %mm0, %mm6 C un=4 119 movd %mm6, 8(%edx) C un=4 120 psrlq $32, %mm6 C un=4 121 paddq %mm1, %mm6 C un=4 122 movd %mm6, 12(%edx) C un=4 123 psrlq $32, %mm6 C un=4 124 movd %mm6, 16(%edx) C un=4 125 lea 4(%edx), %edx C un=4 126 lea 4(%eax), %eax C un=4 127 jmp L(am2) 128 129 L(big): push %esi 130 push %ebx 131 push %edi 132 pxor %mm6, %mm6 133 movd (%eax), %mm7 C 134 lea 4(%eax), %esi C init up, up++ 135 lea 4(%eax), %eax C up2++ FIXME: should fix offsets 136 lea 4(%edx), %edi C init rp, rp++ 137 lea 4(%edx), %edx C rp2++ 138 lea -4(%ecx), %ebx C loop count 139 and $3, %ecx 140 jz L(3m) 141 cmp $2, %ecx 142 ja L(2m) 143 jb L(0m) 144 145 L(1m): 146 movd (%eax), %mm4 C m 1 147 lea (%ebx), %ecx C inner loop count m 1 148 pmuludq %mm7, %mm4 C m 1 149 movd 4(%eax), %mm3 C m 1 150 pmuludq %mm7, %mm3 C m 1 151 movd 8(%eax), %mm0 C m 1 152 jmp L(m01) C m 1 153 ALIGN(16) C m 1 154 L(lpm1): 155 pmuludq %mm7, %mm4 C m 1 156 paddq %mm0, %mm6 C m 1 157 movd 4(%eax), %mm3 C m 1 158 movd %mm6, -8(%edx) C m 1 159 psrlq $32, %mm6 C m 1 160 pmuludq %mm7, %mm3 C m 1 161 paddq %mm1, %mm6 C m 1 162 movd 8(%eax), %mm0 C m 1 163 movd %mm6, -4(%edx) C m 1 164 psrlq $32, %mm6 C m 1 165 L(m01): pmuludq %mm7, %mm0 C m 1 166 paddq %mm4, %mm6 C m 1 167 movd 12(%eax), %mm1 C m 1 168 movd %mm6, (%edx) C m 1 169 psrlq $32, %mm6 C m 1 170 pmuludq %mm7, %mm1 C m 1 171 paddq %mm3, %mm6 C m 1 172 movd 16(%eax), %mm4 C m 1 173 movd %mm6, 4(%edx) C m 1 174 psrlq $32, %mm6 C m 1 175 lea 16(%eax), %eax C m 1 176 lea 16(%edx), %edx C m 1 177 sub $4, %ecx C m 1 178 ja L(lpm1) C m 1 179 pmuludq %mm7, %mm4 C m 1 180 paddq %mm0, %mm6 C m 1 181 movd %mm6, -8(%edx) C m 1 182 psrlq $32, %mm6 C m 1 183 paddq %mm1, %mm6 C m 1 184 jmp L(0) 185 186 L(2m): 187 movd (%eax), %mm1 C m 2 188 lea (%ebx), %ecx C inner loop count m 2 189 pmuludq %mm7, %mm1 C m 2 190 movd 4(%eax), %mm4 C m 2 191 pmuludq %mm7, %mm4 C m 2 192 movd 8(%eax), %mm3 C m 2 193 jmp L(m10) C m 2 194 ALIGN(16) C m 2 195 L(lpm2): 196 pmuludq %mm7, %mm4 C m 2 197 paddq %mm0, %mm6 C m 2 198 movd 8(%eax), %mm3 C m 2 199 movd %mm6, -4(%edx) C m 2 200 psrlq $32, %mm6 C m 2 201 L(m10): pmuludq %mm7, %mm3 C m 2 202 paddq %mm1, %mm6 C m 2 203 movd 12(%eax), %mm0 C m 2 204 movd %mm6, (%edx) C m 2 205 psrlq $32, %mm6 C m 2 206 pmuludq %mm7, %mm0 C m 2 207 paddq %mm4, %mm6 C m 2 208 movd 16(%eax), %mm1 C m 2 209 movd %mm6, 4(%edx) C m 2 210 psrlq $32, %mm6 C m 2 211 pmuludq %mm7, %mm1 C m 2 212 paddq %mm3, %mm6 C m 2 213 movd 20(%eax), %mm4 C m 2 214 movd %mm6, 8(%edx) C m 2 215 psrlq $32, %mm6 C m 2 216 lea 16(%eax), %eax C m 2 217 lea 16(%edx), %edx C m 2 218 sub $4, %ecx C m 2 219 ja L(lpm2) C m 2 220 pmuludq %mm7, %mm4 C m 2 221 paddq %mm0, %mm6 C m 2 222 movd %mm6, -4(%edx) C m 2 223 psrlq $32, %mm6 C m 2 224 paddq %mm1, %mm6 C m 2 225 jmp L(1) 226 227 L(3m): 228 movd (%eax), %mm0 C m 3 229 lea (%ebx), %ecx C inner loop count m 3 230 pmuludq %mm7, %mm0 C m 3 231 movd 4(%eax), %mm1 C m 3 232 pmuludq %mm7, %mm1 C m 3 233 movd 8(%eax), %mm4 C m 3 234 jmp L(lpm3) C m 3 235 ALIGN(16) C m 3 236 L(lpm3): 237 pmuludq %mm7, %mm4 C m 3 238 paddq %mm0, %mm6 C m 3 239 movd 12(%eax), %mm3 C m 3 240 movd %mm6, (%edx) C m 3 241 psrlq $32, %mm6 C m 3 242 pmuludq %mm7, %mm3 C m 3 243 paddq %mm1, %mm6 C m 3 244 movd 16(%eax), %mm0 C m 3 245 movd %mm6, 4(%edx) C m 3 246 psrlq $32, %mm6 C m 3 247 pmuludq %mm7, %mm0 C m 3 248 paddq %mm4, %mm6 C m 3 249 movd 20(%eax), %mm1 C m 3 250 movd %mm6, 8(%edx) C m 3 251 psrlq $32, %mm6 C m 3 252 pmuludq %mm7, %mm1 C m 3 253 paddq %mm3, %mm6 C m 3 254 movd 24(%eax), %mm4 C m 3 255 movd %mm6, 12(%edx) C m 3 256 psrlq $32, %mm6 C m 3 257 lea 16(%eax), %eax C m 3 258 lea 16(%edx), %edx C m 3 259 sub $4, %ecx C m 3 260 ja L(lpm3) C m 3 261 pmuludq %mm7, %mm4 C m 3 262 paddq %mm0, %mm6 C m 3 263 movd %mm6, (%edx) C m 3 264 psrlq $32, %mm6 C m 3 265 paddq %mm1, %mm6 C m 3 266 jmp L(2) 267 268 L(0m): 269 movd (%eax), %mm3 C m 0 270 lea (%ebx), %ecx C inner loop count m 0 271 pmuludq %mm7, %mm3 C m 0 272 movd 4(%eax), %mm0 C m 0 273 pmuludq %mm7, %mm0 C m 0 274 movd 8(%eax), %mm1 C m 0 275 jmp L(m00) C m 0 276 ALIGN(16) C m 0 277 L(lpm0): 278 pmuludq %mm7, %mm4 C m 0 279 paddq %mm0, %mm6 C m 0 280 movd (%eax), %mm3 C m 0 281 movd %mm6, -12(%edx) C m 0 282 psrlq $32, %mm6 C m 0 283 pmuludq %mm7, %mm3 C m 0 284 paddq %mm1, %mm6 C m 0 285 movd 4(%eax), %mm0 C m 0 286 movd %mm6, -8(%edx) C m 0 287 psrlq $32, %mm6 C m 0 288 pmuludq %mm7, %mm0 C m 0 289 paddq %mm4, %mm6 C m 0 290 movd 8(%eax), %mm1 C m 0 291 movd %mm6, -4(%edx) C m 0 292 psrlq $32, %mm6 C m 0 293 L(m00): pmuludq %mm7, %mm1 C m 0 294 paddq %mm3, %mm6 C m 0 295 movd 12(%eax), %mm4 C m 0 296 movd %mm6, (%edx) C m 0 297 psrlq $32, %mm6 C m 0 298 lea 16(%eax), %eax C m 0 299 lea 16(%edx), %edx C m 0 300 sub $4, %ecx C m 0 301 ja L(lpm0) C m 0 302 pmuludq %mm7, %mm4 C m 0 303 paddq %mm0, %mm6 C m 0 304 movd %mm6, -12(%edx) C m 0 305 psrlq $32, %mm6 C m 0 306 paddq %mm1, %mm6 C m 0 307 jmp L(3) 308 309 L(outer): 310 lea 8(%edi), %edi C rp += 2 311 movd (%esi), %mm7 C am 3 312 mov %edi, %edx C rp2 = rp am 3 313 lea 4(%esi), %esi C up++ am 3 314 lea (%esi), %eax C up2 = up am 3 315 movd (%eax), %mm0 C am 3 316 lea (%ebx), %ecx C inner loop count am 3 317 pxor %mm6, %mm6 C am 3 318 pmuludq %mm7, %mm0 C am 3 319 movd 4(%eax), %mm1 C am 3 320 movd (%edx), %mm4 C am 3 321 pmuludq %mm7, %mm1 C am 3 322 movd 8(%eax), %mm2 C am 3 323 paddq %mm0, %mm4 C am 3 324 movd 4(%edx), %mm5 C am 3 325 jmp L(lam3) C am 3 326 ALIGN(16) C am 3 327 L(lam3): 328 pmuludq %mm7, %mm2 C am 3 329 paddq %mm4, %mm6 C am 3 330 movd 12(%eax), %mm3 C am 3 331 paddq %mm1, %mm5 C am 3 332 movd 8(%edx), %mm4 C am 3 333 movd %mm6, (%edx) C am 3 334 psrlq $32, %mm6 C am 3 335 pmuludq %mm7, %mm3 C am 3 336 paddq %mm5, %mm6 C am 3 337 movd 16(%eax), %mm0 C am 3 338 paddq %mm2, %mm4 C am 3 339 movd 12(%edx), %mm5 C am 3 340 movd %mm6, 4(%edx) C am 3 341 psrlq $32, %mm6 C am 3 342 pmuludq %mm7, %mm0 C am 3 343 paddq %mm4, %mm6 C am 3 344 movd 20(%eax), %mm1 C am 3 345 paddq %mm3, %mm5 C am 3 346 movd 16(%edx), %mm4 C am 3 347 movd %mm6, 8(%edx) C am 3 348 psrlq $32, %mm6 C am 3 349 pmuludq %mm7, %mm1 C am 3 350 paddq %mm5, %mm6 C am 3 351 movd 24(%eax), %mm2 C am 3 352 paddq %mm0, %mm4 C am 3 353 movd 20(%edx), %mm5 C am 3 354 movd %mm6, 12(%edx) C am 3 355 psrlq $32, %mm6 C am 3 356 lea 16(%eax), %eax C am 3 357 lea 16(%edx), %edx C am 3 358 sub $4, %ecx C am 3 359 ja L(lam3) C am 3 360 pmuludq %mm7, %mm2 C am 3 361 paddq %mm4, %mm6 C am 3 362 paddq %mm1, %mm5 C am 3 363 movd 8(%edx), %mm4 C am 3 364 movd %mm6, (%edx) C am 3 365 psrlq $32, %mm6 C am 3 366 paddq %mm5, %mm6 C am 3 367 paddq %mm2, %mm4 C am 3 368 L(2): movd %mm6, 4(%edx) C am 3 369 psrlq $32, %mm6 C am 3 370 paddq %mm4, %mm6 C am 3 371 movd %mm6, 8(%edx) C am 3 372 psrlq $32, %mm6 C am 3 373 movd %mm6, 12(%edx) C am 3 374 375 lea 8(%edi), %edi C rp += 2 376 movd (%esi), %mm7 C am 2 377 mov %edi, %edx C rp2 = rp am 2 378 lea 4(%esi), %esi C up++ am 2 379 lea (%esi), %eax C up2 = up am 2 380 movd (%eax), %mm1 C am 2 381 lea (%ebx), %ecx C inner loop count am 2 382 pxor %mm6, %mm6 C am 2 383 pmuludq %mm7, %mm1 C am 2 384 movd 4(%eax), %mm2 C am 2 385 movd (%edx), %mm5 C am 2 386 pmuludq %mm7, %mm2 C am 2 387 movd 8(%eax), %mm3 C am 2 388 paddq %mm1, %mm5 C am 2 389 movd 4(%edx), %mm4 C am 2 390 jmp L(am10) C am 2 391 ALIGN(16) C am 2 392 L(lam2): 393 pmuludq %mm7, %mm2 C am 2 394 paddq %mm4, %mm6 C am 2 395 movd 8(%eax), %mm3 C am 2 396 paddq %mm1, %mm5 C am 2 397 movd 4(%edx), %mm4 C am 2 398 movd %mm6, -4(%edx) C am 2 399 psrlq $32, %mm6 C am 2 400 L(am10): 401 pmuludq %mm7, %mm3 C am 2 402 paddq %mm5, %mm6 C am 2 403 movd 12(%eax), %mm0 C am 2 404 paddq %mm2, %mm4 C am 2 405 movd 8(%edx), %mm5 C am 2 406 movd %mm6, (%edx) C am 2 407 psrlq $32, %mm6 C am 2 408 pmuludq %mm7, %mm0 C am 2 409 paddq %mm4, %mm6 C am 2 410 movd 16(%eax), %mm1 C am 2 411 paddq %mm3, %mm5 C am 2 412 movd 12(%edx), %mm4 C am 2 413 movd %mm6, 4(%edx) C am 2 414 psrlq $32, %mm6 C am 2 415 pmuludq %mm7, %mm1 C am 2 416 paddq %mm5, %mm6 C am 2 417 movd 20(%eax), %mm2 C am 2 418 paddq %mm0, %mm4 C am 2 419 movd 16(%edx), %mm5 C am 2 420 movd %mm6, 8(%edx) C am 2 421 psrlq $32, %mm6 C am 2 422 lea 16(%eax), %eax C am 2 423 lea 16(%edx), %edx C am 2 424 sub $4, %ecx C am 2 425 ja L(lam2) C am 2 426 pmuludq %mm7, %mm2 C am 2 427 paddq %mm4, %mm6 C am 2 428 paddq %mm1, %mm5 C am 2 429 movd 4(%edx), %mm4 C am 2 430 movd %mm6, -4(%edx) C am 2 431 psrlq $32, %mm6 C am 2 432 paddq %mm5, %mm6 C am 2 433 paddq %mm2, %mm4 C am 2 434 L(1): movd %mm6, (%edx) C am 2 435 psrlq $32, %mm6 C am 2 436 paddq %mm4, %mm6 C am 2 437 movd %mm6, 4(%edx) C am 2 438 psrlq $32, %mm6 C am 2 439 movd %mm6, 8(%edx) C am 2 440 441 lea 8(%edi), %edi C rp += 2 442 movd (%esi), %mm7 C am 1 443 mov %edi, %edx C rp2 = rp am 1 444 lea 4(%esi), %esi C up++ am 1 445 lea (%esi), %eax C up2 = up am 1 446 movd (%eax), %mm2 C am 1 447 lea (%ebx), %ecx C inner loop count am 1 448 pxor %mm6, %mm6 C am 1 449 pmuludq %mm7, %mm2 C am 1 450 movd 4(%eax), %mm3 C am 1 451 movd (%edx), %mm4 C am 1 452 pmuludq %mm7, %mm3 C am 1 453 movd 8(%eax), %mm0 C am 1 454 paddq %mm2, %mm4 C am 1 455 movd 4(%edx), %mm5 C am 1 456 jmp L(am01) C am 1 457 ALIGN(16) C am 1 458 L(lam1): 459 pmuludq %mm7, %mm2 C am 1 460 paddq %mm4, %mm6 C am 1 461 movd 4(%eax), %mm3 C am 1 462 paddq %mm1, %mm5 C am 1 463 movd (%edx), %mm4 C am 1 464 movd %mm6, -8(%edx) C am 1 465 psrlq $32, %mm6 C am 1 466 pmuludq %mm7, %mm3 C am 1 467 paddq %mm5, %mm6 C am 1 468 movd 8(%eax), %mm0 C am 1 469 paddq %mm2, %mm4 C am 1 470 movd 4(%edx), %mm5 C am 1 471 movd %mm6, -4(%edx) C am 1 472 psrlq $32, %mm6 C am 1 473 L(am01): 474 pmuludq %mm7, %mm0 C am 1 475 paddq %mm4, %mm6 C am 1 476 movd 12(%eax), %mm1 C am 1 477 paddq %mm3, %mm5 C am 1 478 movd 8(%edx), %mm4 C am 1 479 movd %mm6, (%edx) C am 1 480 psrlq $32, %mm6 C am 1 481 pmuludq %mm7, %mm1 C am 1 482 paddq %mm5, %mm6 C am 1 483 movd 16(%eax), %mm2 C am 1 484 paddq %mm0, %mm4 C am 1 485 movd 12(%edx), %mm5 C am 1 486 movd %mm6, 4(%edx) C am 1 487 psrlq $32, %mm6 C am 1 488 lea 16(%eax), %eax C am 1 489 lea 16(%edx), %edx C am 1 490 sub $4, %ecx C am 1 491 ja L(lam1) C am 1 492 pmuludq %mm7, %mm2 C am 1 493 paddq %mm4, %mm6 C am 1 494 paddq %mm1, %mm5 C am 1 495 movd (%edx), %mm4 C am 1 496 movd %mm6, -8(%edx) C am 1 497 psrlq $32, %mm6 C am 1 498 paddq %mm5, %mm6 C am 1 499 paddq %mm2, %mm4 C am 1 500 L(0): movd %mm6, -4(%edx) C am 1 501 psrlq $32, %mm6 C am 1 502 paddq %mm4, %mm6 C am 1 503 movd %mm6, (%edx) C am 1 504 psrlq $32, %mm6 C am 1 505 movd %mm6, 4(%edx) C am 1 506 507 lea 8(%edi), %edi C rp += 2 508 movd (%esi), %mm7 C am 0 509 mov %edi, %edx C rp2 = rp am 0 510 lea 4(%esi), %esi C up++ am 0 511 lea (%esi), %eax C up2 = up am 0 512 movd (%eax), %mm3 C am 0 513 lea (%ebx), %ecx C inner loop count am 0 514 pxor %mm6, %mm6 C am 0 515 pmuludq %mm7, %mm3 C am 0 516 movd 4(%eax), %mm0 C am 0 517 movd (%edx), %mm5 C am 0 518 pmuludq %mm7, %mm0 C am 0 519 movd 8(%eax), %mm1 C am 0 520 paddq %mm3, %mm5 C am 0 521 movd 4(%edx), %mm4 C am 0 522 jmp L(am00) C am 0 523 ALIGN(16) C am 0 524 L(lam0): 525 pmuludq %mm7, %mm2 C am 0 526 paddq %mm4, %mm6 C am 0 527 movd (%eax), %mm3 C am 0 528 paddq %mm1, %mm5 C am 0 529 movd -4(%edx), %mm4 C am 0 530 movd %mm6, -12(%edx) C am 0 531 psrlq $32, %mm6 C am 0 532 pmuludq %mm7, %mm3 C am 0 533 paddq %mm5, %mm6 C am 0 534 movd 4(%eax), %mm0 C am 0 535 paddq %mm2, %mm4 C am 0 536 movd (%edx), %mm5 C am 0 537 movd %mm6, -8(%edx) C am 0 538 psrlq $32, %mm6 C am 0 539 pmuludq %mm7, %mm0 C am 0 540 paddq %mm4, %mm6 C am 0 541 movd 8(%eax), %mm1 C am 0 542 paddq %mm3, %mm5 C am 0 543 movd 4(%edx), %mm4 C am 0 544 movd %mm6, -4(%edx) C am 0 545 psrlq $32, %mm6 C am 0 546 L(am00): 547 pmuludq %mm7, %mm1 C am 0 548 paddq %mm5, %mm6 C am 0 549 movd 12(%eax), %mm2 C am 0 550 paddq %mm0, %mm4 C am 0 551 movd 8(%edx), %mm5 C am 0 552 movd %mm6, (%edx) C am 0 553 psrlq $32, %mm6 C am 0 554 lea 16(%eax), %eax C am 0 555 lea 16(%edx), %edx C am 0 556 sub $4, %ecx C am 0 557 ja L(lam0) C am 0 558 pmuludq %mm7, %mm2 C am 0 559 paddq %mm4, %mm6 C am 0 560 paddq %mm1, %mm5 C am 0 561 movd -4(%edx), %mm4 C am 0 562 movd %mm6, -12(%edx) C am 0 563 psrlq $32, %mm6 C am 0 564 paddq %mm5, %mm6 C am 0 565 paddq %mm2, %mm4 C am 0 566 L(3): movd %mm6, -8(%edx) C am 0 567 psrlq $32, %mm6 C am 0 568 paddq %mm4, %mm6 C am 0 569 movd %mm6, -4(%edx) C am 0 570 psrlq $32, %mm6 C am 0 571 movd %mm6, (%edx) C am 0 572 sub $4, %ebx C am 0 573 ja L(outer) C am 0 574 575 mov %edi, %edx 576 mov %esi, %eax 577 pop %edi 578 pop %ebx 579 pop %esi 580 581 L(am3): C up[un-1..un-3] x up[un-4] 582 lea 8(%edx), %edx C rp2 += 2 583 movd (%eax), %mm7 584 movd 4(%eax), %mm1 585 movd 8(%eax), %mm2 586 movd 12(%eax), %mm3 587 movd (%edx), %mm4 588 pmuludq %mm7, %mm1 589 movd 4(%edx), %mm5 590 pmuludq %mm7, %mm2 591 movd 8(%edx), %mm6 592 pmuludq %mm7, %mm3 593 paddq %mm1, %mm4 594 paddq %mm2, %mm5 595 paddq %mm3, %mm6 596 movd %mm4, (%edx) 597 psrlq $32, %mm4 598 paddq %mm5, %mm4 599 movd %mm4, 4(%edx) 600 psrlq $32, %mm4 601 paddq %mm6, %mm4 602 movd %mm4, 8(%edx) 603 psrlq $32, %mm4 604 movd %mm4, 12(%edx) C FIXME feed through! 605 lea 4(%eax), %eax 606 607 L(am2): C up[un-1..un-2] x up[un-3] 608 lea 8(%edx), %edx C rp2 += 2 609 movd (%eax), %mm7 610 movd 4(%eax), %mm1 611 movd 8(%eax), %mm2 612 movd (%edx), %mm4 613 movd 4(%edx), %mm5 614 pmuludq %mm7, %mm1 615 pmuludq %mm7, %mm2 616 paddq %mm1, %mm4 617 paddq %mm2, %mm5 618 movd %mm4, (%edx) 619 psrlq $32, %mm4 620 paddq %mm5, %mm4 621 movd %mm4, 4(%edx) 622 psrlq $32, %mm4 623 movd %mm4, 8(%edx) C FIXME feed through! 624 lea 4(%eax), %eax 625 626 L(am1): C up[un-1] x up[un-2] 627 lea 8(%edx), %edx C rp2 += 2 628 movd (%eax), %mm7 629 movd 4(%eax), %mm2 630 movd (%edx), %mm4 631 pmuludq %mm7, %mm2 632 paddq %mm2, %mm4 633 movd %mm4, (%edx) 634 psrlq $32, %mm4 635 movd %mm4, 4(%edx) 636 637 C *** diag stuff, use elementary code for now 638 639 mov 4(%esp), %edx C rp 640 mov 8(%esp), %eax C up 641 mov 12(%esp), %ecx C un 642 643 movd (%eax), %mm2 644 pmuludq %mm2, %mm2 C src[0]^2 645 646 pcmpeqd %mm7, %mm7 647 psrlq $32, %mm7 648 649 movd 4(%edx), %mm3 C dst[1] 650 651 movd %mm2, (%edx) 652 psrlq $32, %mm2 653 654 psllq $1, %mm3 C 2*dst[1] 655 paddq %mm3, %mm2 656 movd %mm2, 4(%edx) 657 psrlq $32, %mm2 658 659 sub $2, %ecx 660 661 L(diag): 662 movd 4(%eax), %mm0 C src limb 663 add $4, %eax 664 pmuludq %mm0, %mm0 665 movq %mm7, %mm1 666 pand %mm0, %mm1 C diagonal low 667 psrlq $32, %mm0 C diagonal high 668 669 movd 8(%edx), %mm3 670 psllq $1, %mm3 C 2*dst[i] 671 paddq %mm3, %mm1 672 paddq %mm1, %mm2 673 movd %mm2, 8(%edx) 674 psrlq $32, %mm2 675 676 movd 12(%edx), %mm3 677 psllq $1, %mm3 C 2*dst[i+1] 678 paddq %mm3, %mm0 679 paddq %mm0, %mm2 680 movd %mm2, 12(%edx) 681 add $8, %edx 682 psrlq $32, %mm2 683 684 sub $1, %ecx 685 jnz L(diag) 686 687 movd 4(%eax), %mm0 C src[size-1] 688 pmuludq %mm0, %mm0 689 pand %mm0, %mm7 C diagonal low 690 psrlq $32, %mm0 C diagonal high 691 692 movd 8(%edx), %mm3 C dst[2*size-2] 693 psllq $1, %mm3 694 paddq %mm3, %mm7 695 paddq %mm7, %mm2 696 movd %mm2, 8(%edx) 697 psrlq $32, %mm2 698 699 paddq %mm0, %mm2 700 movd %mm2, 12(%edx) C dst[2*size-1] 701 702 emms 703 ret 704 705 EPILOGUE()