github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_amd64.s (about) 1 //go:build !(purego || plugin) 2 3 #include "textflag.h" 4 #include "gfp_macros_amd64.s" 5 #define t1 R15 6 7 // func gfpSqr(res, in *gfP, n int) 8 TEXT ·gfpSqr(SB),NOSPLIT,$0 9 MOVQ res+0(FP), res_ptr 10 MOVQ in+8(FP), x_ptr 11 MOVQ n+16(FP), BX 12 13 CMPB ·supportADX(SB), $0 14 JE gfpSqrLoop 15 16 gfpSqrLoopAdx: 17 XORQ acc0, acc0 18 XORQ y_ptr, y_ptr 19 // y[1:] * y[0] 20 MOVQ (8*0)(x_ptr), DX 21 MULXQ (8*1)(x_ptr), acc1, acc2 22 23 MULXQ (8*2)(x_ptr), AX, acc3 24 ADOXQ AX, acc2 25 26 MULXQ (8*3)(x_ptr), AX, acc4 27 ADOXQ AX, acc3 28 ADOXQ y_ptr, acc4 29 30 // y[2:] * y[1] 31 MOVQ (8*1)(x_ptr), DX 32 MULXQ (8*2)(x_ptr), AX, t1 33 ADOXQ AX, acc3 34 35 MULXQ (8*3)(x_ptr), AX, acc5 36 ADCXQ t1, AX 37 ADOXQ AX, acc4 38 ADCXQ y_ptr, acc5 39 40 // y[3] * y[2] 41 MOVQ (8*2)(x_ptr), DX 42 MULXQ (8*3)(x_ptr), AX, y_ptr 43 ADOXQ AX, acc5 44 ADOXQ acc0, y_ptr 45 46 XORQ t1, t1 47 // *2 48 ADOXQ acc1, acc1 49 ADOXQ acc2, acc2 50 ADOXQ acc3, acc3 51 ADOXQ acc4, acc4 52 ADOXQ acc5, acc5 53 ADOXQ y_ptr, y_ptr 54 ADOXQ acc0, t1 55 56 // Missing products 57 MOVQ (8*0)(x_ptr), DX 58 MULXQ DX, acc0, t0 59 ADCXQ t0, acc1 60 61 MOVQ (8*1)(x_ptr), DX 62 MULXQ DX, AX, t0 63 ADCXQ AX, acc2 64 ADCXQ t0, acc3 65 66 MOVQ (8*2)(x_ptr), DX 67 MULXQ DX, AX, t0 68 ADCXQ AX, acc4 69 ADCXQ t0, acc5 70 71 MOVQ (8*3)(x_ptr), DX 72 MULXQ DX, AX, x_ptr 73 ADCXQ AX, y_ptr 74 ADCXQ t1, x_ptr 75 76 // First reduction step 77 MOVQ acc0, DX 78 MULXQ ·np+0x00(SB), DX, AX 79 80 MULXQ ·p2+0x00(SB), AX, t0 81 ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 82 83 MULXQ ·p2+0x08(SB), AX, t1 84 ADCXQ t0, AX 85 ADOXQ AX, acc1 86 87 MULXQ ·p2+0x10(SB), AX, t0 88 ADCXQ t1, AX 89 ADOXQ AX, acc2 90 91 MULXQ ·p2+0x18(SB), AX, acc0 92 ADCXQ t0, AX 93 ADOXQ AX, acc3 94 MOVQ $0, t0 95 ADCXQ t0, acc0 96 ADOXQ t0, acc0 97 98 // Second reduction step 99 MOVQ acc1, DX 100 MULXQ ·np+0x00(SB), DX, AX 101 102 MULXQ ·p2+0x00(SB), AX, t0 103 ADOXQ AX, acc1 104 105 MULXQ ·p2+0x08(SB), AX, t1 106 ADCXQ t0, AX 107 ADOXQ AX, acc2 108 109 MULXQ ·p2+0x10(SB), AX, t0 110 ADCXQ t1, AX 111 ADOXQ AX, acc3 112 113 MULXQ ·p2+0x18(SB), AX, acc1 114 ADCXQ t0, AX 115 ADOXQ AX, acc0 116 MOVQ $0, t0 117 ADCXQ t0, acc1 118 ADOXQ t0, acc1 119 120 // Third reduction step 121 MOVQ acc2, DX 122 MULXQ ·np+0x00(SB), DX, AX 123 124 MULXQ ·p2+0x00(SB), AX, t0 125 ADOXQ AX, acc2 126 127 MULXQ ·p2+0x08(SB), AX, t1 128 ADCXQ t0, AX 129 ADOXQ AX, acc3 130 131 MULXQ ·p2+0x10(SB), AX, t0 132 ADCXQ t1, AX 133 ADOXQ AX, acc0 134 135 MULXQ ·p2+0x18(SB), AX, acc2 136 ADCXQ t0, AX 137 ADOXQ AX, acc1 138 MOVQ $0, t0 139 ADCXQ t0, acc2 140 ADOXQ t0, acc2 141 142 // Last reduction step 143 MOVQ acc3, DX 144 MULXQ ·np+0x00(SB), DX, AX 145 146 MULXQ ·p2+0x00(SB), AX, t0 147 ADOXQ AX, acc3 148 149 MULXQ ·p2+0x08(SB), AX, t1 150 ADCXQ t0, AX 151 ADOXQ AX, acc0 152 153 MULXQ ·p2+0x10(SB), AX, t0 154 ADCXQ t1, AX 155 ADOXQ AX, acc1 156 157 MULXQ ·p2+0x18(SB), AX, acc3 158 ADCXQ t0, AX 159 ADOXQ AX, acc2 160 MOVQ $0, t0 161 ADCXQ t0, acc3 162 ADOXQ t0, acc3 163 164 XORQ t1, t1 165 // Add bits [511:256] of the sqr result 166 ADCXQ acc4, acc0 167 ADCXQ acc5, acc1 168 ADCXQ y_ptr, acc2 169 ADCXQ x_ptr, acc3 170 ADCXQ t1, t0 171 172 gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0) 173 storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) 174 175 MOVQ res_ptr, x_ptr 176 DECQ BX 177 JNE gfpSqrLoopAdx 178 179 RET 180 181 gfpSqrLoop: 182 183 // y[1:] * y[0] 184 MOVQ (8*0)(x_ptr), t0 185 186 MOVQ (8*1)(x_ptr), AX 187 MULQ t0 188 MOVQ AX, acc1 189 MOVQ DX, acc2 190 191 MOVQ (8*2)(x_ptr), AX 192 MULQ t0 193 ADDQ AX, acc2 194 ADCQ $0, DX 195 MOVQ DX, acc3 196 197 MOVQ (8*3)(x_ptr), AX 198 MULQ t0 199 ADDQ AX, acc3 200 ADCQ $0, DX 201 MOVQ DX, acc4 202 // y[2:] * y[1] 203 MOVQ (8*1)(x_ptr), t0 204 205 MOVQ (8*2)(x_ptr), AX 206 MULQ t0 207 ADDQ AX, acc3 208 ADCQ $0, DX 209 MOVQ DX, t1 210 211 MOVQ (8*3)(x_ptr), AX 212 MULQ t0 213 ADDQ t1, acc4 214 ADCQ $0, DX 215 ADDQ AX, acc4 216 ADCQ $0, DX 217 MOVQ DX, acc5 218 // y[3] * y[2] 219 MOVQ (8*2)(x_ptr), t0 220 221 MOVQ (8*3)(x_ptr), AX 222 MULQ t0 223 ADDQ AX, acc5 224 ADCQ $0, DX 225 MOVQ DX, y_ptr 226 XORQ t1, t1 227 // *2 228 ADDQ acc1, acc1 229 ADCQ acc2, acc2 230 ADCQ acc3, acc3 231 ADCQ acc4, acc4 232 ADCQ acc5, acc5 233 ADCQ y_ptr, y_ptr 234 ADCQ $0, t1 235 // Missing products 236 MOVQ (8*0)(x_ptr), AX 237 MULQ AX 238 MOVQ AX, acc0 239 MOVQ DX, t0 240 241 MOVQ (8*1)(x_ptr), AX 242 MULQ AX 243 ADDQ t0, acc1 244 ADCQ AX, acc2 245 ADCQ $0, DX 246 MOVQ DX, t0 247 248 MOVQ (8*2)(x_ptr), AX 249 MULQ AX 250 ADDQ t0, acc3 251 ADCQ AX, acc4 252 ADCQ $0, DX 253 MOVQ DX, t0 254 255 MOVQ (8*3)(x_ptr), AX 256 MULQ AX 257 ADDQ t0, acc5 258 ADCQ AX, y_ptr 259 ADCQ DX, t1 260 MOVQ t1, x_ptr 261 // T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr] 262 // First reduction step 263 MOVQ acc0, AX 264 MULQ ·np+0x00(SB) 265 MOVQ AX, t0 // Y 266 267 // Calculate next T = T+Y*P 268 MOVQ ·p2+0x00(SB), AX 269 MULQ t0 270 ADDQ AX, acc0 // acc0 is free now 271 ADCQ $0, DX 272 MOVQ DX, t1 // carry 273 XORQ acc0, acc0 274 275 MOVQ ·p2+0x08(SB), AX 276 MULQ t0 277 ADDQ t1, acc1 278 ADCQ $0, DX 279 ADDQ AX, acc1 280 ADCQ $0, DX 281 MOVQ DX, t1 // carry 282 283 MOVQ ·p2+0x10(SB), AX 284 MULQ t0 285 ADDQ t1, acc2 286 ADCQ $0, DX 287 ADDQ AX, acc2 288 ADCQ $0, DX 289 MOVQ DX, t1 // carry 290 291 MOVQ ·p2+0x18(SB), AX 292 MULQ t0 293 ADDQ t1, acc3 294 ADCQ $0, DX 295 ADDQ AX, acc3 296 ADCQ DX, acc0 297 298 // Second reduction step 299 MOVQ acc1, AX 300 MULQ ·np+0x00(SB) 301 MOVQ AX, t0 // Y 302 303 // Calculate next T = T+Y*P 304 MOVQ ·p2+0x00(SB), AX 305 MULQ t0 306 ADDQ AX, acc1 // acc1 is free now 307 ADCQ $0, DX 308 MOVQ DX, t1 // carry 309 XORQ acc1, acc1 310 311 MOVQ ·p2+0x08(SB), AX 312 MULQ t0 313 ADDQ t1, acc2 314 ADCQ $0, DX 315 ADDQ AX, acc2 316 ADCQ $0, DX 317 MOVQ DX, t1 // carry 318 319 MOVQ ·p2+0x10(SB), AX 320 MULQ t0 321 ADDQ t1, acc3 322 ADCQ $0, DX 323 ADDQ AX, acc3 324 ADCQ $0, DX 325 MOVQ DX, t1 // carry 326 327 MOVQ ·p2+0x18(SB), AX 328 MULQ t0 329 ADDQ t1, acc0 330 ADCQ $0, DX 331 ADDQ AX, acc0 332 ADCQ DX, acc1 333 334 // Third reduction step 335 MOVQ acc2, AX 336 MULQ ·np+0x00(SB) 337 MOVQ AX, t0 // Y 338 339 // Calculate next T = T+Y*P 340 MOVQ ·p2+0x00(SB), AX 341 MULQ t0 342 ADDQ AX, acc2 // acc2 is free now 343 ADCQ $0, DX 344 MOVQ DX, t1 // carry 345 XORQ acc2, acc2 346 347 MOVQ ·p2+0x08(SB), AX 348 MULQ t0 349 ADDQ t1, acc3 350 ADCQ $0, DX 351 ADDQ AX, acc3 352 ADCQ $0, DX 353 MOVQ DX, t1 // carry 354 355 MOVQ ·p2+0x10(SB), AX 356 MULQ t0 357 ADDQ t1, acc0 358 ADCQ $0, DX 359 ADDQ AX, acc0 360 ADCQ $0, DX 361 MOVQ DX, t1 // carry 362 363 MOVQ ·p2+0x18(SB), AX 364 MULQ t0 365 ADDQ t1, acc1 366 ADCQ $0, DX 367 ADDQ AX, acc1 368 ADCQ DX, acc2 369 370 // Last reduction step 371 MOVQ acc3, AX 372 MULQ ·np+0x00(SB) 373 MOVQ AX, t0 // Y 374 375 // Calculate next T = T+Y*P 376 MOVQ ·p2+0x00(SB), AX 377 MULQ t0 378 ADDQ AX, acc3 // acc3 is free now 379 ADCQ $0, DX 380 MOVQ DX, t1 // carry 381 XORQ acc3, acc3 382 383 MOVQ ·p2+0x08(SB), AX 384 MULQ t0 385 ADDQ t1, acc0 386 ADCQ $0, DX 387 ADDQ AX, acc0 388 ADCQ $0, DX 389 MOVQ DX, t1 // carry 390 391 MOVQ ·p2+0x10(SB), AX 392 MULQ t0 393 ADDQ t1, acc1 394 ADCQ $0, DX 395 ADDQ AX, acc1 396 ADCQ $0, DX 397 MOVQ DX, t1 // carry 398 399 MOVQ ·p2+0x18(SB), AX 400 MULQ t0 401 ADDQ t1, acc2 402 ADCQ $0, DX 403 ADDQ AX, acc2 404 ADCQ DX, acc3 405 406 XORQ t0, t0 407 // Add bits [511:256] of the sqr result 408 ADDQ acc4, acc0 409 ADCQ acc5, acc1 410 ADCQ y_ptr, acc2 411 ADCQ x_ptr, acc3 412 ADCQ $0, t0 413 414 gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0) 415 storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) 416 MOVQ res_ptr, x_ptr 417 DECQ BX 418 JNE gfpSqrLoop 419 420 RET