github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_plugin_amd64.s (about) 1 //go:build plugin && !purego 2 3 #include "textflag.h" 4 #include "gfp_macros_amd64.s" 5 6 // func gfpSqr(res, in *gfP, n int) 7 TEXT ·gfpSqr(SB),NOSPLIT,$0 8 MOVQ res+0(FP), res_ptr 9 MOVQ in+8(FP), x_ptr 10 MOVQ n+16(FP), BP 11 12 CMPB ·supportADX(SB), $0 13 JE gfpSqrLoop 14 15 gfpSqrLoopAdx: 16 XORQ acc0, acc0 17 XORQ y_ptr, y_ptr 18 // y[1:] * y[0] 19 MOVQ (8*0)(x_ptr), DX 20 MULXQ (8*1)(x_ptr), acc1, acc2 21 22 MULXQ (8*2)(x_ptr), AX, acc3 23 ADOXQ AX, acc2 24 25 MULXQ (8*3)(x_ptr), AX, acc4 26 ADOXQ AX, acc3 27 ADOXQ y_ptr, acc4 28 29 // y[2:] * y[1] 30 MOVQ (8*1)(x_ptr), DX 31 MULXQ (8*2)(x_ptr), AX, BX 32 ADOXQ AX, acc3 33 34 MULXQ (8*3)(x_ptr), AX, acc5 35 ADCXQ BX, AX 36 ADOXQ AX, acc4 37 ADCXQ y_ptr, acc5 38 39 // y[3] * y[2] 40 MOVQ (8*2)(x_ptr), DX 41 MULXQ (8*3)(x_ptr), AX, y_ptr 42 ADOXQ AX, acc5 43 ADOXQ acc0, y_ptr 44 45 XORQ BX, BX 46 // *2 47 ADOXQ acc1, acc1 48 ADOXQ acc2, acc2 49 ADOXQ acc3, acc3 50 ADOXQ acc4, acc4 51 ADOXQ acc5, acc5 52 ADOXQ y_ptr, y_ptr 53 ADOXQ acc0, BX 54 55 // Missing products 56 MOVQ (8*0)(x_ptr), DX 57 MULXQ DX, acc0, t0 58 ADCXQ t0, acc1 59 60 MOVQ (8*1)(x_ptr), DX 61 MULXQ DX, AX, t0 62 ADCXQ AX, acc2 63 ADCXQ t0, acc3 64 65 MOVQ (8*2)(x_ptr), DX 66 MULXQ DX, AX, t0 67 ADCXQ AX, acc4 68 ADCXQ t0, acc5 69 70 MOVQ (8*3)(x_ptr), DX 71 MULXQ DX, AX, x_ptr 72 ADCXQ AX, y_ptr 73 ADCXQ BX, x_ptr 74 75 // First reduction step 76 MOVQ acc0, DX 77 MULXQ ·np+0x00(SB), DX, AX 78 79 MULXQ ·p2+0x00(SB), AX, t0 80 ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 81 82 MULXQ ·p2+0x08(SB), AX, BX 83 ADCXQ t0, AX 84 ADOXQ AX, acc1 85 86 MULXQ ·p2+0x10(SB), AX, t0 87 ADCXQ BX, AX 88 ADOXQ AX, acc2 89 90 MULXQ ·p2+0x18(SB), AX, acc0 91 ADCXQ t0, AX 92 ADOXQ AX, acc3 93 MOVQ $0, t0 94 ADCXQ t0, acc0 95 ADOXQ t0, acc0 96 97 // Second reduction step 98 MOVQ acc1, DX 99 MULXQ ·np+0x00(SB), DX, AX 100 101 MULXQ ·p2+0x00(SB), AX, t0 102 ADOXQ AX, acc1 103 104 MULXQ ·p2+0x08(SB), AX, BX 105 ADCXQ t0, AX 106 ADOXQ AX, acc2 107 108 MULXQ ·p2+0x10(SB), AX, t0 109 ADCXQ BX, AX 110 ADOXQ AX, acc3 111 112 MULXQ ·p2+0x18(SB), AX, acc1 113 ADCXQ t0, AX 114 ADOXQ AX, acc0 115 MOVQ $0, t0 116 ADCXQ t0, acc1 117 ADOXQ t0, acc1 118 119 // Third reduction step 120 MOVQ acc2, DX 121 MULXQ ·np+0x00(SB), DX, AX 122 123 MULXQ ·p2+0x00(SB), AX, t0 124 ADOXQ AX, acc2 125 126 MULXQ ·p2+0x08(SB), AX, BX 127 ADCXQ t0, AX 128 ADOXQ AX, acc3 129 130 MULXQ ·p2+0x10(SB), AX, t0 131 ADCXQ BX, AX 132 ADOXQ AX, acc0 133 134 MULXQ ·p2+0x18(SB), AX, acc2 135 ADCXQ t0, AX 136 ADOXQ AX, acc1 137 MOVQ $0, t0 138 ADCXQ t0, acc2 139 ADOXQ t0, acc2 140 141 // Last reduction step 142 MOVQ acc3, DX 143 MULXQ ·np+0x00(SB), DX, AX 144 145 MULXQ ·p2+0x00(SB), AX, t0 146 ADOXQ AX, acc3 147 148 MULXQ ·p2+0x08(SB), AX, BX 149 ADCXQ t0, AX 150 ADOXQ AX, acc0 151 152 MULXQ ·p2+0x10(SB), AX, t0 153 ADCXQ BX, AX 154 ADOXQ AX, acc1 155 156 MULXQ ·p2+0x18(SB), AX, acc3 157 ADCXQ t0, AX 158 ADOXQ AX, acc2 159 MOVQ $0, t0 160 ADCXQ t0, acc3 161 ADOXQ t0, acc3 162 163 XORQ BX, BX 164 // Add bits [511:256] of the sqr result 165 ADCXQ acc4, acc0 166 ADCXQ acc5, acc1 167 ADCXQ y_ptr, acc2 168 ADCXQ x_ptr, acc3 169 ADCXQ BX, t0 170 171 gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0) 172 storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) 173 174 MOVQ res_ptr, x_ptr 175 DECQ BP 176 JNE gfpSqrLoopAdx 177 178 RET 179 180 gfpSqrLoop: 181 // y[1:] * y[0] 182 MOVQ (8*0)(x_ptr), t0 183 184 MOVQ (8*1)(x_ptr), AX 185 MULQ t0 186 MOVQ AX, acc1 187 MOVQ DX, acc2 188 189 MOVQ (8*2)(x_ptr), AX 190 MULQ t0 191 ADDQ AX, acc2 192 ADCQ $0, DX 193 MOVQ DX, acc3 194 195 MOVQ (8*3)(x_ptr), AX 196 MULQ t0 197 ADDQ AX, acc3 198 ADCQ $0, DX 199 MOVQ DX, acc4 200 // y[2:] * y[1] 201 MOVQ (8*1)(x_ptr), t0 202 203 MOVQ (8*2)(x_ptr), AX 204 MULQ t0 205 ADDQ AX, acc3 206 ADCQ $0, DX 207 MOVQ DX, BX 208 209 MOVQ (8*3)(x_ptr), AX 210 MULQ t0 211 ADDQ BX, acc4 212 ADCQ $0, DX 213 ADDQ AX, acc4 214 ADCQ $0, DX 215 MOVQ DX, acc5 216 // y[3] * y[2] 217 MOVQ (8*2)(x_ptr), t0 218 219 MOVQ (8*3)(x_ptr), AX 220 MULQ t0 221 ADDQ AX, acc5 222 ADCQ $0, DX 223 MOVQ DX, y_ptr 224 XORQ BX, BX 225 // *2 226 ADDQ acc1, acc1 227 ADCQ acc2, acc2 228 ADCQ acc3, acc3 229 ADCQ acc4, acc4 230 ADCQ acc5, acc5 231 ADCQ y_ptr, y_ptr 232 ADCQ $0, BX 233 // Missing products 234 MOVQ (8*0)(x_ptr), AX 235 MULQ AX 236 MOVQ AX, acc0 237 MOVQ DX, t0 238 239 MOVQ (8*1)(x_ptr), AX 240 MULQ AX 241 ADDQ t0, acc1 242 ADCQ AX, acc2 243 ADCQ $0, DX 244 MOVQ DX, t0 245 246 MOVQ (8*2)(x_ptr), AX 247 MULQ AX 248 ADDQ t0, acc3 249 ADCQ AX, acc4 250 ADCQ $0, DX 251 MOVQ DX, t0 252 253 MOVQ (8*3)(x_ptr), AX 254 MULQ AX 255 ADDQ t0, acc5 256 ADCQ AX, y_ptr 257 ADCQ DX, BX 258 MOVQ BX, x_ptr 259 // T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr] 260 // First reduction step 261 MOVQ acc0, AX 262 MULQ ·np+0x00(SB) 263 MOVQ AX, t0 // Y 264 265 // Calculate next T = T+Y*P 266 MOVQ ·p2+0x00(SB), AX 267 MULQ t0 268 ADDQ AX, acc0 // acc0 is free now 269 ADCQ $0, DX 270 MOVQ DX, BX // carry 271 XORQ acc0, acc0 272 273 MOVQ ·p2+0x08(SB), AX 274 MULQ t0 275 ADDQ BX, acc1 276 ADCQ $0, DX 277 ADDQ AX, acc1 278 ADCQ $0, DX 279 MOVQ DX, BX // carry 280 281 MOVQ ·p2+0x10(SB), AX 282 MULQ t0 283 ADDQ BX, acc2 284 ADCQ $0, DX 285 ADDQ AX, acc2 286 ADCQ $0, DX 287 MOVQ DX, BX // carry 288 289 MOVQ ·p2+0x18(SB), AX 290 MULQ t0 291 ADDQ BX, acc3 292 ADCQ $0, DX 293 ADDQ AX, acc3 294 ADCQ DX, acc0 295 296 // Second reduction step 297 MOVQ acc1, AX 298 MULQ ·np+0x00(SB) 299 MOVQ AX, t0 // Y 300 301 // Calculate next T = T+Y*P 302 MOVQ ·p2+0x00(SB), AX 303 MULQ t0 304 ADDQ AX, acc1 // acc1 is free now 305 ADCQ $0, DX 306 MOVQ DX, BX // carry 307 XORQ acc1, acc1 308 309 MOVQ ·p2+0x08(SB), AX 310 MULQ t0 311 ADDQ BX, acc2 312 ADCQ $0, DX 313 ADDQ AX, acc2 314 ADCQ $0, DX 315 MOVQ DX, BX // carry 316 317 MOVQ ·p2+0x10(SB), AX 318 MULQ t0 319 ADDQ BX, acc3 320 ADCQ $0, DX 321 ADDQ AX, acc3 322 ADCQ $0, DX 323 MOVQ DX, BX // carry 324 325 MOVQ ·p2+0x18(SB), AX 326 MULQ t0 327 ADDQ BX, acc0 328 ADCQ $0, DX 329 ADDQ AX, acc0 330 ADCQ DX, acc1 331 332 // Third reduction step 333 MOVQ acc2, AX 334 MULQ ·np+0x00(SB) 335 MOVQ AX, t0 // Y 336 337 // Calculate next T = T+Y*P 338 MOVQ ·p2+0x00(SB), AX 339 MULQ t0 340 ADDQ AX, acc2 // acc2 is free now 341 ADCQ $0, DX 342 MOVQ DX, BX // carry 343 XORQ acc2, acc2 344 345 MOVQ ·p2+0x08(SB), AX 346 MULQ t0 347 ADDQ BX, acc3 348 ADCQ $0, DX 349 ADDQ AX, acc3 350 ADCQ $0, DX 351 MOVQ DX, BX // carry 352 353 MOVQ ·p2+0x10(SB), AX 354 MULQ t0 355 ADDQ BX, acc0 356 ADCQ $0, DX 357 ADDQ AX, acc0 358 ADCQ $0, DX 359 MOVQ DX, BX // carry 360 361 MOVQ ·p2+0x18(SB), AX 362 MULQ t0 363 ADDQ BX, acc1 364 ADCQ $0, DX 365 ADDQ AX, acc1 366 ADCQ DX, acc2 367 368 // Last reduction step 369 MOVQ acc3, AX 370 MULQ ·np+0x00(SB) 371 MOVQ AX, t0 // Y 372 373 // Calculate next T = T+Y*P 374 MOVQ ·p2+0x00(SB), AX 375 MULQ t0 376 ADDQ AX, acc3 // acc3 is free now 377 ADCQ $0, DX 378 MOVQ DX, BX // carry 379 XORQ acc3, acc3 380 381 MOVQ ·p2+0x08(SB), AX 382 MULQ t0 383 ADDQ BX, acc0 384 ADCQ $0, DX 385 ADDQ AX, acc0 386 ADCQ $0, DX 387 MOVQ DX, BX // carry 388 389 MOVQ ·p2+0x10(SB), AX 390 MULQ t0 391 ADDQ BX, acc1 392 ADCQ $0, DX 393 ADDQ AX, acc1 394 ADCQ $0, DX 395 MOVQ DX, BX // carry 396 397 MOVQ ·p2+0x18(SB), AX 398 MULQ t0 399 ADDQ BX, acc2 400 ADCQ $0, DX 401 ADDQ AX, acc2 402 ADCQ DX, acc3 403 404 XORQ t0, t0 405 // Add bits [511:256] of the sqr result 406 ADDQ acc4, acc0 407 ADCQ acc5, acc1 408 ADCQ y_ptr, acc2 409 ADCQ x_ptr, acc3 410 ADCQ $0, t0 411 412 gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0) 413 storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) 414 MOVQ res_ptr, x_ptr 415 DECQ BP 416 JNE gfpSqrLoop 417 418 RET