github.com/cloudflare/circl@v1.5.0/dh/csidh/fp511_amd64.s (about)

     1  //go:build amd64 && !purego
     2  // +build amd64,!purego
     3  
     4  #include "textflag.h"
     5  
     6  // Multiplies 512-bit value by 64-bit value. Uses MULQ instruction to
     7  // multiply 2 64-bit values.
     8  //
     9  // Result: x = (y * z) mod 2^512
    10  //
    11  // Registers used: AX, CX, DX, SI, DI, R8
    12  //
    13  // func mul512Amd64(a, b *Fp, c uint64)
    14  TEXT ·mul512Amd64(SB), NOSPLIT, $0-24
    15      MOVQ    a+0(FP), DI    // result
    16      MOVQ    b+8(FP), SI    // multiplicand
    17  
    18      // Check whether to use optimized implementation
    19      CMPB    ·hasBMI2(SB), $1
    20      JE      mul512_mulx
    21  
    22      MOVQ c+16(FP), R10  // 64 bit multiplier, used by MULQ
    23      MOVQ R10, AX; MULQ  0(SI);                            MOVQ DX, R11; MOVQ AX,  0(DI) //x[0]
    24      MOVQ R10, AX; MULQ  8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX,  8(DI) //x[1]
    25      MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
    26      MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
    27      MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
    28      MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
    29      MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
    30      MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX;                            MOVQ AX, 56(DI) //x[7]
    31      RET
    32  
    33  // Optimized for CPUs with BMI2
    34  mul512_mulx:
    35      MOVQ     c+16(FP), DX                                  // 64 bit multiplier, used by MULX
    36      MULXQ    0(SI), AX, R10; MOVQ AX, 0(DI)                // x[0]
    37      MULXQ    8(SI), AX, R11; ADDQ R10, AX; MOVQ AX,  8(DI) // x[1]
    38      MULXQ   16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
    39      MULXQ   24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
    40      MULXQ   32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
    41      MULXQ   40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
    42      MULXQ   48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
    43      MULXQ   56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
    44      RET
    45  
    46  TEXT ·cswap512Amd64(SB),NOSPLIT,$0-17
    47      MOVQ    x+0(FP), DI
    48      MOVQ    y+8(FP), SI
    49      MOVBLZX choice+16(FP), AX       // AL = 0 or 1
    50  
    51      // Make AX, so that either all bits are set or non
    52      // AX = 0 or 1
    53      NEGQ    AX
    54  
    55      // Fill xmm15. After this step first half of XMM15 is
    56      // just zeros and second half is whatever in AX
    57      MOVQ    AX, X15
    58  
    59      // Copy lower double word everywhere else. So that
    60      // XMM15=AL|AL|AL|AL. As AX has either all bits set
    61      // or non result will be that XMM15 has also either
    62      // all bits set or non of them.
    63      PSHUFD $0, X15, X15
    64  
    65  #ifndef CSWAP_BLOCK
    66  #define CSWAP_BLOCK(idx)       \
    67      MOVOU   (idx*16)(DI), X0 \
    68      MOVOU   (idx*16)(SI), X1 \
    69      \ // X2 = mask & (X0 ^ X1)
    70      MOVO     X1, X2 \
    71      PXOR     X0, X2 \
    72      PAND    X15, X2 \
    73      \
    74      PXOR     X2, X0 \
    75      PXOR     X2, X1 \
    76      \
    77      MOVOU    X0, (idx*16)(DI) \
    78      MOVOU    X1, (idx*16)(SI)
    79  #endif
    80  
    81      CSWAP_BLOCK(0)
    82      CSWAP_BLOCK(1)
    83      CSWAP_BLOCK(2)
    84      CSWAP_BLOCK(3)
    85  
    86      RET
    87  
    88  // mulAsm implements montgomery multiplication interleaved with
    89  // montgomery reduction. It uses MULX and ADCX/ADOX instructions.
    90  // Implementation specific to 511-bit prime 'p'
    91  //
    92  // func mulBmiAsm(res, x, y *fp)
    93  TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
    94  
    95      MOVQ x+8(FP), DI // multiplicand
    96      MOVQ y+16(FP), SI // multiplier
    97  
    98      XORQ  R8,  R8
    99      XORQ  R9,  R9
   100      XORQ R10, R10
   101      XORQ R11, R11
   102      XORQ R12, R12
   103      XORQ R13, R13
   104      XORQ R14, R14
   105      XORQ  CX,  CX
   106  
   107      MOVQ BP, 0(SP) // push: BP is Callee-save.
   108      XORQ BP, BP
   109  
   110  // Uses BMI2 (MULX)
   111  #ifdef MULS_MULX_512
   112  #undef MULS_MULX_512
   113  #endif
   114  #define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
   115      \ // Reduction step
   116      MOVQ  ( 0)(SI), DX      \
   117      MULXQ ( 8*idx)(DI), DX, AX  \
   118      ADDQ  r0, DX            \
   119      MOVQ ·pNegInv(SB), AX \
   120      MULXQ AX, DX, AX  \
   121      \
   122      XORQ  AX, AX; \
   123      MOVQ ·p+ 0(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r0; ADCXQ BX, r1 \
   124      MOVQ ·p+ 8(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r1; ADCXQ BX, r2 \
   125      MOVQ ·p+16(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r2; ADCXQ BX, r3 \
   126      MOVQ ·p+24(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r3; ADCXQ BX, r4 \
   127      MOVQ ·p+32(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r4; ADCXQ BX, r5 \
   128      MOVQ ·p+40(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r5; ADCXQ BX, r6 \
   129      MOVQ ·p+48(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r6; ADCXQ BX, r7 \
   130      MOVQ ·p+56(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r7; ADCXQ BX, r8 \
   131      MOVQ  $0, AX; ;;;;;;;;;;;;;;;;;;;;;;;  ADOXQ AX, r8; \
   132      \ // Multiplication step
   133      MOVQ (8*idx)(DI), DX \
   134      \
   135      XORQ  AX, AX \
   136      MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \
   137      MULXQ ( 8)(SI), AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \
   138      MULXQ (16)(SI), AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \
   139      MULXQ (24)(SI), AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \
   140      MULXQ (32)(SI), AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \
   141      MULXQ (40)(SI), AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \
   142      MULXQ (48)(SI), AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \
   143      MULXQ (56)(SI), AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \
   144      MOVQ  $0, AX          ; ADOXQ AX, r8;
   145  
   146      MULS_MULX_512(0,  R8,  R9, R10, R11, R12, R13, R14,  CX,  BP)
   147      MULS_MULX_512(1,  R9, R10, R11, R12, R13, R14,  CX,  BP,  R8)
   148      MULS_MULX_512(2, R10, R11, R12, R13, R14,  CX,  BP,  R8,  R9)
   149      MULS_MULX_512(3, R11, R12, R13, R14,  CX,  BP,  R8,  R9, R10)
   150      MULS_MULX_512(4, R12, R13, R14,  CX,  BP,  R8,  R9, R10, R11)
   151      MULS_MULX_512(5, R13, R14,  CX,  BP,  R8,  R9, R10, R11, R12)
   152      MULS_MULX_512(6, R14,  CX,  BP,  R8,  R9, R10, R11, R12, R13)
   153      MULS_MULX_512(7,  CX,  BP,  R8,  R9, R10, R11, R12, R13, R14)
   154  #undef MULS_MULX_512
   155  
   156      MOVQ res+0(FP), DI
   157      MOVQ  BP, ( 0)(DI)
   158      MOVQ  R8, ( 8)(DI)
   159      MOVQ  R9, (16)(DI)
   160      MOVQ R10, (24)(DI)
   161      MOVQ R11, (32)(DI)
   162      MOVQ R12, (40)(DI)
   163      MOVQ R13, (48)(DI)
   164      MOVQ R14, (56)(DI)
   165      MOVQ 0(SP), BP // pop: BP is Callee-save.
   166  
   167      // NOW DI needs to be reduced if > p
   168      RET