github.com/cloudflare/circl@v1.5.0/ecc/fourq/fq_amd64.h (about)

     1  #include "fp_amd64.h"
     2  
     3  #define _fqAdd(c,a,b)      \
     4      _fpAdd( 0+c, 0+a, 0+b) \
     5      _fpAdd(16+c,16+a,16+b)
     6  
     7  #define _fqSub(c,a,b)      \
     8      _fpSub( 0+c, 0+a, 0+b) \
     9      _fpSub(16+c,16+a,16+b)
    10  
    11  #define _fqMulBmi2(c, a, b) \
    12      MOVL $0, R15 \
    13      \ // T0 = a0 * b0, R11:R10:R9:R8 <- 0+ra:8+ra * 0+rb:8+rb
    14      MOVQ 0+b, DX \
    15      MULXQ 0+a, R8, R9 \
    16      MULXQ 8+a, R10, AX \
    17      ADDQ R10, R9 \
    18      MOVQ 8+b, DX \
    19      MULXQ 8+a, R10, R11 \
    20      ADCQ AX, R10 \
    21      MULXQ 0+a, DX, AX \
    22      ADCQ $0, R11 \
    23      ADDQ DX, R9 \
    24      \
    25      \ // T1 = a1 * b1, R15:R14:R13:R12 <- 16+ra:24+ra * 16+rb:24+rb
    26      MOVQ 16+b, DX \
    27      MULXQ 16+a, R12, R13 \
    28      ADCQ AX, R10 \
    29      MULXQ 24+a, R14, AX \
    30      ADCQ $0, R11 \
    31      MOVQ 24+b, DX \
    32      ADDQ R14, R13 \
    33      MULXQ 24+a, R14, R15 \
    34      ADCQ AX, R14 \
    35      ADCQ $0, R15 \
    36      MULXQ 16+a, DX, AX \
    37      ADDQ DX, R13 \
    38      ADCQ AX, R14 \
    39      ADCQ $0, R15 \
    40      \
    41      \ // c0 = T0 - T1 = a0*b0 - a1*b1
    42      SUBQ R12, R8 \
    43      SBBQ R13, R9 \
    44      SBBQ R14, R10 \
    45      SBBQ R15, R11 \
    46      \
    47      SHLQ $1, R10, R11 \
    48      SHLQ $1, R9, R10 \
    49      MOVQ 16+b, DX \
    50      BTRQ $63, R9 \
    51      \
    52      \ // T0 = a0 * b1, R15:R14:R13:R12 <- 0+ra:8+ra * 16+rb:24+rb
    53      MULXQ 0+a, R12, R13 \
    54      BTRQ $63, R11 \
    55      SBBQ $0, R10 \
    56      SBBQ $0, R11 \
    57      MULXQ 8+a, R14, AX \
    58      ADDQ R14, R13 \
    59      MOVQ 24+b, DX \
    60      MULXQ 8+a, R14, R15 \
    61      ADCQ AX, R14 \
    62      ADCQ $0, R15 \
    63      MULXQ 0+a, DX, AX \
    64      ADDQ DX, R13 \
    65      ADCQ AX, R14 \
    66      ADCQ $0, R15 \
    67      \
    68      \ // Reducing and storing c0
    69      ADDQ R8, R10 \
    70      ADCQ R9, R11 \
    71      BTRQ $63, R11 \
    72      ADCQ $0, R10 \
    73      ADCQ $0, R11 \
    74      \
    75      \ // T1 = a1 * b0, R12:R11:R10:R9 <- 16+ra:24+ra * 0+rb:8+rb
    76      MOVQ 0+b, DX \
    77      MULXQ 16+a, R8, R9 \
    78      MOVQ R10, 0+c \
    79      MULXQ 24+a, R10, AX \
    80      ADDQ R10, R9 \
    81      MOVQ 8+b, DX \
    82      MOVQ R11, 8+c \
    83      MULXQ 24+a, R10, R11 \
    84      ADCQ AX, R10 \
    85      ADCQ $0, R11 \
    86      MULXQ 16+a, DX, AX \
    87      ADDQ DX, R9 \
    88      ADCQ AX, R10 \
    89      ADCQ $0, R11 \
    90      \
    91      \ // c1 = T0 + T1 = a0*b1 + a1*b0
    92      ADDQ R12, R8 \
    93      ADCQ R13, R9 \
    94      ADCQ R14, R10 \
    95      ADCQ R15, R11 \
    96      \
    97      \ // Reducing and storing c1
    98      SHLQ $1, R10, R11 \
    99      SHLQ $1, R9, R10 \
   100      BTRQ $63, R9 \
   101      BTRQ $63, R11 \
   102      ADCQ R10, R8 \
   103      ADCQ R11, R9 \
   104      BTRQ $63, R9 \
   105      ADCQ $0, R8 \
   106      ADCQ $0, R9 \
   107      MOVQ R8, 16+c \
   108      MOVQ R9, 24+c
   109  
   110  #define _fqMulLeg(c, a, b) \
   111      _fpMulLeg(R10, R9, R8, 0+a, 0+b) \
   112      _fpMulLeg(R13,R12,R11,16+a,16+b) \
   113      MOVQ  $0,R14 \
   114      SUBQ R11, R8 \
   115      SBBQ R12, R9 \
   116      SBBQ R13,R10 \
   117      SBBQ  $0,R14 \
   118      SHLQ  $1,R10 \
   119      BTRQ $63, R9 \
   120      ADCQ R10, R8 \
   121      ADCQ R14, R9 \
   122      MOVQ R8, R14 \
   123      MOVQ R9, R15 \
   124      _fpMulLeg(R10, R9, R8, 0+a,16+b) \
   125      _fpMulLeg(R13,R12,R11,16+a, 0+b) \
   126      ADDQ R11, R8 \
   127      ADCQ R12, R9 \
   128      ADCQ R13,R10 \
   129      SHLQ  $1,R10 \
   130      BTRQ $63, R9 \
   131      ADCQ R10, R8 \
   132      ADCQ  $0, R9 \
   133      MOVQ R14, 0+c \
   134      MOVQ R15, 8+c \
   135      MOVQ  R8,16+c \
   136      MOVQ  R9,24+c
   137  
   138  #define _fqSqrBmi2(c,a) \
   139      \ // t0 = R9:R8 = a0 + a1, R14:CX = a1
   140      MOVQ 0+a, R10 \
   141      MOVQ 16+a, R14 \
   142      SUBQ R14, R10 \
   143      MOVQ 8+a, R11 \
   144      MOVQ 24+a, CX \
   145      SBBQ CX, R11 \
   146      \
   147      BTRQ $63, R11 \
   148      SBBQ $0, R10 \
   149      \
   150      \ // t1 = R11:R10 = a0 - a1
   151      MOVQ R10, DX \
   152      MOVQ 0+a, R8 \
   153      ADDQ R14, R8 \
   154      MOVQ 8+a, R9 \
   155      ADCQ CX, R9 \
   156      \
   157      \ //  c0 = t0 * t1 = (a0 + a1)*(a0 - a1), CX:R14:R13:R12 <- R9:R8 * R11:R10
   158      MULXQ R8, R12, R13 \
   159      SBBQ $0, R11 \
   160      MULXQ R9, R14, AX \
   161      MOVQ R11, DX \
   162      ADDQ R14, R13 \
   163      MULXQ R9, R14, CX \
   164      MOVQ 8+a, R9 \
   165      ADCQ AX, R14 \
   166      ADCQ $0, CX \
   167      MULXQ R8, DX, AX \
   168      MOVQ 0+a, R8 \
   169      ADDQ DX, R13 \
   170      ADCQ AX, R14 \
   171      ADCQ $0, CX \
   172      \
   173      \ // t2 = R9:R8 = 2*a0
   174      ADDQ R8, R8 \
   175      ADCQ R9, R9 \
   176      \
   177      \ // Reducing and storing c0
   178      SHLQ $1, R14, CX \
   179      SHLQ $1, R13, R14 \
   180      BTRQ $63, R13 \
   181      BTRQ $63, CX \
   182      ADCQ R14, R12 \
   183      ADCQ CX, R13 \
   184      BTRQ $63, R13 \
   185      ADCQ $0, R12 \
   186      ADCQ $0, R13 \
   187      MOVQ R12, 0+c \
   188      MOVQ R13, 8+c \
   189      \
   190      \ //  c1 = 2a0 * a1, CX:R14:R11:R10 <- R9:R8 * 16+ra:24+ra
   191      MOVQ 16+a, DX \
   192      MULXQ R8, R10, R11 \
   193      MULXQ R9, R14, AX \
   194      ADDQ R14, R11 \
   195      MOVQ 24+a, DX \
   196      MULXQ R9, R14, CX \
   197      ADCQ AX, R14 \
   198      ADCQ $0, CX \
   199      MULXQ R8, DX, AX \
   200      ADDQ DX, R11 \
   201      ADCQ AX, R14 \
   202      ADCQ $0, CX \
   203      \
   204      \ // Reduce and store c1
   205      SHLQ $1, R14, CX \
   206      SHLQ $1, R11, R14 \
   207      BTRQ $63, R11 \
   208      BTRQ $63, CX \
   209      ADCQ R14, R10 \
   210      ADCQ CX, R11 \
   211      BTRQ $63, R11 \
   212      ADCQ $0, R10 \
   213      ADCQ $0, R11 \
   214      MOVQ R10, 16+c \
   215      MOVQ R11, 24+c
   216  
   217  #define _fqSqrLeg(c,a) _fqMulLeg(c,a,a)