github.com/deroproject/derosuite@v2.1.6-1.0.20200307070847-0f2e589c7a2b+incompatible/crypto/edwards25519_fe_square_amd64.s (about)

     1  // Copyright (c) 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64
     6  
     7  #include "textflag.h"
     8  
     9  // func FeSquare(outp *uint64, xp *uint64)
    10  TEXT ·FeSquare(SB),NOSPLIT,$0
    11      MOVQ outp+0(FP), DI
    12      MOVQ xp+8(FP), SI
    13  
    14      // r0 = x0*x0 + x1*38*x4 + x2*38*x3
    15      MOVQ 0(SI), AX
    16      MULQ 0(SI)
    17      MOVQ AX, CX // r00
    18      MOVQ DX, R8 // r01
    19  
    20      MOVQ 8(SI), DX
    21      IMUL3Q $38, DX, AX
    22      MULQ 32(SI)
    23      ADDQ AX, CX
    24      ADCQ DX, R8
    25  
    26      MOVQ 16(SI), DX
    27      IMUL3Q $38, DX, AX
    28      MULQ 24(SI)
    29      ADDQ AX, CX
    30      ADCQ DX, R8
    31  
    32      // r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
    33      MOVQ 0(SI), AX
    34      SHLQ $1, AX
    35      MULQ 8(SI)
    36      MOVQ AX, R9  // r10
    37      MOVQ DX, R10 // r11
    38  
    39      MOVQ 16(SI), DX
    40      IMUL3Q $38, DX, AX
    41      MULQ 32(SI)
    42      ADDQ AX, R9
    43      ADCQ DX, R10
    44  
    45      MOVQ 24(SI), DX
    46      IMUL3Q $19, DX, AX
    47      MULQ 24(SI)
    48      ADDQ AX, R9
    49      ADCQ DX, R10
    50  
    51      // r2 = x0*2*x2 + x1*x1 + x3*38*x4
    52      MOVQ 0(SI), AX
    53      SHLQ $1, AX
    54      MULQ 16(SI)
    55      MOVQ AX, R11 // r20
    56      MOVQ DX, R12 // r21
    57  
    58      MOVQ 8(SI), AX
    59      MULQ 8(SI)
    60      ADDQ AX, R11
    61      ADCQ DX, R12
    62  
    63      MOVQ 24(SI), DX
    64      IMUL3Q $38, DX, AX
    65      MULQ 32(SI)
    66      ADDQ AX, R11
    67      ADCQ DX, R12
    68  
    69      // r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
    70      MOVQ 0(SI), AX
    71      SHLQ $1, AX
    72      MULQ 24(SI)
    73      MOVQ AX, R13 // r30
    74      MOVQ DX, R14 // r31
    75  
    76      MOVQ 8(SI), AX
    77      SHLQ $1, AX
    78      MULQ 16(SI)
    79      ADDQ AX, R13
    80      ADCQ DX, R14
    81  
    82      MOVQ 32(SI), DX
    83      IMUL3Q $19, DX, AX
    84      MULQ 32(SI)
    85      ADDQ AX, R13
    86      ADCQ DX, R14
    87  
    88      // r4 = x0*2*x4 + x1*2*x3 + x2*x2
    89      MOVQ 0(SI), AX
    90      SHLQ $1, AX
    91      MULQ 32(SI)
    92      MOVQ AX, R15 // r40
    93      MOVQ DX, BX  // r41
    94  
    95      MOVQ 8(SI), AX
    96      SHLQ $1, AX
    97      MULQ 24(SI)
    98      ADDQ AX, R15
    99      ADCQ DX, BX
   100  
   101      MOVQ 16(SI), AX
   102      MULQ 16(SI)
   103      ADDQ AX, R15
   104      ADCQ DX, BX
   105  
   106      // Reduce
   107      MOVQ $2251799813685247, AX // (1<<51) - 1
   108      SHLQ $13, CX, R8     // r01 = shld with r00
   109      ANDQ AX, CX          // r00 &= mask51
   110      SHLQ $13, R9, R10    // r11 = shld with r10
   111      ANDQ AX, R9          // r10 &= mask51
   112      ADDQ R8, R9          // r10 += r01
   113      SHLQ $13, R11, R12   // r21 = shld with r20
   114      ANDQ AX, R11         // r20 &= mask51
   115      ADDQ R10, R11        // r20 += r11
   116      SHLQ $13, R13, R14   // r31 = shld with r30
   117      ANDQ AX, R13         // r30 &= mask51
   118      ADDQ R12, R13        // r30 += r21
   119      SHLQ $13, R15, BX    // r41 = shld with r40
   120      ANDQ AX, R15         // r40 &= mask51
   121      ADDQ R14, R15        // r40 += r31
   122      IMUL3Q $19, BX, DX   // r41 = r41*19
   123      ADDQ DX, CX          // r00 += r41
   124  
   125      MOVQ CX, DX          // rdx <-- r00
   126      SHRQ $51, DX         // rdx <-- r00 >> 51
   127      ADDQ DX, R9          // r10 += r00 >> 51
   128      MOVQ R9, DX          // rdx <-- r10
   129      SHRQ $51, DX         // rdx <-- r10 >> 51
   130      ANDQ AX, CX          // r00 &= mask51
   131      ADDQ DX, R11         // r20 += r10 >> 51
   132      MOVQ R11, DX         // rdx <-- r20
   133      SHRQ $51, DX         // rdx <-- r20 >> 51
   134      ANDQ AX, R9          // r10 &= mask51
   135      ADDQ DX, R13         // r30 += r20 >> 51
   136      MOVQ R13, DX         // rdx <-- r30
   137      SHRQ $51, DX         // rdx <-- r30 >> 51
   138      ANDQ AX, R11         // r20 &= mask51
   139      ADDQ DX, R15         // r40 += r30 >> 51
   140      MOVQ R15, DX         // rdx <-- r40
   141      SHRQ $51, DX         // rdx <-- r40 >> 51
   142      ANDQ AX, R13         // r30 &= mask51
   143      IMUL3Q $19, DX, DX   // rdx <-- (r40 >> 51) * 19
   144      ADDQ DX, CX          // r00 += (r40 >> 51) *19
   145      ANDQ AX, R15         // r40 &= mask51
   146  
   147      MOVQ CX, 0(DI)
   148      MOVQ R9, 8(DI)
   149      MOVQ R11, 16(DI)
   150      MOVQ R13, 24(DI)
   151      MOVQ R15, 32(DI)
   152      RET