github.com/deroproject/derosuite@v2.1.6-1.0.20200307070847-0f2e589c7a2b+incompatible/crypto/edwards25519_fe_square_amd64.s (about) 1 // Copyright (c) 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build amd64 6 7 #include "textflag.h" 8 9 // func FeSquare(outp *uint64, xp *uint64) 10 TEXT ·FeSquare(SB),NOSPLIT,$0 11 MOVQ outp+0(FP), DI 12 MOVQ xp+8(FP), SI 13 14 // r0 = x0*x0 + x1*38*x4 + x2*38*x3 15 MOVQ 0(SI), AX 16 MULQ 0(SI) 17 MOVQ AX, CX // r00 18 MOVQ DX, R8 // r01 19 20 MOVQ 8(SI), DX 21 IMUL3Q $38, DX, AX 22 MULQ 32(SI) 23 ADDQ AX, CX 24 ADCQ DX, R8 25 26 MOVQ 16(SI), DX 27 IMUL3Q $38, DX, AX 28 MULQ 24(SI) 29 ADDQ AX, CX 30 ADCQ DX, R8 31 32 // r1 = x0*2*x1 + x2*38*x4 + x3*19*x3 33 MOVQ 0(SI), AX 34 SHLQ $1, AX 35 MULQ 8(SI) 36 MOVQ AX, R9 // r10 37 MOVQ DX, R10 // r11 38 39 MOVQ 16(SI), DX 40 IMUL3Q $38, DX, AX 41 MULQ 32(SI) 42 ADDQ AX, R9 43 ADCQ DX, R10 44 45 MOVQ 24(SI), DX 46 IMUL3Q $19, DX, AX 47 MULQ 24(SI) 48 ADDQ AX, R9 49 ADCQ DX, R10 50 51 // r2 = x0*2*x2 + x1*x1 + x3*38*x4 52 MOVQ 0(SI), AX 53 SHLQ $1, AX 54 MULQ 16(SI) 55 MOVQ AX, R11 // r20 56 MOVQ DX, R12 // r21 57 58 MOVQ 8(SI), AX 59 MULQ 8(SI) 60 ADDQ AX, R11 61 ADCQ DX, R12 62 63 MOVQ 24(SI), DX 64 IMUL3Q $38, DX, AX 65 MULQ 32(SI) 66 ADDQ AX, R11 67 ADCQ DX, R12 68 69 // r3 = x0*2*x3 + x1*2*x2 + x4*19*x4 70 MOVQ 0(SI), AX 71 SHLQ $1, AX 72 MULQ 24(SI) 73 MOVQ AX, R13 // r30 74 MOVQ DX, R14 // r31 75 76 MOVQ 8(SI), AX 77 SHLQ $1, AX 78 MULQ 16(SI) 79 ADDQ AX, R13 80 ADCQ DX, R14 81 82 MOVQ 32(SI), DX 83 IMUL3Q $19, DX, AX 84 MULQ 32(SI) 85 ADDQ AX, R13 86 ADCQ DX, R14 87 88 // r4 = x0*2*x4 + x1*2*x3 + x2*x2 89 MOVQ 0(SI), AX 90 SHLQ $1, AX 91 MULQ 32(SI) 92 MOVQ AX, R15 // r40 93 MOVQ DX, BX // r41 94 95 MOVQ 8(SI), AX 96 SHLQ $1, AX 97 MULQ 24(SI) 98 ADDQ AX, R15 99 ADCQ DX, BX 100 101 MOVQ 16(SI), AX 102 MULQ 16(SI) 103 ADDQ AX, R15 104 ADCQ DX, BX 105 106 // Reduce 107 MOVQ $2251799813685247, AX // (1<<51) - 1 108 SHLQ $13, CX, R8 // r01 = shld with r00 109 ANDQ AX, CX // r00 &= mask51 110 SHLQ $13, R9, R10 // r11 = shld with r10 111 ANDQ AX, R9 // r10 &= mask51 112 ADDQ R8, R9 // r10 += r01 113 SHLQ $13, R11, R12 // r21 = shld with r20 114 ANDQ AX, R11 // r20 &= mask51 115 ADDQ R10, R11 // r20 += r11 116 SHLQ $13, R13, R14 // r31 = shld with r30 117 ANDQ AX, R13 // r30 &= mask51 118 ADDQ R12, R13 // r30 += r21 119 SHLQ $13, R15, BX // r41 = shld with r40 120 ANDQ AX, R15 // r40 &= mask51 121 ADDQ R14, R15 // r40 += r31 122 IMUL3Q $19, BX, DX // r41 = r41*19 123 ADDQ DX, CX // r00 += r41 124 125 MOVQ CX, DX // rdx <-- r00 126 SHRQ $51, DX // rdx <-- r00 >> 51 127 ADDQ DX, R9 // r10 += r00 >> 51 128 MOVQ R9, DX // rdx <-- r10 129 SHRQ $51, DX // rdx <-- r10 >> 51 130 ANDQ AX, CX // r00 &= mask51 131 ADDQ DX, R11 // r20 += r10 >> 51 132 MOVQ R11, DX // rdx <-- r20 133 SHRQ $51, DX // rdx <-- r20 >> 51 134 ANDQ AX, R9 // r10 &= mask51 135 ADDQ DX, R13 // r30 += r20 >> 51 136 MOVQ R13, DX // rdx <-- r30 137 SHRQ $51, DX // rdx <-- r30 >> 51 138 ANDQ AX, R11 // r20 &= mask51 139 ADDQ DX, R15 // r40 += r30 >> 51 140 MOVQ R15, DX // rdx <-- r40 141 SHRQ $51, DX // rdx <-- r40 >> 51 142 ANDQ AX, R13 // r30 &= mask51 143 IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19 144 ADDQ DX, CX // r00 += (r40 >> 51) *19 145 ANDQ AX, R15 // r40 &= mask51 146 147 MOVQ CX, 0(DI) 148 MOVQ R9, 8(DI) 149 MOVQ R11, 16(DI) 150 MOVQ R13, 24(DI) 151 MOVQ R15, 32(DI) 152 RET