github.com/digdeepmining/go-atheios@v1.5.13-0.20180902133602-d5687a2e6f43/crypto/secp256k1/libsecp256k1/src/field_5x52_asm_impl.h (about)

     1  /**********************************************************************
     2   * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille               *
     3   * Distributed under the MIT software license, see the accompanying   *
     4   * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
     5   **********************************************************************/
     6  
     7  /**
     8   * Changelog:
     9   * - March 2013, Diederik Huys:    original version
    10   * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
    11   * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
    12   */
    13  
    14  #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
    15  #define _SECP256K1_FIELD_INNER5X52_IMPL_H_
    16  
    17  SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
    18  /**
    19   * Registers: rdx:rax = multiplication accumulator
    20   *            r9:r8   = c
    21   *            r15:rcx = d
    22   *            r10-r14 = a0-a4
    23   *            rbx     = b
    24   *            rdi     = r
    25   *            rsi     = a / t?
    26   */
    27    uint64_t tmp1, tmp2, tmp3;
    28  __asm__ __volatile__(
    29      "movq 0(%%rsi),%%r10\n"
    30      "movq 8(%%rsi),%%r11\n"
    31      "movq 16(%%rsi),%%r12\n"
    32      "movq 24(%%rsi),%%r13\n"
    33      "movq 32(%%rsi),%%r14\n"
    34  
    35      /* d += a3 * b0 */
    36      "movq 0(%%rbx),%%rax\n"
    37      "mulq %%r13\n"
    38      "movq %%rax,%%rcx\n"
    39      "movq %%rdx,%%r15\n"
    40      /* d += a2 * b1 */
    41      "movq 8(%%rbx),%%rax\n"
    42      "mulq %%r12\n"
    43      "addq %%rax,%%rcx\n"
    44      "adcq %%rdx,%%r15\n"
    45      /* d += a1 * b2 */
    46      "movq 16(%%rbx),%%rax\n"
    47      "mulq %%r11\n"
    48      "addq %%rax,%%rcx\n"
    49      "adcq %%rdx,%%r15\n"
    50      /* d = a0 * b3 */
    51      "movq 24(%%rbx),%%rax\n"
    52      "mulq %%r10\n"
    53      "addq %%rax,%%rcx\n"
    54      "adcq %%rdx,%%r15\n"
    55      /* c = a4 * b4 */
    56      "movq 32(%%rbx),%%rax\n"
    57      "mulq %%r14\n"
    58      "movq %%rax,%%r8\n"
    59      "movq %%rdx,%%r9\n"
    60      /* d += (c & M) * R */
    61      "movq $0xfffffffffffff,%%rdx\n"
    62      "andq %%rdx,%%rax\n"
    63      "movq $0x1000003d10,%%rdx\n"
    64      "mulq %%rdx\n"
    65      "addq %%rax,%%rcx\n"
    66      "adcq %%rdx,%%r15\n"
    67      /* c >>= 52 (%%r8 only) */
    68      "shrdq $52,%%r9,%%r8\n"
    69      /* t3 (tmp1) = d & M */
    70      "movq %%rcx,%%rsi\n"
    71      "movq $0xfffffffffffff,%%rdx\n"
    72      "andq %%rdx,%%rsi\n"
    73      "movq %%rsi,%q1\n"
    74      /* d >>= 52 */
    75      "shrdq $52,%%r15,%%rcx\n"
    76      "xorq %%r15,%%r15\n"
    77      /* d += a4 * b0 */
    78      "movq 0(%%rbx),%%rax\n"
    79      "mulq %%r14\n"
    80      "addq %%rax,%%rcx\n"
    81      "adcq %%rdx,%%r15\n"
    82      /* d += a3 * b1 */
    83      "movq 8(%%rbx),%%rax\n"
    84      "mulq %%r13\n"
    85      "addq %%rax,%%rcx\n"
    86      "adcq %%rdx,%%r15\n"
    87      /* d += a2 * b2 */
    88      "movq 16(%%rbx),%%rax\n"
    89      "mulq %%r12\n"
    90      "addq %%rax,%%rcx\n"
    91      "adcq %%rdx,%%r15\n"
    92      /* d += a1 * b3 */
    93      "movq 24(%%rbx),%%rax\n"
    94      "mulq %%r11\n"
    95      "addq %%rax,%%rcx\n"
    96      "adcq %%rdx,%%r15\n"
    97      /* d += a0 * b4 */
    98      "movq 32(%%rbx),%%rax\n"
    99      "mulq %%r10\n"
   100      "addq %%rax,%%rcx\n"
   101      "adcq %%rdx,%%r15\n"
   102      /* d += c * R */
   103      "movq %%r8,%%rax\n"
   104      "movq $0x1000003d10,%%rdx\n"
   105      "mulq %%rdx\n"
   106      "addq %%rax,%%rcx\n"
   107      "adcq %%rdx,%%r15\n"
   108      /* t4 = d & M (%%rsi) */
   109      "movq %%rcx,%%rsi\n"
   110      "movq $0xfffffffffffff,%%rdx\n"
   111      "andq %%rdx,%%rsi\n"
   112      /* d >>= 52 */
   113      "shrdq $52,%%r15,%%rcx\n"
   114      "xorq %%r15,%%r15\n"
   115      /* tx = t4 >> 48 (tmp3) */
   116      "movq %%rsi,%%rax\n"
   117      "shrq $48,%%rax\n"
   118      "movq %%rax,%q3\n"
   119      /* t4 &= (M >> 4) (tmp2) */
   120      "movq $0xffffffffffff,%%rax\n"
   121      "andq %%rax,%%rsi\n"
   122      "movq %%rsi,%q2\n"
   123      /* c = a0 * b0 */
   124      "movq 0(%%rbx),%%rax\n"
   125      "mulq %%r10\n"
   126      "movq %%rax,%%r8\n"
   127      "movq %%rdx,%%r9\n"
   128      /* d += a4 * b1 */
   129      "movq 8(%%rbx),%%rax\n"
   130      "mulq %%r14\n"
   131      "addq %%rax,%%rcx\n"
   132      "adcq %%rdx,%%r15\n"
   133      /* d += a3 * b2 */
   134      "movq 16(%%rbx),%%rax\n"
   135      "mulq %%r13\n"
   136      "addq %%rax,%%rcx\n"
   137      "adcq %%rdx,%%r15\n"
   138      /* d += a2 * b3 */
   139      "movq 24(%%rbx),%%rax\n"
   140      "mulq %%r12\n"
   141      "addq %%rax,%%rcx\n"
   142      "adcq %%rdx,%%r15\n"
   143      /* d += a1 * b4 */
   144      "movq 32(%%rbx),%%rax\n"
   145      "mulq %%r11\n"
   146      "addq %%rax,%%rcx\n"
   147      "adcq %%rdx,%%r15\n"
   148      /* u0 = d & M (%%rsi) */
   149      "movq %%rcx,%%rsi\n"
   150      "movq $0xfffffffffffff,%%rdx\n"
   151      "andq %%rdx,%%rsi\n"
   152      /* d >>= 52 */
   153      "shrdq $52,%%r15,%%rcx\n"
   154      "xorq %%r15,%%r15\n"
   155      /* u0 = (u0 << 4) | tx (%%rsi) */
   156      "shlq $4,%%rsi\n"
   157      "movq %q3,%%rax\n"
   158      "orq %%rax,%%rsi\n"
   159      /* c += u0 * (R >> 4) */
   160      "movq $0x1000003d1,%%rax\n"
   161      "mulq %%rsi\n"
   162      "addq %%rax,%%r8\n"
   163      "adcq %%rdx,%%r9\n"
   164      /* r[0] = c & M */
   165      "movq %%r8,%%rax\n"
   166      "movq $0xfffffffffffff,%%rdx\n"
   167      "andq %%rdx,%%rax\n"
   168      "movq %%rax,0(%%rdi)\n"
   169      /* c >>= 52 */
   170      "shrdq $52,%%r9,%%r8\n"
   171      "xorq %%r9,%%r9\n"
   172      /* c += a1 * b0 */
   173      "movq 0(%%rbx),%%rax\n"
   174      "mulq %%r11\n"
   175      "addq %%rax,%%r8\n"
   176      "adcq %%rdx,%%r9\n"
   177      /* c += a0 * b1 */
   178      "movq 8(%%rbx),%%rax\n"
   179      "mulq %%r10\n"
   180      "addq %%rax,%%r8\n"
   181      "adcq %%rdx,%%r9\n"
   182      /* d += a4 * b2 */
   183      "movq 16(%%rbx),%%rax\n"
   184      "mulq %%r14\n"
   185      "addq %%rax,%%rcx\n"
   186      "adcq %%rdx,%%r15\n"
   187      /* d += a3 * b3 */
   188      "movq 24(%%rbx),%%rax\n"
   189      "mulq %%r13\n"
   190      "addq %%rax,%%rcx\n"
   191      "adcq %%rdx,%%r15\n"
   192      /* d += a2 * b4 */
   193      "movq 32(%%rbx),%%rax\n"
   194      "mulq %%r12\n"
   195      "addq %%rax,%%rcx\n"
   196      "adcq %%rdx,%%r15\n"
   197      /* c += (d & M) * R */
   198      "movq %%rcx,%%rax\n"
   199      "movq $0xfffffffffffff,%%rdx\n"
   200      "andq %%rdx,%%rax\n"
   201      "movq $0x1000003d10,%%rdx\n"
   202      "mulq %%rdx\n"
   203      "addq %%rax,%%r8\n"
   204      "adcq %%rdx,%%r9\n"
   205      /* d >>= 52 */
   206      "shrdq $52,%%r15,%%rcx\n"
   207      "xorq %%r15,%%r15\n"
   208      /* r[1] = c & M */
   209      "movq %%r8,%%rax\n"
   210      "movq $0xfffffffffffff,%%rdx\n"
   211      "andq %%rdx,%%rax\n"
   212      "movq %%rax,8(%%rdi)\n"
   213      /* c >>= 52 */
   214      "shrdq $52,%%r9,%%r8\n"
   215      "xorq %%r9,%%r9\n"
   216      /* c += a2 * b0 */
   217      "movq 0(%%rbx),%%rax\n"
   218      "mulq %%r12\n"
   219      "addq %%rax,%%r8\n"
   220      "adcq %%rdx,%%r9\n"
   221      /* c += a1 * b1 */
   222      "movq 8(%%rbx),%%rax\n"
   223      "mulq %%r11\n"
   224      "addq %%rax,%%r8\n"
   225      "adcq %%rdx,%%r9\n"
   226      /* c += a0 * b2 (last use of %%r10 = a0) */
   227      "movq 16(%%rbx),%%rax\n"
   228      "mulq %%r10\n"
   229      "addq %%rax,%%r8\n"
   230      "adcq %%rdx,%%r9\n"
   231      /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
   232      "movq %q2,%%rsi\n"
   233      "movq %q1,%%r10\n"
   234      /* d += a4 * b3 */
   235      "movq 24(%%rbx),%%rax\n"
   236      "mulq %%r14\n"
   237      "addq %%rax,%%rcx\n"
   238      "adcq %%rdx,%%r15\n"
   239      /* d += a3 * b4 */
   240      "movq 32(%%rbx),%%rax\n"
   241      "mulq %%r13\n"
   242      "addq %%rax,%%rcx\n"
   243      "adcq %%rdx,%%r15\n"
   244      /* c += (d & M) * R */
   245      "movq %%rcx,%%rax\n"
   246      "movq $0xfffffffffffff,%%rdx\n"
   247      "andq %%rdx,%%rax\n"
   248      "movq $0x1000003d10,%%rdx\n"
   249      "mulq %%rdx\n"
   250      "addq %%rax,%%r8\n"
   251      "adcq %%rdx,%%r9\n"
   252      /* d >>= 52 (%%rcx only) */
   253      "shrdq $52,%%r15,%%rcx\n"
   254      /* r[2] = c & M */
   255      "movq %%r8,%%rax\n"
   256      "movq $0xfffffffffffff,%%rdx\n"
   257      "andq %%rdx,%%rax\n"
   258      "movq %%rax,16(%%rdi)\n"
   259      /* c >>= 52 */
   260      "shrdq $52,%%r9,%%r8\n"
   261      "xorq %%r9,%%r9\n"
   262      /* c += t3 */
   263      "addq %%r10,%%r8\n"
   264      /* c += d * R */
   265      "movq %%rcx,%%rax\n"
   266      "movq $0x1000003d10,%%rdx\n"
   267      "mulq %%rdx\n"
   268      "addq %%rax,%%r8\n"
   269      "adcq %%rdx,%%r9\n"
   270      /* r[3] = c & M */
   271      "movq %%r8,%%rax\n"
   272      "movq $0xfffffffffffff,%%rdx\n"
   273      "andq %%rdx,%%rax\n"
   274      "movq %%rax,24(%%rdi)\n"
   275      /* c >>= 52 (%%r8 only) */
   276      "shrdq $52,%%r9,%%r8\n"
   277      /* c += t4 (%%r8 only) */
   278      "addq %%rsi,%%r8\n"
   279      /* r[4] = c */
   280      "movq %%r8,32(%%rdi)\n"
   281  : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
   282  : "b"(b), "D"(r)
   283  : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
   284  );
   285  }
   286  
   287  SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
   288  /**
   289   * Registers: rdx:rax = multiplication accumulator
   290   *            r9:r8   = c
   291   *            rcx:rbx = d
   292   *            r10-r14 = a0-a4
   293   *            r15     = M (0xfffffffffffff)
   294   *            rdi     = r
   295   *            rsi     = a / t?
   296   */
   297    uint64_t tmp1, tmp2, tmp3;
   298  __asm__ __volatile__(
   299      "movq 0(%%rsi),%%r10\n"
   300      "movq 8(%%rsi),%%r11\n"
   301      "movq 16(%%rsi),%%r12\n"
   302      "movq 24(%%rsi),%%r13\n"
   303      "movq 32(%%rsi),%%r14\n"
   304      "movq $0xfffffffffffff,%%r15\n"
   305  
   306      /* d = (a0*2) * a3 */
   307      "leaq (%%r10,%%r10,1),%%rax\n"
   308      "mulq %%r13\n"
   309      "movq %%rax,%%rbx\n"
   310      "movq %%rdx,%%rcx\n"
   311      /* d += (a1*2) * a2 */
   312      "leaq (%%r11,%%r11,1),%%rax\n"
   313      "mulq %%r12\n"
   314      "addq %%rax,%%rbx\n"
   315      "adcq %%rdx,%%rcx\n"
   316      /* c = a4 * a4 */
   317      "movq %%r14,%%rax\n"
   318      "mulq %%r14\n"
   319      "movq %%rax,%%r8\n"
   320      "movq %%rdx,%%r9\n"
   321      /* d += (c & M) * R */
   322      "andq %%r15,%%rax\n"
   323      "movq $0x1000003d10,%%rdx\n"
   324      "mulq %%rdx\n"
   325      "addq %%rax,%%rbx\n"
   326      "adcq %%rdx,%%rcx\n"
   327      /* c >>= 52 (%%r8 only) */
   328      "shrdq $52,%%r9,%%r8\n"
   329      /* t3 (tmp1) = d & M */
   330      "movq %%rbx,%%rsi\n"
   331      "andq %%r15,%%rsi\n"
   332      "movq %%rsi,%q1\n"
   333      /* d >>= 52 */
   334      "shrdq $52,%%rcx,%%rbx\n"
   335      "xorq %%rcx,%%rcx\n"
   336      /* a4 *= 2 */
   337      "addq %%r14,%%r14\n"
   338      /* d += a0 * a4 */
   339      "movq %%r10,%%rax\n"
   340      "mulq %%r14\n"
   341      "addq %%rax,%%rbx\n"
   342      "adcq %%rdx,%%rcx\n"
   343      /* d+= (a1*2) * a3 */
   344      "leaq (%%r11,%%r11,1),%%rax\n"
   345      "mulq %%r13\n"
   346      "addq %%rax,%%rbx\n"
   347      "adcq %%rdx,%%rcx\n"
   348      /* d += a2 * a2 */
   349      "movq %%r12,%%rax\n"
   350      "mulq %%r12\n"
   351      "addq %%rax,%%rbx\n"
   352      "adcq %%rdx,%%rcx\n"
   353      /* d += c * R */
   354      "movq %%r8,%%rax\n"
   355      "movq $0x1000003d10,%%rdx\n"
   356      "mulq %%rdx\n"
   357      "addq %%rax,%%rbx\n"
   358      "adcq %%rdx,%%rcx\n"
   359      /* t4 = d & M (%%rsi) */
   360      "movq %%rbx,%%rsi\n"
   361      "andq %%r15,%%rsi\n"
   362      /* d >>= 52 */
   363      "shrdq $52,%%rcx,%%rbx\n"
   364      "xorq %%rcx,%%rcx\n"
   365      /* tx = t4 >> 48 (tmp3) */
   366      "movq %%rsi,%%rax\n"
   367      "shrq $48,%%rax\n"
   368      "movq %%rax,%q3\n"
   369      /* t4 &= (M >> 4) (tmp2) */
   370      "movq $0xffffffffffff,%%rax\n"
   371      "andq %%rax,%%rsi\n"
   372      "movq %%rsi,%q2\n"
   373      /* c = a0 * a0 */
   374      "movq %%r10,%%rax\n"
   375      "mulq %%r10\n"
   376      "movq %%rax,%%r8\n"
   377      "movq %%rdx,%%r9\n"
   378      /* d += a1 * a4 */
   379      "movq %%r11,%%rax\n"
   380      "mulq %%r14\n"
   381      "addq %%rax,%%rbx\n"
   382      "adcq %%rdx,%%rcx\n"
   383      /* d += (a2*2) * a3 */
   384      "leaq (%%r12,%%r12,1),%%rax\n"
   385      "mulq %%r13\n"
   386      "addq %%rax,%%rbx\n"
   387      "adcq %%rdx,%%rcx\n"
   388      /* u0 = d & M (%%rsi) */
   389      "movq %%rbx,%%rsi\n"
   390      "andq %%r15,%%rsi\n"
   391      /* d >>= 52 */
   392      "shrdq $52,%%rcx,%%rbx\n"
   393      "xorq %%rcx,%%rcx\n"
   394      /* u0 = (u0 << 4) | tx (%%rsi) */
   395      "shlq $4,%%rsi\n"
   396      "movq %q3,%%rax\n"
   397      "orq %%rax,%%rsi\n"
   398      /* c += u0 * (R >> 4) */
   399      "movq $0x1000003d1,%%rax\n"
   400      "mulq %%rsi\n"
   401      "addq %%rax,%%r8\n"
   402      "adcq %%rdx,%%r9\n"
   403      /* r[0] = c & M */
   404      "movq %%r8,%%rax\n"
   405      "andq %%r15,%%rax\n"
   406      "movq %%rax,0(%%rdi)\n"
   407      /* c >>= 52 */
   408      "shrdq $52,%%r9,%%r8\n"
   409      "xorq %%r9,%%r9\n"
   410      /* a0 *= 2 */
   411      "addq %%r10,%%r10\n"
   412      /* c += a0 * a1 */
   413      "movq %%r10,%%rax\n"
   414      "mulq %%r11\n"
   415      "addq %%rax,%%r8\n"
   416      "adcq %%rdx,%%r9\n"
   417      /* d += a2 * a4 */
   418      "movq %%r12,%%rax\n"
   419      "mulq %%r14\n"
   420      "addq %%rax,%%rbx\n"
   421      "adcq %%rdx,%%rcx\n"
   422      /* d += a3 * a3 */
   423      "movq %%r13,%%rax\n"
   424      "mulq %%r13\n"
   425      "addq %%rax,%%rbx\n"
   426      "adcq %%rdx,%%rcx\n"
   427      /* c += (d & M) * R */
   428      "movq %%rbx,%%rax\n"
   429      "andq %%r15,%%rax\n"
   430      "movq $0x1000003d10,%%rdx\n"
   431      "mulq %%rdx\n"
   432      "addq %%rax,%%r8\n"
   433      "adcq %%rdx,%%r9\n"
   434      /* d >>= 52 */
   435      "shrdq $52,%%rcx,%%rbx\n"
   436      "xorq %%rcx,%%rcx\n"
   437      /* r[1] = c & M */
   438      "movq %%r8,%%rax\n"
   439      "andq %%r15,%%rax\n"
   440      "movq %%rax,8(%%rdi)\n"
   441      /* c >>= 52 */
   442      "shrdq $52,%%r9,%%r8\n"
   443      "xorq %%r9,%%r9\n"
   444      /* c += a0 * a2 (last use of %%r10) */
   445      "movq %%r10,%%rax\n"
   446      "mulq %%r12\n"
   447      "addq %%rax,%%r8\n"
   448      "adcq %%rdx,%%r9\n"
   449      /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
   450      "movq %q2,%%rsi\n"
   451      "movq %q1,%%r10\n"
   452      /* c += a1 * a1 */
   453      "movq %%r11,%%rax\n"
   454      "mulq %%r11\n"
   455      "addq %%rax,%%r8\n"
   456      "adcq %%rdx,%%r9\n"
   457      /* d += a3 * a4 */
   458      "movq %%r13,%%rax\n"
   459      "mulq %%r14\n"
   460      "addq %%rax,%%rbx\n"
   461      "adcq %%rdx,%%rcx\n"
   462      /* c += (d & M) * R */
   463      "movq %%rbx,%%rax\n"
   464      "andq %%r15,%%rax\n"
   465      "movq $0x1000003d10,%%rdx\n"
   466      "mulq %%rdx\n"
   467      "addq %%rax,%%r8\n"
   468      "adcq %%rdx,%%r9\n"
   469      /* d >>= 52 (%%rbx only) */
   470      "shrdq $52,%%rcx,%%rbx\n"
   471      /* r[2] = c & M */
   472      "movq %%r8,%%rax\n"
   473      "andq %%r15,%%rax\n"
   474      "movq %%rax,16(%%rdi)\n"
   475      /* c >>= 52 */
   476      "shrdq $52,%%r9,%%r8\n"
   477      "xorq %%r9,%%r9\n"
   478      /* c += t3 */
   479      "addq %%r10,%%r8\n"
   480      /* c += d * R */
   481      "movq %%rbx,%%rax\n"
   482      "movq $0x1000003d10,%%rdx\n"
   483      "mulq %%rdx\n"
   484      "addq %%rax,%%r8\n"
   485      "adcq %%rdx,%%r9\n"
   486      /* r[3] = c & M */
   487      "movq %%r8,%%rax\n"
   488      "andq %%r15,%%rax\n"
   489      "movq %%rax,24(%%rdi)\n"
   490      /* c >>= 52 (%%r8 only) */
   491      "shrdq $52,%%r9,%%r8\n"
   492      /* c += t4 (%%r8 only) */
   493      "addq %%rsi,%%r8\n"
   494      /* r[4] = c */
   495      "movq %%r8,32(%%rdi)\n"
   496  : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
   497  : "D"(r)
   498  : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
   499  );
   500  }
   501  
   502  #endif