golang.zx2c4.com/wireguard/windows@v0.5.4-0.20230123132234-dcc0eb72a04b/installer/fetcher/crypto.c (about)

     1  // SPDX-License-Identifier: GPL-2.0
     2  /*
     3   * Copyright (C) 2020-2022 Jason A. Donenfeld. All Rights Reserved.
     4   * Copyright (c) 2020, Google Inc.
     5   */
     6  
     7  #include "crypto.h"
     8  #include <stdint.h>
     9  #include <string.h>
    10  #include <winternl.h>
    11  #include <bcrypt.h>
    12  
    13  #if REG_DWORD == REG_DWORD_LITTLE_ENDIAN
    14  #define swap_le64(x) (x)
    15  #define swap_le32(x) (x)
    16  #elif REG_DWORD == REG_DWORD_BIG_ENDIAN
    17  #define swap_le64(x) __builtin_bswap64(x)
    18  #define swap_le32(x) __builtin_bswap32(x)
    19  #endif
    20  
    21  static void store_le64(uint8_t *dst, uint64_t src)
    22  {
    23  	src = swap_le64(src);
    24  	__builtin_memcpy(dst, &src, sizeof(src));
    25  }
    26  
    27  static uint64_t load_le64(const uint8_t *src)
    28  {
    29  	uint64_t dst;
    30  	__builtin_memcpy(&dst, src, sizeof(dst));
    31  	return swap_le64(dst);
    32  }
    33  
    34  static uint32_t load_le24(const uint8_t *in)
    35  {
    36  	uint32_t dst;
    37  	dst = (uint32_t)in[0];
    38  	dst |= ((uint32_t)in[1]) << 8;
    39  	dst |= ((uint32_t)in[2]) << 16;
    40  	return dst;
    41  }
    42  
    43  static uint32_t load_le32(const uint8_t *src)
    44  {
    45  	uint32_t dst;
    46  	__builtin_memcpy(&dst, src, sizeof(dst));
    47  	return swap_le32(dst);
    48  }
    49  
    50  static uint64_t ror64(uint64_t i, unsigned int s)
    51  {
    52  	return (i >> (s & 63)) | (i << ((-s) & 63));
    53  }
    54  
    55  static inline uint32_t value_barrier_u32(uint32_t a)
    56  {
    57  	__asm__("" : "+r"(a) : /* no inputs */);
    58  	return a;
    59  }
    60  
    61  static int memcmp_ct(const void *first, const void *second, size_t len)
    62  {
    63  	const uint8_t *a = first;
    64  	const uint8_t *b = second;
    65  	uint8_t diff = 0;
    66  
    67  	for (size_t i = 0; i < len; ++i) {
    68  		diff |= a[i] ^ b[i];
    69  		__asm__("" : "+r"(diff) : /* no inputs */);
    70  	}
    71  
    72  	return diff;
    73  }
    74  
    75  /*
    76   * The function fiat_25519_addcarryx_u26 is an addition with carry.
    77   * Postconditions:
    78   *   out1 = (arg1 + arg2 + arg3) mod 2^26
    79   *   out2 = ⌊(arg1 + arg2 + arg3) / 2^26⌋
    80   *
    81   * Input Bounds:
    82   *   arg1: [0x0 ~> 0x1]
    83   *   arg2: [0x0 ~> 0x3ffffff]
    84   *   arg3: [0x0 ~> 0x3ffffff]
    85   * Output Bounds:
    86   *   out1: [0x0 ~> 0x3ffffff]
    87   *   out2: [0x0 ~> 0x1]
    88   */
    89  static void fiat_25519_addcarryx_u26(uint32_t *out1, uint8_t *out2,
    90  				     uint8_t arg1, uint32_t arg2, uint32_t arg3)
    91  {
    92  	uint32_t x1 = ((arg1 + arg2) + arg3);
    93  	uint32_t x2 = (x1 & UINT32_C(0x3ffffff));
    94  	uint8_t x3 = (uint8_t)(x1 >> 26);
    95  	*out1 = x2;
    96  	*out2 = x3;
    97  }
    98  
    99  /*
   100   * The function fiat_25519_subborrowx_u26 is a subtraction with borrow.
   101   * Postconditions:
   102   *   out1 = (-arg1 + arg2 + -arg3) mod 2^26
   103   *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^26⌋
   104   *
   105   * Input Bounds:
   106   *   arg1: [0x0 ~> 0x1]
   107   *   arg2: [0x0 ~> 0x3ffffff]
   108   *   arg3: [0x0 ~> 0x3ffffff]
   109   * Output Bounds:
   110   *   out1: [0x0 ~> 0x3ffffff]
   111   *   out2: [0x0 ~> 0x1]
   112   */
   113  static void fiat_25519_subborrowx_u26(uint32_t *out1, uint8_t *out2,
   114  				      uint8_t arg1, uint32_t arg2,
   115  				      uint32_t arg3)
   116  {
   117  	int32_t x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
   118  	int8_t x2 = (int8_t)(x1 >> 26);
   119  	uint32_t x3 = (x1 & UINT32_C(0x3ffffff));
   120  	*out1 = x3;
   121  	*out2 = (uint8_t)(0x0 - x2);
   122  }
   123  
   124  /*
   125   * The function fiat_25519_addcarryx_u25 is an addition with carry.
   126   * Postconditions:
   127   *   out1 = (arg1 + arg2 + arg3) mod 2^25
   128   *   out2 = ⌊(arg1 + arg2 + arg3) / 2^25⌋
   129   *
   130   * Input Bounds:
   131   *   arg1: [0x0 ~> 0x1]
   132   *   arg2: [0x0 ~> 0x1ffffff]
   133   *   arg3: [0x0 ~> 0x1ffffff]
   134   * Output Bounds:
   135   *   out1: [0x0 ~> 0x1ffffff]
   136   *   out2: [0x0 ~> 0x1]
   137   */
   138  static void fiat_25519_addcarryx_u25(uint32_t *out1, uint8_t *out2,
   139  				     uint8_t arg1, uint32_t arg2, uint32_t arg3)
   140  {
   141  	uint32_t x1 = ((arg1 + arg2) + arg3);
   142  	uint32_t x2 = (x1 & UINT32_C(0x1ffffff));
   143  	uint8_t x3 = (uint8_t)(x1 >> 25);
   144  	*out1 = x2;
   145  	*out2 = x3;
   146  }
   147  
   148  /*
   149   * The function fiat_25519_subborrowx_u25 is a subtraction with borrow.
   150   * Postconditions:
   151   *   out1 = (-arg1 + arg2 + -arg3) mod 2^25
   152   *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^25⌋
   153   *
   154   * Input Bounds:
   155   *   arg1: [0x0 ~> 0x1]
   156   *   arg2: [0x0 ~> 0x1ffffff]
   157   *   arg3: [0x0 ~> 0x1ffffff]
   158   * Output Bounds:
   159   *   out1: [0x0 ~> 0x1ffffff]
   160   *   out2: [0x0 ~> 0x1]
   161   */
   162  static void fiat_25519_subborrowx_u25(uint32_t *out1, uint8_t *out2,
   163  				      uint8_t arg1, uint32_t arg2,
   164  				      uint32_t arg3)
   165  {
   166  	int32_t x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
   167  	int8_t x2 = (int8_t)(x1 >> 25);
   168  	uint32_t x3 = (x1 & UINT32_C(0x1ffffff));
   169  	*out1 = x3;
   170  	*out2 = (uint8_t)(0x0 - x2);
   171  }
   172  
   173  /*
   174   * The function fiat_25519_cmovznz_u32 is a single-word conditional move.
   175   * Postconditions:
   176   *   out1 = (if arg1 = 0 then arg2 else arg3)
   177   *
   178   * Input Bounds:
   179   *   arg1: [0x0 ~> 0x1]
   180   *   arg2: [0x0 ~> 0xffffffff]
   181   *   arg3: [0x0 ~> 0xffffffff]
   182   * Output Bounds:
   183   *   out1: [0x0 ~> 0xffffffff]
   184   */
   185  static void fiat_25519_cmovznz_u32(uint32_t *out1, uint8_t arg1, uint32_t arg2,
   186  				   uint32_t arg3)
   187  {
   188  	uint8_t x1 = (!(!arg1));
   189  	uint32_t x2 = ((int8_t)(0x0 - x1) & UINT32_C(0xffffffff));
   190  	// Note this line has been patched from the synthesized code to add value
   191  	// barriers.
   192  	//
   193  	// Clang recognizes this pattern as a select. While it usually transforms it
   194  	// to a cmov, it sometimes further transforms it into a branch, which we do
   195  	// not want.
   196  	uint32_t x3 = ((value_barrier_u32(x2) & arg3) |
   197  		       (value_barrier_u32(~x2) & arg2));
   198  	*out1 = x3;
   199  }
   200  
   201  /*
   202   * The function fiat_25519_carry_mul multiplies two field elements and reduces the result.
   203   * Postconditions:
   204   *   eval out1 mod m = (eval arg1 * eval arg2) mod m
   205   *
   206   * Input Bounds:
   207   *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   208   *   arg2: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   209   * Output Bounds:
   210   *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   211   */
   212  static void fiat_25519_carry_mul(uint32_t out1[10], const uint32_t arg1[10],
   213  				 const uint32_t arg2[10])
   214  {
   215  	uint64_t x1 = ((uint64_t)(arg1[9]) * ((arg2[9]) * UINT8_C(0x26)));
   216  	uint64_t x2 = ((uint64_t)(arg1[9]) * ((arg2[8]) * UINT8_C(0x13)));
   217  	uint64_t x3 = ((uint64_t)(arg1[9]) * ((arg2[7]) * UINT8_C(0x26)));
   218  	uint64_t x4 = ((uint64_t)(arg1[9]) * ((arg2[6]) * UINT8_C(0x13)));
   219  	uint64_t x5 = ((uint64_t)(arg1[9]) * ((arg2[5]) * UINT8_C(0x26)));
   220  	uint64_t x6 = ((uint64_t)(arg1[9]) * ((arg2[4]) * UINT8_C(0x13)));
   221  	uint64_t x7 = ((uint64_t)(arg1[9]) * ((arg2[3]) * UINT8_C(0x26)));
   222  	uint64_t x8 = ((uint64_t)(arg1[9]) * ((arg2[2]) * UINT8_C(0x13)));
   223  	uint64_t x9 = ((uint64_t)(arg1[9]) * ((arg2[1]) * UINT8_C(0x26)));
   224  	uint64_t x10 = ((uint64_t)(arg1[8]) * ((arg2[9]) * UINT8_C(0x13)));
   225  	uint64_t x11 = ((uint64_t)(arg1[8]) * ((arg2[8]) * UINT8_C(0x13)));
   226  	uint64_t x12 = ((uint64_t)(arg1[8]) * ((arg2[7]) * UINT8_C(0x13)));
   227  	uint64_t x13 = ((uint64_t)(arg1[8]) * ((arg2[6]) * UINT8_C(0x13)));
   228  	uint64_t x14 = ((uint64_t)(arg1[8]) * ((arg2[5]) * UINT8_C(0x13)));
   229  	uint64_t x15 = ((uint64_t)(arg1[8]) * ((arg2[4]) * UINT8_C(0x13)));
   230  	uint64_t x16 = ((uint64_t)(arg1[8]) * ((arg2[3]) * UINT8_C(0x13)));
   231  	uint64_t x17 = ((uint64_t)(arg1[8]) * ((arg2[2]) * UINT8_C(0x13)));
   232  	uint64_t x18 = ((uint64_t)(arg1[7]) * ((arg2[9]) * UINT8_C(0x26)));
   233  	uint64_t x19 = ((uint64_t)(arg1[7]) * ((arg2[8]) * UINT8_C(0x13)));
   234  	uint64_t x20 = ((uint64_t)(arg1[7]) * ((arg2[7]) * UINT8_C(0x26)));
   235  	uint64_t x21 = ((uint64_t)(arg1[7]) * ((arg2[6]) * UINT8_C(0x13)));
   236  	uint64_t x22 = ((uint64_t)(arg1[7]) * ((arg2[5]) * UINT8_C(0x26)));
   237  	uint64_t x23 = ((uint64_t)(arg1[7]) * ((arg2[4]) * UINT8_C(0x13)));
   238  	uint64_t x24 = ((uint64_t)(arg1[7]) * ((arg2[3]) * UINT8_C(0x26)));
   239  	uint64_t x25 = ((uint64_t)(arg1[6]) * ((arg2[9]) * UINT8_C(0x13)));
   240  	uint64_t x26 = ((uint64_t)(arg1[6]) * ((arg2[8]) * UINT8_C(0x13)));
   241  	uint64_t x27 = ((uint64_t)(arg1[6]) * ((arg2[7]) * UINT8_C(0x13)));
   242  	uint64_t x28 = ((uint64_t)(arg1[6]) * ((arg2[6]) * UINT8_C(0x13)));
   243  	uint64_t x29 = ((uint64_t)(arg1[6]) * ((arg2[5]) * UINT8_C(0x13)));
   244  	uint64_t x30 = ((uint64_t)(arg1[6]) * ((arg2[4]) * UINT8_C(0x13)));
   245  	uint64_t x31 = ((uint64_t)(arg1[5]) * ((arg2[9]) * UINT8_C(0x26)));
   246  	uint64_t x32 = ((uint64_t)(arg1[5]) * ((arg2[8]) * UINT8_C(0x13)));
   247  	uint64_t x33 = ((uint64_t)(arg1[5]) * ((arg2[7]) * UINT8_C(0x26)));
   248  	uint64_t x34 = ((uint64_t)(arg1[5]) * ((arg2[6]) * UINT8_C(0x13)));
   249  	uint64_t x35 = ((uint64_t)(arg1[5]) * ((arg2[5]) * UINT8_C(0x26)));
   250  	uint64_t x36 = ((uint64_t)(arg1[4]) * ((arg2[9]) * UINT8_C(0x13)));
   251  	uint64_t x37 = ((uint64_t)(arg1[4]) * ((arg2[8]) * UINT8_C(0x13)));
   252  	uint64_t x38 = ((uint64_t)(arg1[4]) * ((arg2[7]) * UINT8_C(0x13)));
   253  	uint64_t x39 = ((uint64_t)(arg1[4]) * ((arg2[6]) * UINT8_C(0x13)));
   254  	uint64_t x40 = ((uint64_t)(arg1[3]) * ((arg2[9]) * UINT8_C(0x26)));
   255  	uint64_t x41 = ((uint64_t)(arg1[3]) * ((arg2[8]) * UINT8_C(0x13)));
   256  	uint64_t x42 = ((uint64_t)(arg1[3]) * ((arg2[7]) * UINT8_C(0x26)));
   257  	uint64_t x43 = ((uint64_t)(arg1[2]) * ((arg2[9]) * UINT8_C(0x13)));
   258  	uint64_t x44 = ((uint64_t)(arg1[2]) * ((arg2[8]) * UINT8_C(0x13)));
   259  	uint64_t x45 = ((uint64_t)(arg1[1]) * ((arg2[9]) * UINT8_C(0x26)));
   260  	uint64_t x46 = ((uint64_t)(arg1[9]) * (arg2[0]));
   261  	uint64_t x47 = ((uint64_t)(arg1[8]) * (arg2[1]));
   262  	uint64_t x48 = ((uint64_t)(arg1[8]) * (arg2[0]));
   263  	uint64_t x49 = ((uint64_t)(arg1[7]) * (arg2[2]));
   264  	uint64_t x50 = ((uint64_t)(arg1[7]) * ((arg2[1]) * 0x2));
   265  	uint64_t x51 = ((uint64_t)(arg1[7]) * (arg2[0]));
   266  	uint64_t x52 = ((uint64_t)(arg1[6]) * (arg2[3]));
   267  	uint64_t x53 = ((uint64_t)(arg1[6]) * (arg2[2]));
   268  	uint64_t x54 = ((uint64_t)(arg1[6]) * (arg2[1]));
   269  	uint64_t x55 = ((uint64_t)(arg1[6]) * (arg2[0]));
   270  	uint64_t x56 = ((uint64_t)(arg1[5]) * (arg2[4]));
   271  	uint64_t x57 = ((uint64_t)(arg1[5]) * ((arg2[3]) * 0x2));
   272  	uint64_t x58 = ((uint64_t)(arg1[5]) * (arg2[2]));
   273  	uint64_t x59 = ((uint64_t)(arg1[5]) * ((arg2[1]) * 0x2));
   274  	uint64_t x60 = ((uint64_t)(arg1[5]) * (arg2[0]));
   275  	uint64_t x61 = ((uint64_t)(arg1[4]) * (arg2[5]));
   276  	uint64_t x62 = ((uint64_t)(arg1[4]) * (arg2[4]));
   277  	uint64_t x63 = ((uint64_t)(arg1[4]) * (arg2[3]));
   278  	uint64_t x64 = ((uint64_t)(arg1[4]) * (arg2[2]));
   279  	uint64_t x65 = ((uint64_t)(arg1[4]) * (arg2[1]));
   280  	uint64_t x66 = ((uint64_t)(arg1[4]) * (arg2[0]));
   281  	uint64_t x67 = ((uint64_t)(arg1[3]) * (arg2[6]));
   282  	uint64_t x68 = ((uint64_t)(arg1[3]) * ((arg2[5]) * 0x2));
   283  	uint64_t x69 = ((uint64_t)(arg1[3]) * (arg2[4]));
   284  	uint64_t x70 = ((uint64_t)(arg1[3]) * ((arg2[3]) * 0x2));
   285  	uint64_t x71 = ((uint64_t)(arg1[3]) * (arg2[2]));
   286  	uint64_t x72 = ((uint64_t)(arg1[3]) * ((arg2[1]) * 0x2));
   287  	uint64_t x73 = ((uint64_t)(arg1[3]) * (arg2[0]));
   288  	uint64_t x74 = ((uint64_t)(arg1[2]) * (arg2[7]));
   289  	uint64_t x75 = ((uint64_t)(arg1[2]) * (arg2[6]));
   290  	uint64_t x76 = ((uint64_t)(arg1[2]) * (arg2[5]));
   291  	uint64_t x77 = ((uint64_t)(arg1[2]) * (arg2[4]));
   292  	uint64_t x78 = ((uint64_t)(arg1[2]) * (arg2[3]));
   293  	uint64_t x79 = ((uint64_t)(arg1[2]) * (arg2[2]));
   294  	uint64_t x80 = ((uint64_t)(arg1[2]) * (arg2[1]));
   295  	uint64_t x81 = ((uint64_t)(arg1[2]) * (arg2[0]));
   296  	uint64_t x82 = ((uint64_t)(arg1[1]) * (arg2[8]));
   297  	uint64_t x83 = ((uint64_t)(arg1[1]) * ((arg2[7]) * 0x2));
   298  	uint64_t x84 = ((uint64_t)(arg1[1]) * (arg2[6]));
   299  	uint64_t x85 = ((uint64_t)(arg1[1]) * ((arg2[5]) * 0x2));
   300  	uint64_t x86 = ((uint64_t)(arg1[1]) * (arg2[4]));
   301  	uint64_t x87 = ((uint64_t)(arg1[1]) * ((arg2[3]) * 0x2));
   302  	uint64_t x88 = ((uint64_t)(arg1[1]) * (arg2[2]));
   303  	uint64_t x89 = ((uint64_t)(arg1[1]) * ((arg2[1]) * 0x2));
   304  	uint64_t x90 = ((uint64_t)(arg1[1]) * (arg2[0]));
   305  	uint64_t x91 = ((uint64_t)(arg1[0]) * (arg2[9]));
   306  	uint64_t x92 = ((uint64_t)(arg1[0]) * (arg2[8]));
   307  	uint64_t x93 = ((uint64_t)(arg1[0]) * (arg2[7]));
   308  	uint64_t x94 = ((uint64_t)(arg1[0]) * (arg2[6]));
   309  	uint64_t x95 = ((uint64_t)(arg1[0]) * (arg2[5]));
   310  	uint64_t x96 = ((uint64_t)(arg1[0]) * (arg2[4]));
   311  	uint64_t x97 = ((uint64_t)(arg1[0]) * (arg2[3]));
   312  	uint64_t x98 = ((uint64_t)(arg1[0]) * (arg2[2]));
   313  	uint64_t x99 = ((uint64_t)(arg1[0]) * (arg2[1]));
   314  	uint64_t x100 = ((uint64_t)(arg1[0]) * (arg2[0]));
   315  	uint64_t x101 =
   316  		(x100 +
   317  		 (x45 +
   318  		  (x44 + (x42 + (x39 + (x35 + (x30 + (x24 + (x17 + x9)))))))));
   319  	uint64_t x102 = (x101 >> 26);
   320  	uint32_t x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
   321  	uint64_t x104 =
   322  		(x91 +
   323  		 (x82 +
   324  		  (x74 + (x67 + (x61 + (x56 + (x52 + (x49 + (x47 + x46)))))))));
   325  	uint64_t x105 =
   326  		(x92 +
   327  		 (x83 +
   328  		  (x75 + (x68 + (x62 + (x57 + (x53 + (x50 + (x48 + x1)))))))));
   329  	uint64_t x106 =
   330  		(x93 +
   331  		 (x84 +
   332  		  (x76 + (x69 + (x63 + (x58 + (x54 + (x51 + (x10 + x2)))))))));
   333  	uint64_t x107 =
   334  		(x94 +
   335  		 (x85 +
   336  		  (x77 + (x70 + (x64 + (x59 + (x55 + (x18 + (x11 + x3)))))))));
   337  	uint64_t x108 =
   338  		(x95 +
   339  		 (x86 +
   340  		  (x78 + (x71 + (x65 + (x60 + (x25 + (x19 + (x12 + x4)))))))));
   341  	uint64_t x109 =
   342  		(x96 +
   343  		 (x87 +
   344  		  (x79 + (x72 + (x66 + (x31 + (x26 + (x20 + (x13 + x5)))))))));
   345  	uint64_t x110 =
   346  		(x97 +
   347  		 (x88 +
   348  		  (x80 + (x73 + (x36 + (x32 + (x27 + (x21 + (x14 + x6)))))))));
   349  	uint64_t x111 =
   350  		(x98 +
   351  		 (x89 +
   352  		  (x81 + (x40 + (x37 + (x33 + (x28 + (x22 + (x15 + x7)))))))));
   353  	uint64_t x112 =
   354  		(x99 +
   355  		 (x90 +
   356  		  (x43 + (x41 + (x38 + (x34 + (x29 + (x23 + (x16 + x8)))))))));
   357  	uint64_t x113 = (x102 + x112);
   358  	uint64_t x114 = (x113 >> 25);
   359  	uint32_t x115 = (uint32_t)(x113 & UINT32_C(0x1ffffff));
   360  	uint64_t x116 = (x114 + x111);
   361  	uint64_t x117 = (x116 >> 26);
   362  	uint32_t x118 = (uint32_t)(x116 & UINT32_C(0x3ffffff));
   363  	uint64_t x119 = (x117 + x110);
   364  	uint64_t x120 = (x119 >> 25);
   365  	uint32_t x121 = (uint32_t)(x119 & UINT32_C(0x1ffffff));
   366  	uint64_t x122 = (x120 + x109);
   367  	uint64_t x123 = (x122 >> 26);
   368  	uint32_t x124 = (uint32_t)(x122 & UINT32_C(0x3ffffff));
   369  	uint64_t x125 = (x123 + x108);
   370  	uint64_t x126 = (x125 >> 25);
   371  	uint32_t x127 = (uint32_t)(x125 & UINT32_C(0x1ffffff));
   372  	uint64_t x128 = (x126 + x107);
   373  	uint64_t x129 = (x128 >> 26);
   374  	uint32_t x130 = (uint32_t)(x128 & UINT32_C(0x3ffffff));
   375  	uint64_t x131 = (x129 + x106);
   376  	uint64_t x132 = (x131 >> 25);
   377  	uint32_t x133 = (uint32_t)(x131 & UINT32_C(0x1ffffff));
   378  	uint64_t x134 = (x132 + x105);
   379  	uint64_t x135 = (x134 >> 26);
   380  	uint32_t x136 = (uint32_t)(x134 & UINT32_C(0x3ffffff));
   381  	uint64_t x137 = (x135 + x104);
   382  	uint64_t x138 = (x137 >> 25);
   383  	uint32_t x139 = (uint32_t)(x137 & UINT32_C(0x1ffffff));
   384  	uint64_t x140 = (x138 * UINT8_C(0x13));
   385  	uint64_t x141 = (x103 + x140);
   386  	uint32_t x142 = (uint32_t)(x141 >> 26);
   387  	uint32_t x143 = (uint32_t)(x141 & UINT32_C(0x3ffffff));
   388  	uint32_t x144 = (x142 + x115);
   389  	uint8_t x145 = (uint8_t)(x144 >> 25);
   390  	uint32_t x146 = (x144 & UINT32_C(0x1ffffff));
   391  	uint32_t x147 = (x145 + x118);
   392  	out1[0] = x143;
   393  	out1[1] = x146;
   394  	out1[2] = x147;
   395  	out1[3] = x121;
   396  	out1[4] = x124;
   397  	out1[5] = x127;
   398  	out1[6] = x130;
   399  	out1[7] = x133;
   400  	out1[8] = x136;
   401  	out1[9] = x139;
   402  }
   403  
   404  /*
   405   * The function fiat_25519_carry_square squares a field element and reduces the result.
   406   * Postconditions:
   407   *   eval out1 mod m = (eval arg1 * eval arg1) mod m
   408   *
   409   * Input Bounds:
   410   *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   411   * Output Bounds:
   412   *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   413   */
   414  static void fiat_25519_carry_square(uint32_t out1[10], const uint32_t arg1[10])
   415  {
   416  	uint32_t x1 = ((arg1[9]) * UINT8_C(0x13));
   417  	uint32_t x2 = (x1 * 0x2);
   418  	uint32_t x3 = ((arg1[9]) * 0x2);
   419  	uint32_t x4 = ((arg1[8]) * UINT8_C(0x13));
   420  	uint64_t x5 = ((uint64_t)x4 * 0x2);
   421  	uint32_t x6 = ((arg1[8]) * 0x2);
   422  	uint32_t x7 = ((arg1[7]) * UINT8_C(0x13));
   423  	uint32_t x8 = (x7 * 0x2);
   424  	uint32_t x9 = ((arg1[7]) * 0x2);
   425  	uint32_t x10 = ((arg1[6]) * UINT8_C(0x13));
   426  	uint64_t x11 = ((uint64_t)x10 * 0x2);
   427  	uint32_t x12 = ((arg1[6]) * 0x2);
   428  	uint32_t x13 = ((arg1[5]) * UINT8_C(0x13));
   429  	uint32_t x14 = ((arg1[5]) * 0x2);
   430  	uint32_t x15 = ((arg1[4]) * 0x2);
   431  	uint32_t x16 = ((arg1[3]) * 0x2);
   432  	uint32_t x17 = ((arg1[2]) * 0x2);
   433  	uint32_t x18 = ((arg1[1]) * 0x2);
   434  	uint64_t x19 = ((uint64_t)(arg1[9]) * (x1 * 0x2));
   435  	uint64_t x20 = ((uint64_t)(arg1[8]) * x2);
   436  	uint64_t x21 = ((uint64_t)(arg1[8]) * x4);
   437  	uint64_t x22 = ((arg1[7]) * ((uint64_t)x2 * 0x2));
   438  	uint64_t x23 = ((arg1[7]) * x5);
   439  	uint64_t x24 = ((uint64_t)(arg1[7]) * (x7 * 0x2));
   440  	uint64_t x25 = ((uint64_t)(arg1[6]) * x2);
   441  	uint64_t x26 = ((arg1[6]) * x5);
   442  	uint64_t x27 = ((uint64_t)(arg1[6]) * x8);
   443  	uint64_t x28 = ((uint64_t)(arg1[6]) * x10);
   444  	uint64_t x29 = ((arg1[5]) * ((uint64_t)x2 * 0x2));
   445  	uint64_t x30 = ((arg1[5]) * x5);
   446  	uint64_t x31 = ((arg1[5]) * ((uint64_t)x8 * 0x2));
   447  	uint64_t x32 = ((arg1[5]) * x11);
   448  	uint64_t x33 = ((uint64_t)(arg1[5]) * (x13 * 0x2));
   449  	uint64_t x34 = ((uint64_t)(arg1[4]) * x2);
   450  	uint64_t x35 = ((arg1[4]) * x5);
   451  	uint64_t x36 = ((uint64_t)(arg1[4]) * x8);
   452  	uint64_t x37 = ((arg1[4]) * x11);
   453  	uint64_t x38 = ((uint64_t)(arg1[4]) * x14);
   454  	uint64_t x39 = ((uint64_t)(arg1[4]) * (arg1[4]));
   455  	uint64_t x40 = ((arg1[3]) * ((uint64_t)x2 * 0x2));
   456  	uint64_t x41 = ((arg1[3]) * x5);
   457  	uint64_t x42 = ((arg1[3]) * ((uint64_t)x8 * 0x2));
   458  	uint64_t x43 = ((uint64_t)(arg1[3]) * x12);
   459  	uint64_t x44 = ((uint64_t)(arg1[3]) * (x14 * 0x2));
   460  	uint64_t x45 = ((uint64_t)(arg1[3]) * x15);
   461  	uint64_t x46 = ((uint64_t)(arg1[3]) * ((arg1[3]) * 0x2));
   462  	uint64_t x47 = ((uint64_t)(arg1[2]) * x2);
   463  	uint64_t x48 = ((arg1[2]) * x5);
   464  	uint64_t x49 = ((uint64_t)(arg1[2]) * x9);
   465  	uint64_t x50 = ((uint64_t)(arg1[2]) * x12);
   466  	uint64_t x51 = ((uint64_t)(arg1[2]) * x14);
   467  	uint64_t x52 = ((uint64_t)(arg1[2]) * x15);
   468  	uint64_t x53 = ((uint64_t)(arg1[2]) * x16);
   469  	uint64_t x54 = ((uint64_t)(arg1[2]) * (arg1[2]));
   470  	uint64_t x55 = ((arg1[1]) * ((uint64_t)x2 * 0x2));
   471  	uint64_t x56 = ((uint64_t)(arg1[1]) * x6);
   472  	uint64_t x57 = ((uint64_t)(arg1[1]) * (x9 * 0x2));
   473  	uint64_t x58 = ((uint64_t)(arg1[1]) * x12);
   474  	uint64_t x59 = ((uint64_t)(arg1[1]) * (x14 * 0x2));
   475  	uint64_t x60 = ((uint64_t)(arg1[1]) * x15);
   476  	uint64_t x61 = ((uint64_t)(arg1[1]) * (x16 * 0x2));
   477  	uint64_t x62 = ((uint64_t)(arg1[1]) * x17);
   478  	uint64_t x63 = ((uint64_t)(arg1[1]) * ((arg1[1]) * 0x2));
   479  	uint64_t x64 = ((uint64_t)(arg1[0]) * x3);
   480  	uint64_t x65 = ((uint64_t)(arg1[0]) * x6);
   481  	uint64_t x66 = ((uint64_t)(arg1[0]) * x9);
   482  	uint64_t x67 = ((uint64_t)(arg1[0]) * x12);
   483  	uint64_t x68 = ((uint64_t)(arg1[0]) * x14);
   484  	uint64_t x69 = ((uint64_t)(arg1[0]) * x15);
   485  	uint64_t x70 = ((uint64_t)(arg1[0]) * x16);
   486  	uint64_t x71 = ((uint64_t)(arg1[0]) * x17);
   487  	uint64_t x72 = ((uint64_t)(arg1[0]) * x18);
   488  	uint64_t x73 = ((uint64_t)(arg1[0]) * (arg1[0]));
   489  	uint64_t x74 = (x73 + (x55 + (x48 + (x42 + (x37 + x33)))));
   490  	uint64_t x75 = (x74 >> 26);
   491  	uint32_t x76 = (uint32_t)(x74 & UINT32_C(0x3ffffff));
   492  	uint64_t x77 = (x64 + (x56 + (x49 + (x43 + x38))));
   493  	uint64_t x78 = (x65 + (x57 + (x50 + (x44 + (x39 + x19)))));
   494  	uint64_t x79 = (x66 + (x58 + (x51 + (x45 + x20))));
   495  	uint64_t x80 = (x67 + (x59 + (x52 + (x46 + (x22 + x21)))));
   496  	uint64_t x81 = (x68 + (x60 + (x53 + (x25 + x23))));
   497  	uint64_t x82 = (x69 + (x61 + (x54 + (x29 + (x26 + x24)))));
   498  	uint64_t x83 = (x70 + (x62 + (x34 + (x30 + x27))));
   499  	uint64_t x84 = (x71 + (x63 + (x40 + (x35 + (x31 + x28)))));
   500  	uint64_t x85 = (x72 + (x47 + (x41 + (x36 + x32))));
   501  	uint64_t x86 = (x75 + x85);
   502  	uint64_t x87 = (x86 >> 25);
   503  	uint32_t x88 = (uint32_t)(x86 & UINT32_C(0x1ffffff));
   504  	uint64_t x89 = (x87 + x84);
   505  	uint64_t x90 = (x89 >> 26);
   506  	uint32_t x91 = (uint32_t)(x89 & UINT32_C(0x3ffffff));
   507  	uint64_t x92 = (x90 + x83);
   508  	uint64_t x93 = (x92 >> 25);
   509  	uint32_t x94 = (uint32_t)(x92 & UINT32_C(0x1ffffff));
   510  	uint64_t x95 = (x93 + x82);
   511  	uint64_t x96 = (x95 >> 26);
   512  	uint32_t x97 = (uint32_t)(x95 & UINT32_C(0x3ffffff));
   513  	uint64_t x98 = (x96 + x81);
   514  	uint64_t x99 = (x98 >> 25);
   515  	uint32_t x100 = (uint32_t)(x98 & UINT32_C(0x1ffffff));
   516  	uint64_t x101 = (x99 + x80);
   517  	uint64_t x102 = (x101 >> 26);
   518  	uint32_t x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
   519  	uint64_t x104 = (x102 + x79);
   520  	uint64_t x105 = (x104 >> 25);
   521  	uint32_t x106 = (uint32_t)(x104 & UINT32_C(0x1ffffff));
   522  	uint64_t x107 = (x105 + x78);
   523  	uint64_t x108 = (x107 >> 26);
   524  	uint32_t x109 = (uint32_t)(x107 & UINT32_C(0x3ffffff));
   525  	uint64_t x110 = (x108 + x77);
   526  	uint64_t x111 = (x110 >> 25);
   527  	uint32_t x112 = (uint32_t)(x110 & UINT32_C(0x1ffffff));
   528  	uint64_t x113 = (x111 * UINT8_C(0x13));
   529  	uint64_t x114 = (x76 + x113);
   530  	uint32_t x115 = (uint32_t)(x114 >> 26);
   531  	uint32_t x116 = (uint32_t)(x114 & UINT32_C(0x3ffffff));
   532  	uint32_t x117 = (x115 + x88);
   533  	uint8_t x118 = (uint8_t)(x117 >> 25);
   534  	uint32_t x119 = (x117 & UINT32_C(0x1ffffff));
   535  	uint32_t x120 = (x118 + x91);
   536  	out1[0] = x116;
   537  	out1[1] = x119;
   538  	out1[2] = x120;
   539  	out1[3] = x94;
   540  	out1[4] = x97;
   541  	out1[5] = x100;
   542  	out1[6] = x103;
   543  	out1[7] = x106;
   544  	out1[8] = x109;
   545  	out1[9] = x112;
   546  }
   547  
   548  /*
   549   * The function fiat_25519_carry reduces a field element.
   550   * Postconditions:
   551   *   eval out1 mod m = eval arg1 mod m
   552   *
   553   * Input Bounds:
   554   *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   555   * Output Bounds:
   556   *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   557   */
   558  static void fiat_25519_carry(uint32_t out1[10], const uint32_t arg1[10])
   559  {
   560  	uint32_t x1 = (arg1[0]);
   561  	uint32_t x2 = ((x1 >> 26) + (arg1[1]));
   562  	uint32_t x3 = ((x2 >> 25) + (arg1[2]));
   563  	uint32_t x4 = ((x3 >> 26) + (arg1[3]));
   564  	uint32_t x5 = ((x4 >> 25) + (arg1[4]));
   565  	uint32_t x6 = ((x5 >> 26) + (arg1[5]));
   566  	uint32_t x7 = ((x6 >> 25) + (arg1[6]));
   567  	uint32_t x8 = ((x7 >> 26) + (arg1[7]));
   568  	uint32_t x9 = ((x8 >> 25) + (arg1[8]));
   569  	uint32_t x10 = ((x9 >> 26) + (arg1[9]));
   570  	uint32_t x11 =
   571  		((x1 & UINT32_C(0x3ffffff)) + ((x10 >> 25) * UINT8_C(0x13)));
   572  	uint32_t x12 = ((uint8_t)(x11 >> 26) + (x2 & UINT32_C(0x1ffffff)));
   573  	uint32_t x13 = (x11 & UINT32_C(0x3ffffff));
   574  	uint32_t x14 = (x12 & UINT32_C(0x1ffffff));
   575  	uint32_t x15 = ((uint8_t)(x12 >> 25) + (x3 & UINT32_C(0x3ffffff)));
   576  	uint32_t x16 = (x4 & UINT32_C(0x1ffffff));
   577  	uint32_t x17 = (x5 & UINT32_C(0x3ffffff));
   578  	uint32_t x18 = (x6 & UINT32_C(0x1ffffff));
   579  	uint32_t x19 = (x7 & UINT32_C(0x3ffffff));
   580  	uint32_t x20 = (x8 & UINT32_C(0x1ffffff));
   581  	uint32_t x21 = (x9 & UINT32_C(0x3ffffff));
   582  	uint32_t x22 = (x10 & UINT32_C(0x1ffffff));
   583  	out1[0] = x13;
   584  	out1[1] = x14;
   585  	out1[2] = x15;
   586  	out1[3] = x16;
   587  	out1[4] = x17;
   588  	out1[5] = x18;
   589  	out1[6] = x19;
   590  	out1[7] = x20;
   591  	out1[8] = x21;
   592  	out1[9] = x22;
   593  }
   594  
   595  /*
   596   * The function fiat_25519_add adds two field elements.
   597   * Postconditions:
   598   *   eval out1 mod m = (eval arg1 + eval arg2) mod m
   599   *
   600   * Input Bounds:
   601   *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   602   *   arg2: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   603   * Output Bounds:
   604   *   out1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   605   */
   606  static void fiat_25519_add(uint32_t out1[10], const uint32_t arg1[10],
   607  			   const uint32_t arg2[10])
   608  {
   609  	uint32_t x1 = ((arg1[0]) + (arg2[0]));
   610  	uint32_t x2 = ((arg1[1]) + (arg2[1]));
   611  	uint32_t x3 = ((arg1[2]) + (arg2[2]));
   612  	uint32_t x4 = ((arg1[3]) + (arg2[3]));
   613  	uint32_t x5 = ((arg1[4]) + (arg2[4]));
   614  	uint32_t x6 = ((arg1[5]) + (arg2[5]));
   615  	uint32_t x7 = ((arg1[6]) + (arg2[6]));
   616  	uint32_t x8 = ((arg1[7]) + (arg2[7]));
   617  	uint32_t x9 = ((arg1[8]) + (arg2[8]));
   618  	uint32_t x10 = ((arg1[9]) + (arg2[9]));
   619  	out1[0] = x1;
   620  	out1[1] = x2;
   621  	out1[2] = x3;
   622  	out1[3] = x4;
   623  	out1[4] = x5;
   624  	out1[5] = x6;
   625  	out1[6] = x7;
   626  	out1[7] = x8;
   627  	out1[8] = x9;
   628  	out1[9] = x10;
   629  }
   630  
   631  /*
   632   * The function fiat_25519_sub subtracts two field elements.
   633   * Postconditions:
   634   *   eval out1 mod m = (eval arg1 - eval arg2) mod m
   635   *
   636   * Input Bounds:
   637   *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   638   *   arg2: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   639   * Output Bounds:
   640   *   out1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   641   */
   642  static void fiat_25519_sub(uint32_t out1[10], const uint32_t arg1[10],
   643  			   const uint32_t arg2[10])
   644  {
   645  	uint32_t x1 = ((UINT32_C(0x7ffffda) + (arg1[0])) - (arg2[0]));
   646  	uint32_t x2 = ((UINT32_C(0x3fffffe) + (arg1[1])) - (arg2[1]));
   647  	uint32_t x3 = ((UINT32_C(0x7fffffe) + (arg1[2])) - (arg2[2]));
   648  	uint32_t x4 = ((UINT32_C(0x3fffffe) + (arg1[3])) - (arg2[3]));
   649  	uint32_t x5 = ((UINT32_C(0x7fffffe) + (arg1[4])) - (arg2[4]));
   650  	uint32_t x6 = ((UINT32_C(0x3fffffe) + (arg1[5])) - (arg2[5]));
   651  	uint32_t x7 = ((UINT32_C(0x7fffffe) + (arg1[6])) - (arg2[6]));
   652  	uint32_t x8 = ((UINT32_C(0x3fffffe) + (arg1[7])) - (arg2[7]));
   653  	uint32_t x9 = ((UINT32_C(0x7fffffe) + (arg1[8])) - (arg2[8]));
   654  	uint32_t x10 = ((UINT32_C(0x3fffffe) + (arg1[9])) - (arg2[9]));
   655  	out1[0] = x1;
   656  	out1[1] = x2;
   657  	out1[2] = x3;
   658  	out1[3] = x4;
   659  	out1[4] = x5;
   660  	out1[5] = x6;
   661  	out1[6] = x7;
   662  	out1[7] = x8;
   663  	out1[8] = x9;
   664  	out1[9] = x10;
   665  }
   666  
   667  /*
   668   * The function fiat_25519_opp negates a field element.
   669   * Postconditions:
   670   *   eval out1 mod m = -eval arg1 mod m
   671   *
   672   * Input Bounds:
   673   *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   674   * Output Bounds:
   675   *   out1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
   676   */
   677  static void fiat_25519_opp(uint32_t out1[10], const uint32_t arg1[10])
   678  {
   679  	uint32_t x1 = (UINT32_C(0x7ffffda) - (arg1[0]));
   680  	uint32_t x2 = (UINT32_C(0x3fffffe) - (arg1[1]));
   681  	uint32_t x3 = (UINT32_C(0x7fffffe) - (arg1[2]));
   682  	uint32_t x4 = (UINT32_C(0x3fffffe) - (arg1[3]));
   683  	uint32_t x5 = (UINT32_C(0x7fffffe) - (arg1[4]));
   684  	uint32_t x6 = (UINT32_C(0x3fffffe) - (arg1[5]));
   685  	uint32_t x7 = (UINT32_C(0x7fffffe) - (arg1[6]));
   686  	uint32_t x8 = (UINT32_C(0x3fffffe) - (arg1[7]));
   687  	uint32_t x9 = (UINT32_C(0x7fffffe) - (arg1[8]));
   688  	uint32_t x10 = (UINT32_C(0x3fffffe) - (arg1[9]));
   689  	out1[0] = x1;
   690  	out1[1] = x2;
   691  	out1[2] = x3;
   692  	out1[3] = x4;
   693  	out1[4] = x5;
   694  	out1[5] = x6;
   695  	out1[6] = x7;
   696  	out1[7] = x8;
   697  	out1[8] = x9;
   698  	out1[9] = x10;
   699  }
   700  
   701  /*
   702   * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order.
   703   * Postconditions:
   704   *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
   705   *
   706   * Input Bounds:
   707   *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   708   * Output Bounds:
   709   *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
   710   */
   711  static void fiat_25519_to_bytes(uint8_t out1[32], const uint32_t arg1[10])
   712  {
   713  	uint32_t x1;
   714  	uint8_t x2;
   715  	fiat_25519_subborrowx_u26(&x1, &x2, 0x0, (arg1[0]),
   716  				  UINT32_C(0x3ffffed));
   717  	uint32_t x3;
   718  	uint8_t x4;
   719  	fiat_25519_subborrowx_u25(&x3, &x4, x2, (arg1[1]), UINT32_C(0x1ffffff));
   720  	uint32_t x5;
   721  	uint8_t x6;
   722  	fiat_25519_subborrowx_u26(&x5, &x6, x4, (arg1[2]), UINT32_C(0x3ffffff));
   723  	uint32_t x7;
   724  	uint8_t x8;
   725  	fiat_25519_subborrowx_u25(&x7, &x8, x6, (arg1[3]), UINT32_C(0x1ffffff));
   726  	uint32_t x9;
   727  	uint8_t x10;
   728  	fiat_25519_subborrowx_u26(&x9, &x10, x8, (arg1[4]),
   729  				  UINT32_C(0x3ffffff));
   730  	uint32_t x11;
   731  	uint8_t x12;
   732  	fiat_25519_subborrowx_u25(&x11, &x12, x10, (arg1[5]),
   733  				  UINT32_C(0x1ffffff));
   734  	uint32_t x13;
   735  	uint8_t x14;
   736  	fiat_25519_subborrowx_u26(&x13, &x14, x12, (arg1[6]),
   737  				  UINT32_C(0x3ffffff));
   738  	uint32_t x15;
   739  	uint8_t x16;
   740  	fiat_25519_subborrowx_u25(&x15, &x16, x14, (arg1[7]),
   741  				  UINT32_C(0x1ffffff));
   742  	uint32_t x17;
   743  	uint8_t x18;
   744  	fiat_25519_subborrowx_u26(&x17, &x18, x16, (arg1[8]),
   745  				  UINT32_C(0x3ffffff));
   746  	uint32_t x19;
   747  	uint8_t x20;
   748  	fiat_25519_subborrowx_u25(&x19, &x20, x18, (arg1[9]),
   749  				  UINT32_C(0x1ffffff));
   750  	uint32_t x21;
   751  	fiat_25519_cmovznz_u32(&x21, x20, 0x0, UINT32_C(0xffffffff));
   752  	uint32_t x22;
   753  	uint8_t x23;
   754  	fiat_25519_addcarryx_u26(&x22, &x23, 0x0, x1,
   755  				 (x21 & UINT32_C(0x3ffffed)));
   756  	uint32_t x24;
   757  	uint8_t x25;
   758  	fiat_25519_addcarryx_u25(&x24, &x25, x23, x3,
   759  				 (x21 & UINT32_C(0x1ffffff)));
   760  	uint32_t x26;
   761  	uint8_t x27;
   762  	fiat_25519_addcarryx_u26(&x26, &x27, x25, x5,
   763  				 (x21 & UINT32_C(0x3ffffff)));
   764  	uint32_t x28;
   765  	uint8_t x29;
   766  	fiat_25519_addcarryx_u25(&x28, &x29, x27, x7,
   767  				 (x21 & UINT32_C(0x1ffffff)));
   768  	uint32_t x30;
   769  	uint8_t x31;
   770  	fiat_25519_addcarryx_u26(&x30, &x31, x29, x9,
   771  				 (x21 & UINT32_C(0x3ffffff)));
   772  	uint32_t x32;
   773  	uint8_t x33;
   774  	fiat_25519_addcarryx_u25(&x32, &x33, x31, x11,
   775  				 (x21 & UINT32_C(0x1ffffff)));
   776  	uint32_t x34;
   777  	uint8_t x35;
   778  	fiat_25519_addcarryx_u26(&x34, &x35, x33, x13,
   779  				 (x21 & UINT32_C(0x3ffffff)));
   780  	uint32_t x36;
   781  	uint8_t x37;
   782  	fiat_25519_addcarryx_u25(&x36, &x37, x35, x15,
   783  				 (x21 & UINT32_C(0x1ffffff)));
   784  	uint32_t x38;
   785  	uint8_t x39;
   786  	fiat_25519_addcarryx_u26(&x38, &x39, x37, x17,
   787  				 (x21 & UINT32_C(0x3ffffff)));
   788  	uint32_t x40;
   789  	uint8_t x41;
   790  	fiat_25519_addcarryx_u25(&x40, &x41, x39, x19,
   791  				 (x21 & UINT32_C(0x1ffffff)));
   792  	uint32_t x42 = (x40 << 6);
   793  	uint32_t x43 = (x38 << 4);
   794  	uint32_t x44 = (x36 << 3);
   795  	uint32_t x45 = (x34 * (uint32_t)0x2);
   796  	uint32_t x46 = (x30 << 6);
   797  	uint32_t x47 = (x28 << 5);
   798  	uint32_t x48 = (x26 << 3);
   799  	uint32_t x49 = (x24 << 2);
   800  	uint32_t x50 = (x22 >> 8);
   801  	uint8_t x51 = (uint8_t)(x22 & UINT8_C(0xff));
   802  	uint32_t x52 = (x50 >> 8);
   803  	uint8_t x53 = (uint8_t)(x50 & UINT8_C(0xff));
   804  	uint8_t x54 = (uint8_t)(x52 >> 8);
   805  	uint8_t x55 = (uint8_t)(x52 & UINT8_C(0xff));
   806  	uint32_t x56 = (x54 + x49);
   807  	uint32_t x57 = (x56 >> 8);
   808  	uint8_t x58 = (uint8_t)(x56 & UINT8_C(0xff));
   809  	uint32_t x59 = (x57 >> 8);
   810  	uint8_t x60 = (uint8_t)(x57 & UINT8_C(0xff));
   811  	uint8_t x61 = (uint8_t)(x59 >> 8);
   812  	uint8_t x62 = (uint8_t)(x59 & UINT8_C(0xff));
   813  	uint32_t x63 = (x61 + x48);
   814  	uint32_t x64 = (x63 >> 8);
   815  	uint8_t x65 = (uint8_t)(x63 & UINT8_C(0xff));
   816  	uint32_t x66 = (x64 >> 8);
   817  	uint8_t x67 = (uint8_t)(x64 & UINT8_C(0xff));
   818  	uint8_t x68 = (uint8_t)(x66 >> 8);
   819  	uint8_t x69 = (uint8_t)(x66 & UINT8_C(0xff));
   820  	uint32_t x70 = (x68 + x47);
   821  	uint32_t x71 = (x70 >> 8);
   822  	uint8_t x72 = (uint8_t)(x70 & UINT8_C(0xff));
   823  	uint32_t x73 = (x71 >> 8);
   824  	uint8_t x74 = (uint8_t)(x71 & UINT8_C(0xff));
   825  	uint8_t x75 = (uint8_t)(x73 >> 8);
   826  	uint8_t x76 = (uint8_t)(x73 & UINT8_C(0xff));
   827  	uint32_t x77 = (x75 + x46);
   828  	uint32_t x78 = (x77 >> 8);
   829  	uint8_t x79 = (uint8_t)(x77 & UINT8_C(0xff));
   830  	uint32_t x80 = (x78 >> 8);
   831  	uint8_t x81 = (uint8_t)(x78 & UINT8_C(0xff));
   832  	uint8_t x82 = (uint8_t)(x80 >> 8);
   833  	uint8_t x83 = (uint8_t)(x80 & UINT8_C(0xff));
   834  	uint8_t x84 = (uint8_t)(x82 & UINT8_C(0xff));
   835  	uint32_t x85 = (x32 >> 8);
   836  	uint8_t x86 = (uint8_t)(x32 & UINT8_C(0xff));
   837  	uint32_t x87 = (x85 >> 8);
   838  	uint8_t x88 = (uint8_t)(x85 & UINT8_C(0xff));
   839  	uint8_t x89 = (uint8_t)(x87 >> 8);
   840  	uint8_t x90 = (uint8_t)(x87 & UINT8_C(0xff));
   841  	uint32_t x91 = (x89 + x45);
   842  	uint32_t x92 = (x91 >> 8);
   843  	uint8_t x93 = (uint8_t)(x91 & UINT8_C(0xff));
   844  	uint32_t x94 = (x92 >> 8);
   845  	uint8_t x95 = (uint8_t)(x92 & UINT8_C(0xff));
   846  	uint8_t x96 = (uint8_t)(x94 >> 8);
   847  	uint8_t x97 = (uint8_t)(x94 & UINT8_C(0xff));
   848  	uint32_t x98 = (x96 + x44);
   849  	uint32_t x99 = (x98 >> 8);
   850  	uint8_t x100 = (uint8_t)(x98 & UINT8_C(0xff));
   851  	uint32_t x101 = (x99 >> 8);
   852  	uint8_t x102 = (uint8_t)(x99 & UINT8_C(0xff));
   853  	uint8_t x103 = (uint8_t)(x101 >> 8);
   854  	uint8_t x104 = (uint8_t)(x101 & UINT8_C(0xff));
   855  	uint32_t x105 = (x103 + x43);
   856  	uint32_t x106 = (x105 >> 8);
   857  	uint8_t x107 = (uint8_t)(x105 & UINT8_C(0xff));
   858  	uint32_t x108 = (x106 >> 8);
   859  	uint8_t x109 = (uint8_t)(x106 & UINT8_C(0xff));
   860  	uint8_t x110 = (uint8_t)(x108 >> 8);
   861  	uint8_t x111 = (uint8_t)(x108 & UINT8_C(0xff));
   862  	uint32_t x112 = (x110 + x42);
   863  	uint32_t x113 = (x112 >> 8);
   864  	uint8_t x114 = (uint8_t)(x112 & UINT8_C(0xff));
   865  	uint32_t x115 = (x113 >> 8);
   866  	uint8_t x116 = (uint8_t)(x113 & UINT8_C(0xff));
   867  	uint8_t x117 = (uint8_t)(x115 >> 8);
   868  	uint8_t x118 = (uint8_t)(x115 & UINT8_C(0xff));
   869  	out1[0] = x51;
   870  	out1[1] = x53;
   871  	out1[2] = x55;
   872  	out1[3] = x58;
   873  	out1[4] = x60;
   874  	out1[5] = x62;
   875  	out1[6] = x65;
   876  	out1[7] = x67;
   877  	out1[8] = x69;
   878  	out1[9] = x72;
   879  	out1[10] = x74;
   880  	out1[11] = x76;
   881  	out1[12] = x79;
   882  	out1[13] = x81;
   883  	out1[14] = x83;
   884  	out1[15] = x84;
   885  	out1[16] = x86;
   886  	out1[17] = x88;
   887  	out1[18] = x90;
   888  	out1[19] = x93;
   889  	out1[20] = x95;
   890  	out1[21] = x97;
   891  	out1[22] = x100;
   892  	out1[23] = x102;
   893  	out1[24] = x104;
   894  	out1[25] = x107;
   895  	out1[26] = x109;
   896  	out1[27] = x111;
   897  	out1[28] = x114;
   898  	out1[29] = x116;
   899  	out1[30] = x118;
   900  	out1[31] = x117;
   901  }
   902  
   903  /*
   904   * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order.
   905   * Postconditions:
   906   *   eval out1 mod m = bytes_eval arg1 mod m
   907   *
   908   * Input Bounds:
   909   *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
   910   * Output Bounds:
   911   *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
   912   */
   913  static void fiat_25519_from_bytes(uint32_t out1[10], const uint8_t arg1[32])
   914  {
   915  	uint32_t x1 = ((uint32_t)(arg1[31]) << 18);
   916  	uint32_t x2 = ((uint32_t)(arg1[30]) << 10);
   917  	uint32_t x3 = ((uint32_t)(arg1[29]) << 2);
   918  	uint32_t x4 = ((uint32_t)(arg1[28]) << 20);
   919  	uint32_t x5 = ((uint32_t)(arg1[27]) << 12);
   920  	uint32_t x6 = ((uint32_t)(arg1[26]) << 4);
   921  	uint32_t x7 = ((uint32_t)(arg1[25]) << 21);
   922  	uint32_t x8 = ((uint32_t)(arg1[24]) << 13);
   923  	uint32_t x9 = ((uint32_t)(arg1[23]) << 5);
   924  	uint32_t x10 = ((uint32_t)(arg1[22]) << 23);
   925  	uint32_t x11 = ((uint32_t)(arg1[21]) << 15);
   926  	uint32_t x12 = ((uint32_t)(arg1[20]) << 7);
   927  	uint32_t x13 = ((uint32_t)(arg1[19]) << 24);
   928  	uint32_t x14 = ((uint32_t)(arg1[18]) << 16);
   929  	uint32_t x15 = ((uint32_t)(arg1[17]) << 8);
   930  	uint8_t x16 = (arg1[16]);
   931  	uint32_t x17 = ((uint32_t)(arg1[15]) << 18);
   932  	uint32_t x18 = ((uint32_t)(arg1[14]) << 10);
   933  	uint32_t x19 = ((uint32_t)(arg1[13]) << 2);
   934  	uint32_t x20 = ((uint32_t)(arg1[12]) << 19);
   935  	uint32_t x21 = ((uint32_t)(arg1[11]) << 11);
   936  	uint32_t x22 = ((uint32_t)(arg1[10]) << 3);
   937  	uint32_t x23 = ((uint32_t)(arg1[9]) << 21);
   938  	uint32_t x24 = ((uint32_t)(arg1[8]) << 13);
   939  	uint32_t x25 = ((uint32_t)(arg1[7]) << 5);
   940  	uint32_t x26 = ((uint32_t)(arg1[6]) << 22);
   941  	uint32_t x27 = ((uint32_t)(arg1[5]) << 14);
   942  	uint32_t x28 = ((uint32_t)(arg1[4]) << 6);
   943  	uint32_t x29 = ((uint32_t)(arg1[3]) << 24);
   944  	uint32_t x30 = ((uint32_t)(arg1[2]) << 16);
   945  	uint32_t x31 = ((uint32_t)(arg1[1]) << 8);
   946  	uint8_t x32 = (arg1[0]);
   947  	uint32_t x33 = (x32 + (x31 + (x30 + x29)));
   948  	uint8_t x34 = (uint8_t)(x33 >> 26);
   949  	uint32_t x35 = (x33 & UINT32_C(0x3ffffff));
   950  	uint32_t x36 = (x3 + (x2 + x1));
   951  	uint32_t x37 = (x6 + (x5 + x4));
   952  	uint32_t x38 = (x9 + (x8 + x7));
   953  	uint32_t x39 = (x12 + (x11 + x10));
   954  	uint32_t x40 = (x16 + (x15 + (x14 + x13)));
   955  	uint32_t x41 = (x19 + (x18 + x17));
   956  	uint32_t x42 = (x22 + (x21 + x20));
   957  	uint32_t x43 = (x25 + (x24 + x23));
   958  	uint32_t x44 = (x28 + (x27 + x26));
   959  	uint32_t x45 = (x34 + x44);
   960  	uint8_t x46 = (uint8_t)(x45 >> 25);
   961  	uint32_t x47 = (x45 & UINT32_C(0x1ffffff));
   962  	uint32_t x48 = (x46 + x43);
   963  	uint8_t x49 = (uint8_t)(x48 >> 26);
   964  	uint32_t x50 = (x48 & UINT32_C(0x3ffffff));
   965  	uint32_t x51 = (x49 + x42);
   966  	uint8_t x52 = (uint8_t)(x51 >> 25);
   967  	uint32_t x53 = (x51 & UINT32_C(0x1ffffff));
   968  	uint32_t x54 = (x52 + x41);
   969  	uint32_t x55 = (x54 & UINT32_C(0x3ffffff));
   970  	uint8_t x56 = (uint8_t)(x40 >> 25);
   971  	uint32_t x57 = (x40 & UINT32_C(0x1ffffff));
   972  	uint32_t x58 = (x56 + x39);
   973  	uint8_t x59 = (uint8_t)(x58 >> 26);
   974  	uint32_t x60 = (x58 & UINT32_C(0x3ffffff));
   975  	uint32_t x61 = (x59 + x38);
   976  	uint8_t x62 = (uint8_t)(x61 >> 25);
   977  	uint32_t x63 = (x61 & UINT32_C(0x1ffffff));
   978  	uint32_t x64 = (x62 + x37);
   979  	uint8_t x65 = (uint8_t)(x64 >> 26);
   980  	uint32_t x66 = (x64 & UINT32_C(0x3ffffff));
   981  	uint32_t x67 = (x65 + x36);
   982  	out1[0] = x35;
   983  	out1[1] = x47;
   984  	out1[2] = x50;
   985  	out1[3] = x53;
   986  	out1[4] = x55;
   987  	out1[5] = x57;
   988  	out1[6] = x60;
   989  	out1[7] = x63;
   990  	out1[8] = x66;
   991  	out1[9] = x67;
   992  }
   993  
   994  // Definitions
   995  
   996  // fe means field element. Here the field is \Z/(2^255-19). An element t,
   997  // entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
   998  // t[3]+2^102 t[4]+...+2^230 t[9].
   999  // fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
  1000  // Multiplication and carrying produce fe from fe_loose.
  1001  typedef struct fe {
  1002  	uint32_t v[10];
  1003  } fe;
  1004  
  1005  // fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc.
  1006  // Addition and subtraction produce fe_loose from (fe, fe).
  1007  typedef struct fe_loose {
  1008  	uint32_t v[10];
  1009  } fe_loose;
  1010  
  1011  // ge means group element.
  1012  //
  1013  // Here the group is the set of pairs (x,y) of field elements (see fe.h)
  1014  // satisfying -x^2 + y^2 = 1 + d x^2y^2
  1015  // where d = -121665/121666.
  1016  //
  1017  // Representations:
  1018  //   ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z
  1019  //   ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT
  1020  //   ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T
  1021  //   ge_precomp (Duif): (y+x,y-x,2dxy)
  1022  
  1023  typedef struct {
  1024  	fe X;
  1025  	fe Y;
  1026  	fe Z;
  1027  } ge_p2;
  1028  
  1029  typedef struct {
  1030  	fe X;
  1031  	fe Y;
  1032  	fe Z;
  1033  	fe T;
  1034  } ge_p3;
  1035  
  1036  typedef struct {
  1037  	fe_loose X;
  1038  	fe_loose Y;
  1039  	fe_loose Z;
  1040  	fe_loose T;
  1041  } ge_p1p1;
  1042  
  1043  typedef struct {
  1044  	fe_loose yplusx;
  1045  	fe_loose yminusx;
  1046  	fe_loose xy2d;
  1047  } ge_precomp;
  1048  
  1049  typedef struct {
  1050  	fe_loose YplusX;
  1051  	fe_loose YminusX;
  1052  	fe_loose Z;
  1053  	fe_loose T2d;
  1054  } ge_cached;
  1055  
  1056  // Constants.
  1057  
  1058  static const fe d = { { 56195235, 13857412, 51736253, 6949390, 114729, 24766616,
  1059  			60832955, 30306712, 48412415, 21499315 } };
  1060  
  1061  static const fe sqrtm1 = { { 34513072, 25610706, 9377949, 3500415, 12389472,
  1062  			     33281959, 41962654, 31548777, 326685, 11406482 } };
  1063  
  1064  static const fe d2 = { { 45281625, 27714825, 36363642, 13898781, 229458,
  1065  			 15978800, 54557047, 27058993, 29715967, 9444199 } };
  1066  
  1067  // Bi[i] = (2*i+1)*B
  1068  static const ge_precomp Bi[8] = {
  1069  	{
  1070  		{ { 25967493, 19198397, 29566455, 3660896, 54414519, 4014786,
  1071  		    27544626, 21800161, 61029707, 2047604
  1072  
  1073  		} },
  1074  		{ { 54563134, 934261, 64385954, 3049989, 66381436, 9406985,
  1075  		    12720692, 5043384, 19500929, 18085054
  1076  
  1077  		} },
  1078  		{ { 58370664, 4489569, 9688441, 18769238, 10184608, 21191052,
  1079  		    29287918, 11864899, 42594502, 29115885 } },
  1080  	},
  1081  	{
  1082  		{ { 15636272, 23865875, 24204772, 25642034, 616976, 16869170,
  1083  		    27787599, 18782243, 28944399, 32004408 } },
  1084  		{ { 16568933, 4717097, 55552716, 32452109, 15682895, 21747389,
  1085  		    16354576, 21778470, 7689661, 11199574 } },
  1086  		{ { 30464137, 27578307, 55329429, 17883566, 23220364, 15915852,
  1087  		    7512774, 10017326, 49359771, 23634074 } },
  1088  	},
  1089  	{
  1090  		{ { 10861363, 11473154, 27284546, 1981175, 37044515, 12577860,
  1091  		    32867885, 14515107, 51670560, 10819379 } },
  1092  		{ { 4708026, 6336745, 20377586, 9066809, 55836755, 6594695,
  1093  		    41455196, 12483687, 54440373, 5581305 } },
  1094  		{ { 19563141, 16186464, 37722007, 4097518, 10237984, 29206317,
  1095  		    28542349, 13850243, 43430843, 17738489 } },
  1096  	},
  1097  	{
  1098  		{ { 5153727, 9909285, 1723747, 30776558, 30523604, 5516873,
  1099  		    19480852, 5230134, 43156425, 18378665 } },
  1100  		{ { 36839857, 30090922, 7665485, 10083793, 28475525, 1649722,
  1101  		    20654025, 16520125, 30598449, 7715701 } },
  1102  		{ { 28881826, 14381568, 9657904, 3680757, 46927229, 7843315,
  1103  		    35708204, 1370707, 29794553, 32145132 } },
  1104  	},
  1105  	{
  1106  		{ { 44589871, 26862249, 14201701, 24808930, 43598457, 8844725,
  1107  		    18474211, 32192982, 54046167, 13821876 } },
  1108  		{ { 60653668, 25714560, 3374701, 28813570, 40010246, 22982724,
  1109  		    31655027, 26342105, 18853321, 19333481 } },
  1110  		{ { 4566811, 20590564, 38133974, 21313742, 59506191, 30723862,
  1111  		    58594505, 23123294, 2207752, 30344648 } },
  1112  	},
  1113  	{
  1114  		{ { 41954014, 29368610, 29681143, 7868801, 60254203, 24130566,
  1115  		    54671499, 32891431, 35997400, 17421995 } },
  1116  		{ { 25576264, 30851218, 7349803, 21739588, 16472781, 9300885,
  1117  		    3844789, 15725684, 171356, 6466918 } },
  1118  		{ { 23103977, 13316479, 9739013, 17404951, 817874, 18515490,
  1119  		    8965338, 19466374, 36393951, 16193876 } },
  1120  	},
  1121  	{
  1122  		{ { 33587053, 3180712, 64714734, 14003686, 50205390, 17283591,
  1123  		    17238397, 4729455, 49034351, 9256799 } },
  1124  		{ { 41926547, 29380300, 32336397, 5036987, 45872047, 11360616,
  1125  		    22616405, 9761698, 47281666, 630304 } },
  1126  		{ { 53388152, 2639452, 42871404, 26147950, 9494426, 27780403,
  1127  		    60554312, 17593437, 64659607, 19263131 } },
  1128  	},
  1129  	{
  1130  		{ { 63957664, 28508356, 9282713, 6866145, 35201802, 32691408,
  1131  		    48168288, 15033783, 25105118, 25659556 } },
  1132  		{ { 42782475, 15950225, 35307649, 18961608, 55446126, 28463506,
  1133  		    1573891, 30928545, 2198789, 17749813 } },
  1134  		{ { 64009494, 10324966, 64867251, 7453182, 61661885, 30818928,
  1135  		    53296841, 17317989, 34647629, 21263748 } },
  1136  	},
  1137  };
  1138  
  1139  static void fe_frombytes_strict(fe *h, const uint8_t s[32])
  1140  {
  1141  	// |fiat_25519_from_bytes| requires the top-most bit be clear.
  1142  	fiat_25519_from_bytes(h->v, s);
  1143  }
  1144  
  1145  static void fe_frombytes(fe *h, const uint8_t s[32])
  1146  {
  1147  	uint8_t s_copy[32];
  1148  	memcpy(s_copy, s, 32);
  1149  	s_copy[31] &= 0x7f;
  1150  	fe_frombytes_strict(h, s_copy);
  1151  }
  1152  
  1153  static void fe_tobytes(uint8_t s[32], const fe *f)
  1154  {
  1155  	fiat_25519_to_bytes(s, f->v);
  1156  }
  1157  
  1158  // h = 0
  1159  static void fe_0(fe *h)
  1160  {
  1161  	memset(h, 0, sizeof(fe));
  1162  }
  1163  
  1164  // h = 1
  1165  static void fe_1(fe *h)
  1166  {
  1167  	memset(h, 0, sizeof(fe));
  1168  	h->v[0] = 1;
  1169  }
  1170  
  1171  // h = f + g
  1172  // Can overlap h with f or g.
  1173  static void fe_add(fe_loose *h, const fe *f, const fe *g)
  1174  {
  1175  	fiat_25519_add(h->v, f->v, g->v);
  1176  }
  1177  
  1178  // h = f - g
  1179  // Can overlap h with f or g.
  1180  static void fe_sub(fe_loose *h, const fe *f, const fe *g)
  1181  {
  1182  	fiat_25519_sub(h->v, f->v, g->v);
  1183  }
  1184  
  1185  static void fe_carry(fe *h, const fe_loose *f)
  1186  {
  1187  	fiat_25519_carry(h->v, f->v);
  1188  }
  1189  
  1190  static void fe_mul_impl(uint32_t out[10], const uint32_t in1[10],
  1191  			const uint32_t in2[10])
  1192  {
  1193  	fiat_25519_carry_mul(out, in1, in2);
  1194  }
  1195  
  1196  static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g)
  1197  {
  1198  	fe_mul_impl(h->v, f->v, g->v);
  1199  }
  1200  
  1201  static void fe_mul_ttt(fe *h, const fe *f, const fe *g)
  1202  {
  1203  	fe_mul_impl(h->v, f->v, g->v);
  1204  }
  1205  
  1206  static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g)
  1207  {
  1208  	fe_mul_impl(h->v, f->v, g->v);
  1209  }
  1210  
  1211  static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g)
  1212  {
  1213  	fe_mul_impl(h->v, f->v, g->v);
  1214  }
  1215  
  1216  static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g)
  1217  {
  1218  	fe_mul_impl(h->v, f->v, g->v);
  1219  }
  1220  
  1221  static void fe_sq_tl(fe *h, const fe_loose *f)
  1222  {
  1223  	fiat_25519_carry_square(h->v, f->v);
  1224  }
  1225  
  1226  static void fe_sq_tt(fe *h, const fe *f)
  1227  {
  1228  	fiat_25519_carry_square(h->v, f->v);
  1229  }
  1230  
  1231  // h = -f
  1232  static void fe_neg(fe_loose *h, const fe *f)
  1233  {
  1234  	fiat_25519_opp(h->v, f->v);
  1235  }
  1236  
  1237  // h = f
  1238  static void fe_copy(fe *h, const fe *f)
  1239  {
  1240  	memmove(h, f, sizeof(fe));
  1241  }
  1242  
  1243  static void fe_copy_lt(fe_loose *h, const fe *f)
  1244  {
  1245  	memmove(h, f, sizeof(fe));
  1246  }
  1247  
  1248  static void fe_loose_invert(fe *out, const fe_loose *z)
  1249  {
  1250  	fe t0;
  1251  	fe t1;
  1252  	fe t2;
  1253  	fe t3;
  1254  	int i;
  1255  
  1256  	fe_sq_tl(&t0, z);
  1257  	fe_sq_tt(&t1, &t0);
  1258  	for (i = 1; i < 2; ++i) {
  1259  		fe_sq_tt(&t1, &t1);
  1260  	}
  1261  	fe_mul_tlt(&t1, z, &t1);
  1262  	fe_mul_ttt(&t0, &t0, &t1);
  1263  	fe_sq_tt(&t2, &t0);
  1264  	fe_mul_ttt(&t1, &t1, &t2);
  1265  	fe_sq_tt(&t2, &t1);
  1266  	for (i = 1; i < 5; ++i) {
  1267  		fe_sq_tt(&t2, &t2);
  1268  	}
  1269  	fe_mul_ttt(&t1, &t2, &t1);
  1270  	fe_sq_tt(&t2, &t1);
  1271  	for (i = 1; i < 10; ++i) {
  1272  		fe_sq_tt(&t2, &t2);
  1273  	}
  1274  	fe_mul_ttt(&t2, &t2, &t1);
  1275  	fe_sq_tt(&t3, &t2);
  1276  	for (i = 1; i < 20; ++i) {
  1277  		fe_sq_tt(&t3, &t3);
  1278  	}
  1279  	fe_mul_ttt(&t2, &t3, &t2);
  1280  	fe_sq_tt(&t2, &t2);
  1281  	for (i = 1; i < 10; ++i) {
  1282  		fe_sq_tt(&t2, &t2);
  1283  	}
  1284  	fe_mul_ttt(&t1, &t2, &t1);
  1285  	fe_sq_tt(&t2, &t1);
  1286  	for (i = 1; i < 50; ++i) {
  1287  		fe_sq_tt(&t2, &t2);
  1288  	}
  1289  	fe_mul_ttt(&t2, &t2, &t1);
  1290  	fe_sq_tt(&t3, &t2);
  1291  	for (i = 1; i < 100; ++i) {
  1292  		fe_sq_tt(&t3, &t3);
  1293  	}
  1294  	fe_mul_ttt(&t2, &t3, &t2);
  1295  	fe_sq_tt(&t2, &t2);
  1296  	for (i = 1; i < 50; ++i) {
  1297  		fe_sq_tt(&t2, &t2);
  1298  	}
  1299  	fe_mul_ttt(&t1, &t2, &t1);
  1300  	fe_sq_tt(&t1, &t1);
  1301  	for (i = 1; i < 5; ++i) {
  1302  		fe_sq_tt(&t1, &t1);
  1303  	}
  1304  	fe_mul_ttt(out, &t1, &t0);
  1305  }
  1306  
  1307  static void fe_invert(fe *out, const fe *z)
  1308  {
  1309  	fe_loose l;
  1310  	fe_copy_lt(&l, z);
  1311  	fe_loose_invert(out, &l);
  1312  }
  1313  
  1314  // return 0 if f == 0
  1315  // return 1 if f != 0
  1316  static int fe_isnonzero(const fe_loose *f)
  1317  {
  1318  	fe tight;
  1319  	fe_carry(&tight, f);
  1320  	uint8_t s[32];
  1321  	fe_tobytes(s, &tight);
  1322  
  1323  	static const uint8_t zero[32] = { 0 };
  1324  	return memcmp_ct(s, zero, sizeof(zero)) != 0;
  1325  }
  1326  
  1327  // return 1 if f is in {1,3,5,...,q-2}
  1328  // return 0 if f is in {0,2,4,...,q-1}
  1329  static int fe_isnegative(const fe *f)
  1330  {
  1331  	uint8_t s[32];
  1332  	fe_tobytes(s, f);
  1333  	return s[0] & 1;
  1334  }
  1335  
  1336  static void fe_sq2_tt(fe *h, const fe *f)
  1337  {
  1338  	// h = f^2
  1339  	fe_sq_tt(h, f);
  1340  
  1341  	// h = h + h
  1342  	fe_loose tmp;
  1343  	fe_add(&tmp, h, h);
  1344  	fe_carry(h, &tmp);
  1345  }
  1346  
  1347  static void fe_pow22523(fe *out, const fe *z)
  1348  {
  1349  	fe t0;
  1350  	fe t1;
  1351  	fe t2;
  1352  	int i;
  1353  
  1354  	fe_sq_tt(&t0, z);
  1355  	fe_sq_tt(&t1, &t0);
  1356  	for (i = 1; i < 2; ++i) {
  1357  		fe_sq_tt(&t1, &t1);
  1358  	}
  1359  	fe_mul_ttt(&t1, z, &t1);
  1360  	fe_mul_ttt(&t0, &t0, &t1);
  1361  	fe_sq_tt(&t0, &t0);
  1362  	fe_mul_ttt(&t0, &t1, &t0);
  1363  	fe_sq_tt(&t1, &t0);
  1364  	for (i = 1; i < 5; ++i) {
  1365  		fe_sq_tt(&t1, &t1);
  1366  	}
  1367  	fe_mul_ttt(&t0, &t1, &t0);
  1368  	fe_sq_tt(&t1, &t0);
  1369  	for (i = 1; i < 10; ++i) {
  1370  		fe_sq_tt(&t1, &t1);
  1371  	}
  1372  	fe_mul_ttt(&t1, &t1, &t0);
  1373  	fe_sq_tt(&t2, &t1);
  1374  	for (i = 1; i < 20; ++i) {
  1375  		fe_sq_tt(&t2, &t2);
  1376  	}
  1377  	fe_mul_ttt(&t1, &t2, &t1);
  1378  	fe_sq_tt(&t1, &t1);
  1379  	for (i = 1; i < 10; ++i) {
  1380  		fe_sq_tt(&t1, &t1);
  1381  	}
  1382  	fe_mul_ttt(&t0, &t1, &t0);
  1383  	fe_sq_tt(&t1, &t0);
  1384  	for (i = 1; i < 50; ++i) {
  1385  		fe_sq_tt(&t1, &t1);
  1386  	}
  1387  	fe_mul_ttt(&t1, &t1, &t0);
  1388  	fe_sq_tt(&t2, &t1);
  1389  	for (i = 1; i < 100; ++i) {
  1390  		fe_sq_tt(&t2, &t2);
  1391  	}
  1392  	fe_mul_ttt(&t1, &t2, &t1);
  1393  	fe_sq_tt(&t1, &t1);
  1394  	for (i = 1; i < 50; ++i) {
  1395  		fe_sq_tt(&t1, &t1);
  1396  	}
  1397  	fe_mul_ttt(&t0, &t1, &t0);
  1398  	fe_sq_tt(&t0, &t0);
  1399  	for (i = 1; i < 2; ++i) {
  1400  		fe_sq_tt(&t0, &t0);
  1401  	}
  1402  	fe_mul_ttt(out, &t0, z);
  1403  }
  1404  
  1405  static void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h)
  1406  {
  1407  	fe recip;
  1408  	fe x;
  1409  	fe y;
  1410  
  1411  	fe_invert(&recip, &h->Z);
  1412  	fe_mul_ttt(&x, &h->X, &recip);
  1413  	fe_mul_ttt(&y, &h->Y, &recip);
  1414  	fe_tobytes(s, &y);
  1415  	s[31] ^= fe_isnegative(&x) << 7;
  1416  }
  1417  
  1418  static int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32])
  1419  {
  1420  	fe u;
  1421  	fe_loose v;
  1422  	fe v3;
  1423  	fe vxx;
  1424  	fe_loose check;
  1425  
  1426  	fe_frombytes(&h->Y, s);
  1427  	fe_1(&h->Z);
  1428  	fe_sq_tt(&v3, &h->Y);
  1429  	fe_mul_ttt(&vxx, &v3, &d);
  1430  	fe_sub(&v, &v3, &h->Z); // u = y^2-1
  1431  	fe_carry(&u, &v);
  1432  	fe_add(&v, &vxx, &h->Z); // v = dy^2+1
  1433  
  1434  	fe_sq_tl(&v3, &v);
  1435  	fe_mul_ttl(&v3, &v3, &v); // v3 = v^3
  1436  	fe_sq_tt(&h->X, &v3);
  1437  	fe_mul_ttl(&h->X, &h->X, &v);
  1438  	fe_mul_ttt(&h->X, &h->X, &u); // x = uv^7
  1439  
  1440  	fe_pow22523(&h->X, &h->X); // x = (uv^7)^((q-5)/8)
  1441  	fe_mul_ttt(&h->X, &h->X, &v3);
  1442  	fe_mul_ttt(&h->X, &h->X, &u); // x = uv^3(uv^7)^((q-5)/8)
  1443  
  1444  	fe_sq_tt(&vxx, &h->X);
  1445  	fe_mul_ttl(&vxx, &vxx, &v);
  1446  	fe_sub(&check, &vxx, &u);
  1447  	if (fe_isnonzero(&check)) {
  1448  		fe_add(&check, &vxx, &u);
  1449  		if (fe_isnonzero(&check)) {
  1450  			return 0;
  1451  		}
  1452  		fe_mul_ttt(&h->X, &h->X, &sqrtm1);
  1453  	}
  1454  
  1455  	if (fe_isnegative(&h->X) != (s[31] >> 7)) {
  1456  		fe_loose t;
  1457  		fe_neg(&t, &h->X);
  1458  		fe_carry(&h->X, &t);
  1459  	}
  1460  
  1461  	fe_mul_ttt(&h->T, &h->X, &h->Y);
  1462  	return 1;
  1463  }
  1464  
  1465  static void ge_p2_0(ge_p2 *h)
  1466  {
  1467  	fe_0(&h->X);
  1468  	fe_1(&h->Y);
  1469  	fe_1(&h->Z);
  1470  }
  1471  
  1472  // r = p
  1473  static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p)
  1474  {
  1475  	fe_copy(&r->X, &p->X);
  1476  	fe_copy(&r->Y, &p->Y);
  1477  	fe_copy(&r->Z, &p->Z);
  1478  }
  1479  
  1480  // r = p
  1481  static void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p)
  1482  {
  1483  	fe_add(&r->YplusX, &p->Y, &p->X);
  1484  	fe_sub(&r->YminusX, &p->Y, &p->X);
  1485  	fe_copy_lt(&r->Z, &p->Z);
  1486  	fe_mul_ltt(&r->T2d, &p->T, &d2);
  1487  }
  1488  
  1489  // r = p
  1490  static void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p)
  1491  {
  1492  	fe_mul_tll(&r->X, &p->X, &p->T);
  1493  	fe_mul_tll(&r->Y, &p->Y, &p->Z);
  1494  	fe_mul_tll(&r->Z, &p->Z, &p->T);
  1495  }
  1496  
  1497  // r = p
  1498  static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p)
  1499  {
  1500  	fe_mul_tll(&r->X, &p->X, &p->T);
  1501  	fe_mul_tll(&r->Y, &p->Y, &p->Z);
  1502  	fe_mul_tll(&r->Z, &p->Z, &p->T);
  1503  	fe_mul_tll(&r->T, &p->X, &p->Y);
  1504  }
  1505  
  1506  // r = 2 * p
  1507  static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p)
  1508  {
  1509  	fe trX, trZ, trT;
  1510  	fe t0;
  1511  
  1512  	fe_sq_tt(&trX, &p->X);
  1513  	fe_sq_tt(&trZ, &p->Y);
  1514  	fe_sq2_tt(&trT, &p->Z);
  1515  	fe_add(&r->Y, &p->X, &p->Y);
  1516  	fe_sq_tl(&t0, &r->Y);
  1517  
  1518  	fe_add(&r->Y, &trZ, &trX);
  1519  	fe_sub(&r->Z, &trZ, &trX);
  1520  	fe_carry(&trZ, &r->Y);
  1521  	fe_sub(&r->X, &t0, &trZ);
  1522  	fe_carry(&trZ, &r->Z);
  1523  	fe_sub(&r->T, &trT, &trZ);
  1524  }
  1525  
  1526  // r = 2 * p
  1527  static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p)
  1528  {
  1529  	ge_p2 q;
  1530  	ge_p3_to_p2(&q, p);
  1531  	ge_p2_dbl(r, &q);
  1532  }
  1533  
  1534  // r = p + q
  1535  static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
  1536  {
  1537  	fe trY, trZ, trT;
  1538  
  1539  	fe_add(&r->X, &p->Y, &p->X);
  1540  	fe_sub(&r->Y, &p->Y, &p->X);
  1541  	fe_mul_tll(&trZ, &r->X, &q->yplusx);
  1542  	fe_mul_tll(&trY, &r->Y, &q->yminusx);
  1543  	fe_mul_tlt(&trT, &q->xy2d, &p->T);
  1544  	fe_add(&r->T, &p->Z, &p->Z);
  1545  	fe_sub(&r->X, &trZ, &trY);
  1546  	fe_add(&r->Y, &trZ, &trY);
  1547  	fe_carry(&trZ, &r->T);
  1548  	fe_add(&r->Z, &trZ, &trT);
  1549  	fe_sub(&r->T, &trZ, &trT);
  1550  }
  1551  
  1552  // r = p - q
  1553  static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
  1554  {
  1555  	fe trY, trZ, trT;
  1556  
  1557  	fe_add(&r->X, &p->Y, &p->X);
  1558  	fe_sub(&r->Y, &p->Y, &p->X);
  1559  	fe_mul_tll(&trZ, &r->X, &q->yminusx);
  1560  	fe_mul_tll(&trY, &r->Y, &q->yplusx);
  1561  	fe_mul_tlt(&trT, &q->xy2d, &p->T);
  1562  	fe_add(&r->T, &p->Z, &p->Z);
  1563  	fe_sub(&r->X, &trZ, &trY);
  1564  	fe_add(&r->Y, &trZ, &trY);
  1565  	fe_carry(&trZ, &r->T);
  1566  	fe_sub(&r->Z, &trZ, &trT);
  1567  	fe_add(&r->T, &trZ, &trT);
  1568  }
  1569  
  1570  // r = p + q
  1571  static void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
  1572  {
  1573  	fe trX, trY, trZ, trT;
  1574  
  1575  	fe_add(&r->X, &p->Y, &p->X);
  1576  	fe_sub(&r->Y, &p->Y, &p->X);
  1577  	fe_mul_tll(&trZ, &r->X, &q->YplusX);
  1578  	fe_mul_tll(&trY, &r->Y, &q->YminusX);
  1579  	fe_mul_tlt(&trT, &q->T2d, &p->T);
  1580  	fe_mul_ttl(&trX, &p->Z, &q->Z);
  1581  	fe_add(&r->T, &trX, &trX);
  1582  	fe_sub(&r->X, &trZ, &trY);
  1583  	fe_add(&r->Y, &trZ, &trY);
  1584  	fe_carry(&trZ, &r->T);
  1585  	fe_add(&r->Z, &trZ, &trT);
  1586  	fe_sub(&r->T, &trZ, &trT);
  1587  }
  1588  
  1589  // r = p - q
  1590  static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
  1591  {
  1592  	fe trX, trY, trZ, trT;
  1593  
  1594  	fe_add(&r->X, &p->Y, &p->X);
  1595  	fe_sub(&r->Y, &p->Y, &p->X);
  1596  	fe_mul_tll(&trZ, &r->X, &q->YminusX);
  1597  	fe_mul_tll(&trY, &r->Y, &q->YplusX);
  1598  	fe_mul_tlt(&trT, &q->T2d, &p->T);
  1599  	fe_mul_ttl(&trX, &p->Z, &q->Z);
  1600  	fe_add(&r->T, &trX, &trX);
  1601  	fe_sub(&r->X, &trZ, &trY);
  1602  	fe_add(&r->Y, &trZ, &trY);
  1603  	fe_carry(&trZ, &r->T);
  1604  	fe_sub(&r->Z, &trZ, &trT);
  1605  	fe_add(&r->T, &trZ, &trT);
  1606  }
  1607  
  1608  static void slide(signed char *r, const uint8_t *a)
  1609  {
  1610  	int i;
  1611  	int b;
  1612  	int k;
  1613  
  1614  	for (i = 0; i < 256; ++i) {
  1615  		r[i] = 1 & (a[i >> 3] >> (i & 7));
  1616  	}
  1617  
  1618  	for (i = 0; i < 256; ++i) {
  1619  		if (r[i]) {
  1620  			for (b = 1; b <= 6 && i + b < 256; ++b) {
  1621  				if (r[i + b]) {
  1622  					if (r[i] + (r[i + b] << b) <= 15) {
  1623  						r[i] += r[i + b] << b;
  1624  						r[i + b] = 0;
  1625  					} else if (r[i] - (r[i + b] << b) >=
  1626  						   -15) {
  1627  						r[i] -= r[i + b] << b;
  1628  						for (k = i + b; k < 256; ++k) {
  1629  							if (!r[k]) {
  1630  								r[k] = 1;
  1631  								break;
  1632  							}
  1633  							r[k] = 0;
  1634  						}
  1635  					} else {
  1636  						break;
  1637  					}
  1638  				}
  1639  			}
  1640  		}
  1641  	}
  1642  }
  1643  
  1644  // r = a * A + b * B
  1645  // where a = a[0]+256*a[1]+...+256^31 a[31].
  1646  // and b = b[0]+256*b[1]+...+256^31 b[31].
  1647  // B is the Ed25519 base point (x,4/5) with x positive.
  1648  static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
  1649  					 const ge_p3 *A, const uint8_t *b)
  1650  {
  1651  	signed char aslide[256];
  1652  	signed char bslide[256];
  1653  	ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A
  1654  	ge_p1p1 t;
  1655  	ge_p3 u;
  1656  	ge_p3 A2;
  1657  	int i;
  1658  
  1659  	slide(aslide, a);
  1660  	slide(bslide, b);
  1661  
  1662  	x25519_ge_p3_to_cached(&Ai[0], A);
  1663  	ge_p3_dbl(&t, A);
  1664  	x25519_ge_p1p1_to_p3(&A2, &t);
  1665  	x25519_ge_add(&t, &A2, &Ai[0]);
  1666  	x25519_ge_p1p1_to_p3(&u, &t);
  1667  	x25519_ge_p3_to_cached(&Ai[1], &u);
  1668  	x25519_ge_add(&t, &A2, &Ai[1]);
  1669  	x25519_ge_p1p1_to_p3(&u, &t);
  1670  	x25519_ge_p3_to_cached(&Ai[2], &u);
  1671  	x25519_ge_add(&t, &A2, &Ai[2]);
  1672  	x25519_ge_p1p1_to_p3(&u, &t);
  1673  	x25519_ge_p3_to_cached(&Ai[3], &u);
  1674  	x25519_ge_add(&t, &A2, &Ai[3]);
  1675  	x25519_ge_p1p1_to_p3(&u, &t);
  1676  	x25519_ge_p3_to_cached(&Ai[4], &u);
  1677  	x25519_ge_add(&t, &A2, &Ai[4]);
  1678  	x25519_ge_p1p1_to_p3(&u, &t);
  1679  	x25519_ge_p3_to_cached(&Ai[5], &u);
  1680  	x25519_ge_add(&t, &A2, &Ai[5]);
  1681  	x25519_ge_p1p1_to_p3(&u, &t);
  1682  	x25519_ge_p3_to_cached(&Ai[6], &u);
  1683  	x25519_ge_add(&t, &A2, &Ai[6]);
  1684  	x25519_ge_p1p1_to_p3(&u, &t);
  1685  	x25519_ge_p3_to_cached(&Ai[7], &u);
  1686  
  1687  	ge_p2_0(r);
  1688  
  1689  	for (i = 255; i >= 0; --i) {
  1690  		if (aslide[i] || bslide[i]) {
  1691  			break;
  1692  		}
  1693  	}
  1694  
  1695  	for (; i >= 0; --i) {
  1696  		ge_p2_dbl(&t, r);
  1697  
  1698  		if (aslide[i] > 0) {
  1699  			x25519_ge_p1p1_to_p3(&u, &t);
  1700  			x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
  1701  		} else if (aslide[i] < 0) {
  1702  			x25519_ge_p1p1_to_p3(&u, &t);
  1703  			x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
  1704  		}
  1705  
  1706  		if (bslide[i] > 0) {
  1707  			x25519_ge_p1p1_to_p3(&u, &t);
  1708  			ge_madd(&t, &u, &Bi[bslide[i] / 2]);
  1709  		} else if (bslide[i] < 0) {
  1710  			x25519_ge_p1p1_to_p3(&u, &t);
  1711  			ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
  1712  		}
  1713  
  1714  		x25519_ge_p1p1_to_p2(r, &t);
  1715  	}
  1716  }
  1717  
  1718  // int64_lshift21 returns |a << 21| but is defined when shifting bits into the
  1719  // sign bit. This works around a language flaw in C.
  1720  static inline int64_t int64_lshift21(int64_t a)
  1721  {
  1722  	return (int64_t)((uint64_t)a << 21);
  1723  }
  1724  
  1725  // The set of scalars is \Z/l
  1726  // where l = 2^252 + 27742317777372353535851937790883648493.
  1727  
  1728  // Input:
  1729  //   s[0]+256*s[1]+...+256^63*s[63] = s
  1730  //
  1731  // Output:
  1732  //   s[0]+256*s[1]+...+256^31*s[31] = s mod l
  1733  //   where l = 2^252 + 27742317777372353535851937790883648493.
  1734  //   Overwrites s in place.
  1735  static void x25519_sc_reduce(uint8_t s[64])
  1736  {
  1737  	int64_t s0 = 2097151 & load_le24(s);
  1738  	int64_t s1 = 2097151 & (load_le32(s + 2) >> 5);
  1739  	int64_t s2 = 2097151 & (load_le24(s + 5) >> 2);
  1740  	int64_t s3 = 2097151 & (load_le32(s + 7) >> 7);
  1741  	int64_t s4 = 2097151 & (load_le32(s + 10) >> 4);
  1742  	int64_t s5 = 2097151 & (load_le24(s + 13) >> 1);
  1743  	int64_t s6 = 2097151 & (load_le32(s + 15) >> 6);
  1744  	int64_t s7 = 2097151 & (load_le24(s + 18) >> 3);
  1745  	int64_t s8 = 2097151 & load_le24(s + 21);
  1746  	int64_t s9 = 2097151 & (load_le32(s + 23) >> 5);
  1747  	int64_t s10 = 2097151 & (load_le24(s + 26) >> 2);
  1748  	int64_t s11 = 2097151 & (load_le32(s + 28) >> 7);
  1749  	int64_t s12 = 2097151 & (load_le32(s + 31) >> 4);
  1750  	int64_t s13 = 2097151 & (load_le24(s + 34) >> 1);
  1751  	int64_t s14 = 2097151 & (load_le32(s + 36) >> 6);
  1752  	int64_t s15 = 2097151 & (load_le24(s + 39) >> 3);
  1753  	int64_t s16 = 2097151 & load_le24(s + 42);
  1754  	int64_t s17 = 2097151 & (load_le32(s + 44) >> 5);
  1755  	int64_t s18 = 2097151 & (load_le24(s + 47) >> 2);
  1756  	int64_t s19 = 2097151 & (load_le32(s + 49) >> 7);
  1757  	int64_t s20 = 2097151 & (load_le32(s + 52) >> 4);
  1758  	int64_t s21 = 2097151 & (load_le24(s + 55) >> 1);
  1759  	int64_t s22 = 2097151 & (load_le32(s + 57) >> 6);
  1760  	int64_t s23 = (load_le32(s + 60) >> 3);
  1761  	int64_t carry0;
  1762  	int64_t carry1;
  1763  	int64_t carry2;
  1764  	int64_t carry3;
  1765  	int64_t carry4;
  1766  	int64_t carry5;
  1767  	int64_t carry6;
  1768  	int64_t carry7;
  1769  	int64_t carry8;
  1770  	int64_t carry9;
  1771  	int64_t carry10;
  1772  	int64_t carry11;
  1773  	int64_t carry12;
  1774  	int64_t carry13;
  1775  	int64_t carry14;
  1776  	int64_t carry15;
  1777  	int64_t carry16;
  1778  
  1779  	s11 += s23 * 666643;
  1780  	s12 += s23 * 470296;
  1781  	s13 += s23 * 654183;
  1782  	s14 -= s23 * 997805;
  1783  	s15 += s23 * 136657;
  1784  	s16 -= s23 * 683901;
  1785  	s23 = 0;
  1786  
  1787  	s10 += s22 * 666643;
  1788  	s11 += s22 * 470296;
  1789  	s12 += s22 * 654183;
  1790  	s13 -= s22 * 997805;
  1791  	s14 += s22 * 136657;
  1792  	s15 -= s22 * 683901;
  1793  	s22 = 0;
  1794  
  1795  	s9 += s21 * 666643;
  1796  	s10 += s21 * 470296;
  1797  	s11 += s21 * 654183;
  1798  	s12 -= s21 * 997805;
  1799  	s13 += s21 * 136657;
  1800  	s14 -= s21 * 683901;
  1801  	s21 = 0;
  1802  
  1803  	s8 += s20 * 666643;
  1804  	s9 += s20 * 470296;
  1805  	s10 += s20 * 654183;
  1806  	s11 -= s20 * 997805;
  1807  	s12 += s20 * 136657;
  1808  	s13 -= s20 * 683901;
  1809  	s20 = 0;
  1810  
  1811  	s7 += s19 * 666643;
  1812  	s8 += s19 * 470296;
  1813  	s9 += s19 * 654183;
  1814  	s10 -= s19 * 997805;
  1815  	s11 += s19 * 136657;
  1816  	s12 -= s19 * 683901;
  1817  	s19 = 0;
  1818  
  1819  	s6 += s18 * 666643;
  1820  	s7 += s18 * 470296;
  1821  	s8 += s18 * 654183;
  1822  	s9 -= s18 * 997805;
  1823  	s10 += s18 * 136657;
  1824  	s11 -= s18 * 683901;
  1825  	s18 = 0;
  1826  
  1827  	carry6 = (s6 + (1 << 20)) >> 21;
  1828  	s7 += carry6;
  1829  	s6 -= int64_lshift21(carry6);
  1830  	carry8 = (s8 + (1 << 20)) >> 21;
  1831  	s9 += carry8;
  1832  	s8 -= int64_lshift21(carry8);
  1833  	carry10 = (s10 + (1 << 20)) >> 21;
  1834  	s11 += carry10;
  1835  	s10 -= int64_lshift21(carry10);
  1836  	carry12 = (s12 + (1 << 20)) >> 21;
  1837  	s13 += carry12;
  1838  	s12 -= int64_lshift21(carry12);
  1839  	carry14 = (s14 + (1 << 20)) >> 21;
  1840  	s15 += carry14;
  1841  	s14 -= int64_lshift21(carry14);
  1842  	carry16 = (s16 + (1 << 20)) >> 21;
  1843  	s17 += carry16;
  1844  	s16 -= int64_lshift21(carry16);
  1845  
  1846  	carry7 = (s7 + (1 << 20)) >> 21;
  1847  	s8 += carry7;
  1848  	s7 -= int64_lshift21(carry7);
  1849  	carry9 = (s9 + (1 << 20)) >> 21;
  1850  	s10 += carry9;
  1851  	s9 -= int64_lshift21(carry9);
  1852  	carry11 = (s11 + (1 << 20)) >> 21;
  1853  	s12 += carry11;
  1854  	s11 -= int64_lshift21(carry11);
  1855  	carry13 = (s13 + (1 << 20)) >> 21;
  1856  	s14 += carry13;
  1857  	s13 -= int64_lshift21(carry13);
  1858  	carry15 = (s15 + (1 << 20)) >> 21;
  1859  	s16 += carry15;
  1860  	s15 -= int64_lshift21(carry15);
  1861  
  1862  	s5 += s17 * 666643;
  1863  	s6 += s17 * 470296;
  1864  	s7 += s17 * 654183;
  1865  	s8 -= s17 * 997805;
  1866  	s9 += s17 * 136657;
  1867  	s10 -= s17 * 683901;
  1868  	s17 = 0;
  1869  
  1870  	s4 += s16 * 666643;
  1871  	s5 += s16 * 470296;
  1872  	s6 += s16 * 654183;
  1873  	s7 -= s16 * 997805;
  1874  	s8 += s16 * 136657;
  1875  	s9 -= s16 * 683901;
  1876  	s16 = 0;
  1877  
  1878  	s3 += s15 * 666643;
  1879  	s4 += s15 * 470296;
  1880  	s5 += s15 * 654183;
  1881  	s6 -= s15 * 997805;
  1882  	s7 += s15 * 136657;
  1883  	s8 -= s15 * 683901;
  1884  	s15 = 0;
  1885  
  1886  	s2 += s14 * 666643;
  1887  	s3 += s14 * 470296;
  1888  	s4 += s14 * 654183;
  1889  	s5 -= s14 * 997805;
  1890  	s6 += s14 * 136657;
  1891  	s7 -= s14 * 683901;
  1892  	s14 = 0;
  1893  
  1894  	s1 += s13 * 666643;
  1895  	s2 += s13 * 470296;
  1896  	s3 += s13 * 654183;
  1897  	s4 -= s13 * 997805;
  1898  	s5 += s13 * 136657;
  1899  	s6 -= s13 * 683901;
  1900  	s13 = 0;
  1901  
  1902  	s0 += s12 * 666643;
  1903  	s1 += s12 * 470296;
  1904  	s2 += s12 * 654183;
  1905  	s3 -= s12 * 997805;
  1906  	s4 += s12 * 136657;
  1907  	s5 -= s12 * 683901;
  1908  	s12 = 0;
  1909  
  1910  	carry0 = (s0 + (1 << 20)) >> 21;
  1911  	s1 += carry0;
  1912  	s0 -= int64_lshift21(carry0);
  1913  	carry2 = (s2 + (1 << 20)) >> 21;
  1914  	s3 += carry2;
  1915  	s2 -= int64_lshift21(carry2);
  1916  	carry4 = (s4 + (1 << 20)) >> 21;
  1917  	s5 += carry4;
  1918  	s4 -= int64_lshift21(carry4);
  1919  	carry6 = (s6 + (1 << 20)) >> 21;
  1920  	s7 += carry6;
  1921  	s6 -= int64_lshift21(carry6);
  1922  	carry8 = (s8 + (1 << 20)) >> 21;
  1923  	s9 += carry8;
  1924  	s8 -= int64_lshift21(carry8);
  1925  	carry10 = (s10 + (1 << 20)) >> 21;
  1926  	s11 += carry10;
  1927  	s10 -= int64_lshift21(carry10);
  1928  
  1929  	carry1 = (s1 + (1 << 20)) >> 21;
  1930  	s2 += carry1;
  1931  	s1 -= int64_lshift21(carry1);
  1932  	carry3 = (s3 + (1 << 20)) >> 21;
  1933  	s4 += carry3;
  1934  	s3 -= int64_lshift21(carry3);
  1935  	carry5 = (s5 + (1 << 20)) >> 21;
  1936  	s6 += carry5;
  1937  	s5 -= int64_lshift21(carry5);
  1938  	carry7 = (s7 + (1 << 20)) >> 21;
  1939  	s8 += carry7;
  1940  	s7 -= int64_lshift21(carry7);
  1941  	carry9 = (s9 + (1 << 20)) >> 21;
  1942  	s10 += carry9;
  1943  	s9 -= int64_lshift21(carry9);
  1944  	carry11 = (s11 + (1 << 20)) >> 21;
  1945  	s12 += carry11;
  1946  	s11 -= int64_lshift21(carry11);
  1947  
  1948  	s0 += s12 * 666643;
  1949  	s1 += s12 * 470296;
  1950  	s2 += s12 * 654183;
  1951  	s3 -= s12 * 997805;
  1952  	s4 += s12 * 136657;
  1953  	s5 -= s12 * 683901;
  1954  	s12 = 0;
  1955  
  1956  	carry0 = s0 >> 21;
  1957  	s1 += carry0;
  1958  	s0 -= int64_lshift21(carry0);
  1959  	carry1 = s1 >> 21;
  1960  	s2 += carry1;
  1961  	s1 -= int64_lshift21(carry1);
  1962  	carry2 = s2 >> 21;
  1963  	s3 += carry2;
  1964  	s2 -= int64_lshift21(carry2);
  1965  	carry3 = s3 >> 21;
  1966  	s4 += carry3;
  1967  	s3 -= int64_lshift21(carry3);
  1968  	carry4 = s4 >> 21;
  1969  	s5 += carry4;
  1970  	s4 -= int64_lshift21(carry4);
  1971  	carry5 = s5 >> 21;
  1972  	s6 += carry5;
  1973  	s5 -= int64_lshift21(carry5);
  1974  	carry6 = s6 >> 21;
  1975  	s7 += carry6;
  1976  	s6 -= int64_lshift21(carry6);
  1977  	carry7 = s7 >> 21;
  1978  	s8 += carry7;
  1979  	s7 -= int64_lshift21(carry7);
  1980  	carry8 = s8 >> 21;
  1981  	s9 += carry8;
  1982  	s8 -= int64_lshift21(carry8);
  1983  	carry9 = s9 >> 21;
  1984  	s10 += carry9;
  1985  	s9 -= int64_lshift21(carry9);
  1986  	carry10 = s10 >> 21;
  1987  	s11 += carry10;
  1988  	s10 -= int64_lshift21(carry10);
  1989  	carry11 = s11 >> 21;
  1990  	s12 += carry11;
  1991  	s11 -= int64_lshift21(carry11);
  1992  
  1993  	s0 += s12 * 666643;
  1994  	s1 += s12 * 470296;
  1995  	s2 += s12 * 654183;
  1996  	s3 -= s12 * 997805;
  1997  	s4 += s12 * 136657;
  1998  	s5 -= s12 * 683901;
  1999  	s12 = 0;
  2000  
  2001  	carry0 = s0 >> 21;
  2002  	s1 += carry0;
  2003  	s0 -= int64_lshift21(carry0);
  2004  	carry1 = s1 >> 21;
  2005  	s2 += carry1;
  2006  	s1 -= int64_lshift21(carry1);
  2007  	carry2 = s2 >> 21;
  2008  	s3 += carry2;
  2009  	s2 -= int64_lshift21(carry2);
  2010  	carry3 = s3 >> 21;
  2011  	s4 += carry3;
  2012  	s3 -= int64_lshift21(carry3);
  2013  	carry4 = s4 >> 21;
  2014  	s5 += carry4;
  2015  	s4 -= int64_lshift21(carry4);
  2016  	carry5 = s5 >> 21;
  2017  	s6 += carry5;
  2018  	s5 -= int64_lshift21(carry5);
  2019  	carry6 = s6 >> 21;
  2020  	s7 += carry6;
  2021  	s6 -= int64_lshift21(carry6);
  2022  	carry7 = s7 >> 21;
  2023  	s8 += carry7;
  2024  	s7 -= int64_lshift21(carry7);
  2025  	carry8 = s8 >> 21;
  2026  	s9 += carry8;
  2027  	s8 -= int64_lshift21(carry8);
  2028  	carry9 = s9 >> 21;
  2029  	s10 += carry9;
  2030  	s9 -= int64_lshift21(carry9);
  2031  	carry10 = s10 >> 21;
  2032  	s11 += carry10;
  2033  	s10 -= int64_lshift21(carry10);
  2034  
  2035  	s[0] = s0 >> 0;
  2036  	s[1] = s0 >> 8;
  2037  	s[2] = (s0 >> 16) | (s1 << 5);
  2038  	s[3] = s1 >> 3;
  2039  	s[4] = s1 >> 11;
  2040  	s[5] = (s1 >> 19) | (s2 << 2);
  2041  	s[6] = s2 >> 6;
  2042  	s[7] = (s2 >> 14) | (s3 << 7);
  2043  	s[8] = s3 >> 1;
  2044  	s[9] = s3 >> 9;
  2045  	s[10] = (s3 >> 17) | (s4 << 4);
  2046  	s[11] = s4 >> 4;
  2047  	s[12] = s4 >> 12;
  2048  	s[13] = (s4 >> 20) | (s5 << 1);
  2049  	s[14] = s5 >> 7;
  2050  	s[15] = (s5 >> 15) | (s6 << 6);
  2051  	s[16] = s6 >> 2;
  2052  	s[17] = s6 >> 10;
  2053  	s[18] = (s6 >> 18) | (s7 << 3);
  2054  	s[19] = s7 >> 5;
  2055  	s[20] = s7 >> 13;
  2056  	s[21] = s8 >> 0;
  2057  	s[22] = s8 >> 8;
  2058  	s[23] = (s8 >> 16) | (s9 << 5);
  2059  	s[24] = s9 >> 3;
  2060  	s[25] = s9 >> 11;
  2061  	s[26] = (s9 >> 19) | (s10 << 2);
  2062  	s[27] = s10 >> 6;
  2063  	s[28] = (s10 >> 14) | (s11 << 7);
  2064  	s[29] = s11 >> 1;
  2065  	s[30] = s11 >> 9;
  2066  	s[31] = s11 >> 17;
  2067  }
  2068  
  2069  bool ed25519_verify(const uint8_t signature[64], const uint8_t public_key[32],
  2070  		    const void *message, size_t message_size)
  2071  {
  2072  	ge_p3 A;
  2073  	if ((signature[63] & 224) != 0 ||
  2074  	    !x25519_ge_frombytes_vartime(&A, public_key))
  2075  		return false;
  2076  
  2077  	fe_loose t;
  2078  	fe_neg(&t, &A.X);
  2079  	fe_carry(&A.X, &t);
  2080  	fe_neg(&t, &A.T);
  2081  	fe_carry(&A.T, &t);
  2082  
  2083  	uint8_t pkcopy[32];
  2084  	memcpy(pkcopy, public_key, 32);
  2085  	uint8_t rcopy[32];
  2086  	memcpy(rcopy, signature, 32);
  2087  	union {
  2088  		uint64_t u64[4];
  2089  		uint8_t u8[32];
  2090  	} scopy;
  2091  	memcpy(&scopy.u8[0], signature + 32, 32);
  2092  
  2093  	// https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
  2094  	// the range [0, order) in order to prevent signature malleability.
  2095  
  2096  	// kOrder is the order of Curve25519 in little-endian form.
  2097  	static const uint64_t kOrder[4] = {
  2098  		UINT64_C(0x5812631a5cf5d3ed),
  2099  		UINT64_C(0x14def9dea2f79cd6),
  2100  		0,
  2101  		UINT64_C(0x1000000000000000),
  2102  	};
  2103  	for (size_t i = 3;; --i) {
  2104  		uint64_t le = swap_le64(scopy.u64[i]);
  2105  		if (le > kOrder[i]) {
  2106  			return false;
  2107  		} else if (le < kOrder[i]) {
  2108  			break;
  2109  		} else if (i == 0) {
  2110  			return false;
  2111  		}
  2112  	}
  2113  
  2114  	uint8_t h[64];
  2115  	BCRYPT_ALG_HANDLE alg, hash;
  2116  	if (!NT_SUCCESS(BCryptOpenAlgorithmProvider(&alg, BCRYPT_SHA512_ALGORITHM, NULL, 0)) ||
  2117  	    !NT_SUCCESS(BCryptCreateHash(alg, &hash, NULL, 0, NULL, 0, 0)) ||
  2118  	    !NT_SUCCESS(BCryptHashData(hash, (PUCHAR)signature, 32, 0)) ||
  2119  	    !NT_SUCCESS(BCryptHashData(hash, (PUCHAR)public_key, 32, 0)) ||
  2120  	    !NT_SUCCESS(BCryptHashData(hash, (PUCHAR)message, message_size, 0)) ||
  2121  	    !NT_SUCCESS(BCryptFinishHash(hash, h, 64, 0)) ||
  2122  	    !NT_SUCCESS(BCryptDestroyHash(hash)) ||
  2123  	    !NT_SUCCESS(BCryptCloseAlgorithmProvider(alg, 0)))
  2124  		return false;
  2125  
  2126  	x25519_sc_reduce(h);
  2127  
  2128  	ge_p2 R;
  2129  	ge_double_scalarmult_vartime(&R, h, &A, scopy.u8);
  2130  
  2131  	uint8_t rcheck[32];
  2132  	x25519_ge_tobytes(rcheck, &R);
  2133  
  2134  	return memcmp_ct(rcheck, rcopy, sizeof(rcheck)) == 0;
  2135  }
  2136  
  2137  static const uint64_t blake2b_iv[8] = {
  2138  	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
  2139  	0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
  2140  	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
  2141  };
  2142  
  2143  static const uint8_t blake2b_sigma[12][16] = {
  2144  	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  2145  	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
  2146  	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
  2147  	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
  2148  	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
  2149  	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
  2150  	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
  2151  	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
  2152  	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
  2153  	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
  2154  	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  2155  	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
  2156  };
  2157  
  2158  #define G(r, i, a, b, c, d)                                                    \
  2159  	do {                                                                   \
  2160  		a = a + b + m[blake2b_sigma[r][2 * i + 0]];                    \
  2161  		d = ror64(d ^ a, 32);                                          \
  2162  		c = c + d;                                                     \
  2163  		b = ror64(b ^ c, 24);                                          \
  2164  		a = a + b + m[blake2b_sigma[r][2 * i + 1]];                    \
  2165  		d = ror64(d ^ a, 16);                                          \
  2166  		c = c + d;                                                     \
  2167  		b = ror64(b ^ c, 63);                                          \
  2168  	} while (0)
  2169  
  2170  #define ROUND(r)                                                               \
  2171  	do {                                                                   \
  2172  		G(r, 0, v[0], v[4], v[8], v[12]);                              \
  2173  		G(r, 1, v[1], v[5], v[9], v[13]);                              \
  2174  		G(r, 2, v[2], v[6], v[10], v[14]);                             \
  2175  		G(r, 3, v[3], v[7], v[11], v[15]);                             \
  2176  		G(r, 4, v[0], v[5], v[10], v[15]);                             \
  2177  		G(r, 5, v[1], v[6], v[11], v[12]);                             \
  2178  		G(r, 6, v[2], v[7], v[8], v[13]);                              \
  2179  		G(r, 7, v[3], v[4], v[9], v[14]);                              \
  2180  	} while (0)
  2181  
  2182  static void blake2b256_compress(struct blake2b256_state *state,
  2183  				const uint8_t block[128])
  2184  {
  2185  	uint64_t m[16];
  2186  	uint64_t v[16];
  2187  
  2188  	for (int i = 0; i < 16; ++i)
  2189  		m[i] = load_le64(block + i * sizeof(m[i]));
  2190  
  2191  	for (int i = 0; i < 8; ++i)
  2192  		v[i] = state->h[i];
  2193  
  2194  	memcpy(v + 8, blake2b_iv, sizeof(blake2b_iv));
  2195  	v[12] ^= state->t[0];
  2196  	v[13] ^= state->t[1];
  2197  	v[14] ^= state->f[0];
  2198  	v[15] ^= state->f[1];
  2199  
  2200  	for (int i = 0; i < 12; ++i)
  2201  		ROUND(i);
  2202  	for (int i = 0; i < 8; ++i)
  2203  		state->h[i] = state->h[i] ^ v[i] ^ v[i + 8];
  2204  }
  2205  
  2206  void blake2b256_init(struct blake2b256_state *state)
  2207  {
  2208  	memset(state, 0, sizeof(*state));
  2209  	memcpy(state->h, blake2b_iv, sizeof(state->h));
  2210  	state->h[0] ^= 0x01010000 | 32;
  2211  }
  2212  
  2213  void blake2b256_update(struct blake2b256_state *state, const uint8_t *in,
  2214  		       unsigned int inlen)
  2215  {
  2216  	const size_t left = state->buflen;
  2217  	const size_t fill = 128 - left;
  2218  
  2219  	if (!inlen)
  2220  		return;
  2221  
  2222  	if (inlen > fill) {
  2223  		state->buflen = 0;
  2224  		memcpy(state->buf + left, in, fill);
  2225  		state->t[0] += 128;
  2226  		state->t[1] += (state->t[0] < 128);
  2227  		blake2b256_compress(state, state->buf);
  2228  		in += fill;
  2229  		inlen -= fill;
  2230  		while (inlen > 128) {
  2231  			state->t[0] += 128;
  2232  			state->t[1] += (state->t[0] < 128);
  2233  			blake2b256_compress(state, in);
  2234  			in += 128;
  2235  			inlen -= 128;
  2236  		}
  2237  	}
  2238  	memcpy(state->buf + state->buflen, in, inlen);
  2239  	state->buflen += inlen;
  2240  }
  2241  
  2242  void blake2b256_final(struct blake2b256_state *state, uint8_t out[32])
  2243  {
  2244  	state->t[0] += state->buflen;
  2245  	state->t[1] += (state->t[0] < state->buflen);
  2246  	state->f[0] = (uint64_t)-1;
  2247  	memset(state->buf + state->buflen, 0, 128 - state->buflen);
  2248  	blake2b256_compress(state, state->buf);
  2249  
  2250  	for (int i = 0; i < 4; ++i)
  2251  		store_le64(out + i * sizeof(state->h[i]), state->h[i]);
  2252  }