github.com/luckypickle/go-ethereum-vet@v1.14.2/crypto/secp256k1/libsecp256k1/src/asm/field_10x26_arm.s (about)

     1  @ vim: set tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab syntax=armasm:
     2  /**********************************************************************
     3   * Copyright (c) 2014 Wladimir J. van der Laan                        *
     4   * Distributed under the MIT software license, see the accompanying   *
     5   * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
     6   **********************************************************************/
     7  /*
     8  ARM implementation of field_10x26 inner loops.
     9  
    10  Note:
    11  
    12  - To avoid unnecessary loads and make use of available registers, two
    13    'passes' have every time been interleaved, with the odd passes accumulating c' and d' 
    14    which will be added to c and d respectively in the even passes
    15  
    16  */
    17  
    18  	.syntax unified
    19  	.arch armv7-a
    20  	@ eabi attributes - see readelf -A
    21  	.eabi_attribute 8, 1  @ Tag_ARM_ISA_use = yes
    22  	.eabi_attribute 9, 0  @ Tag_Thumb_ISA_use = no
    23  	.eabi_attribute 10, 0 @ Tag_FP_arch = none
    24  	.eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte
    25  	.eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP
    26  	.eabi_attribute 30, 2 @ Tag_ABI_optimization_goals = Aggressive Speed
    27  	.eabi_attribute 34, 1 @ Tag_CPU_unaligned_access = v6
    28  	.text
    29  
    30  	@ Field constants
    31  	.set field_R0, 0x3d10
    32  	.set field_R1, 0x400
    33  	.set field_not_M, 0xfc000000	@ ~M = ~0x3ffffff
    34  
    35  	.align	2
    36  	.global vet_secp256k1_fe_mul_inner
    37  	.type	vet_secp256k1_fe_mul_inner, %function
    38  	@ Arguments:
    39  	@  r0  r      Restrict: can overlap with a, not with b
    40  	@  r1  a
    41  	@  r2  b
    42  	@ Stack (total 4+10*4 = 44)
    43  	@  sp + #0        saved 'r' pointer
    44  	@  sp + #4 + 4*X  t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
    45  vet_secp256k1_fe_mul_inner:
    46  	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
    47  	sub	sp, sp, #48			@ frame=44 + alignment
    48  	str     r0, [sp, #0]			@ save result address, we need it only at the end
    49  
    50  	/******************************************
    51  	 * Main computation code.
    52  	 ******************************************
    53  
    54  	Allocation:
    55  	    r0,r14,r7,r8   scratch
    56  	    r1       a (pointer)
    57  	    r2       b (pointer)
    58  	    r3:r4    c
    59  	    r5:r6    d
    60  	    r11:r12  c'
    61  	    r9:r10   d'
    62  
    63  	Note: do not write to r[] here, it may overlap with a[]
    64  	*/
    65  
    66  	/* A - interleaved with B */
    67  	ldr	r7, [r1, #0*4]			@ a[0]
    68  	ldr	r8, [r2, #9*4]			@ b[9]
    69  	ldr	r0, [r1, #1*4]			@ a[1]
    70  	umull	r5, r6, r7, r8			@ d = a[0] * b[9]
    71  	ldr	r14, [r2, #8*4]			@ b[8]
    72  	umull	r9, r10, r0, r8			@ d' = a[1] * b[9]
    73  	ldr	r7, [r1, #2*4]			@ a[2]
    74  	umlal	r5, r6, r0, r14			@ d += a[1] * b[8]
    75  	ldr	r8, [r2, #7*4] 			@ b[7]
    76  	umlal	r9, r10, r7, r14		@ d' += a[2] * b[8]
    77  	ldr	r0, [r1, #3*4]   		@ a[3]
    78  	umlal	r5, r6, r7, r8   		@ d += a[2] * b[7]
    79  	ldr	r14, [r2, #6*4]   		@ b[6]
    80  	umlal	r9, r10, r0, r8  		@ d' += a[3] * b[7]
    81  	ldr	r7, [r1, #4*4]   		@ a[4]
    82  	umlal	r5, r6, r0, r14   		@ d += a[3] * b[6]
    83  	ldr	r8, [r2, #5*4]   		@ b[5]
    84  	umlal	r9, r10, r7, r14  		@ d' += a[4] * b[6]
    85  	ldr	r0, [r1, #5*4]   		@ a[5]
    86  	umlal	r5, r6, r7, r8   		@ d += a[4] * b[5]
    87  	ldr	r14, [r2, #4*4]   		@ b[4]
    88  	umlal	r9, r10, r0, r8  		@ d' += a[5] * b[5]
    89  	ldr	r7, [r1, #6*4]   		@ a[6]
    90  	umlal	r5, r6, r0, r14   		@ d += a[5] * b[4]
    91  	ldr	r8, [r2, #3*4]   		@ b[3]
    92  	umlal	r9, r10, r7, r14  		@ d' += a[6] * b[4]
    93  	ldr	r0, [r1, #7*4]   		@ a[7]
    94  	umlal	r5, r6, r7, r8   		@ d += a[6] * b[3]
    95  	ldr	r14, [r2, #2*4]   		@ b[2]
    96  	umlal	r9, r10, r0, r8  		@ d' += a[7] * b[3]
    97  	ldr	r7, [r1, #8*4]   		@ a[8]
    98  	umlal	r5, r6, r0, r14   		@ d += a[7] * b[2]
    99  	ldr	r8, [r2, #1*4]   		@ b[1]
   100  	umlal	r9, r10, r7, r14  		@ d' += a[8] * b[2]
   101  	ldr	r0, [r1, #9*4]   		@ a[9]
   102  	umlal	r5, r6, r7, r8   		@ d += a[8] * b[1]
   103  	ldr	r14, [r2, #0*4]   		@ b[0]
   104  	umlal	r9, r10, r0, r8  		@ d' += a[9] * b[1]
   105  	ldr	r7, [r1, #0*4]   		@ a[0]
   106  	umlal	r5, r6, r0, r14   		@ d += a[9] * b[0]
   107  	@ r7,r14 used in B
   108  
   109  	bic	r0, r5, field_not_M 		@ t9 = d & M
   110  	str     r0, [sp, #4 + 4*9]
   111  	mov	r5, r5, lsr #26     		@ d >>= 26 
   112  	orr	r5, r5, r6, asl #6
   113  	mov     r6, r6, lsr #26
   114  
   115  	/* B */
   116  	umull	r3, r4, r7, r14   		@ c = a[0] * b[0]
   117  	adds	r5, r5, r9       		@ d += d'
   118  	adc	r6, r6, r10
   119  
   120  	bic	r0, r5, field_not_M 		@ u0 = d & M
   121  	mov	r5, r5, lsr #26     		@ d >>= 26
   122  	orr	r5, r5, r6, asl #6
   123  	mov     r6, r6, lsr #26
   124  	movw    r14, field_R0			@ c += u0 * R0
   125  	umlal   r3, r4, r0, r14
   126  
   127  	bic	r14, r3, field_not_M 		@ t0 = c & M
   128  	str	r14, [sp, #4 + 0*4]
   129  	mov	r3, r3, lsr #26     		@ c >>= 26
   130  	orr	r3, r3, r4, asl #6
   131  	mov     r4, r4, lsr #26
   132  	mov     r14, field_R1			@ c += u0 * R1
   133  	umlal   r3, r4, r0, r14
   134  
   135  	/* C - interleaved with D */
   136  	ldr	r7, [r1, #0*4]   		@ a[0]
   137  	ldr	r8, [r2, #2*4]   		@ b[2]
   138  	ldr	r14, [r2, #1*4]   		@ b[1]
   139  	umull	r11, r12, r7, r8   		@ c' = a[0] * b[2]
   140  	ldr	r0, [r1, #1*4]   		@ a[1]
   141  	umlal   r3, r4, r7, r14   		@ c += a[0] * b[1]
   142  	ldr	r8, [r2, #0*4]   		@ b[0]
   143  	umlal   r11, r12, r0, r14   		@ c' += a[1] * b[1]
   144  	ldr	r7, [r1, #2*4]   		@ a[2]
   145  	umlal   r3, r4, r0, r8   		@ c += a[1] * b[0]
   146  	ldr	r14, [r2, #9*4]   		@ b[9]
   147  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[0]
   148  	ldr	r0, [r1, #3*4]   		@ a[3]
   149  	umlal	r5, r6, r7, r14   		@ d += a[2] * b[9]
   150  	ldr	r8, [r2, #8*4]   		@ b[8]
   151  	umull	r9, r10, r0, r14   		@ d' = a[3] * b[9]
   152  	ldr	r7, [r1, #4*4]   		@ a[4]
   153  	umlal	r5, r6, r0, r8   		@ d += a[3] * b[8]
   154  	ldr	r14, [r2, #7*4]   		@ b[7]
   155  	umlal	r9, r10, r7, r8   		@ d' += a[4] * b[8]
   156  	ldr	r0, [r1, #5*4]   		@ a[5]
   157  	umlal	r5, r6, r7, r14   		@ d += a[4] * b[7]
   158  	ldr	r8, [r2, #6*4]   		@ b[6]
   159  	umlal	r9, r10, r0, r14   		@ d' += a[5] * b[7]
   160  	ldr	r7, [r1, #6*4]   		@ a[6]
   161  	umlal	r5, r6, r0, r8   		@ d += a[5] * b[6]
   162  	ldr	r14, [r2, #5*4]   		@ b[5]
   163  	umlal	r9, r10, r7, r8   		@ d' += a[6] * b[6]
   164  	ldr	r0, [r1, #7*4]   		@ a[7]
   165  	umlal	r5, r6, r7, r14   		@ d += a[6] * b[5]
   166  	ldr	r8, [r2, #4*4]   		@ b[4]
   167  	umlal	r9, r10, r0, r14   		@ d' += a[7] * b[5]
   168  	ldr	r7, [r1, #8*4]   		@ a[8]
   169  	umlal	r5, r6, r0, r8   		@ d += a[7] * b[4]
   170  	ldr	r14, [r2, #3*4]   		@ b[3]
   171  	umlal	r9, r10, r7, r8   		@ d' += a[8] * b[4]
   172  	ldr	r0, [r1, #9*4]   		@ a[9]
   173  	umlal	r5, r6, r7, r14   		@ d += a[8] * b[3]
   174  	ldr	r8, [r2, #2*4]   		@ b[2]
   175  	umlal	r9, r10, r0, r14   		@ d' += a[9] * b[3]
   176  	umlal	r5, r6, r0, r8   		@ d += a[9] * b[2]
   177  
   178  	bic	r0, r5, field_not_M 		@ u1 = d & M
   179  	mov	r5, r5, lsr #26     		@ d >>= 26
   180  	orr	r5, r5, r6, asl #6
   181  	mov     r6, r6, lsr #26
   182  	movw    r14, field_R0			@ c += u1 * R0
   183  	umlal   r3, r4, r0, r14
   184  
   185  	bic	r14, r3, field_not_M 		@ t1 = c & M
   186  	str	r14, [sp, #4 + 1*4]
   187  	mov	r3, r3, lsr #26     		@ c >>= 26
   188  	orr	r3, r3, r4, asl #6
   189  	mov     r4, r4, lsr #26
   190  	mov     r14, field_R1			@ c += u1 * R1
   191  	umlal   r3, r4, r0, r14
   192  
   193  	/* D */
   194  	adds	r3, r3, r11			@ c += c'
   195  	adc	r4, r4, r12
   196  	adds	r5, r5, r9			@ d += d'
   197  	adc	r6, r6, r10
   198  
   199  	bic	r0, r5, field_not_M 		@ u2 = d & M
   200  	mov	r5, r5, lsr #26     		@ d >>= 26
   201  	orr	r5, r5, r6, asl #6
   202  	mov     r6, r6, lsr #26
   203  	movw    r14, field_R0			@ c += u2 * R0
   204  	umlal   r3, r4, r0, r14
   205  
   206  	bic	r14, r3, field_not_M 		@ t2 = c & M
   207  	str	r14, [sp, #4 + 2*4]
   208  	mov	r3, r3, lsr #26     		@ c >>= 26
   209  	orr	r3, r3, r4, asl #6
   210  	mov     r4, r4, lsr #26
   211  	mov     r14, field_R1			@ c += u2 * R1
   212  	umlal   r3, r4, r0, r14
   213  
   214  	/* E - interleaved with F */
   215  	ldr	r7, [r1, #0*4]   		@ a[0]
   216  	ldr	r8, [r2, #4*4]   		@ b[4]
   217  	umull	r11, r12, r7, r8   		@ c' = a[0] * b[4]
   218  	ldr	r8, [r2, #3*4]   		@ b[3]
   219  	umlal   r3, r4, r7, r8   		@ c += a[0] * b[3]
   220  	ldr	r7, [r1, #1*4]   		@ a[1]
   221  	umlal   r11, r12, r7, r8   		@ c' += a[1] * b[3]
   222  	ldr	r8, [r2, #2*4]   		@ b[2]
   223  	umlal   r3, r4, r7, r8   		@ c += a[1] * b[2]
   224  	ldr	r7, [r1, #2*4]   		@ a[2]
   225  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[2]
   226  	ldr	r8, [r2, #1*4]   		@ b[1]
   227  	umlal   r3, r4, r7, r8   		@ c += a[2] * b[1]
   228  	ldr	r7, [r1, #3*4]   		@ a[3]
   229  	umlal   r11, r12, r7, r8   		@ c' += a[3] * b[1]
   230  	ldr	r8, [r2, #0*4]   		@ b[0]
   231  	umlal   r3, r4, r7, r8   		@ c += a[3] * b[0]
   232  	ldr	r7, [r1, #4*4]   		@ a[4]
   233  	umlal   r11, r12, r7, r8   		@ c' += a[4] * b[0]
   234  	ldr	r8, [r2, #9*4]   		@ b[9]
   235  	umlal	r5, r6, r7, r8   		@ d += a[4] * b[9]
   236  	ldr	r7, [r1, #5*4]   		@ a[5]
   237  	umull	r9, r10, r7, r8   		@ d' = a[5] * b[9]
   238  	ldr	r8, [r2, #8*4]   		@ b[8]
   239  	umlal	r5, r6, r7, r8   		@ d += a[5] * b[8]
   240  	ldr	r7, [r1, #6*4]   		@ a[6]
   241  	umlal	r9, r10, r7, r8   		@ d' += a[6] * b[8]
   242  	ldr	r8, [r2, #7*4]   		@ b[7]
   243  	umlal	r5, r6, r7, r8   		@ d += a[6] * b[7]
   244  	ldr	r7, [r1, #7*4]   		@ a[7]
   245  	umlal	r9, r10, r7, r8   		@ d' += a[7] * b[7]
   246  	ldr	r8, [r2, #6*4]   		@ b[6]
   247  	umlal	r5, r6, r7, r8   		@ d += a[7] * b[6]
   248  	ldr	r7, [r1, #8*4]   		@ a[8]
   249  	umlal	r9, r10, r7, r8   		@ d' += a[8] * b[6]
   250  	ldr	r8, [r2, #5*4]   		@ b[5]
   251  	umlal	r5, r6, r7, r8   		@ d += a[8] * b[5]
   252  	ldr	r7, [r1, #9*4]   		@ a[9]
   253  	umlal	r9, r10, r7, r8   		@ d' += a[9] * b[5]
   254  	ldr	r8, [r2, #4*4]   		@ b[4]
   255  	umlal	r5, r6, r7, r8   		@ d += a[9] * b[4]
   256  
   257  	bic	r0, r5, field_not_M 		@ u3 = d & M
   258  	mov	r5, r5, lsr #26     		@ d >>= 26
   259  	orr	r5, r5, r6, asl #6
   260  	mov     r6, r6, lsr #26
   261  	movw    r14, field_R0			@ c += u3 * R0
   262  	umlal   r3, r4, r0, r14
   263  
   264  	bic	r14, r3, field_not_M 		@ t3 = c & M
   265  	str	r14, [sp, #4 + 3*4]
   266  	mov	r3, r3, lsr #26     		@ c >>= 26
   267  	orr	r3, r3, r4, asl #6
   268  	mov     r4, r4, lsr #26
   269  	mov     r14, field_R1			@ c += u3 * R1
   270  	umlal   r3, r4, r0, r14
   271  
   272  	/* F */
   273  	adds	r3, r3, r11			@ c += c'
   274  	adc	r4, r4, r12
   275  	adds	r5, r5, r9			@ d += d'
   276  	adc	r6, r6, r10
   277  
   278  	bic	r0, r5, field_not_M 		@ u4 = d & M
   279  	mov	r5, r5, lsr #26     		@ d >>= 26
   280  	orr	r5, r5, r6, asl #6
   281  	mov     r6, r6, lsr #26
   282  	movw    r14, field_R0			@ c += u4 * R0
   283  	umlal   r3, r4, r0, r14
   284  
   285  	bic	r14, r3, field_not_M 		@ t4 = c & M
   286  	str	r14, [sp, #4 + 4*4]
   287  	mov	r3, r3, lsr #26     		@ c >>= 26
   288  	orr	r3, r3, r4, asl #6
   289  	mov     r4, r4, lsr #26
   290  	mov     r14, field_R1			@ c += u4 * R1
   291  	umlal   r3, r4, r0, r14
   292  
   293  	/* G - interleaved with H */
   294  	ldr	r7, [r1, #0*4]   		@ a[0]
   295  	ldr	r8, [r2, #6*4]   		@ b[6]
   296  	ldr	r14, [r2, #5*4]   		@ b[5]
   297  	umull	r11, r12, r7, r8   		@ c' = a[0] * b[6]
   298  	ldr	r0, [r1, #1*4]   		@ a[1]
   299  	umlal   r3, r4, r7, r14   		@ c += a[0] * b[5]
   300  	ldr	r8, [r2, #4*4]   		@ b[4]
   301  	umlal   r11, r12, r0, r14   		@ c' += a[1] * b[5]
   302  	ldr	r7, [r1, #2*4]   		@ a[2]
   303  	umlal   r3, r4, r0, r8   		@ c += a[1] * b[4]
   304  	ldr	r14, [r2, #3*4]   		@ b[3]
   305  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[4]
   306  	ldr	r0, [r1, #3*4]   		@ a[3]
   307  	umlal   r3, r4, r7, r14   		@ c += a[2] * b[3]
   308  	ldr	r8, [r2, #2*4]   		@ b[2]
   309  	umlal   r11, r12, r0, r14   		@ c' += a[3] * b[3]
   310  	ldr	r7, [r1, #4*4]   		@ a[4]
   311  	umlal   r3, r4, r0, r8   		@ c += a[3] * b[2]
   312  	ldr	r14, [r2, #1*4]   		@ b[1]
   313  	umlal   r11, r12, r7, r8   		@ c' += a[4] * b[2]
   314  	ldr	r0, [r1, #5*4]   		@ a[5]
   315  	umlal   r3, r4, r7, r14   		@ c += a[4] * b[1]
   316  	ldr	r8, [r2, #0*4]   		@ b[0]
   317  	umlal   r11, r12, r0, r14   		@ c' += a[5] * b[1]
   318  	ldr	r7, [r1, #6*4]   		@ a[6]
   319  	umlal   r3, r4, r0, r8   		@ c += a[5] * b[0]
   320  	ldr	r14, [r2, #9*4]   		@ b[9]
   321  	umlal   r11, r12, r7, r8   		@ c' += a[6] * b[0]
   322  	ldr	r0, [r1, #7*4]   		@ a[7]
   323  	umlal	r5, r6, r7, r14   		@ d += a[6] * b[9]
   324  	ldr	r8, [r2, #8*4]   		@ b[8]
   325  	umull	r9, r10, r0, r14   		@ d' = a[7] * b[9]
   326  	ldr	r7, [r1, #8*4]   		@ a[8]
   327  	umlal	r5, r6, r0, r8   		@ d += a[7] * b[8]
   328  	ldr	r14, [r2, #7*4]   		@ b[7]
   329  	umlal	r9, r10, r7, r8   		@ d' += a[8] * b[8]
   330  	ldr	r0, [r1, #9*4]   		@ a[9]
   331  	umlal	r5, r6, r7, r14   		@ d += a[8] * b[7]
   332  	ldr	r8, [r2, #6*4]   		@ b[6]
   333  	umlal	r9, r10, r0, r14   		@ d' += a[9] * b[7]
   334  	umlal	r5, r6, r0, r8   		@ d += a[9] * b[6]
   335  
   336  	bic	r0, r5, field_not_M 		@ u5 = d & M
   337  	mov	r5, r5, lsr #26     		@ d >>= 26
   338  	orr	r5, r5, r6, asl #6
   339  	mov     r6, r6, lsr #26
   340  	movw    r14, field_R0			@ c += u5 * R0
   341  	umlal   r3, r4, r0, r14
   342  
   343  	bic	r14, r3, field_not_M 		@ t5 = c & M
   344  	str	r14, [sp, #4 + 5*4]
   345  	mov	r3, r3, lsr #26     		@ c >>= 26
   346  	orr	r3, r3, r4, asl #6
   347  	mov     r4, r4, lsr #26
   348  	mov     r14, field_R1			@ c += u5 * R1
   349  	umlal   r3, r4, r0, r14
   350  
   351  	/* H */
   352  	adds	r3, r3, r11			@ c += c'
   353  	adc	r4, r4, r12
   354  	adds	r5, r5, r9			@ d += d'
   355  	adc	r6, r6, r10
   356  
   357  	bic	r0, r5, field_not_M 		@ u6 = d & M
   358  	mov	r5, r5, lsr #26     		@ d >>= 26
   359  	orr	r5, r5, r6, asl #6
   360  	mov     r6, r6, lsr #26
   361  	movw    r14, field_R0			@ c += u6 * R0
   362  	umlal   r3, r4, r0, r14
   363  
   364  	bic	r14, r3, field_not_M 		@ t6 = c & M
   365  	str	r14, [sp, #4 + 6*4]
   366  	mov	r3, r3, lsr #26     		@ c >>= 26
   367  	orr	r3, r3, r4, asl #6
   368  	mov     r4, r4, lsr #26
   369  	mov     r14, field_R1			@ c += u6 * R1
   370  	umlal   r3, r4, r0, r14
   371  
   372  	/* I - interleaved with J */
   373  	ldr	r8, [r2, #8*4]   		@ b[8]
   374  	ldr	r7, [r1, #0*4]   		@ a[0]
   375  	ldr	r14, [r2, #7*4]   		@ b[7]
   376  	umull   r11, r12, r7, r8   		@ c' = a[0] * b[8]
   377  	ldr	r0, [r1, #1*4]   		@ a[1]
   378  	umlal   r3, r4, r7, r14   		@ c += a[0] * b[7]
   379  	ldr	r8, [r2, #6*4]   		@ b[6]
   380  	umlal   r11, r12, r0, r14   		@ c' += a[1] * b[7]
   381  	ldr	r7, [r1, #2*4]   		@ a[2]
   382  	umlal   r3, r4, r0, r8   		@ c += a[1] * b[6]
   383  	ldr	r14, [r2, #5*4]   		@ b[5]
   384  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[6]
   385  	ldr	r0, [r1, #3*4]   		@ a[3]
   386  	umlal   r3, r4, r7, r14   		@ c += a[2] * b[5]
   387  	ldr	r8, [r2, #4*4]   		@ b[4]
   388  	umlal   r11, r12, r0, r14   		@ c' += a[3] * b[5]
   389  	ldr	r7, [r1, #4*4]   		@ a[4]
   390  	umlal   r3, r4, r0, r8   		@ c += a[3] * b[4]
   391  	ldr	r14, [r2, #3*4]   		@ b[3]
   392  	umlal   r11, r12, r7, r8   		@ c' += a[4] * b[4]
   393  	ldr	r0, [r1, #5*4]   		@ a[5]
   394  	umlal   r3, r4, r7, r14   		@ c += a[4] * b[3]
   395  	ldr	r8, [r2, #2*4]   		@ b[2]
   396  	umlal   r11, r12, r0, r14   		@ c' += a[5] * b[3]
   397  	ldr	r7, [r1, #6*4]   		@ a[6]
   398  	umlal   r3, r4, r0, r8   		@ c += a[5] * b[2]
   399  	ldr	r14, [r2, #1*4]   		@ b[1]
   400  	umlal   r11, r12, r7, r8   		@ c' += a[6] * b[2]
   401  	ldr	r0, [r1, #7*4]   		@ a[7]
   402  	umlal   r3, r4, r7, r14   		@ c += a[6] * b[1]
   403  	ldr	r8, [r2, #0*4]   		@ b[0]
   404  	umlal   r11, r12, r0, r14   		@ c' += a[7] * b[1]
   405  	ldr	r7, [r1, #8*4]   		@ a[8]
   406  	umlal   r3, r4, r0, r8   		@ c += a[7] * b[0]
   407  	ldr	r14, [r2, #9*4]   		@ b[9]
   408  	umlal   r11, r12, r7, r8   		@ c' += a[8] * b[0]
   409  	ldr	r0, [r1, #9*4]   		@ a[9]
   410  	umlal	r5, r6, r7, r14   		@ d += a[8] * b[9]
   411  	ldr	r8, [r2, #8*4]   		@ b[8]
   412  	umull	r9, r10, r0, r14  		@ d' = a[9] * b[9]
   413  	umlal	r5, r6, r0, r8   		@ d += a[9] * b[8]
   414  
   415  	bic	r0, r5, field_not_M 		@ u7 = d & M
   416  	mov	r5, r5, lsr #26     		@ d >>= 26
   417  	orr	r5, r5, r6, asl #6
   418  	mov     r6, r6, lsr #26
   419  	movw    r14, field_R0			@ c += u7 * R0
   420  	umlal   r3, r4, r0, r14
   421  
   422  	bic	r14, r3, field_not_M 		@ t7 = c & M
   423  	str	r14, [sp, #4 + 7*4]
   424  	mov	r3, r3, lsr #26     		@ c >>= 26
   425  	orr	r3, r3, r4, asl #6
   426  	mov     r4, r4, lsr #26
   427  	mov     r14, field_R1			@ c += u7 * R1
   428  	umlal   r3, r4, r0, r14
   429  
   430  	/* J */
   431  	adds	r3, r3, r11			@ c += c'
   432  	adc	r4, r4, r12
   433  	adds	r5, r5, r9			@ d += d'
   434  	adc	r6, r6, r10
   435  
   436  	bic	r0, r5, field_not_M 		@ u8 = d & M
   437  	str	r0, [sp, #4 + 8*4]
   438  	mov	r5, r5, lsr #26     		@ d >>= 26
   439  	orr	r5, r5, r6, asl #6
   440  	mov     r6, r6, lsr #26
   441  	movw    r14, field_R0			@ c += u8 * R0
   442  	umlal   r3, r4, r0, r14
   443  
   444  	/******************************************
   445  	 * compute and write back result
   446  	 ******************************************
   447  	Allocation:
   448  	    r0    r
   449  	    r3:r4 c
   450  	    r5:r6 d
   451  	    r7    t0
   452  	    r8    t1
   453  	    r9    t2
   454  	    r11   u8
   455  	    r12   t9
   456  	    r1,r2,r10,r14 scratch
   457  
   458  	Note: do not read from a[] after here, it may overlap with r[]
   459  	*/
   460  	ldr	r0, [sp, #0]
   461  	add	r1, sp, #4 + 3*4		@ r[3..7] = t3..7, r11=u8, r12=t9
   462  	ldmia	r1, {r2,r7,r8,r9,r10,r11,r12}
   463  	add	r1, r0, #3*4
   464  	stmia	r1, {r2,r7,r8,r9,r10}
   465  
   466  	bic	r2, r3, field_not_M 		@ r[8] = c & M
   467  	str	r2, [r0, #8*4]
   468  	mov	r3, r3, lsr #26     		@ c >>= 26
   469  	orr	r3, r3, r4, asl #6
   470  	mov     r4, r4, lsr #26
   471  	mov     r14, field_R1			@ c += u8 * R1
   472  	umlal   r3, r4, r11, r14
   473  	movw    r14, field_R0			@ c += d * R0
   474  	umlal   r3, r4, r5, r14
   475  	adds	r3, r3, r12			@ c += t9
   476  	adc	r4, r4, #0
   477  
   478  	add	r1, sp, #4 + 0*4		@ r7,r8,r9 = t0,t1,t2
   479  	ldmia	r1, {r7,r8,r9}
   480  
   481  	ubfx	r2, r3, #0, #22     		@ r[9] = c & (M >> 4)
   482  	str	r2, [r0, #9*4]
   483  	mov	r3, r3, lsr #22     		@ c >>= 22
   484  	orr	r3, r3, r4, asl #10
   485  	mov     r4, r4, lsr #22
   486  	movw    r14, field_R1 << 4   		@ c += d * (R1 << 4)
   487  	umlal   r3, r4, r5, r14
   488  
   489  	movw    r14, field_R0 >> 4   		@ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
   490  	umull	r5, r6, r3, r14			@ d = c.lo * (R0 >> 4)
   491  	adds	r5, r5, r7	    		@ d.lo += t0
   492  	mla	r6, r14, r4, r6			@ d.hi += c.hi * (R0 >> 4)
   493  	adc	r6, r6, 0	     		@ d.hi += carry
   494  
   495  	bic	r2, r5, field_not_M 		@ r[0] = d & M
   496  	str	r2, [r0, #0*4]
   497  
   498  	mov	r5, r5, lsr #26     		@ d >>= 26
   499  	orr	r5, r5, r6, asl #6
   500  	mov     r6, r6, lsr #26
   501  	
   502  	movw    r14, field_R1 >> 4   		@ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
   503  	umull	r1, r2, r3, r14       		@ tmp = c.lo * (R1 >> 4)
   504  	adds	r5, r5, r8	    		@ d.lo += t1
   505  	adc	r6, r6, #0	    		@ d.hi += carry
   506  	adds	r5, r5, r1	    		@ d.lo += tmp.lo
   507  	mla	r2, r14, r4, r2      		@ tmp.hi += c.hi * (R1 >> 4)
   508  	adc	r6, r6, r2	   		@ d.hi += carry + tmp.hi
   509  
   510  	bic	r2, r5, field_not_M 		@ r[1] = d & M
   511  	str	r2, [r0, #1*4]
   512  	mov	r5, r5, lsr #26     		@ d >>= 26 (ignore hi)
   513  	orr	r5, r5, r6, asl #6
   514  
   515  	add	r5, r5, r9	  		@ d += t2
   516  	str	r5, [r0, #2*4]      		@ r[2] = d
   517  
   518  	add	sp, sp, #48
   519  	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   520  	.size	vet_secp256k1_fe_mul_inner, .-vet_secp256k1_fe_mul_inner
   521  
   522  	.align	2
   523  	.global vet_secp256k1_fe_sqr_inner
   524  	.type	vet_secp256k1_fe_sqr_inner, %function
   525  	@ Arguments:
   526  	@  r0  r	 Can overlap with a
   527  	@  r1  a
   528  	@ Stack (total 4+10*4 = 44)
   529  	@  sp + #0        saved 'r' pointer
   530  	@  sp + #4 + 4*X  t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
   531  vet_secp256k1_fe_sqr_inner:
   532  	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
   533  	sub	sp, sp, #48			@ frame=44 + alignment
   534  	str     r0, [sp, #0]			@ save result address, we need it only at the end
   535  	/******************************************
   536  	 * Main computation code.
   537  	 ******************************************
   538  
   539  	Allocation:
   540  	    r0,r14,r2,r7,r8   scratch
   541  	    r1       a (pointer)
   542  	    r3:r4    c
   543  	    r5:r6    d
   544  	    r11:r12  c'
   545  	    r9:r10   d'
   546  
   547  	Note: do not write to r[] here, it may overlap with a[]
   548  	*/
   549  	/* A interleaved with B */
   550  	ldr	r0, [r1, #1*4]			@ a[1]*2
   551  	ldr	r7, [r1, #0*4]			@ a[0]
   552  	mov	r0, r0, asl #1
   553  	ldr	r14, [r1, #9*4]			@ a[9]
   554  	umull	r3, r4, r7, r7			@ c = a[0] * a[0]
   555  	ldr	r8, [r1, #8*4]			@ a[8]
   556  	mov	r7, r7, asl #1
   557  	umull	r5, r6, r7, r14			@ d = a[0]*2 * a[9]
   558  	ldr	r7, [r1, #2*4]			@ a[2]*2
   559  	umull	r9, r10, r0, r14		@ d' = a[1]*2 * a[9]
   560  	ldr	r14, [r1, #7*4]			@ a[7]
   561  	umlal	r5, r6, r0, r8			@ d += a[1]*2 * a[8]
   562  	mov	r7, r7, asl #1
   563  	ldr	r0, [r1, #3*4]			@ a[3]*2
   564  	umlal	r9, r10, r7, r8			@ d' += a[2]*2 * a[8]
   565  	ldr	r8, [r1, #6*4]			@ a[6]
   566  	umlal	r5, r6, r7, r14			@ d += a[2]*2 * a[7]
   567  	mov	r0, r0, asl #1
   568  	ldr	r7, [r1, #4*4]			@ a[4]*2
   569  	umlal	r9, r10, r0, r14		@ d' += a[3]*2 * a[7]
   570  	ldr	r14, [r1, #5*4]			@ a[5]
   571  	mov	r7, r7, asl #1
   572  	umlal	r5, r6, r0, r8			@ d += a[3]*2 * a[6]
   573  	umlal	r9, r10, r7, r8			@ d' += a[4]*2 * a[6]
   574  	umlal	r5, r6, r7, r14			@ d += a[4]*2 * a[5]
   575  	umlal	r9, r10, r14, r14		@ d' += a[5] * a[5]
   576  
   577  	bic	r0, r5, field_not_M 		@ t9 = d & M
   578  	str     r0, [sp, #4 + 9*4]
   579  	mov	r5, r5, lsr #26     		@ d >>= 26 
   580  	orr	r5, r5, r6, asl #6
   581  	mov     r6, r6, lsr #26
   582  
   583  	/* B */
   584  	adds	r5, r5, r9			@ d += d'
   585  	adc	r6, r6, r10
   586  
   587  	bic	r0, r5, field_not_M 		@ u0 = d & M
   588  	mov	r5, r5, lsr #26     		@ d >>= 26
   589  	orr	r5, r5, r6, asl #6
   590  	mov     r6, r6, lsr #26
   591  	movw    r14, field_R0			@ c += u0 * R0
   592  	umlal   r3, r4, r0, r14
   593  	bic	r14, r3, field_not_M 		@ t0 = c & M
   594  	str	r14, [sp, #4 + 0*4]
   595  	mov	r3, r3, lsr #26     		@ c >>= 26
   596  	orr	r3, r3, r4, asl #6
   597  	mov     r4, r4, lsr #26
   598  	mov     r14, field_R1			@ c += u0 * R1
   599  	umlal   r3, r4, r0, r14
   600  
   601  	/* C interleaved with D */
   602  	ldr	r0, [r1, #0*4]			@ a[0]*2
   603  	ldr	r14, [r1, #1*4]			@ a[1]
   604  	mov	r0, r0, asl #1
   605  	ldr	r8, [r1, #2*4]			@ a[2]
   606  	umlal	r3, r4, r0, r14			@ c += a[0]*2 * a[1]
   607  	mov	r7, r8, asl #1                  @ a[2]*2
   608  	umull	r11, r12, r14, r14		@ c' = a[1] * a[1]
   609  	ldr	r14, [r1, #9*4]			@ a[9]
   610  	umlal	r11, r12, r0, r8		@ c' += a[0]*2 * a[2]
   611  	ldr	r0, [r1, #3*4]			@ a[3]*2
   612  	ldr	r8, [r1, #8*4]			@ a[8]
   613  	umlal	r5, r6, r7, r14			@ d += a[2]*2 * a[9]
   614  	mov	r0, r0, asl #1
   615  	ldr	r7, [r1, #4*4]			@ a[4]*2
   616  	umull	r9, r10, r0, r14		@ d' = a[3]*2 * a[9]
   617  	ldr	r14, [r1, #7*4]			@ a[7]
   618  	umlal	r5, r6, r0, r8			@ d += a[3]*2 * a[8]
   619  	mov	r7, r7, asl #1
   620  	ldr	r0, [r1, #5*4]			@ a[5]*2
   621  	umlal	r9, r10, r7, r8			@ d' += a[4]*2 * a[8]
   622  	ldr	r8, [r1, #6*4]			@ a[6]
   623  	mov	r0, r0, asl #1
   624  	umlal	r5, r6, r7, r14			@ d += a[4]*2 * a[7]
   625  	umlal	r9, r10, r0, r14		@ d' += a[5]*2 * a[7]
   626  	umlal	r5, r6, r0, r8			@ d += a[5]*2 * a[6]
   627  	umlal	r9, r10, r8, r8			@ d' += a[6] * a[6]
   628  
   629  	bic	r0, r5, field_not_M 		@ u1 = d & M
   630  	mov	r5, r5, lsr #26     		@ d >>= 26
   631  	orr	r5, r5, r6, asl #6
   632  	mov     r6, r6, lsr #26
   633  	movw    r14, field_R0			@ c += u1 * R0
   634  	umlal   r3, r4, r0, r14
   635  	bic	r14, r3, field_not_M 		@ t1 = c & M
   636  	str	r14, [sp, #4 + 1*4]
   637  	mov	r3, r3, lsr #26     		@ c >>= 26
   638  	orr	r3, r3, r4, asl #6
   639  	mov     r4, r4, lsr #26
   640  	mov     r14, field_R1			@ c += u1 * R1
   641  	umlal   r3, r4, r0, r14
   642  
   643  	/* D */
   644  	adds	r3, r3, r11			@ c += c'
   645  	adc	r4, r4, r12
   646  	adds	r5, r5, r9			@ d += d'
   647  	adc	r6, r6, r10
   648  
   649  	bic	r0, r5, field_not_M 		@ u2 = d & M
   650  	mov	r5, r5, lsr #26     		@ d >>= 26
   651  	orr	r5, r5, r6, asl #6
   652  	mov     r6, r6, lsr #26
   653  	movw    r14, field_R0			@ c += u2 * R0
   654  	umlal   r3, r4, r0, r14
   655  	bic	r14, r3, field_not_M 		@ t2 = c & M
   656  	str	r14, [sp, #4 + 2*4]
   657  	mov	r3, r3, lsr #26     		@ c >>= 26
   658  	orr	r3, r3, r4, asl #6
   659  	mov     r4, r4, lsr #26
   660  	mov     r14, field_R1			@ c += u2 * R1
   661  	umlal   r3, r4, r0, r14
   662  
   663  	/* E interleaved with F */
   664  	ldr	r7, [r1, #0*4]			@ a[0]*2
   665  	ldr	r0, [r1, #1*4]			@ a[1]*2
   666  	ldr	r14, [r1, #2*4]			@ a[2]
   667  	mov	r7, r7, asl #1
   668  	ldr	r8, [r1, #3*4]			@ a[3]
   669  	ldr	r2, [r1, #4*4]
   670  	umlal	r3, r4, r7, r8			@ c += a[0]*2 * a[3]
   671  	mov	r0, r0, asl #1
   672  	umull	r11, r12, r7, r2		@ c' = a[0]*2 * a[4]
   673  	mov	r2, r2, asl #1			@ a[4]*2
   674  	umlal	r11, r12, r0, r8		@ c' += a[1]*2 * a[3]
   675  	ldr	r8, [r1, #9*4]			@ a[9]
   676  	umlal	r3, r4, r0, r14			@ c += a[1]*2 * a[2]
   677  	ldr	r0, [r1, #5*4]			@ a[5]*2
   678  	umlal	r11, r12, r14, r14		@ c' += a[2] * a[2]
   679  	ldr	r14, [r1, #8*4]			@ a[8]
   680  	mov	r0, r0, asl #1
   681  	umlal	r5, r6, r2, r8			@ d += a[4]*2 * a[9]
   682  	ldr	r7, [r1, #6*4]			@ a[6]*2
   683  	umull	r9, r10, r0, r8			@ d' = a[5]*2 * a[9]
   684  	mov	r7, r7, asl #1
   685  	ldr	r8, [r1, #7*4]			@ a[7]
   686  	umlal	r5, r6, r0, r14			@ d += a[5]*2 * a[8]
   687  	umlal	r9, r10, r7, r14		@ d' += a[6]*2 * a[8]
   688  	umlal	r5, r6, r7, r8			@ d += a[6]*2 * a[7]
   689  	umlal	r9, r10, r8, r8			@ d' += a[7] * a[7]
   690  
   691  	bic	r0, r5, field_not_M 		@ u3 = d & M
   692  	mov	r5, r5, lsr #26     		@ d >>= 26
   693  	orr	r5, r5, r6, asl #6
   694  	mov     r6, r6, lsr #26
   695  	movw    r14, field_R0			@ c += u3 * R0
   696  	umlal   r3, r4, r0, r14
   697  	bic	r14, r3, field_not_M 		@ t3 = c & M
   698  	str	r14, [sp, #4 + 3*4]
   699  	mov	r3, r3, lsr #26     		@ c >>= 26
   700  	orr	r3, r3, r4, asl #6
   701  	mov     r4, r4, lsr #26
   702  	mov     r14, field_R1			@ c += u3 * R1
   703  	umlal   r3, r4, r0, r14
   704  
   705  	/* F */
   706  	adds	r3, r3, r11			@ c += c'
   707  	adc	r4, r4, r12
   708  	adds	r5, r5, r9			@ d += d'
   709  	adc	r6, r6, r10
   710  
   711  	bic	r0, r5, field_not_M 		@ u4 = d & M
   712  	mov	r5, r5, lsr #26     		@ d >>= 26
   713  	orr	r5, r5, r6, asl #6
   714  	mov     r6, r6, lsr #26
   715  	movw    r14, field_R0			@ c += u4 * R0
   716  	umlal   r3, r4, r0, r14
   717  	bic	r14, r3, field_not_M 		@ t4 = c & M
   718  	str	r14, [sp, #4 + 4*4]
   719  	mov	r3, r3, lsr #26     		@ c >>= 26
   720  	orr	r3, r3, r4, asl #6
   721  	mov     r4, r4, lsr #26
   722  	mov     r14, field_R1			@ c += u4 * R1
   723  	umlal   r3, r4, r0, r14
   724  
   725  	/* G interleaved with H */
   726  	ldr	r7, [r1, #0*4]			@ a[0]*2
   727  	ldr	r0, [r1, #1*4]			@ a[1]*2
   728  	mov	r7, r7, asl #1
   729  	ldr	r8, [r1, #5*4]			@ a[5]
   730  	ldr	r2, [r1, #6*4]			@ a[6]
   731  	umlal	r3, r4, r7, r8			@ c += a[0]*2 * a[5]
   732  	ldr	r14, [r1, #4*4]			@ a[4]
   733  	mov	r0, r0, asl #1
   734  	umull	r11, r12, r7, r2		@ c' = a[0]*2 * a[6]
   735  	ldr	r7, [r1, #2*4]			@ a[2]*2
   736  	umlal	r11, r12, r0, r8		@ c' += a[1]*2 * a[5]
   737  	mov	r7, r7, asl #1
   738  	ldr	r8, [r1, #3*4]			@ a[3]
   739  	umlal	r3, r4, r0, r14			@ c += a[1]*2 * a[4]
   740  	mov	r0, r2, asl #1			@ a[6]*2
   741  	umlal	r11, r12, r7, r14		@ c' += a[2]*2 * a[4]
   742  	ldr	r14, [r1, #9*4]			@ a[9]
   743  	umlal	r3, r4, r7, r8			@ c += a[2]*2 * a[3]
   744  	ldr	r7, [r1, #7*4]			@ a[7]*2
   745  	umlal	r11, r12, r8, r8		@ c' += a[3] * a[3]
   746  	mov	r7, r7, asl #1
   747  	ldr	r8, [r1, #8*4]			@ a[8]
   748  	umlal	r5, r6, r0, r14			@ d += a[6]*2 * a[9]
   749  	umull	r9, r10, r7, r14		@ d' = a[7]*2 * a[9]
   750  	umlal	r5, r6, r7, r8			@ d += a[7]*2 * a[8]
   751  	umlal	r9, r10, r8, r8			@ d' += a[8] * a[8]
   752  
   753  	bic	r0, r5, field_not_M 		@ u5 = d & M
   754  	mov	r5, r5, lsr #26     		@ d >>= 26
   755  	orr	r5, r5, r6, asl #6
   756  	mov     r6, r6, lsr #26
   757  	movw    r14, field_R0			@ c += u5 * R0
   758  	umlal   r3, r4, r0, r14
   759  	bic	r14, r3, field_not_M 		@ t5 = c & M
   760  	str	r14, [sp, #4 + 5*4]
   761  	mov	r3, r3, lsr #26     		@ c >>= 26
   762  	orr	r3, r3, r4, asl #6
   763  	mov     r4, r4, lsr #26
   764  	mov     r14, field_R1			@ c += u5 * R1
   765  	umlal   r3, r4, r0, r14
   766  
   767  	/* H */
   768  	adds	r3, r3, r11			@ c += c'
   769  	adc	r4, r4, r12
   770  	adds	r5, r5, r9			@ d += d'
   771  	adc	r6, r6, r10
   772  
   773  	bic	r0, r5, field_not_M 		@ u6 = d & M
   774  	mov	r5, r5, lsr #26     		@ d >>= 26
   775  	orr	r5, r5, r6, asl #6
   776  	mov     r6, r6, lsr #26
   777  	movw    r14, field_R0			@ c += u6 * R0
   778  	umlal   r3, r4, r0, r14
   779  	bic	r14, r3, field_not_M 		@ t6 = c & M
   780  	str	r14, [sp, #4 + 6*4]
   781  	mov	r3, r3, lsr #26     		@ c >>= 26
   782  	orr	r3, r3, r4, asl #6
   783  	mov     r4, r4, lsr #26
   784  	mov     r14, field_R1			@ c += u6 * R1
   785  	umlal   r3, r4, r0, r14
   786  
   787  	/* I interleaved with J */
   788  	ldr	r7, [r1, #0*4]			@ a[0]*2
   789  	ldr	r0, [r1, #1*4]			@ a[1]*2
   790  	mov	r7, r7, asl #1
   791  	ldr	r8, [r1, #7*4]			@ a[7]
   792  	ldr	r2, [r1, #8*4]			@ a[8]
   793  	umlal	r3, r4, r7, r8			@ c += a[0]*2 * a[7]
   794  	ldr	r14, [r1, #6*4]			@ a[6]
   795  	mov	r0, r0, asl #1
   796  	umull	r11, r12, r7, r2		@ c' = a[0]*2 * a[8]
   797  	ldr	r7, [r1, #2*4]			@ a[2]*2
   798  	umlal	r11, r12, r0, r8		@ c' += a[1]*2 * a[7]
   799  	ldr	r8, [r1, #5*4]			@ a[5]
   800  	umlal	r3, r4, r0, r14			@ c += a[1]*2 * a[6]
   801  	ldr	r0, [r1, #3*4]			@ a[3]*2
   802  	mov	r7, r7, asl #1
   803  	umlal	r11, r12, r7, r14		@ c' += a[2]*2 * a[6]
   804  	ldr	r14, [r1, #4*4]			@ a[4]
   805  	mov	r0, r0, asl #1
   806  	umlal	r3, r4, r7, r8			@ c += a[2]*2 * a[5]
   807  	mov	r2, r2, asl #1			@ a[8]*2
   808  	umlal	r11, r12, r0, r8		@ c' += a[3]*2 * a[5]
   809  	umlal	r3, r4, r0, r14			@ c += a[3]*2 * a[4]
   810  	umlal	r11, r12, r14, r14		@ c' += a[4] * a[4]
   811  	ldr	r8, [r1, #9*4]			@ a[9]
   812  	umlal	r5, r6, r2, r8			@ d += a[8]*2 * a[9]
   813  	@ r8 will be used in J
   814  
   815  	bic	r0, r5, field_not_M 		@ u7 = d & M
   816  	mov	r5, r5, lsr #26     		@ d >>= 26
   817  	orr	r5, r5, r6, asl #6
   818  	mov     r6, r6, lsr #26
   819  	movw    r14, field_R0			@ c += u7 * R0
   820  	umlal   r3, r4, r0, r14
   821  	bic	r14, r3, field_not_M 		@ t7 = c & M
   822  	str	r14, [sp, #4 + 7*4]
   823  	mov	r3, r3, lsr #26     		@ c >>= 26
   824  	orr	r3, r3, r4, asl #6
   825  	mov     r4, r4, lsr #26
   826  	mov     r14, field_R1			@ c += u7 * R1
   827  	umlal   r3, r4, r0, r14
   828  
   829  	/* J */
   830  	adds	r3, r3, r11			@ c += c'
   831  	adc	r4, r4, r12
   832  	umlal	r5, r6, r8, r8			@ d += a[9] * a[9]
   833  
   834  	bic	r0, r5, field_not_M 		@ u8 = d & M
   835  	str	r0, [sp, #4 + 8*4]
   836  	mov	r5, r5, lsr #26     		@ d >>= 26
   837  	orr	r5, r5, r6, asl #6
   838  	mov     r6, r6, lsr #26
   839  	movw    r14, field_R0			@ c += u8 * R0
   840  	umlal   r3, r4, r0, r14
   841  
   842  	/******************************************
   843  	 * compute and write back result
   844  	 ******************************************
   845  	Allocation:
   846  	    r0    r
   847  	    r3:r4 c
   848  	    r5:r6 d
   849  	    r7    t0
   850  	    r8    t1
   851  	    r9    t2
   852  	    r11   u8
   853  	    r12   t9
   854  	    r1,r2,r10,r14 scratch
   855  
   856  	Note: do not read from a[] after here, it may overlap with r[]
   857  	*/
   858  	ldr	r0, [sp, #0]
   859  	add	r1, sp, #4 + 3*4		@ r[3..7] = t3..7, r11=u8, r12=t9
   860  	ldmia	r1, {r2,r7,r8,r9,r10,r11,r12}
   861  	add	r1, r0, #3*4
   862  	stmia	r1, {r2,r7,r8,r9,r10}
   863  
   864  	bic	r2, r3, field_not_M 		@ r[8] = c & M
   865  	str	r2, [r0, #8*4]
   866  	mov	r3, r3, lsr #26     		@ c >>= 26
   867  	orr	r3, r3, r4, asl #6
   868  	mov     r4, r4, lsr #26
   869  	mov     r14, field_R1			@ c += u8 * R1
   870  	umlal   r3, r4, r11, r14
   871  	movw    r14, field_R0			@ c += d * R0
   872  	umlal   r3, r4, r5, r14
   873  	adds	r3, r3, r12			@ c += t9
   874  	adc	r4, r4, #0
   875  
   876  	add	r1, sp, #4 + 0*4		@ r7,r8,r9 = t0,t1,t2
   877  	ldmia	r1, {r7,r8,r9}
   878  
   879  	ubfx	r2, r3, #0, #22     		@ r[9] = c & (M >> 4)
   880  	str	r2, [r0, #9*4]
   881  	mov	r3, r3, lsr #22     		@ c >>= 22
   882  	orr	r3, r3, r4, asl #10
   883  	mov     r4, r4, lsr #22
   884  	movw    r14, field_R1 << 4   		@ c += d * (R1 << 4)
   885  	umlal   r3, r4, r5, r14
   886  
   887  	movw    r14, field_R0 >> 4   		@ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
   888  	umull	r5, r6, r3, r14			@ d = c.lo * (R0 >> 4)
   889  	adds	r5, r5, r7	    		@ d.lo += t0
   890  	mla	r6, r14, r4, r6			@ d.hi += c.hi * (R0 >> 4)
   891  	adc	r6, r6, 0	     		@ d.hi += carry
   892  
   893  	bic	r2, r5, field_not_M 		@ r[0] = d & M
   894  	str	r2, [r0, #0*4]
   895  
   896  	mov	r5, r5, lsr #26     		@ d >>= 26
   897  	orr	r5, r5, r6, asl #6
   898  	mov     r6, r6, lsr #26
   899  	
   900  	movw    r14, field_R1 >> 4   		@ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
   901  	umull	r1, r2, r3, r14       		@ tmp = c.lo * (R1 >> 4)
   902  	adds	r5, r5, r8	    		@ d.lo += t1
   903  	adc	r6, r6, #0	    		@ d.hi += carry
   904  	adds	r5, r5, r1	    		@ d.lo += tmp.lo
   905  	mla	r2, r14, r4, r2      		@ tmp.hi += c.hi * (R1 >> 4)
   906  	adc	r6, r6, r2	   		@ d.hi += carry + tmp.hi
   907  
   908  	bic	r2, r5, field_not_M 		@ r[1] = d & M
   909  	str	r2, [r0, #1*4]
   910  	mov	r5, r5, lsr #26     		@ d >>= 26 (ignore hi)
   911  	orr	r5, r5, r6, asl #6
   912  
   913  	add	r5, r5, r9	  		@ d += t2
   914  	str	r5, [r0, #2*4]      		@ r[2] = d
   915  
   916  	add	sp, sp, #48
   917  	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   918  	.size	vet_secp256k1_fe_sqr_inner, .-vet_secp256k1_fe_sqr_inner
   919