github.com/ethereum/go-ethereum@v1.16.1/crypto/secp256k1/libsecp256k1/src/asm/field_10x26_arm.s (about)

     1  @ vim: set tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab syntax=armasm:
     2  /***********************************************************************
     3   * Copyright (c) 2014 Wladimir J. van der Laan                         *
     4   * Distributed under the MIT software license, see the accompanying    *
     5   * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
     6   ***********************************************************************/
     7  /*
     8  ARM implementation of field_10x26 inner loops.
     9  
    10  Note:
    11  
    12  - To avoid unnecessary loads and make use of available registers, two
    13    'passes' have every time been interleaved, with the odd passes accumulating c' and d' 
    14    which will be added to c and d respectively in the even passes
    15  
    16  */
    17  
    18  	.syntax unified
    19  	@ eabi attributes - see readelf -A
    20  	.eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte
    21  	.eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP
    22  	.text
    23  
    24  	@ Field constants
    25  	.set field_R0, 0x3d10
    26  	.set field_R1, 0x400
    27  	.set field_not_M, 0xfc000000	@ ~M = ~0x3ffffff
    28  
    29  	.align	2
    30  	.global secp256k1_fe_mul_inner
    31  	.type	secp256k1_fe_mul_inner, %function
    32  	.hidden secp256k1_fe_mul_inner
    33  	@ Arguments:
    34  	@  r0  r      Restrict: can overlap with a, not with b
    35  	@  r1  a
    36  	@  r2  b
    37  	@ Stack (total 4+10*4 = 44)
    38  	@  sp + #0        saved 'r' pointer
    39  	@  sp + #4 + 4*X  t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
    40  secp256k1_fe_mul_inner:
    41  	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
    42  	sub	sp, sp, #48			@ frame=44 + alignment
    43  	str     r0, [sp, #0]			@ save result address, we need it only at the end
    44  
    45  	/******************************************
    46  	 * Main computation code.
    47  	 ******************************************
    48  
    49  	Allocation:
    50  	    r0,r14,r7,r8   scratch
    51  	    r1       a (pointer)
    52  	    r2       b (pointer)
    53  	    r3:r4    c
    54  	    r5:r6    d
    55  	    r11:r12  c'
    56  	    r9:r10   d'
    57  
    58  	Note: do not write to r[] here, it may overlap with a[]
    59  	*/
    60  
    61  	/* A - interleaved with B */
    62  	ldr	r7, [r1, #0*4]			@ a[0]
    63  	ldr	r8, [r2, #9*4]			@ b[9]
    64  	ldr	r0, [r1, #1*4]			@ a[1]
    65  	umull	r5, r6, r7, r8			@ d = a[0] * b[9]
    66  	ldr	r14, [r2, #8*4]			@ b[8]
    67  	umull	r9, r10, r0, r8			@ d' = a[1] * b[9]
    68  	ldr	r7, [r1, #2*4]			@ a[2]
    69  	umlal	r5, r6, r0, r14			@ d += a[1] * b[8]
    70  	ldr	r8, [r2, #7*4] 			@ b[7]
    71  	umlal	r9, r10, r7, r14		@ d' += a[2] * b[8]
    72  	ldr	r0, [r1, #3*4]   		@ a[3]
    73  	umlal	r5, r6, r7, r8   		@ d += a[2] * b[7]
    74  	ldr	r14, [r2, #6*4]   		@ b[6]
    75  	umlal	r9, r10, r0, r8  		@ d' += a[3] * b[7]
    76  	ldr	r7, [r1, #4*4]   		@ a[4]
    77  	umlal	r5, r6, r0, r14   		@ d += a[3] * b[6]
    78  	ldr	r8, [r2, #5*4]   		@ b[5]
    79  	umlal	r9, r10, r7, r14  		@ d' += a[4] * b[6]
    80  	ldr	r0, [r1, #5*4]   		@ a[5]
    81  	umlal	r5, r6, r7, r8   		@ d += a[4] * b[5]
    82  	ldr	r14, [r2, #4*4]   		@ b[4]
    83  	umlal	r9, r10, r0, r8  		@ d' += a[5] * b[5]
    84  	ldr	r7, [r1, #6*4]   		@ a[6]
    85  	umlal	r5, r6, r0, r14   		@ d += a[5] * b[4]
    86  	ldr	r8, [r2, #3*4]   		@ b[3]
    87  	umlal	r9, r10, r7, r14  		@ d' += a[6] * b[4]
    88  	ldr	r0, [r1, #7*4]   		@ a[7]
    89  	umlal	r5, r6, r7, r8   		@ d += a[6] * b[3]
    90  	ldr	r14, [r2, #2*4]   		@ b[2]
    91  	umlal	r9, r10, r0, r8  		@ d' += a[7] * b[3]
    92  	ldr	r7, [r1, #8*4]   		@ a[8]
    93  	umlal	r5, r6, r0, r14   		@ d += a[7] * b[2]
    94  	ldr	r8, [r2, #1*4]   		@ b[1]
    95  	umlal	r9, r10, r7, r14  		@ d' += a[8] * b[2]
    96  	ldr	r0, [r1, #9*4]   		@ a[9]
    97  	umlal	r5, r6, r7, r8   		@ d += a[8] * b[1]
    98  	ldr	r14, [r2, #0*4]   		@ b[0]
    99  	umlal	r9, r10, r0, r8  		@ d' += a[9] * b[1]
   100  	ldr	r7, [r1, #0*4]   		@ a[0]
   101  	umlal	r5, r6, r0, r14   		@ d += a[9] * b[0]
   102  	@ r7,r14 used in B
   103  
   104  	bic	r0, r5, field_not_M 		@ t9 = d & M
   105  	str     r0, [sp, #4 + 4*9]
   106  	mov	r5, r5, lsr #26     		@ d >>= 26 
   107  	orr	r5, r5, r6, asl #6
   108  	mov     r6, r6, lsr #26
   109  
   110  	/* B */
   111  	umull	r3, r4, r7, r14   		@ c = a[0] * b[0]
   112  	adds	r5, r5, r9       		@ d += d'
   113  	adc	r6, r6, r10
   114  
   115  	bic	r0, r5, field_not_M 		@ u0 = d & M
   116  	mov	r5, r5, lsr #26     		@ d >>= 26
   117  	orr	r5, r5, r6, asl #6
   118  	mov     r6, r6, lsr #26
   119  	movw    r14, field_R0			@ c += u0 * R0
   120  	umlal   r3, r4, r0, r14
   121  
   122  	bic	r14, r3, field_not_M 		@ t0 = c & M
   123  	str	r14, [sp, #4 + 0*4]
   124  	mov	r3, r3, lsr #26     		@ c >>= 26
   125  	orr	r3, r3, r4, asl #6
   126  	mov     r4, r4, lsr #26
   127  	mov     r14, field_R1			@ c += u0 * R1
   128  	umlal   r3, r4, r0, r14
   129  
   130  	/* C - interleaved with D */
   131  	ldr	r7, [r1, #0*4]   		@ a[0]
   132  	ldr	r8, [r2, #2*4]   		@ b[2]
   133  	ldr	r14, [r2, #1*4]   		@ b[1]
   134  	umull	r11, r12, r7, r8   		@ c' = a[0] * b[2]
   135  	ldr	r0, [r1, #1*4]   		@ a[1]
   136  	umlal   r3, r4, r7, r14   		@ c += a[0] * b[1]
   137  	ldr	r8, [r2, #0*4]   		@ b[0]
   138  	umlal   r11, r12, r0, r14   		@ c' += a[1] * b[1]
   139  	ldr	r7, [r1, #2*4]   		@ a[2]
   140  	umlal   r3, r4, r0, r8   		@ c += a[1] * b[0]
   141  	ldr	r14, [r2, #9*4]   		@ b[9]
   142  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[0]
   143  	ldr	r0, [r1, #3*4]   		@ a[3]
   144  	umlal	r5, r6, r7, r14   		@ d += a[2] * b[9]
   145  	ldr	r8, [r2, #8*4]   		@ b[8]
   146  	umull	r9, r10, r0, r14   		@ d' = a[3] * b[9]
   147  	ldr	r7, [r1, #4*4]   		@ a[4]
   148  	umlal	r5, r6, r0, r8   		@ d += a[3] * b[8]
   149  	ldr	r14, [r2, #7*4]   		@ b[7]
   150  	umlal	r9, r10, r7, r8   		@ d' += a[4] * b[8]
   151  	ldr	r0, [r1, #5*4]   		@ a[5]
   152  	umlal	r5, r6, r7, r14   		@ d += a[4] * b[7]
   153  	ldr	r8, [r2, #6*4]   		@ b[6]
   154  	umlal	r9, r10, r0, r14   		@ d' += a[5] * b[7]
   155  	ldr	r7, [r1, #6*4]   		@ a[6]
   156  	umlal	r5, r6, r0, r8   		@ d += a[5] * b[6]
   157  	ldr	r14, [r2, #5*4]   		@ b[5]
   158  	umlal	r9, r10, r7, r8   		@ d' += a[6] * b[6]
   159  	ldr	r0, [r1, #7*4]   		@ a[7]
   160  	umlal	r5, r6, r7, r14   		@ d += a[6] * b[5]
   161  	ldr	r8, [r2, #4*4]   		@ b[4]
   162  	umlal	r9, r10, r0, r14   		@ d' += a[7] * b[5]
   163  	ldr	r7, [r1, #8*4]   		@ a[8]
   164  	umlal	r5, r6, r0, r8   		@ d += a[7] * b[4]
   165  	ldr	r14, [r2, #3*4]   		@ b[3]
   166  	umlal	r9, r10, r7, r8   		@ d' += a[8] * b[4]
   167  	ldr	r0, [r1, #9*4]   		@ a[9]
   168  	umlal	r5, r6, r7, r14   		@ d += a[8] * b[3]
   169  	ldr	r8, [r2, #2*4]   		@ b[2]
   170  	umlal	r9, r10, r0, r14   		@ d' += a[9] * b[3]
   171  	umlal	r5, r6, r0, r8   		@ d += a[9] * b[2]
   172  
   173  	bic	r0, r5, field_not_M 		@ u1 = d & M
   174  	mov	r5, r5, lsr #26     		@ d >>= 26
   175  	orr	r5, r5, r6, asl #6
   176  	mov     r6, r6, lsr #26
   177  	movw    r14, field_R0			@ c += u1 * R0
   178  	umlal   r3, r4, r0, r14
   179  
   180  	bic	r14, r3, field_not_M 		@ t1 = c & M
   181  	str	r14, [sp, #4 + 1*4]
   182  	mov	r3, r3, lsr #26     		@ c >>= 26
   183  	orr	r3, r3, r4, asl #6
   184  	mov     r4, r4, lsr #26
   185  	mov     r14, field_R1			@ c += u1 * R1
   186  	umlal   r3, r4, r0, r14
   187  
   188  	/* D */
   189  	adds	r3, r3, r11			@ c += c'
   190  	adc	r4, r4, r12
   191  	adds	r5, r5, r9			@ d += d'
   192  	adc	r6, r6, r10
   193  
   194  	bic	r0, r5, field_not_M 		@ u2 = d & M
   195  	mov	r5, r5, lsr #26     		@ d >>= 26
   196  	orr	r5, r5, r6, asl #6
   197  	mov     r6, r6, lsr #26
   198  	movw    r14, field_R0			@ c += u2 * R0
   199  	umlal   r3, r4, r0, r14
   200  
   201  	bic	r14, r3, field_not_M 		@ t2 = c & M
   202  	str	r14, [sp, #4 + 2*4]
   203  	mov	r3, r3, lsr #26     		@ c >>= 26
   204  	orr	r3, r3, r4, asl #6
   205  	mov     r4, r4, lsr #26
   206  	mov     r14, field_R1			@ c += u2 * R1
   207  	umlal   r3, r4, r0, r14
   208  
   209  	/* E - interleaved with F */
   210  	ldr	r7, [r1, #0*4]   		@ a[0]
   211  	ldr	r8, [r2, #4*4]   		@ b[4]
   212  	umull	r11, r12, r7, r8   		@ c' = a[0] * b[4]
   213  	ldr	r8, [r2, #3*4]   		@ b[3]
   214  	umlal   r3, r4, r7, r8   		@ c += a[0] * b[3]
   215  	ldr	r7, [r1, #1*4]   		@ a[1]
   216  	umlal   r11, r12, r7, r8   		@ c' += a[1] * b[3]
   217  	ldr	r8, [r2, #2*4]   		@ b[2]
   218  	umlal   r3, r4, r7, r8   		@ c += a[1] * b[2]
   219  	ldr	r7, [r1, #2*4]   		@ a[2]
   220  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[2]
   221  	ldr	r8, [r2, #1*4]   		@ b[1]
   222  	umlal   r3, r4, r7, r8   		@ c += a[2] * b[1]
   223  	ldr	r7, [r1, #3*4]   		@ a[3]
   224  	umlal   r11, r12, r7, r8   		@ c' += a[3] * b[1]
   225  	ldr	r8, [r2, #0*4]   		@ b[0]
   226  	umlal   r3, r4, r7, r8   		@ c += a[3] * b[0]
   227  	ldr	r7, [r1, #4*4]   		@ a[4]
   228  	umlal   r11, r12, r7, r8   		@ c' += a[4] * b[0]
   229  	ldr	r8, [r2, #9*4]   		@ b[9]
   230  	umlal	r5, r6, r7, r8   		@ d += a[4] * b[9]
   231  	ldr	r7, [r1, #5*4]   		@ a[5]
   232  	umull	r9, r10, r7, r8   		@ d' = a[5] * b[9]
   233  	ldr	r8, [r2, #8*4]   		@ b[8]
   234  	umlal	r5, r6, r7, r8   		@ d += a[5] * b[8]
   235  	ldr	r7, [r1, #6*4]   		@ a[6]
   236  	umlal	r9, r10, r7, r8   		@ d' += a[6] * b[8]
   237  	ldr	r8, [r2, #7*4]   		@ b[7]
   238  	umlal	r5, r6, r7, r8   		@ d += a[6] * b[7]
   239  	ldr	r7, [r1, #7*4]   		@ a[7]
   240  	umlal	r9, r10, r7, r8   		@ d' += a[7] * b[7]
   241  	ldr	r8, [r2, #6*4]   		@ b[6]
   242  	umlal	r5, r6, r7, r8   		@ d += a[7] * b[6]
   243  	ldr	r7, [r1, #8*4]   		@ a[8]
   244  	umlal	r9, r10, r7, r8   		@ d' += a[8] * b[6]
   245  	ldr	r8, [r2, #5*4]   		@ b[5]
   246  	umlal	r5, r6, r7, r8   		@ d += a[8] * b[5]
   247  	ldr	r7, [r1, #9*4]   		@ a[9]
   248  	umlal	r9, r10, r7, r8   		@ d' += a[9] * b[5]
   249  	ldr	r8, [r2, #4*4]   		@ b[4]
   250  	umlal	r5, r6, r7, r8   		@ d += a[9] * b[4]
   251  
   252  	bic	r0, r5, field_not_M 		@ u3 = d & M
   253  	mov	r5, r5, lsr #26     		@ d >>= 26
   254  	orr	r5, r5, r6, asl #6
   255  	mov     r6, r6, lsr #26
   256  	movw    r14, field_R0			@ c += u3 * R0
   257  	umlal   r3, r4, r0, r14
   258  
   259  	bic	r14, r3, field_not_M 		@ t3 = c & M
   260  	str	r14, [sp, #4 + 3*4]
   261  	mov	r3, r3, lsr #26     		@ c >>= 26
   262  	orr	r3, r3, r4, asl #6
   263  	mov     r4, r4, lsr #26
   264  	mov     r14, field_R1			@ c += u3 * R1
   265  	umlal   r3, r4, r0, r14
   266  
   267  	/* F */
   268  	adds	r3, r3, r11			@ c += c'
   269  	adc	r4, r4, r12
   270  	adds	r5, r5, r9			@ d += d'
   271  	adc	r6, r6, r10
   272  
   273  	bic	r0, r5, field_not_M 		@ u4 = d & M
   274  	mov	r5, r5, lsr #26     		@ d >>= 26
   275  	orr	r5, r5, r6, asl #6
   276  	mov     r6, r6, lsr #26
   277  	movw    r14, field_R0			@ c += u4 * R0
   278  	umlal   r3, r4, r0, r14
   279  
   280  	bic	r14, r3, field_not_M 		@ t4 = c & M
   281  	str	r14, [sp, #4 + 4*4]
   282  	mov	r3, r3, lsr #26     		@ c >>= 26
   283  	orr	r3, r3, r4, asl #6
   284  	mov     r4, r4, lsr #26
   285  	mov     r14, field_R1			@ c += u4 * R1
   286  	umlal   r3, r4, r0, r14
   287  
   288  	/* G - interleaved with H */
   289  	ldr	r7, [r1, #0*4]   		@ a[0]
   290  	ldr	r8, [r2, #6*4]   		@ b[6]
   291  	ldr	r14, [r2, #5*4]   		@ b[5]
   292  	umull	r11, r12, r7, r8   		@ c' = a[0] * b[6]
   293  	ldr	r0, [r1, #1*4]   		@ a[1]
   294  	umlal   r3, r4, r7, r14   		@ c += a[0] * b[5]
   295  	ldr	r8, [r2, #4*4]   		@ b[4]
   296  	umlal   r11, r12, r0, r14   		@ c' += a[1] * b[5]
   297  	ldr	r7, [r1, #2*4]   		@ a[2]
   298  	umlal   r3, r4, r0, r8   		@ c += a[1] * b[4]
   299  	ldr	r14, [r2, #3*4]   		@ b[3]
   300  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[4]
   301  	ldr	r0, [r1, #3*4]   		@ a[3]
   302  	umlal   r3, r4, r7, r14   		@ c += a[2] * b[3]
   303  	ldr	r8, [r2, #2*4]   		@ b[2]
   304  	umlal   r11, r12, r0, r14   		@ c' += a[3] * b[3]
   305  	ldr	r7, [r1, #4*4]   		@ a[4]
   306  	umlal   r3, r4, r0, r8   		@ c += a[3] * b[2]
   307  	ldr	r14, [r2, #1*4]   		@ b[1]
   308  	umlal   r11, r12, r7, r8   		@ c' += a[4] * b[2]
   309  	ldr	r0, [r1, #5*4]   		@ a[5]
   310  	umlal   r3, r4, r7, r14   		@ c += a[4] * b[1]
   311  	ldr	r8, [r2, #0*4]   		@ b[0]
   312  	umlal   r11, r12, r0, r14   		@ c' += a[5] * b[1]
   313  	ldr	r7, [r1, #6*4]   		@ a[6]
   314  	umlal   r3, r4, r0, r8   		@ c += a[5] * b[0]
   315  	ldr	r14, [r2, #9*4]   		@ b[9]
   316  	umlal   r11, r12, r7, r8   		@ c' += a[6] * b[0]
   317  	ldr	r0, [r1, #7*4]   		@ a[7]
   318  	umlal	r5, r6, r7, r14   		@ d += a[6] * b[9]
   319  	ldr	r8, [r2, #8*4]   		@ b[8]
   320  	umull	r9, r10, r0, r14   		@ d' = a[7] * b[9]
   321  	ldr	r7, [r1, #8*4]   		@ a[8]
   322  	umlal	r5, r6, r0, r8   		@ d += a[7] * b[8]
   323  	ldr	r14, [r2, #7*4]   		@ b[7]
   324  	umlal	r9, r10, r7, r8   		@ d' += a[8] * b[8]
   325  	ldr	r0, [r1, #9*4]   		@ a[9]
   326  	umlal	r5, r6, r7, r14   		@ d += a[8] * b[7]
   327  	ldr	r8, [r2, #6*4]   		@ b[6]
   328  	umlal	r9, r10, r0, r14   		@ d' += a[9] * b[7]
   329  	umlal	r5, r6, r0, r8   		@ d += a[9] * b[6]
   330  
   331  	bic	r0, r5, field_not_M 		@ u5 = d & M
   332  	mov	r5, r5, lsr #26     		@ d >>= 26
   333  	orr	r5, r5, r6, asl #6
   334  	mov     r6, r6, lsr #26
   335  	movw    r14, field_R0			@ c += u5 * R0
   336  	umlal   r3, r4, r0, r14
   337  
   338  	bic	r14, r3, field_not_M 		@ t5 = c & M
   339  	str	r14, [sp, #4 + 5*4]
   340  	mov	r3, r3, lsr #26     		@ c >>= 26
   341  	orr	r3, r3, r4, asl #6
   342  	mov     r4, r4, lsr #26
   343  	mov     r14, field_R1			@ c += u5 * R1
   344  	umlal   r3, r4, r0, r14
   345  
   346  	/* H */
   347  	adds	r3, r3, r11			@ c += c'
   348  	adc	r4, r4, r12
   349  	adds	r5, r5, r9			@ d += d'
   350  	adc	r6, r6, r10
   351  
   352  	bic	r0, r5, field_not_M 		@ u6 = d & M
   353  	mov	r5, r5, lsr #26     		@ d >>= 26
   354  	orr	r5, r5, r6, asl #6
   355  	mov     r6, r6, lsr #26
   356  	movw    r14, field_R0			@ c += u6 * R0
   357  	umlal   r3, r4, r0, r14
   358  
   359  	bic	r14, r3, field_not_M 		@ t6 = c & M
   360  	str	r14, [sp, #4 + 6*4]
   361  	mov	r3, r3, lsr #26     		@ c >>= 26
   362  	orr	r3, r3, r4, asl #6
   363  	mov     r4, r4, lsr #26
   364  	mov     r14, field_R1			@ c += u6 * R1
   365  	umlal   r3, r4, r0, r14
   366  
   367  	/* I - interleaved with J */
   368  	ldr	r8, [r2, #8*4]   		@ b[8]
   369  	ldr	r7, [r1, #0*4]   		@ a[0]
   370  	ldr	r14, [r2, #7*4]   		@ b[7]
   371  	umull   r11, r12, r7, r8   		@ c' = a[0] * b[8]
   372  	ldr	r0, [r1, #1*4]   		@ a[1]
   373  	umlal   r3, r4, r7, r14   		@ c += a[0] * b[7]
   374  	ldr	r8, [r2, #6*4]   		@ b[6]
   375  	umlal   r11, r12, r0, r14   		@ c' += a[1] * b[7]
   376  	ldr	r7, [r1, #2*4]   		@ a[2]
   377  	umlal   r3, r4, r0, r8   		@ c += a[1] * b[6]
   378  	ldr	r14, [r2, #5*4]   		@ b[5]
   379  	umlal   r11, r12, r7, r8   		@ c' += a[2] * b[6]
   380  	ldr	r0, [r1, #3*4]   		@ a[3]
   381  	umlal   r3, r4, r7, r14   		@ c += a[2] * b[5]
   382  	ldr	r8, [r2, #4*4]   		@ b[4]
   383  	umlal   r11, r12, r0, r14   		@ c' += a[3] * b[5]
   384  	ldr	r7, [r1, #4*4]   		@ a[4]
   385  	umlal   r3, r4, r0, r8   		@ c += a[3] * b[4]
   386  	ldr	r14, [r2, #3*4]   		@ b[3]
   387  	umlal   r11, r12, r7, r8   		@ c' += a[4] * b[4]
   388  	ldr	r0, [r1, #5*4]   		@ a[5]
   389  	umlal   r3, r4, r7, r14   		@ c += a[4] * b[3]
   390  	ldr	r8, [r2, #2*4]   		@ b[2]
   391  	umlal   r11, r12, r0, r14   		@ c' += a[5] * b[3]
   392  	ldr	r7, [r1, #6*4]   		@ a[6]
   393  	umlal   r3, r4, r0, r8   		@ c += a[5] * b[2]
   394  	ldr	r14, [r2, #1*4]   		@ b[1]
   395  	umlal   r11, r12, r7, r8   		@ c' += a[6] * b[2]
   396  	ldr	r0, [r1, #7*4]   		@ a[7]
   397  	umlal   r3, r4, r7, r14   		@ c += a[6] * b[1]
   398  	ldr	r8, [r2, #0*4]   		@ b[0]
   399  	umlal   r11, r12, r0, r14   		@ c' += a[7] * b[1]
   400  	ldr	r7, [r1, #8*4]   		@ a[8]
   401  	umlal   r3, r4, r0, r8   		@ c += a[7] * b[0]
   402  	ldr	r14, [r2, #9*4]   		@ b[9]
   403  	umlal   r11, r12, r7, r8   		@ c' += a[8] * b[0]
   404  	ldr	r0, [r1, #9*4]   		@ a[9]
   405  	umlal	r5, r6, r7, r14   		@ d += a[8] * b[9]
   406  	ldr	r8, [r2, #8*4]   		@ b[8]
   407  	umull	r9, r10, r0, r14  		@ d' = a[9] * b[9]
   408  	umlal	r5, r6, r0, r8   		@ d += a[9] * b[8]
   409  
   410  	bic	r0, r5, field_not_M 		@ u7 = d & M
   411  	mov	r5, r5, lsr #26     		@ d >>= 26
   412  	orr	r5, r5, r6, asl #6
   413  	mov     r6, r6, lsr #26
   414  	movw    r14, field_R0			@ c += u7 * R0
   415  	umlal   r3, r4, r0, r14
   416  
   417  	bic	r14, r3, field_not_M 		@ t7 = c & M
   418  	str	r14, [sp, #4 + 7*4]
   419  	mov	r3, r3, lsr #26     		@ c >>= 26
   420  	orr	r3, r3, r4, asl #6
   421  	mov     r4, r4, lsr #26
   422  	mov     r14, field_R1			@ c += u7 * R1
   423  	umlal   r3, r4, r0, r14
   424  
   425  	/* J */
   426  	adds	r3, r3, r11			@ c += c'
   427  	adc	r4, r4, r12
   428  	adds	r5, r5, r9			@ d += d'
   429  	adc	r6, r6, r10
   430  
   431  	bic	r0, r5, field_not_M 		@ u8 = d & M
   432  	str	r0, [sp, #4 + 8*4]
   433  	mov	r5, r5, lsr #26     		@ d >>= 26
   434  	orr	r5, r5, r6, asl #6
   435  	mov     r6, r6, lsr #26
   436  	movw    r14, field_R0			@ c += u8 * R0
   437  	umlal   r3, r4, r0, r14
   438  
   439  	/******************************************
   440  	 * compute and write back result
   441  	 ******************************************
   442  	Allocation:
   443  	    r0    r
   444  	    r3:r4 c
   445  	    r5:r6 d
   446  	    r7    t0
   447  	    r8    t1
   448  	    r9    t2
   449  	    r11   u8
   450  	    r12   t9
   451  	    r1,r2,r10,r14 scratch
   452  
   453  	Note: do not read from a[] after here, it may overlap with r[]
   454  	*/
   455  	ldr	r0, [sp, #0]
   456  	add	r1, sp, #4 + 3*4		@ r[3..7] = t3..7, r11=u8, r12=t9
   457  	ldmia	r1, {r2,r7,r8,r9,r10,r11,r12}
   458  	add	r1, r0, #3*4
   459  	stmia	r1, {r2,r7,r8,r9,r10}
   460  
   461  	bic	r2, r3, field_not_M 		@ r[8] = c & M
   462  	str	r2, [r0, #8*4]
   463  	mov	r3, r3, lsr #26     		@ c >>= 26
   464  	orr	r3, r3, r4, asl #6
   465  	mov     r4, r4, lsr #26
   466  	mov     r14, field_R1			@ c += u8 * R1
   467  	umlal   r3, r4, r11, r14
   468  	movw    r14, field_R0			@ c += d * R0
   469  	umlal   r3, r4, r5, r14
   470  	adds	r3, r3, r12			@ c += t9
   471  	adc	r4, r4, #0
   472  
   473  	add	r1, sp, #4 + 0*4		@ r7,r8,r9 = t0,t1,t2
   474  	ldmia	r1, {r7,r8,r9}
   475  
   476  	ubfx	r2, r3, #0, #22     		@ r[9] = c & (M >> 4)
   477  	str	r2, [r0, #9*4]
   478  	mov	r3, r3, lsr #22     		@ c >>= 22
   479  	orr	r3, r3, r4, asl #10
   480  	mov     r4, r4, lsr #22
   481  	movw    r14, field_R1 << 4   		@ c += d * (R1 << 4)
   482  	umlal   r3, r4, r5, r14
   483  
   484  	movw    r14, field_R0 >> 4   		@ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
   485  	umull	r5, r6, r3, r14			@ d = c.lo * (R0 >> 4)
   486  	adds	r5, r5, r7	    		@ d.lo += t0
   487  	mla	r6, r14, r4, r6			@ d.hi += c.hi * (R0 >> 4)
   488  	adc	r6, r6, 0	     		@ d.hi += carry
   489  
   490  	bic	r2, r5, field_not_M 		@ r[0] = d & M
   491  	str	r2, [r0, #0*4]
   492  
   493  	mov	r5, r5, lsr #26     		@ d >>= 26
   494  	orr	r5, r5, r6, asl #6
   495  	mov     r6, r6, lsr #26
   496  	
   497  	movw    r14, field_R1 >> 4   		@ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
   498  	umull	r1, r2, r3, r14       		@ tmp = c.lo * (R1 >> 4)
   499  	adds	r5, r5, r8	    		@ d.lo += t1
   500  	adc	r6, r6, #0	    		@ d.hi += carry
   501  	adds	r5, r5, r1	    		@ d.lo += tmp.lo
   502  	mla	r2, r14, r4, r2      		@ tmp.hi += c.hi * (R1 >> 4)
   503  	adc	r6, r6, r2	   		@ d.hi += carry + tmp.hi
   504  
   505  	bic	r2, r5, field_not_M 		@ r[1] = d & M
   506  	str	r2, [r0, #1*4]
   507  	mov	r5, r5, lsr #26     		@ d >>= 26 (ignore hi)
   508  	orr	r5, r5, r6, asl #6
   509  
   510  	add	r5, r5, r9	  		@ d += t2
   511  	str	r5, [r0, #2*4]      		@ r[2] = d
   512  
   513  	add	sp, sp, #48
   514  	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   515  	.size	secp256k1_fe_mul_inner, .-secp256k1_fe_mul_inner
   516  
   517  	.align	2
   518  	.global secp256k1_fe_sqr_inner
   519  	.type	secp256k1_fe_sqr_inner, %function
   520  	.hidden secp256k1_fe_sqr_inner
   521  	@ Arguments:
   522  	@  r0  r	 Can overlap with a
   523  	@  r1  a
   524  	@ Stack (total 4+10*4 = 44)
   525  	@  sp + #0        saved 'r' pointer
   526  	@  sp + #4 + 4*X  t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
   527  secp256k1_fe_sqr_inner:
   528  	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
   529  	sub	sp, sp, #48			@ frame=44 + alignment
   530  	str     r0, [sp, #0]			@ save result address, we need it only at the end
   531  	/******************************************
   532  	 * Main computation code.
   533  	 ******************************************
   534  
   535  	Allocation:
   536  	    r0,r14,r2,r7,r8   scratch
   537  	    r1       a (pointer)
   538  	    r3:r4    c
   539  	    r5:r6    d
   540  	    r11:r12  c'
   541  	    r9:r10   d'
   542  
   543  	Note: do not write to r[] here, it may overlap with a[]
   544  	*/
   545  	/* A interleaved with B */
   546  	ldr	r0, [r1, #1*4]			@ a[1]*2
   547  	ldr	r7, [r1, #0*4]			@ a[0]
   548  	mov	r0, r0, asl #1
   549  	ldr	r14, [r1, #9*4]			@ a[9]
   550  	umull	r3, r4, r7, r7			@ c = a[0] * a[0]
   551  	ldr	r8, [r1, #8*4]			@ a[8]
   552  	mov	r7, r7, asl #1
   553  	umull	r5, r6, r7, r14			@ d = a[0]*2 * a[9]
   554  	ldr	r7, [r1, #2*4]			@ a[2]*2
   555  	umull	r9, r10, r0, r14		@ d' = a[1]*2 * a[9]
   556  	ldr	r14, [r1, #7*4]			@ a[7]
   557  	umlal	r5, r6, r0, r8			@ d += a[1]*2 * a[8]
   558  	mov	r7, r7, asl #1
   559  	ldr	r0, [r1, #3*4]			@ a[3]*2
   560  	umlal	r9, r10, r7, r8			@ d' += a[2]*2 * a[8]
   561  	ldr	r8, [r1, #6*4]			@ a[6]
   562  	umlal	r5, r6, r7, r14			@ d += a[2]*2 * a[7]
   563  	mov	r0, r0, asl #1
   564  	ldr	r7, [r1, #4*4]			@ a[4]*2
   565  	umlal	r9, r10, r0, r14		@ d' += a[3]*2 * a[7]
   566  	ldr	r14, [r1, #5*4]			@ a[5]
   567  	mov	r7, r7, asl #1
   568  	umlal	r5, r6, r0, r8			@ d += a[3]*2 * a[6]
   569  	umlal	r9, r10, r7, r8			@ d' += a[4]*2 * a[6]
   570  	umlal	r5, r6, r7, r14			@ d += a[4]*2 * a[5]
   571  	umlal	r9, r10, r14, r14		@ d' += a[5] * a[5]
   572  
   573  	bic	r0, r5, field_not_M 		@ t9 = d & M
   574  	str     r0, [sp, #4 + 9*4]
   575  	mov	r5, r5, lsr #26     		@ d >>= 26 
   576  	orr	r5, r5, r6, asl #6
   577  	mov     r6, r6, lsr #26
   578  
   579  	/* B */
   580  	adds	r5, r5, r9			@ d += d'
   581  	adc	r6, r6, r10
   582  
   583  	bic	r0, r5, field_not_M 		@ u0 = d & M
   584  	mov	r5, r5, lsr #26     		@ d >>= 26
   585  	orr	r5, r5, r6, asl #6
   586  	mov     r6, r6, lsr #26
   587  	movw    r14, field_R0			@ c += u0 * R0
   588  	umlal   r3, r4, r0, r14
   589  	bic	r14, r3, field_not_M 		@ t0 = c & M
   590  	str	r14, [sp, #4 + 0*4]
   591  	mov	r3, r3, lsr #26     		@ c >>= 26
   592  	orr	r3, r3, r4, asl #6
   593  	mov     r4, r4, lsr #26
   594  	mov     r14, field_R1			@ c += u0 * R1
   595  	umlal   r3, r4, r0, r14
   596  
   597  	/* C interleaved with D */
   598  	ldr	r0, [r1, #0*4]			@ a[0]*2
   599  	ldr	r14, [r1, #1*4]			@ a[1]
   600  	mov	r0, r0, asl #1
   601  	ldr	r8, [r1, #2*4]			@ a[2]
   602  	umlal	r3, r4, r0, r14			@ c += a[0]*2 * a[1]
   603  	mov	r7, r8, asl #1                  @ a[2]*2
   604  	umull	r11, r12, r14, r14		@ c' = a[1] * a[1]
   605  	ldr	r14, [r1, #9*4]			@ a[9]
   606  	umlal	r11, r12, r0, r8		@ c' += a[0]*2 * a[2]
   607  	ldr	r0, [r1, #3*4]			@ a[3]*2
   608  	ldr	r8, [r1, #8*4]			@ a[8]
   609  	umlal	r5, r6, r7, r14			@ d += a[2]*2 * a[9]
   610  	mov	r0, r0, asl #1
   611  	ldr	r7, [r1, #4*4]			@ a[4]*2
   612  	umull	r9, r10, r0, r14		@ d' = a[3]*2 * a[9]
   613  	ldr	r14, [r1, #7*4]			@ a[7]
   614  	umlal	r5, r6, r0, r8			@ d += a[3]*2 * a[8]
   615  	mov	r7, r7, asl #1
   616  	ldr	r0, [r1, #5*4]			@ a[5]*2
   617  	umlal	r9, r10, r7, r8			@ d' += a[4]*2 * a[8]
   618  	ldr	r8, [r1, #6*4]			@ a[6]
   619  	mov	r0, r0, asl #1
   620  	umlal	r5, r6, r7, r14			@ d += a[4]*2 * a[7]
   621  	umlal	r9, r10, r0, r14		@ d' += a[5]*2 * a[7]
   622  	umlal	r5, r6, r0, r8			@ d += a[5]*2 * a[6]
   623  	umlal	r9, r10, r8, r8			@ d' += a[6] * a[6]
   624  
   625  	bic	r0, r5, field_not_M 		@ u1 = d & M
   626  	mov	r5, r5, lsr #26     		@ d >>= 26
   627  	orr	r5, r5, r6, asl #6
   628  	mov     r6, r6, lsr #26
   629  	movw    r14, field_R0			@ c += u1 * R0
   630  	umlal   r3, r4, r0, r14
   631  	bic	r14, r3, field_not_M 		@ t1 = c & M
   632  	str	r14, [sp, #4 + 1*4]
   633  	mov	r3, r3, lsr #26     		@ c >>= 26
   634  	orr	r3, r3, r4, asl #6
   635  	mov     r4, r4, lsr #26
   636  	mov     r14, field_R1			@ c += u1 * R1
   637  	umlal   r3, r4, r0, r14
   638  
   639  	/* D */
   640  	adds	r3, r3, r11			@ c += c'
   641  	adc	r4, r4, r12
   642  	adds	r5, r5, r9			@ d += d'
   643  	adc	r6, r6, r10
   644  
   645  	bic	r0, r5, field_not_M 		@ u2 = d & M
   646  	mov	r5, r5, lsr #26     		@ d >>= 26
   647  	orr	r5, r5, r6, asl #6
   648  	mov     r6, r6, lsr #26
   649  	movw    r14, field_R0			@ c += u2 * R0
   650  	umlal   r3, r4, r0, r14
   651  	bic	r14, r3, field_not_M 		@ t2 = c & M
   652  	str	r14, [sp, #4 + 2*4]
   653  	mov	r3, r3, lsr #26     		@ c >>= 26
   654  	orr	r3, r3, r4, asl #6
   655  	mov     r4, r4, lsr #26
   656  	mov     r14, field_R1			@ c += u2 * R1
   657  	umlal   r3, r4, r0, r14
   658  
   659  	/* E interleaved with F */
   660  	ldr	r7, [r1, #0*4]			@ a[0]*2
   661  	ldr	r0, [r1, #1*4]			@ a[1]*2
   662  	ldr	r14, [r1, #2*4]			@ a[2]
   663  	mov	r7, r7, asl #1
   664  	ldr	r8, [r1, #3*4]			@ a[3]
   665  	ldr	r2, [r1, #4*4]
   666  	umlal	r3, r4, r7, r8			@ c += a[0]*2 * a[3]
   667  	mov	r0, r0, asl #1
   668  	umull	r11, r12, r7, r2		@ c' = a[0]*2 * a[4]
   669  	mov	r2, r2, asl #1			@ a[4]*2
   670  	umlal	r11, r12, r0, r8		@ c' += a[1]*2 * a[3]
   671  	ldr	r8, [r1, #9*4]			@ a[9]
   672  	umlal	r3, r4, r0, r14			@ c += a[1]*2 * a[2]
   673  	ldr	r0, [r1, #5*4]			@ a[5]*2
   674  	umlal	r11, r12, r14, r14		@ c' += a[2] * a[2]
   675  	ldr	r14, [r1, #8*4]			@ a[8]
   676  	mov	r0, r0, asl #1
   677  	umlal	r5, r6, r2, r8			@ d += a[4]*2 * a[9]
   678  	ldr	r7, [r1, #6*4]			@ a[6]*2
   679  	umull	r9, r10, r0, r8			@ d' = a[5]*2 * a[9]
   680  	mov	r7, r7, asl #1
   681  	ldr	r8, [r1, #7*4]			@ a[7]
   682  	umlal	r5, r6, r0, r14			@ d += a[5]*2 * a[8]
   683  	umlal	r9, r10, r7, r14		@ d' += a[6]*2 * a[8]
   684  	umlal	r5, r6, r7, r8			@ d += a[6]*2 * a[7]
   685  	umlal	r9, r10, r8, r8			@ d' += a[7] * a[7]
   686  
   687  	bic	r0, r5, field_not_M 		@ u3 = d & M
   688  	mov	r5, r5, lsr #26     		@ d >>= 26
   689  	orr	r5, r5, r6, asl #6
   690  	mov     r6, r6, lsr #26
   691  	movw    r14, field_R0			@ c += u3 * R0
   692  	umlal   r3, r4, r0, r14
   693  	bic	r14, r3, field_not_M 		@ t3 = c & M
   694  	str	r14, [sp, #4 + 3*4]
   695  	mov	r3, r3, lsr #26     		@ c >>= 26
   696  	orr	r3, r3, r4, asl #6
   697  	mov     r4, r4, lsr #26
   698  	mov     r14, field_R1			@ c += u3 * R1
   699  	umlal   r3, r4, r0, r14
   700  
   701  	/* F */
   702  	adds	r3, r3, r11			@ c += c'
   703  	adc	r4, r4, r12
   704  	adds	r5, r5, r9			@ d += d'
   705  	adc	r6, r6, r10
   706  
   707  	bic	r0, r5, field_not_M 		@ u4 = d & M
   708  	mov	r5, r5, lsr #26     		@ d >>= 26
   709  	orr	r5, r5, r6, asl #6
   710  	mov     r6, r6, lsr #26
   711  	movw    r14, field_R0			@ c += u4 * R0
   712  	umlal   r3, r4, r0, r14
   713  	bic	r14, r3, field_not_M 		@ t4 = c & M
   714  	str	r14, [sp, #4 + 4*4]
   715  	mov	r3, r3, lsr #26     		@ c >>= 26
   716  	orr	r3, r3, r4, asl #6
   717  	mov     r4, r4, lsr #26
   718  	mov     r14, field_R1			@ c += u4 * R1
   719  	umlal   r3, r4, r0, r14
   720  
   721  	/* G interleaved with H */
   722  	ldr	r7, [r1, #0*4]			@ a[0]*2
   723  	ldr	r0, [r1, #1*4]			@ a[1]*2
   724  	mov	r7, r7, asl #1
   725  	ldr	r8, [r1, #5*4]			@ a[5]
   726  	ldr	r2, [r1, #6*4]			@ a[6]
   727  	umlal	r3, r4, r7, r8			@ c += a[0]*2 * a[5]
   728  	ldr	r14, [r1, #4*4]			@ a[4]
   729  	mov	r0, r0, asl #1
   730  	umull	r11, r12, r7, r2		@ c' = a[0]*2 * a[6]
   731  	ldr	r7, [r1, #2*4]			@ a[2]*2
   732  	umlal	r11, r12, r0, r8		@ c' += a[1]*2 * a[5]
   733  	mov	r7, r7, asl #1
   734  	ldr	r8, [r1, #3*4]			@ a[3]
   735  	umlal	r3, r4, r0, r14			@ c += a[1]*2 * a[4]
   736  	mov	r0, r2, asl #1			@ a[6]*2
   737  	umlal	r11, r12, r7, r14		@ c' += a[2]*2 * a[4]
   738  	ldr	r14, [r1, #9*4]			@ a[9]
   739  	umlal	r3, r4, r7, r8			@ c += a[2]*2 * a[3]
   740  	ldr	r7, [r1, #7*4]			@ a[7]*2
   741  	umlal	r11, r12, r8, r8		@ c' += a[3] * a[3]
   742  	mov	r7, r7, asl #1
   743  	ldr	r8, [r1, #8*4]			@ a[8]
   744  	umlal	r5, r6, r0, r14			@ d += a[6]*2 * a[9]
   745  	umull	r9, r10, r7, r14		@ d' = a[7]*2 * a[9]
   746  	umlal	r5, r6, r7, r8			@ d += a[7]*2 * a[8]
   747  	umlal	r9, r10, r8, r8			@ d' += a[8] * a[8]
   748  
   749  	bic	r0, r5, field_not_M 		@ u5 = d & M
   750  	mov	r5, r5, lsr #26     		@ d >>= 26
   751  	orr	r5, r5, r6, asl #6
   752  	mov     r6, r6, lsr #26
   753  	movw    r14, field_R0			@ c += u5 * R0
   754  	umlal   r3, r4, r0, r14
   755  	bic	r14, r3, field_not_M 		@ t5 = c & M
   756  	str	r14, [sp, #4 + 5*4]
   757  	mov	r3, r3, lsr #26     		@ c >>= 26
   758  	orr	r3, r3, r4, asl #6
   759  	mov     r4, r4, lsr #26
   760  	mov     r14, field_R1			@ c += u5 * R1
   761  	umlal   r3, r4, r0, r14
   762  
   763  	/* H */
   764  	adds	r3, r3, r11			@ c += c'
   765  	adc	r4, r4, r12
   766  	adds	r5, r5, r9			@ d += d'
   767  	adc	r6, r6, r10
   768  
   769  	bic	r0, r5, field_not_M 		@ u6 = d & M
   770  	mov	r5, r5, lsr #26     		@ d >>= 26
   771  	orr	r5, r5, r6, asl #6
   772  	mov     r6, r6, lsr #26
   773  	movw    r14, field_R0			@ c += u6 * R0
   774  	umlal   r3, r4, r0, r14
   775  	bic	r14, r3, field_not_M 		@ t6 = c & M
   776  	str	r14, [sp, #4 + 6*4]
   777  	mov	r3, r3, lsr #26     		@ c >>= 26
   778  	orr	r3, r3, r4, asl #6
   779  	mov     r4, r4, lsr #26
   780  	mov     r14, field_R1			@ c += u6 * R1
   781  	umlal   r3, r4, r0, r14
   782  
   783  	/* I interleaved with J */
   784  	ldr	r7, [r1, #0*4]			@ a[0]*2
   785  	ldr	r0, [r1, #1*4]			@ a[1]*2
   786  	mov	r7, r7, asl #1
   787  	ldr	r8, [r1, #7*4]			@ a[7]
   788  	ldr	r2, [r1, #8*4]			@ a[8]
   789  	umlal	r3, r4, r7, r8			@ c += a[0]*2 * a[7]
   790  	ldr	r14, [r1, #6*4]			@ a[6]
   791  	mov	r0, r0, asl #1
   792  	umull	r11, r12, r7, r2		@ c' = a[0]*2 * a[8]
   793  	ldr	r7, [r1, #2*4]			@ a[2]*2
   794  	umlal	r11, r12, r0, r8		@ c' += a[1]*2 * a[7]
   795  	ldr	r8, [r1, #5*4]			@ a[5]
   796  	umlal	r3, r4, r0, r14			@ c += a[1]*2 * a[6]
   797  	ldr	r0, [r1, #3*4]			@ a[3]*2
   798  	mov	r7, r7, asl #1
   799  	umlal	r11, r12, r7, r14		@ c' += a[2]*2 * a[6]
   800  	ldr	r14, [r1, #4*4]			@ a[4]
   801  	mov	r0, r0, asl #1
   802  	umlal	r3, r4, r7, r8			@ c += a[2]*2 * a[5]
   803  	mov	r2, r2, asl #1			@ a[8]*2
   804  	umlal	r11, r12, r0, r8		@ c' += a[3]*2 * a[5]
   805  	umlal	r3, r4, r0, r14			@ c += a[3]*2 * a[4]
   806  	umlal	r11, r12, r14, r14		@ c' += a[4] * a[4]
   807  	ldr	r8, [r1, #9*4]			@ a[9]
   808  	umlal	r5, r6, r2, r8			@ d += a[8]*2 * a[9]
   809  	@ r8 will be used in J
   810  
   811  	bic	r0, r5, field_not_M 		@ u7 = d & M
   812  	mov	r5, r5, lsr #26     		@ d >>= 26
   813  	orr	r5, r5, r6, asl #6
   814  	mov     r6, r6, lsr #26
   815  	movw    r14, field_R0			@ c += u7 * R0
   816  	umlal   r3, r4, r0, r14
   817  	bic	r14, r3, field_not_M 		@ t7 = c & M
   818  	str	r14, [sp, #4 + 7*4]
   819  	mov	r3, r3, lsr #26     		@ c >>= 26
   820  	orr	r3, r3, r4, asl #6
   821  	mov     r4, r4, lsr #26
   822  	mov     r14, field_R1			@ c += u7 * R1
   823  	umlal   r3, r4, r0, r14
   824  
   825  	/* J */
   826  	adds	r3, r3, r11			@ c += c'
   827  	adc	r4, r4, r12
   828  	umlal	r5, r6, r8, r8			@ d += a[9] * a[9]
   829  
   830  	bic	r0, r5, field_not_M 		@ u8 = d & M
   831  	str	r0, [sp, #4 + 8*4]
   832  	mov	r5, r5, lsr #26     		@ d >>= 26
   833  	orr	r5, r5, r6, asl #6
   834  	mov     r6, r6, lsr #26
   835  	movw    r14, field_R0			@ c += u8 * R0
   836  	umlal   r3, r4, r0, r14
   837  
   838  	/******************************************
   839  	 * compute and write back result
   840  	 ******************************************
   841  	Allocation:
   842  	    r0    r
   843  	    r3:r4 c
   844  	    r5:r6 d
   845  	    r7    t0
   846  	    r8    t1
   847  	    r9    t2
   848  	    r11   u8
   849  	    r12   t9
   850  	    r1,r2,r10,r14 scratch
   851  
   852  	Note: do not read from a[] after here, it may overlap with r[]
   853  	*/
   854  	ldr	r0, [sp, #0]
   855  	add	r1, sp, #4 + 3*4		@ r[3..7] = t3..7, r11=u8, r12=t9
   856  	ldmia	r1, {r2,r7,r8,r9,r10,r11,r12}
   857  	add	r1, r0, #3*4
   858  	stmia	r1, {r2,r7,r8,r9,r10}
   859  
   860  	bic	r2, r3, field_not_M 		@ r[8] = c & M
   861  	str	r2, [r0, #8*4]
   862  	mov	r3, r3, lsr #26     		@ c >>= 26
   863  	orr	r3, r3, r4, asl #6
   864  	mov     r4, r4, lsr #26
   865  	mov     r14, field_R1			@ c += u8 * R1
   866  	umlal   r3, r4, r11, r14
   867  	movw    r14, field_R0			@ c += d * R0
   868  	umlal   r3, r4, r5, r14
   869  	adds	r3, r3, r12			@ c += t9
   870  	adc	r4, r4, #0
   871  
   872  	add	r1, sp, #4 + 0*4		@ r7,r8,r9 = t0,t1,t2
   873  	ldmia	r1, {r7,r8,r9}
   874  
   875  	ubfx	r2, r3, #0, #22     		@ r[9] = c & (M >> 4)
   876  	str	r2, [r0, #9*4]
   877  	mov	r3, r3, lsr #22     		@ c >>= 22
   878  	orr	r3, r3, r4, asl #10
   879  	mov     r4, r4, lsr #22
   880  	movw    r14, field_R1 << 4   		@ c += d * (R1 << 4)
   881  	umlal   r3, r4, r5, r14
   882  
   883  	movw    r14, field_R0 >> 4   		@ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
   884  	umull	r5, r6, r3, r14			@ d = c.lo * (R0 >> 4)
   885  	adds	r5, r5, r7	    		@ d.lo += t0
   886  	mla	r6, r14, r4, r6			@ d.hi += c.hi * (R0 >> 4)
   887  	adc	r6, r6, 0	     		@ d.hi += carry
   888  
   889  	bic	r2, r5, field_not_M 		@ r[0] = d & M
   890  	str	r2, [r0, #0*4]
   891  
   892  	mov	r5, r5, lsr #26     		@ d >>= 26
   893  	orr	r5, r5, r6, asl #6
   894  	mov     r6, r6, lsr #26
   895  	
   896  	movw    r14, field_R1 >> 4   		@ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
   897  	umull	r1, r2, r3, r14       		@ tmp = c.lo * (R1 >> 4)
   898  	adds	r5, r5, r8	    		@ d.lo += t1
   899  	adc	r6, r6, #0	    		@ d.hi += carry
   900  	adds	r5, r5, r1	    		@ d.lo += tmp.lo
   901  	mla	r2, r14, r4, r2      		@ tmp.hi += c.hi * (R1 >> 4)
   902  	adc	r6, r6, r2	   		@ d.hi += carry + tmp.hi
   903  
   904  	bic	r2, r5, field_not_M 		@ r[1] = d & M
   905  	str	r2, [r0, #1*4]
   906  	mov	r5, r5, lsr #26     		@ d >>= 26 (ignore hi)
   907  	orr	r5, r5, r6, asl #6
   908  
   909  	add	r5, r5, r9	  		@ d += t2
   910  	str	r5, [r0, #2*4]      		@ r[2] = d
   911  
   912  	add	sp, sp, #48
   913  	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   914  	.size	secp256k1_fe_sqr_inner, .-secp256k1_fe_sqr_inner
   915  
   916  	.section .note.GNU-stack,"",%progbits