github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/poly1305/sum_arm.s (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build arm,!gccgo,!appengine,!nacl
     6  
     7  #include "textflag.h"
     8  
     9  // This code was translated into a form compatible with 5a from the public
    10  // domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
    11  
    12  DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
    13  DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
    14  DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
    15  DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
    16  DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
    17  GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
    18  
    19  // Warning: the linker may use R11 to synthesize certain instructions. Please
    20  // take care and verify that no synthetic instructions use it.
    21  
    22  TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
    23  	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
    24  	// might look like it's only 60 bytes of space but the final four bytes
    25  	// will be written by another function.) We need to skip over four
    26  	// bytes of stack because that's saving the value of 'g'.
    27  	ADD       $4, R13, R8
    28  	MOVM.IB   [R4-R7], (R8)
    29  	MOVM.IA.W (R1), [R2-R5]
    30  	MOVW      $·poly1305_init_constants_armv6<>(SB), R7
    31  	MOVW      R2, R8
    32  	MOVW      R2>>26, R9
    33  	MOVW      R3>>20, g
    34  	MOVW      R4>>14, R11
    35  	MOVW      R5>>8, R12
    36  	ORR       R3<<6, R9, R9
    37  	ORR       R4<<12, g, g
    38  	ORR       R5<<18, R11, R11
    39  	MOVM.IA   (R7), [R2-R6]
    40  	AND       R8, R2, R2
    41  	AND       R9, R3, R3
    42  	AND       g, R4, R4
    43  	AND       R11, R5, R5
    44  	AND       R12, R6, R6
    45  	MOVM.IA.W [R2-R6], (R0)
    46  	EOR       R2, R2, R2
    47  	EOR       R3, R3, R3
    48  	EOR       R4, R4, R4
    49  	EOR       R5, R5, R5
    50  	EOR       R6, R6, R6
    51  	MOVM.IA.W [R2-R6], (R0)
    52  	MOVM.IA.W (R1), [R2-R5]
    53  	MOVM.IA   [R2-R6], (R0)
    54  	ADD       $20, R13, R0
    55  	MOVM.DA   (R0), [R4-R7]
    56  	RET
    57  
    58  #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
    59  	MOVBU (offset+0)(Rsrc), Rtmp; \
    60  	MOVBU Rtmp, (offset+0)(Rdst); \
    61  	MOVBU (offset+1)(Rsrc), Rtmp; \
    62  	MOVBU Rtmp, (offset+1)(Rdst); \
    63  	MOVBU (offset+2)(Rsrc), Rtmp; \
    64  	MOVBU Rtmp, (offset+2)(Rdst); \
    65  	MOVBU (offset+3)(Rsrc), Rtmp; \
    66  	MOVBU Rtmp, (offset+3)(Rdst)
    67  
    68  TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
    69  	// Needs 24 bytes of stack for saved registers and then 88 bytes of
    70  	// scratch space after that. We assume that 24 bytes at (R13) have
    71  	// already been used: four bytes for the link register saved in the
    72  	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
    73  	// in that function and 16 bytes of scratch space used around
    74  	// poly1305_finish_ext_armv6_skip1.
    75  	ADD     $24, R13, R12
    76  	MOVM.IB [R4-R8, R14], (R12)
    77  	MOVW    R0, 88(R13)
    78  	MOVW    R1, 92(R13)
    79  	MOVW    R2, 96(R13)
    80  	MOVW    R1, R14
    81  	MOVW    R2, R12
    82  	MOVW    56(R0), R8
    83  	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
    84  	EOR     R6, R6, R6
    85  	MOVW.EQ $(1<<24), R6
    86  	MOVW    R6, 84(R13)
    87  	ADD     $116, R13, g
    88  	MOVM.IA (R0), [R0-R9]
    89  	MOVM.IA [R0-R4], (g)
    90  	CMP     $16, R12
    91  	BLO     poly1305_blocks_armv6_done
    92  
    93  poly1305_blocks_armv6_mainloop:
    94  	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
    95  	BEQ     poly1305_blocks_armv6_mainloop_aligned
    96  	ADD     $100, R13, g
    97  	MOVW_UNALIGNED(R14, g, R0, 0)
    98  	MOVW_UNALIGNED(R14, g, R0, 4)
    99  	MOVW_UNALIGNED(R14, g, R0, 8)
   100  	MOVW_UNALIGNED(R14, g, R0, 12)
   101  	MOVM.IA (g), [R0-R3]
   102  	ADD     $16, R14
   103  	B       poly1305_blocks_armv6_mainloop_loaded
   104  
   105  poly1305_blocks_armv6_mainloop_aligned:
   106  	MOVM.IA.W (R14), [R0-R3]
   107  
   108  poly1305_blocks_armv6_mainloop_loaded:
   109  	MOVW    R0>>26, g
   110  	MOVW    R1>>20, R11
   111  	MOVW    R2>>14, R12
   112  	MOVW    R14, 92(R13)
   113  	MOVW    R3>>8, R4
   114  	ORR     R1<<6, g, g
   115  	ORR     R2<<12, R11, R11
   116  	ORR     R3<<18, R12, R12
   117  	BIC     $0xfc000000, R0, R0
   118  	BIC     $0xfc000000, g, g
   119  	MOVW    84(R13), R3
   120  	BIC     $0xfc000000, R11, R11
   121  	BIC     $0xfc000000, R12, R12
   122  	ADD     R0, R5, R5
   123  	ADD     g, R6, R6
   124  	ORR     R3, R4, R4
   125  	ADD     R11, R7, R7
   126  	ADD     $116, R13, R14
   127  	ADD     R12, R8, R8
   128  	ADD     R4, R9, R9
   129  	MOVM.IA (R14), [R0-R4]
   130  	MULLU   R4, R5, (R11, g)
   131  	MULLU   R3, R5, (R14, R12)
   132  	MULALU  R3, R6, (R11, g)
   133  	MULALU  R2, R6, (R14, R12)
   134  	MULALU  R2, R7, (R11, g)
   135  	MULALU  R1, R7, (R14, R12)
   136  	ADD     R4<<2, R4, R4
   137  	ADD     R3<<2, R3, R3
   138  	MULALU  R1, R8, (R11, g)
   139  	MULALU  R0, R8, (R14, R12)
   140  	MULALU  R0, R9, (R11, g)
   141  	MULALU  R4, R9, (R14, R12)
   142  	MOVW    g, 76(R13)
   143  	MOVW    R11, 80(R13)
   144  	MOVW    R12, 68(R13)
   145  	MOVW    R14, 72(R13)
   146  	MULLU   R2, R5, (R11, g)
   147  	MULLU   R1, R5, (R14, R12)
   148  	MULALU  R1, R6, (R11, g)
   149  	MULALU  R0, R6, (R14, R12)
   150  	MULALU  R0, R7, (R11, g)
   151  	MULALU  R4, R7, (R14, R12)
   152  	ADD     R2<<2, R2, R2
   153  	ADD     R1<<2, R1, R1
   154  	MULALU  R4, R8, (R11, g)
   155  	MULALU  R3, R8, (R14, R12)
   156  	MULALU  R3, R9, (R11, g)
   157  	MULALU  R2, R9, (R14, R12)
   158  	MOVW    g, 60(R13)
   159  	MOVW    R11, 64(R13)
   160  	MOVW    R12, 52(R13)
   161  	MOVW    R14, 56(R13)
   162  	MULLU   R0, R5, (R11, g)
   163  	MULALU  R4, R6, (R11, g)
   164  	MULALU  R3, R7, (R11, g)
   165  	MULALU  R2, R8, (R11, g)
   166  	MULALU  R1, R9, (R11, g)
   167  	ADD     $52, R13, R0
   168  	MOVM.IA (R0), [R0-R7]
   169  	MOVW    g>>26, R12
   170  	MOVW    R4>>26, R14
   171  	ORR     R11<<6, R12, R12
   172  	ORR     R5<<6, R14, R14
   173  	BIC     $0xfc000000, g, g
   174  	BIC     $0xfc000000, R4, R4
   175  	ADD.S   R12, R0, R0
   176  	ADC     $0, R1, R1
   177  	ADD.S   R14, R6, R6
   178  	ADC     $0, R7, R7
   179  	MOVW    R0>>26, R12
   180  	MOVW    R6>>26, R14
   181  	ORR     R1<<6, R12, R12
   182  	ORR     R7<<6, R14, R14
   183  	BIC     $0xfc000000, R0, R0
   184  	BIC     $0xfc000000, R6, R6
   185  	ADD     R14<<2, R14, R14
   186  	ADD.S   R12, R2, R2
   187  	ADC     $0, R3, R3
   188  	ADD     R14, g, g
   189  	MOVW    R2>>26, R12
   190  	MOVW    g>>26, R14
   191  	ORR     R3<<6, R12, R12
   192  	BIC     $0xfc000000, g, R5
   193  	BIC     $0xfc000000, R2, R7
   194  	ADD     R12, R4, R4
   195  	ADD     R14, R0, R0
   196  	MOVW    R4>>26, R12
   197  	BIC     $0xfc000000, R4, R8
   198  	ADD     R12, R6, R9
   199  	MOVW    96(R13), R12
   200  	MOVW    92(R13), R14
   201  	MOVW    R0, R6
   202  	CMP     $32, R12
   203  	SUB     $16, R12, R12
   204  	MOVW    R12, 96(R13)
   205  	BHS     poly1305_blocks_armv6_mainloop
   206  
   207  poly1305_blocks_armv6_done:
   208  	MOVW    88(R13), R12
   209  	MOVW    R5, 20(R12)
   210  	MOVW    R6, 24(R12)
   211  	MOVW    R7, 28(R12)
   212  	MOVW    R8, 32(R12)
   213  	MOVW    R9, 36(R12)
   214  	ADD     $48, R13, R0
   215  	MOVM.DA (R0), [R4-R8, R14]
   216  	RET
   217  
   218  #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
   219  	MOVBU.P 1(Rsrc), Rtmp; \
   220  	MOVBU.P Rtmp, 1(Rdst); \
   221  	MOVBU.P 1(Rsrc), Rtmp; \
   222  	MOVBU.P Rtmp, 1(Rdst)
   223  
   224  #define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
   225  	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
   226  	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
   227  
   228  // func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
   229  TEXT ·poly1305_auth_armv6(SB), $196-16
   230  	// The value 196, just above, is the sum of 64 (the size of the context
   231  	// structure) and 132 (the amount of stack needed).
   232  	//
   233  	// At this point, the stack pointer (R13) has been moved down. It
   234  	// points to the saved link register and there's 196 bytes of free
   235  	// space above it.
   236  	//
   237  	// The stack for this function looks like:
   238  	//
   239  	// +---------------------
   240  	// |
   241  	// | 64 bytes of context structure
   242  	// |
   243  	// +---------------------
   244  	// |
   245  	// | 112 bytes for poly1305_blocks_armv6
   246  	// |
   247  	// +---------------------
   248  	// | 16 bytes of final block, constructed at
   249  	// | poly1305_finish_ext_armv6_skip8
   250  	// +---------------------
   251  	// | four bytes of saved 'g'
   252  	// +---------------------
   253  	// | lr, saved by prelude    <- R13 points here
   254  	// +---------------------
   255  	MOVW g, 4(R13)
   256  
   257  	MOVW out+0(FP), R4
   258  	MOVW m+4(FP), R5
   259  	MOVW mlen+8(FP), R6
   260  	MOVW key+12(FP), R7
   261  
   262  	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
   263  	MOVW R7, R1
   264  
   265  	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
   266  	// that's ok because none of the other values have been written yet.
   267  	BL    poly1305_init_ext_armv6<>(SB)
   268  	BIC.S $15, R6, R2
   269  	BEQ   poly1305_auth_armv6_noblocks
   270  	ADD   $136, R13, R0
   271  	MOVW  R5, R1
   272  	ADD   R2, R5, R5
   273  	SUB   R2, R6, R6
   274  	BL    poly1305_blocks_armv6<>(SB)
   275  
   276  poly1305_auth_armv6_noblocks:
   277  	ADD  $136, R13, R0
   278  	MOVW R5, R1
   279  	MOVW R6, R2
   280  	MOVW R4, R3
   281  
   282  	MOVW  R0, R5
   283  	MOVW  R1, R6
   284  	MOVW  R2, R7
   285  	MOVW  R3, R8
   286  	AND.S R2, R2, R2
   287  	BEQ   poly1305_finish_ext_armv6_noremaining
   288  	EOR   R0, R0
   289  	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
   290  	MOVW  R0, (R9)
   291  	MOVW  R0, 4(R9)
   292  	MOVW  R0, 8(R9)
   293  	MOVW  R0, 12(R9)
   294  	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
   295  	BEQ   poly1305_finish_ext_armv6_aligned
   296  	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
   297  	BEQ   poly1305_finish_ext_armv6_skip8
   298  	MOVWP_UNALIGNED(R1, R9, g)
   299  	MOVWP_UNALIGNED(R1, R9, g)
   300  
   301  poly1305_finish_ext_armv6_skip8:
   302  	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
   303  	BEQ  poly1305_finish_ext_armv6_skip4
   304  	MOVWP_UNALIGNED(R1, R9, g)
   305  
   306  poly1305_finish_ext_armv6_skip4:
   307  	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
   308  	BEQ  poly1305_finish_ext_armv6_skip2
   309  	MOVHUP_UNALIGNED(R1, R9, g)
   310  	B    poly1305_finish_ext_armv6_skip2
   311  
   312  poly1305_finish_ext_armv6_aligned:
   313  	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
   314  	BEQ       poly1305_finish_ext_armv6_skip8_aligned
   315  	MOVM.IA.W (R1), [g-R11]
   316  	MOVM.IA.W [g-R11], (R9)
   317  
   318  poly1305_finish_ext_armv6_skip8_aligned:
   319  	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
   320  	BEQ    poly1305_finish_ext_armv6_skip4_aligned
   321  	MOVW.P 4(R1), g
   322  	MOVW.P g, 4(R9)
   323  
   324  poly1305_finish_ext_armv6_skip4_aligned:
   325  	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
   326  	BEQ     poly1305_finish_ext_armv6_skip2
   327  	MOVHU.P 2(R1), g
   328  	MOVH.P  g, 2(R9)
   329  
   330  poly1305_finish_ext_armv6_skip2:
   331  	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
   332  	BEQ     poly1305_finish_ext_armv6_skip1
   333  	MOVBU.P 1(R1), g
   334  	MOVBU.P g, 1(R9)
   335  
   336  poly1305_finish_ext_armv6_skip1:
   337  	MOVW  $1, R11
   338  	MOVBU R11, 0(R9)
   339  	MOVW  R11, 56(R5)
   340  	MOVW  R5, R0
   341  	ADD   $8, R13, R1
   342  	MOVW  $16, R2
   343  	BL    poly1305_blocks_armv6<>(SB)
   344  
   345  poly1305_finish_ext_armv6_noremaining:
   346  	MOVW      20(R5), R0
   347  	MOVW      24(R5), R1
   348  	MOVW      28(R5), R2
   349  	MOVW      32(R5), R3
   350  	MOVW      36(R5), R4
   351  	MOVW      R4>>26, R12
   352  	BIC       $0xfc000000, R4, R4
   353  	ADD       R12<<2, R12, R12
   354  	ADD       R12, R0, R0
   355  	MOVW      R0>>26, R12
   356  	BIC       $0xfc000000, R0, R0
   357  	ADD       R12, R1, R1
   358  	MOVW      R1>>26, R12
   359  	BIC       $0xfc000000, R1, R1
   360  	ADD       R12, R2, R2
   361  	MOVW      R2>>26, R12
   362  	BIC       $0xfc000000, R2, R2
   363  	ADD       R12, R3, R3
   364  	MOVW      R3>>26, R12
   365  	BIC       $0xfc000000, R3, R3
   366  	ADD       R12, R4, R4
   367  	ADD       $5, R0, R6
   368  	MOVW      R6>>26, R12
   369  	BIC       $0xfc000000, R6, R6
   370  	ADD       R12, R1, R7
   371  	MOVW      R7>>26, R12
   372  	BIC       $0xfc000000, R7, R7
   373  	ADD       R12, R2, g
   374  	MOVW      g>>26, R12
   375  	BIC       $0xfc000000, g, g
   376  	ADD       R12, R3, R11
   377  	MOVW      $-(1<<26), R12
   378  	ADD       R11>>26, R12, R12
   379  	BIC       $0xfc000000, R11, R11
   380  	ADD       R12, R4, R9
   381  	MOVW      R9>>31, R12
   382  	SUB       $1, R12
   383  	AND       R12, R6, R6
   384  	AND       R12, R7, R7
   385  	AND       R12, g, g
   386  	AND       R12, R11, R11
   387  	AND       R12, R9, R9
   388  	MVN       R12, R12
   389  	AND       R12, R0, R0
   390  	AND       R12, R1, R1
   391  	AND       R12, R2, R2
   392  	AND       R12, R3, R3
   393  	AND       R12, R4, R4
   394  	ORR       R6, R0, R0
   395  	ORR       R7, R1, R1
   396  	ORR       g, R2, R2
   397  	ORR       R11, R3, R3
   398  	ORR       R9, R4, R4
   399  	ORR       R1<<26, R0, R0
   400  	MOVW      R1>>6, R1
   401  	ORR       R2<<20, R1, R1
   402  	MOVW      R2>>12, R2
   403  	ORR       R3<<14, R2, R2
   404  	MOVW      R3>>18, R3
   405  	ORR       R4<<8, R3, R3
   406  	MOVW      40(R5), R6
   407  	MOVW      44(R5), R7
   408  	MOVW      48(R5), g
   409  	MOVW      52(R5), R11
   410  	ADD.S     R6, R0, R0
   411  	ADC.S     R7, R1, R1
   412  	ADC.S     g, R2, R2
   413  	ADC.S     R11, R3, R3
   414  	MOVM.IA   [R0-R3], (R8)
   415  	MOVW      R5, R12
   416  	EOR       R0, R0, R0
   417  	EOR       R1, R1, R1
   418  	EOR       R2, R2, R2
   419  	EOR       R3, R3, R3
   420  	EOR       R4, R4, R4
   421  	EOR       R5, R5, R5
   422  	EOR       R6, R6, R6
   423  	EOR       R7, R7, R7
   424  	MOVM.IA.W [R0-R7], (R12)
   425  	MOVM.IA   [R0-R7], (R12)
   426  	MOVW      4(R13), g
   427  	RET