github.com/afumu/libc@v0.0.6/musl/src/string/aarch64/memcpy.S (about)

     1  /*
     2   * memcpy - copy memory area
     3   *
     4   * Copyright (c) 2012-2020, Arm Limited.
     5   * SPDX-License-Identifier: MIT
     6   */
     7  
     8  /* Assumptions:
     9   *
    10   * ARMv8-a, AArch64, unaligned accesses.
    11   *
    12   */
    13  
    14  #define dstin   x0
    15  #define src     x1
    16  #define count   x2
    17  #define dst     x3
    18  #define srcend  x4
    19  #define dstend  x5
    20  #define A_l     x6
    21  #define A_lw    w6
    22  #define A_h     x7
    23  #define B_l     x8
    24  #define B_lw    w8
    25  #define B_h     x9
    26  #define C_l     x10
    27  #define C_lw    w10
    28  #define C_h     x11
    29  #define D_l     x12
    30  #define D_h     x13
    31  #define E_l     x14
    32  #define E_h     x15
    33  #define F_l     x16
    34  #define F_h     x17
    35  #define G_l     count
    36  #define G_h     dst
    37  #define H_l     src
    38  #define H_h     srcend
    39  #define tmp1    x14
    40  
    41  /* This implementation of memcpy uses unaligned accesses and branchless
    42     sequences to keep the code small, simple and improve performance.
    43  
    44     Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    45     copies of up to 128 bytes, and large copies.  The overhead of the overlap
    46     check is negligible since it is only required for large copies.
    47  
    48     Large copies use a software pipelined loop processing 64 bytes per iteration.
    49     The destination pointer is 16-byte aligned to minimize unaligned accesses.
    50     The loop tail is handled by always copying 64 bytes from the end.
    51  */
    52  
    53  .global memcpy
    54  .type memcpy,%function
    55  memcpy:
    56  	add     srcend, src, count
    57  	add     dstend, dstin, count
    58  	cmp     count, 128
    59  	b.hi    .Lcopy_long
    60  	cmp     count, 32
    61  	b.hi    .Lcopy32_128
    62  
    63  	/* Small copies: 0..32 bytes.  */
    64  	cmp     count, 16
    65  	b.lo    .Lcopy16
    66  	ldp     A_l, A_h, [src]
    67  	ldp     D_l, D_h, [srcend, -16]
    68  	stp     A_l, A_h, [dstin]
    69  	stp     D_l, D_h, [dstend, -16]
    70  	ret
    71  
    72  	/* Copy 8-15 bytes.  */
    73  .Lcopy16:
    74  	tbz     count, 3, .Lcopy8
    75  	ldr     A_l, [src]
    76  	ldr     A_h, [srcend, -8]
    77  	str     A_l, [dstin]
    78  	str     A_h, [dstend, -8]
    79  	ret
    80  
    81  	.p2align 3
    82  	/* Copy 4-7 bytes.  */
    83  .Lcopy8:
    84  	tbz     count, 2, .Lcopy4
    85  	ldr     A_lw, [src]
    86  	ldr     B_lw, [srcend, -4]
    87  	str     A_lw, [dstin]
    88  	str     B_lw, [dstend, -4]
    89  	ret
    90  
    91  	/* Copy 0..3 bytes using a branchless sequence.  */
    92  .Lcopy4:
    93  	cbz     count, .Lcopy0
    94  	lsr     tmp1, count, 1
    95  	ldrb    A_lw, [src]
    96  	ldrb    C_lw, [srcend, -1]
    97  	ldrb    B_lw, [src, tmp1]
    98  	strb    A_lw, [dstin]
    99  	strb    B_lw, [dstin, tmp1]
   100  	strb    C_lw, [dstend, -1]
   101  .Lcopy0:
   102  	ret
   103  
   104  	.p2align 4
   105  	/* Medium copies: 33..128 bytes.  */
   106  .Lcopy32_128:
   107  	ldp     A_l, A_h, [src]
   108  	ldp     B_l, B_h, [src, 16]
   109  	ldp     C_l, C_h, [srcend, -32]
   110  	ldp     D_l, D_h, [srcend, -16]
   111  	cmp     count, 64
   112  	b.hi    .Lcopy128
   113  	stp     A_l, A_h, [dstin]
   114  	stp     B_l, B_h, [dstin, 16]
   115  	stp     C_l, C_h, [dstend, -32]
   116  	stp     D_l, D_h, [dstend, -16]
   117  	ret
   118  
   119  	.p2align 4
   120  	/* Copy 65..128 bytes.  */
   121  .Lcopy128:
   122  	ldp     E_l, E_h, [src, 32]
   123  	ldp     F_l, F_h, [src, 48]
   124  	cmp     count, 96
   125  	b.ls    .Lcopy96
   126  	ldp     G_l, G_h, [srcend, -64]
   127  	ldp     H_l, H_h, [srcend, -48]
   128  	stp     G_l, G_h, [dstend, -64]
   129  	stp     H_l, H_h, [dstend, -48]
   130  .Lcopy96:
   131  	stp     A_l, A_h, [dstin]
   132  	stp     B_l, B_h, [dstin, 16]
   133  	stp     E_l, E_h, [dstin, 32]
   134  	stp     F_l, F_h, [dstin, 48]
   135  	stp     C_l, C_h, [dstend, -32]
   136  	stp     D_l, D_h, [dstend, -16]
   137  	ret
   138  
   139  	.p2align 4
   140  	/* Copy more than 128 bytes.  */
   141  .Lcopy_long:
   142  
   143  	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
   144  
   145  	ldp     D_l, D_h, [src]
   146  	and     tmp1, dstin, 15
   147  	bic     dst, dstin, 15
   148  	sub     src, src, tmp1
   149  	add     count, count, tmp1      /* Count is now 16 too large.  */
   150  	ldp     A_l, A_h, [src, 16]
   151  	stp     D_l, D_h, [dstin]
   152  	ldp     B_l, B_h, [src, 32]
   153  	ldp     C_l, C_h, [src, 48]
   154  	ldp     D_l, D_h, [src, 64]!
   155  	subs    count, count, 128 + 16  /* Test and readjust count.  */
   156  	b.ls    .Lcopy64_from_end
   157  
   158  .Lloop64:
   159  	stp     A_l, A_h, [dst, 16]
   160  	ldp     A_l, A_h, [src, 16]
   161  	stp     B_l, B_h, [dst, 32]
   162  	ldp     B_l, B_h, [src, 32]
   163  	stp     C_l, C_h, [dst, 48]
   164  	ldp     C_l, C_h, [src, 48]
   165  	stp     D_l, D_h, [dst, 64]!
   166  	ldp     D_l, D_h, [src, 64]!
   167  	subs    count, count, 64
   168  	b.hi    .Lloop64
   169  
   170  	/* Write the last iteration and copy 64 bytes from the end.  */
   171  .Lcopy64_from_end:
   172  	ldp     E_l, E_h, [srcend, -64]
   173  	stp     A_l, A_h, [dst, 16]
   174  	ldp     A_l, A_h, [srcend, -48]
   175  	stp     B_l, B_h, [dst, 32]
   176  	ldp     B_l, B_h, [srcend, -32]
   177  	stp     C_l, C_h, [dst, 48]
   178  	ldp     C_l, C_h, [srcend, -16]
   179  	stp     D_l, D_h, [dst, 64]
   180  	stp     E_l, E_h, [dstend, -64]
   181  	stp     A_l, A_h, [dstend, -48]
   182  	stp     B_l, B_h, [dstend, -32]
   183  	stp     C_l, C_h, [dstend, -16]
   184  	ret
   185  
   186  .size memcpy,.-memcpy