github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_arm.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Inferno's libkern/memmove-arm.s
     5  // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-arm.s
     6  //
     7  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     8  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     9  //         Portions Copyright 2009 The Go Authors. All rights reserved.
    10  //
    11  // Permission is hereby granted, free of charge, to any person obtaining a copy
    12  // of this software and associated documentation files (the "Software"), to deal
    13  // in the Software without restriction, including without limitation the rights
    14  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    15  // copies of the Software, and to permit persons to whom the Software is
    16  // furnished to do so, subject to the following conditions:
    17  //
    18  // The above copyright notice and this permission notice shall be included in
    19  // all copies or substantial portions of the Software.
    20  //
    21  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    22  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    23  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    24  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    25  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    26  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    27  // THE SOFTWARE.
    28  
    29  //go:build pcz && arm
    30  
    31  #include "textflag.h"
    32  
    33  // TE or TS are spilled to the stack during bulk register moves.
    34  #define TS R0
    35  #define TE R8
    36  
    37  // Warning: the linker will use R11 to synthesize certain instructions. Please
    38  // take care and double check with objdump.
    39  #define FROM R11
    40  #define N R12
    41  #define TMP R12 /* N and TMP don't overlap */
    42  #define TMP1 R5
    43  
    44  #define RSHIFT R5
    45  #define LSHIFT R6
    46  #define OFFSET R7
    47  
    48  #define BR0 R0 /* shared with TS */
    49  #define BW0 R1
    50  #define BR1 R1
    51  #define BW1 R2
    52  #define BR2 R2
    53  #define BW2 R3
    54  #define BR3 R3
    55  #define BW3 R4
    56  
    57  #define FW0 R1
    58  #define FR0 R2
    59  #define FW1 R2
    60  #define FR1 R3
    61  #define FW2 R3
    62  #define FR2 R4
    63  #define FW3 R4
    64  #define FR3 R8 /* shared with TE */
    65  
    66  // See memmove Go doc for important implementation constraints.
    67  
    68  // func Move(to, from unsafe.Pointer, n uintptr)
    69  TEXT ·Move(SB), NOSPLIT, $4-12
    70  _memmove:
    71  	MOVW to+0(FP), TS
    72  	MOVW from+4(FP), FROM
    73  	MOVW n+8(FP), N
    74  
    75  	ADD N, TS, TE /* to end pointer */
    76  
    77  	CMP FROM, TS
    78  	BLS _forward
    79  
    80  _back:
    81  	ADD N, FROM /* from end pointer */
    82  	CMP $4, N /* need at least 4 bytes to copy */
    83  	BLT _b1tail
    84  
    85  _b4align:				/* align destination on 4 */
    86  	AND.S $3, TE, TMP
    87  	BEQ _b4aligned
    88  
    89  	MOVBU.W -1(FROM), TMP /* pre-indexed */
    90  	MOVBU.W TMP, -1(TE)	/* pre-indexed */
    91  	B _b4align
    92  
    93  _b4aligned:				/* is source now aligned? */
    94  	AND.S $3, FROM, TMP
    95  	BNE _bunaligned
    96  
    97  	ADD $31, TS, TMP /* do 32-byte chunks if possible */
    98  	MOVW TS, savedts-4(SP)
    99  _b32loop:
   100  	CMP TMP, TE
   101  	BLS _b4tail
   102  
   103  	MOVM.DB.W (FROM), [R0-R7]
   104  	MOVM.DB.W [R0-R7], (TE)
   105  	B _b32loop
   106  
   107  _b4tail:				/* do remaining words if possible */
   108  	MOVW savedts-4(SP), TS
   109  	ADD $3, TS, TMP
   110  _b4loop:
   111  	CMP TMP, TE
   112  	BLS _b1tail
   113  
   114  	MOVW.W -4(FROM), TMP1 /* pre-indexed */
   115  	MOVW.W TMP1, -4(TE)	/* pre-indexed */
   116  	B _b4loop
   117  
   118  _b1tail:				/* remaining bytes */
   119  	CMP TE, TS
   120  	BEQ _return
   121  
   122  	MOVBU.W -1(FROM), TMP /* pre-indexed */
   123  	MOVBU.W TMP, -1(TE)	/* pre-indexed */
   124  	B _b1tail
   125  
   126  _forward:
   127  	CMP $4, N /* need at least 4 bytes to copy */
   128  	BLT _f1tail
   129  
   130  _f4align:				/* align destination on 4 */
   131  	AND.S $3, TS, TMP
   132  	BEQ _f4aligned
   133  
   134  	MOVBU.P 1(FROM), TMP /* implicit write back */
   135  	MOVBU.P TMP, 1(TS)	/* implicit write back */
   136  	B _f4align
   137  
   138  _f4aligned:				/* is source now aligned? */
   139  	AND.S $3, FROM, TMP
   140  	BNE _funaligned
   141  
   142  	SUB $31, TE, TMP /* do 32-byte chunks if possible */
   143  	MOVW TE, savedte-4(SP)
   144  _f32loop:
   145  	CMP TMP, TS
   146  	BHS _f4tail
   147  
   148  	MOVM.IA.W (FROM), [R1-R8]
   149  	MOVM.IA.W [R1-R8], (TS)
   150  	B _f32loop
   151  
   152  _f4tail:
   153  	MOVW savedte-4(SP), TE
   154  	SUB $3, TE, TMP /* do remaining words if possible */
   155  _f4loop:
   156  	CMP TMP, TS
   157  	BHS _f1tail
   158  
   159  	MOVW.P 4(FROM), TMP1 /* implicit write back */
   160  	MOVW.P TMP1, 4(TS)	/* implicit write back */
   161  	B _f4loop
   162  
   163  _f1tail:
   164  	CMP TS, TE
   165  	BEQ _return
   166  
   167  	MOVBU.P 1(FROM), TMP /* implicit write back */
   168  	MOVBU.P TMP, 1(TS)	/* implicit write back */
   169  	B _f1tail
   170  
   171  _return:
   172  	MOVW to+0(FP), R0
   173  	RET
   174  
   175  _bunaligned:
   176  	CMP $2, TMP /* is TMP < 2 ? */
   177  
   178  	MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */
   179  	MOVW.LT $24, LSHIFT
   180  	MOVW.LT $1, OFFSET
   181  
   182  	MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */
   183  	MOVW.EQ $16, LSHIFT
   184  	MOVW.EQ $2, OFFSET
   185  
   186  	MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */
   187  	MOVW.GT $8, LSHIFT
   188  	MOVW.GT $3, OFFSET
   189  
   190  	ADD $16, TS, TMP /* do 16-byte chunks if possible */
   191  	CMP TMP, TE
   192  	BLS _b1tail
   193  
   194  	BIC $3, FROM /* align source */
   195  	MOVW TS, savedts-4(SP)
   196  	MOVW (FROM), BR0 /* prime first block register */
   197  
   198  _bu16loop:
   199  	CMP TMP, TE
   200  	BLS _bu1tail
   201  
   202  	MOVW BR0<<LSHIFT, BW3
   203  	MOVM.DB.W (FROM), [BR0-BR3]
   204  	ORR BR3>>RSHIFT, BW3
   205  
   206  	MOVW BR3<<LSHIFT, BW2
   207  	ORR BR2>>RSHIFT, BW2
   208  
   209  	MOVW BR2<<LSHIFT, BW1
   210  	ORR BR1>>RSHIFT, BW1
   211  
   212  	MOVW BR1<<LSHIFT, BW0
   213  	ORR BR0>>RSHIFT, BW0
   214  
   215  	MOVM.DB.W [BW0-BW3], (TE)
   216  	B _bu16loop
   217  
   218  _bu1tail:
   219  	MOVW savedts-4(SP), TS
   220  	ADD OFFSET, FROM
   221  	B _b1tail
   222  
   223  _funaligned:
   224  	CMP $2, TMP
   225  
   226  	MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */
   227  	MOVW.LT $24, LSHIFT
   228  	MOVW.LT $3, OFFSET
   229  
   230  	MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */
   231  	MOVW.EQ $16, LSHIFT
   232  	MOVW.EQ $2, OFFSET
   233  
   234  	MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */
   235  	MOVW.GT $8, LSHIFT
   236  	MOVW.GT $1, OFFSET
   237  
   238  	SUB $16, TE, TMP /* do 16-byte chunks if possible */
   239  	CMP TMP, TS
   240  	BHS _f1tail
   241  
   242  	BIC $3, FROM /* align source */
   243  	MOVW TE, savedte-4(SP)
   244  	MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */
   245  
   246  _fu16loop:
   247  	CMP TMP, TS
   248  	BHS _fu1tail
   249  
   250  	MOVW FR3>>RSHIFT, FW0
   251  	MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
   252  	ORR FR0<<LSHIFT, FW0
   253  
   254  	MOVW FR0>>RSHIFT, FW1
   255  	ORR FR1<<LSHIFT, FW1
   256  
   257  	MOVW FR1>>RSHIFT, FW2
   258  	ORR FR2<<LSHIFT, FW2
   259  
   260  	MOVW FR2>>RSHIFT, FW3
   261  	ORR FR3<<LSHIFT, FW3
   262  
   263  	MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
   264  	B _fu16loop
   265  
   266  _fu1tail:
   267  	MOVW savedte-4(SP), TE
   268  	SUB OFFSET, FROM
   269  	B _f1tail