github.com/primecitizens/pcz/std@v0.2.1/core/mem/clear_ppc64x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2014 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && ppc64 || ppc64le
     9  
    10  #include "textflag.h"
    11  
    12  // See memclrNoHeapPointers Go doc for important implementation constraints.
    13  
    14  // func Clear(ptr unsafe.Pointer, n uintptr)
    15  TEXT ·Clear<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-16
    16  	// R3 = ptr
    17  	// R4 = n
    18  
    19  	// Determine if there are doublewords to clear
    20  check:
    21  	ANDCC $7, R4, R5  // R5: leftover bytes to clear
    22  	SRD   $3, R4, R6  // R6: double words to clear
    23  	CMP   R6, $0, CR1 // CR1[EQ] set if no double words
    24  
    25  	BC    12, 6, nozerolarge // only single bytes
    26  	CMP   R4, $512
    27  	BLT   under512           // special case for < 512
    28  	ANDCC $127, R3, R8       // check for 128 alignment of address
    29  	BEQ   zero512setup
    30  
    31  	ANDCC $7, R3, R15
    32  	BEQ   zero512xsetup // at least 8 byte aligned
    33  
    34  	// zero bytes up to 8 byte alignment
    35  
    36  	ANDCC $1, R3, R15 // check for byte alignment
    37  	BEQ   byte2
    38  	MOVB  R0, 0(R3)   // zero 1 byte
    39  	ADD   $1, R3      // bump ptr by 1
    40  	ADD   $-1, R4
    41  
    42  byte2:
    43  	ANDCC $2, R3, R15 // check for 2 byte alignment
    44  	BEQ   byte4
    45  	MOVH  R0, 0(R3)   // zero 2 bytes
    46  	ADD   $2, R3      // bump ptr by 2
    47  	ADD   $-2, R4
    48  
    49  byte4:
    50  	ANDCC $4, R3, R15   // check for 4 byte alignment
    51  	BEQ   zero512xsetup
    52  	MOVW  R0, 0(R3)     // zero 4 bytes
    53  	ADD   $4, R3        // bump ptr by 4
    54  	ADD   $-4, R4
    55  	BR    zero512xsetup // ptr should now be 8 byte aligned
    56  
    57  under512:
    58  	SRDCC $3, R6, R7  // 64 byte chunks?
    59  	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
    60  	BEQ   lt64gt8
    61  
    62  	// Prepare to clear 64 bytes at a time.
    63  
    64  zero64setup:
    65  	DCBTST (R3)             // prepare data cache
    66  	MOVD   R7, CTR          // number of 64 byte chunks
    67  	MOVD   $16, R8
    68  	MOVD   $32, R16
    69  	MOVD   $48, R17
    70  
    71  zero64:
    72  	STXVD2X VS32, (R3+R0)   // store 16 bytes
    73  	STXVD2X VS32, (R3+R8)
    74  	STXVD2X VS32, (R3+R16)
    75  	STXVD2X VS32, (R3+R17)
    76  	ADD     $64, R3
    77  	ADD     $-64, R4
    78  	BDNZ    zero64          // dec ctr, br zero64 if ctr not 0
    79  	SRDCC   $3, R4, R6 // remaining doublewords
    80  	BEQ     nozerolarge
    81  
    82  lt64gt8:
    83  	CMP R4, $32
    84  	BLT lt32gt8
    85  	MOVD $16, R8
    86  	STXVD2X VS32, (R3+R0)
    87  	STXVD2X VS32, (R3+R8)
    88  	ADD $-32, R4
    89  	ADD $32, R3
    90  lt32gt8:
    91  	CMP R4, $16
    92  	BLT lt16gt8
    93  	STXVD2X VS32, (R3+R0)
    94  	ADD $16, R3
    95  	ADD $-16, R4
    96  lt16gt8:
    97  	CMP R4, $8
    98  	BLT nozerolarge
    99  	MOVD R0, 0(R3)
   100  	ADD $8, R3
   101  	ADD $-8, R4
   102  
   103  nozerolarge:
   104  	ANDCC $7, R4, R5 // any remaining bytes
   105  	BC    4, 1, LR   // ble lr
   106  
   107  zerotail:
   108  	MOVD R5, CTR // set up to clear tail bytes
   109  
   110  zerotailloop:
   111  	MOVB R0, 0(R3)           // clear single bytes
   112  	ADD  $1, R3
   113  	BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
   114  	RET
   115  
   116  zero512xsetup:  // 512 chunk with extra needed
   117  	ANDCC $8, R3, R11    // 8 byte alignment?
   118  	BEQ   zero512setup16
   119  	MOVD  R0, 0(R3)      // clear 8 bytes
   120  	ADD   $8, R3         // update ptr to next 8
   121  	ADD   $-8, R4        // dec count by 8
   122  
   123  zero512setup16:
   124  	ANDCC $127, R3, R14 // < 128 byte alignment
   125  	BEQ   zero512setup  // handle 128 byte alignment
   126  	MOVD  $128, R15
   127  	SUB   R14, R15, R14 // find increment to 128 alignment
   128  	SRD   $4, R14, R15  // number of 16 byte chunks
   129  
   130  zero512presetup:
   131  	MOVD   R15, CTR         // loop counter of 16 bytes
   132  	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
   133  
   134  zero512preloop:  // clear up to 128 alignment
   135  	STXVD2X VS32, (R3+R0)         // clear 16 bytes
   136  	ADD     $16, R3               // update ptr
   137  	ADD     $-16, R4              // dec count
   138  	BDNZ    zero512preloop
   139  
   140  zero512setup:  // setup for dcbz loop
   141  	CMP  R4, $512   // check if at least 512
   142  	BLT  remain
   143  	SRD  $9, R4, R8 // loop count for 512 chunks
   144  	MOVD R8, CTR    // set up counter
   145  	MOVD $128, R9   // index regs for 128 bytes
   146  	MOVD $256, R10
   147  	MOVD $384, R11
   148  	PCALIGN $32
   149  
   150  zero512:
   151  	DCBZ (R3+R0)        // clear first chunk
   152  	DCBZ (R3+R9)        // clear second chunk
   153  	DCBZ (R3+R10)       // clear third chunk
   154  	DCBZ (R3+R11)       // clear fourth chunk
   155  	ADD  $512, R3
   156  	BDNZ zero512
   157  	ANDCC $511, R4
   158  
   159  remain:
   160  	CMP  R4, $128  // check if 128 byte chunks left
   161  	BLT  smaller
   162  	DCBZ (R3+R0)   // clear 128
   163  	ADD  $128, R3
   164  	ADD  $-128, R4
   165  	BR   remain
   166  
   167  smaller:
   168  	ANDCC $127, R4, R7 // find leftovers
   169  	BEQ   done
   170  	CMP   R7, $64      // more than 64, do 64 at a time
   171  	XXLXOR VS32, VS32, VS32
   172  	BLT   lt64gt8    // less than 64
   173  	SRD   $6, R7, R7   // set up counter for 64
   174  	BR    zero64setup
   175  
   176  done:
   177  	RET