github.com/primecitizens/pcz/std@v0.2.1/core/mem/equal_ppc64x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && (ppc64 || ppc64le)
     9  
    10  #include "textflag.h"
    11  
    12  // 4K (smallest case) page size offset mask for PPC64.
    13  #define PAGE_OFFSET 4095
    14  
    15  // TODO: At writing, ISEL and BC do not support CR bit type arguments,
    16  // define them here for readability.
    17  #define CR0LT 4*0+0
    18  #define CR0EQ 4*0+2
    19  #define CR1LT 4*1+0
    20  #define CR6LT 4*6+0
    21  
    22  // Likewise, the BC opcode is hard to read, and no extended
    23  // mnemonics are offered for these forms.
    24  #define BGELR_CR6 BC  4, CR6LT, (LR)
    25  #define BEQLR     BC 12, CR0EQ, (LR)
    26  
    27  // Equal(a, b unsafe.Pointer, size uintptr) bool
    28  TEXT ·Equal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
    29  	// R3 = a
    30  	// R4 = b
    31  	// R5 = size
    32  	BR memeqbody<>(SB)
    33  
    34  // memequal_varlen(a, b unsafe.Pointer) bool
    35  TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
    36  	// R3 = a
    37  	// R4 = b
    38  	CMP R3, R4
    39  	BEQ eq
    40  	MOVD 8(R11), R5    // compiler stores size at offset 8 in the closure
    41  	BR memeqbody<>(SB)
    42  eq:
    43  	MOVD $1, R3
    44  	RET
    45  
    46  // Do an efficient memequal for ppc64
    47  // R3 = s1
    48  // R4 = s2
    49  // R5 = len
    50  // On exit:
    51  // R3 = return value
    52  TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
    53  	MOVD R3, R8 // Move s1 into R8
    54  	ADD R5, R3, R9 // &s1[len(s1)]
    55  	ADD R5, R4, R10 // &s2[len(s2)]
    56  	MOVD $1, R11
    57  	CMP R5, $16 // Use GPR checks for check for len <= 16
    58  	BLE check0_16
    59  	MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
    60  	CMP R5, $32 // Use overlapping VSX loads for len <= 32
    61  	BLE check17_32 // Do a pair of overlapping VSR compares
    62  	CMP R5, $64
    63  	BLE check33_64 // Hybrid check + overlap compare.
    64  
    65  setup64:
    66  	SRD $6, R5, R6 // number of 64 byte chunks to compare
    67  	MOVD R6, CTR
    68  	MOVD $16, R14 // index for VSX loads and stores
    69  	MOVD $32, R15
    70  	MOVD $48, R16
    71  	ANDCC $0x3F, R5, R5 // len%64==0?
    72  
    73  	PCALIGN $32
    74  loop64:
    75  	LXVD2X (R8+R0), V0
    76  	LXVD2X (R4+R0), V1
    77  	VCMPEQUBCC V0, V1, V2 // compare, setting CR6
    78  	BGELR_CR6
    79  	LXVD2X (R8+R14), V0
    80  	LXVD2X (R4+R14), V1
    81  	VCMPEQUBCC V0, V1, V2
    82  	BGELR_CR6
    83  	LXVD2X (R8+R15), V0
    84  	LXVD2X (R4+R15), V1
    85  	VCMPEQUBCC V0, V1, V2
    86  	BGELR_CR6
    87  	LXVD2X (R8+R16), V0
    88  	LXVD2X (R4+R16), V1
    89  	VCMPEQUBCC V0, V1, V2
    90  	BGELR_CR6
    91  	ADD $64,R8 // bump up to next 64
    92  	ADD $64,R4
    93  	BDNZ loop64
    94  
    95  	ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
    96  	BEQLR // return if no tail.
    97  
    98  	ADD $-64, R9, R8
    99  	ADD $-64, R10, R4
   100  	LXVD2X (R8+R0), V0
   101  	LXVD2X (R4+R0), V1
   102  	VCMPEQUBCC V0, V1, V2
   103  	BGELR_CR6
   104  	LXVD2X (R8+R14), V0
   105  	LXVD2X (R4+R14), V1
   106  	VCMPEQUBCC V0, V1, V2
   107  	BGELR_CR6
   108  	LXVD2X (R8+R15), V0
   109  	LXVD2X (R4+R15), V1
   110  	VCMPEQUBCC V0, V1, V2
   111  	BGELR_CR6
   112  	LXVD2X (R8+R16), V0
   113  	LXVD2X (R4+R16), V1
   114  	VCMPEQUBCC V0, V1, V2
   115  	ISEL $CR6LT, R11, R0, R3
   116  	RET
   117  
   118  check33_64:
   119  	// Bytes 0-15
   120  	LXVD2X (R8+R0), V0
   121  	LXVD2X (R4+R0), V1
   122  	VCMPEQUBCC V0, V1, V2
   123  	BGELR_CR6
   124  	ADD $16, R8
   125  	ADD $16, R4
   126  
   127  	// Bytes 16-31
   128  	LXVD2X (R8+R0), V0
   129  	LXVD2X (R4+R0), V1
   130  	VCMPEQUBCC V0, V1, V2
   131  	BGELR_CR6
   132  
   133  	// A little tricky, but point R4,R8 to &sx[len-32],
   134  	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
   135  	ADD $-32, R9, R8
   136  	ADD $-32, R10, R4
   137  	// Fallthrough
   138  
   139  check17_32:
   140  	LXVD2X (R8+R0), V0
   141  	LXVD2X (R4+R0), V1
   142  	VCMPEQUBCC V0, V1, V2
   143  	ISEL $CR6LT, R11, R0, R5
   144  
   145  	// Load sX[len(sX)-16:len(sX)] and compare.
   146  	ADD $-16, R9
   147  	ADD $-16, R10
   148  	LXVD2X (R9+R0), V0
   149  	LXVD2X (R10+R0), V1
   150  	VCMPEQUBCC V0, V1, V2
   151  	ISEL $CR6LT, R5, R0, R3
   152  	RET
   153  
   154  check0_16:
   155  	CMP R5, $8
   156  	BLT check0_7
   157  	// Load sX[0:7] and compare.
   158  	MOVD (R8), R6
   159  	MOVD (R4), R7
   160  	CMP R6, R7
   161  	ISEL $CR0EQ, R11, R0, R5
   162  	// Load sX[len(sX)-8:len(sX)] and compare.
   163  	MOVD -8(R9), R6
   164  	MOVD -8(R10), R7
   165  	CMP R6, R7
   166  	ISEL $CR0EQ, R5, R0, R3
   167  	RET
   168  
   169  check0_7:
   170  	CMP R5,$0
   171  	MOVD $1, R3
   172  	BEQLR // return if len == 0
   173  
   174  	// Check < 8B loads with a single compare, but select the load address
   175  	// such that it cannot cross a page boundary. Load a few bytes from the
   176  	// lower address if that does not cross the lower page. Or, load a few
   177  	// extra bytes from the higher addresses. And align those values
   178  	// consistently in register as either address may have differing
   179  	// alignment requirements.
   180  	ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
   181  	ANDCC $PAGE_OFFSET, R4, R9
   182  	SUBC R5, $8, R12 // 8-len
   183  	SLD $3, R12, R14 // (8-len)*8
   184  	CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
   185  	CMPU R9, R12, CR0
   186  	SUB R12, R8, R6 // compute lower load address
   187  	SUB R12, R4, R9
   188  	ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
   189  	ISEL $CR0LT, R4, R9, R4 // Similar for s2
   190  	MOVD (R8), R15
   191  	MOVD (R4), R16
   192  	SLD R14, R15, R7
   193  	SLD R14, R16, R17
   194  	SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
   195  	SRD R14, R17, R17
   196  	SRD R14, R15, R6 // Clear the lower (8-len) bytes
   197  	SRD R14, R16, R9
   198  #ifdef GOARCH_ppc64le
   199  	ISEL $CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
   200  	ISEL $CR0LT, R17, R9, R4
   201  #else
   202  	ISEL $CR1LT, R6, R7, R8
   203  	ISEL $CR0LT, R9, R17, R4
   204  #endif
   205  	CMP R4, R8
   206  	ISEL $CR0EQ, R11, R0, R3
   207  	RET