github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/count_s390x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2019 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && s390x
     9  
    10  #include "textflag.h"
    11  
    12  // condition code masks
    13  #define EQ 8
    14  #define NE 7
    15  
    16  // register assignments
    17  #define R_ZERO R0
    18  #define R_VAL  R1
    19  #define R_TMP  R2
    20  #define R_PTR  R3
    21  #define R_LEN  R4
    22  #define R_CHAR R5
    23  #define R_RET  R6
    24  #define R_ITER R7
    25  #define R_CNT  R8
    26  #define R_MPTR R9
    27  
    28  // vector register assignments
    29  #define V_ZERO V0
    30  #define V_CHAR V1
    31  #define V_MASK V2
    32  #define V_VAL  V3
    33  #define V_CNT  V4
    34  
    35  // mask for trailing bytes in vector implementation
    36  GLOBL countbytemask<>(SB), RODATA, $16
    37  DATA countbytemask<>+0(SB)/8, $0x0101010101010101
    38  DATA countbytemask<>+8(SB)/8, $0x0101010101010101
    39  
    40  // func CountSlice(b []byte, c byte) int
    41  TEXT ·CountSlice(SB), NOSPLIT|NOFRAME, $0-40
    42  	LMG   b+0(FP), R_PTR, R_LEN
    43  	MOVBZ c+24(FP), R_CHAR
    44  	MOVD  $ret+32(FP), R_RET
    45  	BR    countbytebody<>(SB)
    46  
    47  // func Count(s string, c byte) int
    48  TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-32
    49  	LMG   s+0(FP), R_PTR, R_LEN
    50  	MOVBZ c+16(FP), R_CHAR
    51  	MOVD  $ret+24(FP), R_RET
    52  	BR    countbytebody<>(SB)
    53  
    54  // input:
    55  // R_PTR  = address of array of bytes
    56  // R_LEN  = number of bytes in array
    57  // R_CHAR = byte value to count zero (extended to register width)
    58  // R_RET  = address of return value
    59  TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
    60  	MOVD  ·hasVX(SB), R_TMP
    61  	MOVD  $countbytemask<>(SB), R_MPTR
    62  	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
    63  	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
    64  	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
    65  	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
    66  
    67  	// Start of vector code (have vector facility).
    68  	//
    69  	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
    70  	// vector 'load with length' (VLL). It will be in the range [-1,14].
    71  	// Also replicate c across a 16-byte vector and initialize V_ZERO.
    72  	ANDW  $0xf, R_LEN
    73  	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
    74  	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
    75  	ADDW  $-1, R_LEN
    76  	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
    77  
    78  	// Jump to loop if we have more than 15 bytes to process.
    79  	CGIJ $NE, R_ITER, $0, vxchunks
    80  
    81  	// Load 1-15 bytes and corresponding mask.
    82  	// Note: only the low 32-bits of R_LEN are used for the index.
    83  	VLL R_LEN, (R_PTR), V_VAL
    84  	VLL R_LEN, (R_MPTR), V_MASK
    85  
    86  	// Compare each byte in input chunk against byte to be counted.
    87  	// Each byte element will be set to either 0 (no match) or 1 (match).
    88  	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
    89  	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
    90  
    91  	// Accumulate matched byte count in 128-bit integer value.
    92  	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
    93  	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
    94  
    95  	// Return rightmost (lowest) 64-bit part of accumulator.
    96  	VSTEG $1, V_CNT, (R_RET)
    97  	RET
    98  
    99  vxchunks:
   100  	// Load 0x01 into every byte element in the 16-byte mask vector.
   101  	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
   102  	VZERO  V_CNT      // initial uint128 count of 0
   103  
   104  vxloop:
   105  	// Load input bytes in 16-byte chunks.
   106  	VL (R_PTR), V_VAL
   107  
   108  	// Compare each byte in input chunk against byte to be counted.
   109  	// Each byte element will be set to either 0 (no match) or 1 (match).
   110  	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
   111  	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
   112  
   113  	// Increment input string address.
   114  	MOVD $16(R_PTR), R_PTR
   115  
   116  	// Accumulate matched byte count in 128-bit integer value.
   117  	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
   118  	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
   119  	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
   120  
   121  	// Repeat until all 16-byte chunks are done.
   122  	BRCTG R_ITER, vxloop
   123  
   124  	// Skip to end if there are no trailing bytes.
   125  	CIJ $EQ, R_LEN, $-1, vxret
   126  
   127  	// Load 1-15 bytes and corresponding mask.
   128  	// Note: only the low 32-bits of R_LEN are used for the index.
   129  	VLL R_LEN, (R_PTR), V_VAL
   130  	VLL R_LEN, (R_MPTR), V_MASK
   131  
   132  	// Compare each byte in input chunk against byte to be counted.
   133  	// Each byte element will be set to either 0 (no match) or 1 (match).
   134  	VCEQB V_CHAR, V_VAL, V_VAL
   135  	VN    V_MASK, V_VAL, V_VAL
   136  
   137  	// Accumulate matched byte count in 128-bit integer value.
   138  	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
   139  	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
   140  	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
   141  
   142  vxret:
   143  	// Return rightmost (lowest) 64-bit part of accumulator.
   144  	VSTEG $1, V_CNT, (R_RET)
   145  	RET
   146  
   147  novx:
   148  	// Start of non-vector code (the vector facility not available).
   149  	//
   150  	// Initialise counter and constant zero.
   151  	MOVD $0, R_CNT
   152  	MOVD $0, R_ZERO
   153  
   154  loop:
   155  	// Read 1-byte from input and compare.
   156  	// Note: avoid putting LOCGR in critical path.
   157  	MOVBZ (R_PTR), R_VAL
   158  	MOVD  $1, R_TMP
   159  	MOVD  $1(R_PTR), R_PTR
   160  	CMPW  R_VAL, R_CHAR
   161  	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
   162  	ADD   R_TMP, R_CNT       // accumulate 64-bit result
   163  
   164  	// Repeat until all bytes have been checked.
   165  	BRCTG R_LEN, loop
   166  
   167  ret:
   168  	MOVD R_CNT, (R_RET)
   169  	RET
   170  
   171  ret0:
   172  	MOVD $0, (R_RET)
   173  	RET