github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/index_s390x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && s390x
     9  
    10  #include "textflag.h"
    11  
    12  // Caller must confirm availability of vx facility before calling.
    13  TEXT ·indexSlice(SB),NOSPLIT|NOFRAME,$0-56
    14  	LMG a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
    15  	LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
    16  	MOVD $ret+48(FP), R5
    17  	BR indexbody<>(SB)
    18  
    19  // Caller must confirm availability of vx facility before calling.
    20  TEXT ·index(SB),NOSPLIT|NOFRAME,$0-40
    21  	LMG  a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
    22  	LMG  b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
    23  	MOVD $ret+32(FP), R5
    24  	BR   indexbody<>(SB)
    25  
    26  // s: string we are searching
    27  // sep: string to search for
    28  // R1=&s[0], R2=len(s)
    29  // R3=&sep[0], R4=len(sep)
    30  // R5=&ret (int)
    31  // Caller must confirm availability of vx facility before calling.
    32  TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
    33  	CMPBGT R4, R2, notfound
    34  	ADD R1, R2
    35  	SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
    36  	CMPBEQ R4, $0, notfound
    37  	SUB $1, R4 // R4=len(sep)-1 for use as VLL index
    38  	VLL R4, (R3), V0 // contains first 16 bytes of sep
    39  	MOVD R1, R7
    40  index2plus:
    41  	CMPBNE R4, $1, index3plus
    42  	MOVD $15(R7), R9
    43  	CMPBGE R9, R2, index2to16
    44  	VGBM $0xaaaa, V31       // 0xff00ff00ff00ff00...
    45  	VONE V16
    46  	VREPH $0, V0, V1
    47  	CMPBGE R9, R2, index2to16
    48  index2loop:
    49  	VL 0(R7), V2          // 16 bytes, even indices
    50  	VL 1(R7), V4          // 16 bytes, odd indices
    51  	VCEQH V1, V2, V5         // compare even indices
    52  	VCEQH V1, V4, V6         // compare odd indices
    53  	VSEL V5, V6, V31, V7    // merge even and odd indices
    54  	VFEEBS V16, V7, V17       // find leftmost index, set condition to 1 if found
    55  	BLT foundV17
    56  	MOVD $16(R7), R7        // R7+=16
    57  	ADD $15, R7, R9
    58  	CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
    59  	CMPBLE R7, R2, index2to16
    60  	BR notfound
    61  
    62  index3plus:
    63  	CMPBNE R4, $2, index4plus
    64  	ADD $15, R7, R9
    65  	CMPBGE R9, R2, index2to16
    66  	MOVD $1, R0
    67  	VGBM $0xaaaa, V31       // 0xff00ff00ff00ff00...
    68  	VONE V16
    69  	VREPH $0, V0, V1
    70  	VREPB $2, V0, V8
    71  index3loop:
    72  	VL (R7), V2           // load 16-bytes into V2
    73  	VLL R0, 16(R7), V3     // load 2-bytes into V3
    74  	VSLDB $1, V2, V3, V4     // V4=(V2:V3)<<1
    75  	VSLDB $2, V2, V3, V9     // V9=(V2:V3)<<2
    76  	VCEQH V1, V2, V5         // compare 2-byte even indices
    77  	VCEQH V1, V4, V6         // compare 2-byte odd indices
    78  	VCEQB V8, V9, V10        // compare last bytes
    79  	VSEL V5, V6, V31, V7    // merge even and odd indices
    80  	VN V7, V10, V7        // AND indices with last byte
    81  	VFEEBS V16, V7, V17       // find leftmost index, set condition to 1 if found
    82  	BLT foundV17
    83  	MOVD $16(R7), R7        // R7+=16
    84  	ADD $15, R7, R9
    85  	CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
    86  	CMPBLE R7, R2, index2to16
    87  	BR notfound
    88  
    89  index4plus:
    90  	CMPBNE R4, $3, index5plus
    91  	ADD $15, R7, R9
    92  	CMPBGE R9, R2, index2to16
    93  	MOVD $2, R0
    94  	VGBM $0x8888, V29       // 0xff000000ff000000...
    95  	VGBM $0x2222, V30       // 0x0000ff000000ff00...
    96  	VGBM $0xcccc, V31       // 0xffff0000ffff0000...
    97  	VONE V16
    98  	VREPF $0, V0, V1
    99  index4loop:
   100  	VL (R7), V2           // load 16-bytes into V2
   101  	VLL R0, 16(R7), V3     // load 3-bytes into V3
   102  	VSLDB $1, V2, V3, V4     // V4=(V2:V3)<<1
   103  	VSLDB $2, V2, V3, V9     // V9=(V2:V3)<<1
   104  	VSLDB $3, V2, V3, V10    // V10=(V2:V3)<<1
   105  	VCEQF V1, V2, V5         // compare index 0, 4, ...
   106  	VCEQF V1, V4, V6         // compare index 1, 5, ...
   107  	VCEQF V1, V9, V11        // compare index 2, 6, ...
   108  	VCEQF V1, V10, V12       // compare index 3, 7, ...
   109  	VSEL V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
   110  	VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
   111  	VSEL V13, V14, V31, V7  // final merge
   112  	VFEEBS V16, V7, V17       // find leftmost index, set condition to 1 if found
   113  	BLT foundV17
   114  	MOVD $16(R7), R7        // R7+=16
   115  	ADD $15, R7, R9
   116  	CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
   117  	CMPBLE R7, R2, index2to16
   118  	BR notfound
   119  
   120  index5plus:
   121  	CMPBGT R4, $15, index17plus
   122  index2to16:
   123  	CMPBGT R7, R2, notfound
   124  	MOVD $1(R7), R8
   125  	CMPBGT R8, R2, index2to16tail
   126  index2to16loop:
   127  	// unrolled 2x
   128  	VLL R4, (R7), V1
   129  	VLL R4, 1(R7), V2
   130  	VCEQGS V0, V1, V3
   131  	BEQ found
   132  	MOVD $1(R7), R7
   133  	VCEQGS V0, V2, V4
   134  	BEQ found
   135  	MOVD $1(R7), R7
   136  	CMPBLT R7, R2, index2to16loop
   137  	CMPBGT R7, R2, notfound
   138  index2to16tail:
   139  	VLL R4, (R7), V1
   140  	VCEQGS V0, V1, V2
   141  	BEQ found
   142  	BR notfound
   143  
   144  index17plus:
   145  	CMPBGT R4, $31, index33plus
   146  	SUB $16, R4, R0
   147  	VLL R0, 16(R3), V1
   148  	VONE V7
   149  index17to32loop:
   150  	VL (R7), V2
   151  	VLL R0, 16(R7), V3
   152  	VCEQG V0, V2, V4
   153  	VCEQG V1, V3, V5
   154  	VN V4, V5, V6
   155  	VCEQGS V6, V7, V8
   156  	BEQ found
   157  	MOVD $1(R7), R7
   158  	CMPBLE  R7, R2, index17to32loop
   159  	BR notfound
   160  
   161  index33plus:
   162  	CMPBGT R4, $47, index49plus
   163  	SUB $32, R4, R0
   164  	VL 16(R3), V1
   165  	VLL R0, 32(R3), V2
   166  	VONE V11
   167  index33to48loop:
   168  	VL (R7), V3
   169  	VL 16(R7), V4
   170  	VLL R0, 32(R7), V5
   171  	VCEQG V0, V3, V6
   172  	VCEQG V1, V4, V7
   173  	VCEQG V2, V5, V8
   174  	VN V6, V7, V9
   175  	VN V8, V9, V10
   176  	VCEQGS V10, V11, V12
   177  	BEQ found
   178  	MOVD $1(R7), R7
   179  	CMPBLE  R7, R2, index33to48loop
   180  	BR notfound
   181  
   182  index49plus:
   183  	CMPBGT R4, $63, index65plus
   184  	SUB $48, R4, R0
   185  	VL 16(R3), V1
   186  	VL 32(R3), V2
   187  	VLL R0, 48(R3), V3
   188  	VONE V15
   189  index49to64loop:
   190  	VL (R7), V4
   191  	VL 16(R7), V5
   192  	VL 32(R7), V6
   193  	VLL R0, 48(R7), V7
   194  	VCEQG V0, V4, V8
   195  	VCEQG V1, V5, V9
   196  	VCEQG V2, V6, V10
   197  	VCEQG V3, V7, V11
   198  	VN V8, V9, V12
   199  	VN V10, V11, V13
   200  	VN V12, V13, V14
   201  	VCEQGS V14, V15, V16
   202  	BEQ found
   203  	MOVD $1(R7), R7
   204  	CMPBLE  R7, R2, index49to64loop
   205  notfound:
   206  	MOVD $-1, (R5)
   207  	RET
   208  
   209  index65plus:
   210  	// not implemented
   211  	MOVD $0, (R0)
   212  	RET
   213  
   214  foundV17: // index is in doubleword V17[0]
   215  	VLGVG $0, V17, R8
   216  	ADD R8, R7
   217  found:
   218  	SUB R1, R7
   219  	MOVD R7, (R5)
   220  	RET