github.com/primecitizens/pcz/std@v0.2.1/core/cmp/bs_ppc64x.s

github.com/primecitizens/pcz/std@v0.2.1/core/cmp/bs_ppc64x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && (ppc64 || ppc64le)
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·Bytes<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    13  	// incoming:
    14  	// R3 a addr -> R5
    15  	// R4 a len  -> R3
    16  	// R5 a cap unused
    17  	// R6 b addr -> R6
    18  	// R7 b len  -> R4
    19  	// R8 b cap unused
    20  	MOVD R3, R5
    21  	MOVD R4, R3
    22  	MOVD R7, R4
    23  	CMP R5,R6,CR7
    24  	CMP R3,R4,CR6
    25  	BEQ CR7,equal
    26  	MOVBZ ·isPOWER9(SB), R16
    27  	CMP R16,$1
    28  	BNE power8
    29  	BR cmpbodyp9<>(SB)
    30  power8:
    31  	BR cmpbody<>(SB)
    32  equal:
    33  	BEQ CR6,done
    34  	MOVD $1, R8
    35  	BGT CR6,greater
    36  	NEG R8
    37  greater:
    38  	MOVD R8, R3
    39  	RET
    40  done:
    41  	MOVD $0, R3
    42  	RET
    43  
    44  TEXT ·String<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    45  	// incoming:
    46  	// R3 a addr -> R5
    47  	// R4 a len  -> R3
    48  	// R5 b addr -> R6
    49  	// R6 b len  -> R4
    50  	MOVD R6, R7
    51  	MOVD R5, R6
    52  	MOVD R3, R5
    53  	MOVD R4, R3
    54  	MOVD R7, R4
    55  	CMP     R5,R6,CR7
    56  	CMP R3,R4,CR6
    57  	BEQ CR7,equal
    58  	MOVBZ ·isPOWER9(SB), R16
    59  	CMP R16,$1
    60  	BNE power8
    61  	BR cmpbodyp9<>(SB)
    62  power8:
    63  	BR cmpbody<>(SB)
    64  equal:
    65  	BEQ CR6,done
    66  	MOVD $1, R8
    67  	BGT CR6,greater
    68  	NEG R8
    69  greater:
    70  	MOVD R8, R3
    71  	RET
    72  
    73  done:
    74  	MOVD $0, R3
    75  	RET
    76  
    77  #ifdef GOARCH_ppc64le
    78  DATA byteswap<>+0(SB)/8, $0x0706050403020100
    79  DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    80  GLOBL byteswap<>+0(SB), RODATA, $16
    81  #define SWAP V21
    82  #endif
    83  
    84  // Do an efficient memcmp for ppc64le/ppc64/POWER8
    85  // R3 = a len
    86  // R4 = b len
    87  // R5 = a addr
    88  // R6 = b addr
    89  // On exit:
    90  // R3 = return value
    91  TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
    92  	MOVD R3,R8 // set up length
    93  	CMP R3,R4,CR2 // unequal?
    94  	BLT CR2,setuplen // BLT CR2
    95  	MOVD R4,R8 // use R4 for comparison len
    96  setuplen:
    97  	CMP R8,$32 // optimize >= 32
    98  	MOVD R8,R9
    99  	BLT setup8a // optimize < 32
   100  	MOVD $16,R10 // set offsets to load into vectors
   101  	CMP R8,$64
   102  	BLT cmp32 // process size 32-63
   103  
   104  	DCBT (R5)		// optimize >= 64
   105  	DCBT (R6)		// cache hint
   106  	MOVD $32,R11 // set offsets to load into vector
   107  	MOVD $48,R12 // set offsets to load into vector
   108  
   109  loop64a:// process size 64 and greater
   110  	LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
   111  	LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
   112  	VCMPEQUDCC V3,V4,V1
   113  	BGE CR6,different // jump out if its different
   114  
   115  	LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
   116  	LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
   117  
   118  	VCMPEQUDCC V3,V4,V1
   119  	BGE CR6,different
   120  
   121  	LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
   122  	LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
   123  
   124  	VCMPEQUDCC V3,V4,V1
   125  	BGE CR6,different
   126  
   127  	LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
   128  	LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
   129  
   130  	VCMPEQUDCC V3,V4,V1
   131  	BGE CR6,different
   132  
   133  	ADD $-64,R9,R9 // reduce remaining size by 64
   134  	ADD $64,R5,R5 // increment to next 64 bytes of A
   135  	ADD $64,R6,R6 // increment to next 64 bytes of B
   136  	CMPU R9,$64
   137  	BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
   138  	
   139  	CMPU R9,$32
   140  	BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
   141  	CMPU R9,$0
   142  	BNE rem // loop to rem if the remainder is not 0
   143  
   144  	BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
   145  	BLT CR2,less // jump to less if len(A)<len(B)
   146  	BR greater // jump to greater otherwise
   147  cmp32:
   148  	LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
   149  	LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
   150  
   151  	VCMPEQUDCC V3,V4,V1
   152  	BGE CR6,different
   153  
   154  	LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
   155  	LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
   156  
   157  	VCMPEQUDCC V3,V4,V1
   158  	BGE CR6,different
   159  
   160  	ADD $-32,R9,R9 // reduce remaining size by 32
   161  	ADD $32,R5,R5 // increment to next 32 bytes of A
   162  	ADD $32,R6,R6 // increment to next 32 bytes of B
   163  	CMPU R9,$0
   164  	BNE rem // loop to rem if the remainder is not 0
   165  	BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
   166  	BLT CR2,less // jump to less if len(A)<len(B)
   167  	BR greater // jump to greater otherwise
   168  rem:
   169  	MOVD R9,R8
   170  	ANDCC $24,R8,R9 // Any 8 byte chunks?
   171  	BEQ leftover // and result is 0
   172  	BR setup8a
   173  
   174  different:
   175  #ifdef GOARCH_ppc64le
   176  	MOVD $byteswap<>+00(SB), R16
   177  	LXVD2X (R16)(R0),SWAP // Set up swap string
   178  
   179  	VPERM V3,V3,SWAP,V3
   180  	VPERM V4,V4,SWAP,V4
   181  #endif
   182  	MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
   183  	MFVSRD VS36,R10
   184  
   185  	CMPU R16,R10
   186  	BEQ lower
   187  	BGT greater
   188  	MOVD $-1,R3 // return value if A < B
   189  	RET
   190  lower:
   191  	VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
   192  	MFVSRD VS35,R16
   193  	VSLDOI $8,V4,V4,V4
   194  	MFVSRD VS36,R10
   195  
   196  	CMPU R16,R10
   197  	BGT greater
   198  	MOVD $-1,R3 // return value if A < B
   199  	RET
   200  setup8a:
   201  	SRADCC $3,R8,R9 // get the 8 byte count
   202  	BEQ leftover // shifted value is 0
   203  	CMPU R8,$8 // optimize 8byte move
   204  	BEQ size8
   205  	CMPU R8,$16
   206  	BEQ size16
   207  	MOVD R9,CTR // loop count for doublewords
   208  loop8:
   209  #ifdef  GOARCH_ppc64le
   210  	MOVDBR (R5+R0),R16 // doublewords to compare
   211  	MOVDBR (R6+R0),R10 // LE compare order
   212  #else
   213  	MOVD (R5+R0),R16 // doublewords to compare
   214  	MOVD (R6+R0),R10 // BE compare order
   215  #endif
   216  	ADD $8,R5
   217  	ADD $8,R6
   218  	CMPU R16,R10 // match?
   219  	BC 8,2,loop8 // bt ctr <> 0 && cr
   220  	BGT greater
   221  	BLT less
   222  leftover:
   223  	ANDCC $7,R8,R9 // check for leftover bytes
   224  	BEQ zeroremainder
   225  simplecheck:
   226  	MOVD R0,R14
   227  	CMP R9,$4 // process 4 bytes
   228  	BLT halfword
   229  #ifdef  GOARCH_ppc64le
   230  	MOVWBR (R5)(R14),R10
   231  	MOVWBR (R6)(R14),R11
   232  #else
   233  	MOVWZ (R5)(R14),R10
   234  	MOVWZ (R6)(R14),R11
   235  #endif
   236  	CMPU R10,R11
   237  	BGT greater
   238  	BLT less
   239  	ADD $-4,R9
   240  	ADD $4,R14
   241  	PCALIGN $16
   242  
   243  halfword:
   244  	CMP R9,$2 // process 2 bytes
   245  	BLT byte
   246  #ifdef  GOARCH_ppc64le
   247  	MOVHBR (R5)(R14),R10
   248  	MOVHBR (R6)(R14),R11
   249  #else
   250  	MOVHZ (R5)(R14),R10
   251  	MOVHZ (R6)(R14),R11
   252  #endif
   253  	CMPU R10,R11
   254  	BGT greater
   255  	BLT less
   256  	ADD $-2,R9
   257  	ADD $2,R14
   258  	PCALIGN $16
   259  byte:
   260  	CMP R9,$0 // process 1 byte
   261  	BEQ skip
   262  	MOVBZ (R5)(R14),R10
   263  	MOVBZ (R6)(R14),R11
   264  	CMPU R10,R11
   265  	BGT greater
   266  	BLT less
   267  	PCALIGN $16
   268  skip:
   269  	BEQ CR2,equal
   270  	BGT CR2,greater
   271  
   272  less:	MOVD $-1,R3 // return value if A < B
   273  	RET
   274  size16:
   275  	LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
   276  	LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
   277  	VCMPEQUDCC V3,V4,V1
   278  	BGE CR6,different
   279  zeroremainder:
   280  	BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
   281  	BLT CR2,less // jump to less if len(A)<len(B)
   282  	BR greater // jump to greater otherwise
   283  size8:
   284  #ifdef  GOARCH_ppc64le
   285  	MOVDBR (R5+R0),R16 // doublewords to compare
   286  	MOVDBR (R6+R0),R10 // LE compare order
   287  #else
   288  	MOVD (R5+R0),R16 // doublewords to compare
   289  	MOVD (R6+R0),R10 // BE compare order
   290  #endif
   291  	CMPU R16,R10 // match?
   292  	BGT greater
   293  	BLT less
   294  	BGT CR2,greater // 2nd len > 1st len
   295  	BLT CR2,less // 2nd len < 1st len
   296  equal:
   297  	MOVD $0, R3 // return value if A == B
   298  	RET
   299  greater:
   300  	MOVD $1,R3 // return value if A > B
   301  	RET
   302  
   303  // Do an efficient memcmp for ppc64le/ppc64/POWER9
   304  // R3 = a len
   305  // R4 = b len
   306  // R5 = a addr
   307  // R6 = b addr
   308  // On exit:
   309  // R3 = return value
   310  TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
   311  	MOVD R3,R8 // set up length
   312  	CMP R3,R4,CR2 // unequal?
   313  	BLT CR2,setuplen // BLT CR2
   314  	MOVD R4,R8 // use R4 for comparison len
   315  setuplen:
   316  	CMP R8,$16 // optimize for size<16
   317  	MOVD R8,R9
   318  	BLT simplecheck
   319  	MOVD $16,R10 // set offsets to load into vectors
   320  	CMP R8,$32 // optimize for size 16-31
   321  	BLT cmp16
   322  	CMP R8,$64
   323  	BLT cmp32 // optimize for size 32-63
   324  	DCBT (R5)		// optimize for size>=64
   325  	DCBT (R6)		// cache hint
   326  
   327  	MOVD $32,R11 // set offsets to load into vector
   328  	MOVD $48,R12 // set offsets to load into vector
   329  
   330  loop64a:// process size 64 and greater
   331  	LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
   332  	LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
   333  	VCMPNEBCC V3,V4,V1 // record comparison into V1
   334  	BNE CR6,different // jump out if its different
   335  
   336  	LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
   337  	LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
   338  	VCMPNEBCC V3,V4,V1
   339  	BNE CR6,different
   340  
   341  	LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
   342  	LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
   343  	VCMPNEBCC V3,V4,V1
   344  	BNE CR6,different
   345  
   346  	LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
   347  	LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
   348  	VCMPNEBCC V3,V4,V1
   349  	BNE CR6,different
   350  
   351  	ADD $-64,R9,R9 // reduce remaining size by 64
   352  	ADD $64,R5,R5 // increment to next 64 bytes of A
   353  	ADD $64,R6,R6 // increment to next 64 bytes of B
   354  	CMPU R9,$64
   355  	BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
   356  
   357  	CMPU R9,$32
   358  	BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
   359  	CMPU R9,$16
   360  	BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
   361  	CMPU R9,$0
   362  	BNE simplecheck // loop to simplecheck for remaining bytes
   363  
   364  	BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
   365  	BLT CR2,less // jump to less if len(A)<len(B)
   366  	BR greater // jump to greater otherwise
   367  cmp32:
   368  	LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
   369  	LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
   370  
   371  	VCMPNEBCC V3,V4,V1 // record comparison into V1
   372  	BNE CR6,different // jump out if its different
   373  
   374  	LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
   375  	LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
   376  	VCMPNEBCC V3,V4,V1
   377  	BNE CR6,different
   378  
   379  	ADD $-32,R9,R9 // reduce remaining size by 32
   380  	ADD $32,R5,R5 // increment to next 32 bytes of A
   381  	ADD $32,R6,R6 // increment to next 32 bytes of B
   382  	CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
   383  	BGE cmp16
   384  	CMPU R9,$0
   385  	BNE simplecheck // loop to simplecheck for remainder bytes
   386  	BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
   387  	BLT CR2,less // jump to less if len(A)<len(B)
   388  	BR greater // jump to greater otherwise
   389  different:
   390  
   391  	MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
   392  	MFVSRD VS36,R10
   393  
   394  	CMPU R16,R10
   395  	BEQ lower
   396  	BGT greater
   397  	MOVD $-1,R3 // return value if A < B
   398  	RET
   399  lower:
   400  	MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
   401  	MFVSRLD VS36,R10
   402  
   403  	CMPU R16,R10
   404  	BGT greater
   405  	MOVD $-1,R3 // return value if A < B
   406  	RET
   407  
   408  greater:
   409  	MOVD $1,R3 // return value if A > B
   410  	RET
   411  cmp16:
   412  	ANDCC $16,R9,R31
   413  	BEQ tail
   414  
   415  	LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
   416  	LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
   417  	VCMPEQUDCC V3,V4,V1
   418  	BGE CR6,different
   419  
   420  	ADD $16,R5
   421  	ADD $16,R6
   422  tail:
   423  	ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
   424  	BEQ end
   425  
   426  	ADD R9,R5
   427  	ADD R9,R6
   428  	MOVD $-16,R10
   429  
   430  	LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
   431  	LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
   432  	VCMPEQUDCC V3,V4,V1
   433  	BGE CR6,different
   434  end:
   435  	BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
   436  	BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
   437  	BR greater // jump to greater otherwise
   438  simplecheck:
   439  	MOVD $0,R14 // process 8 bytes
   440  	CMP R9,$8
   441  	BLT word
   442  #ifdef  GOARCH_ppc64le
   443  	MOVDBR (R5+R14),R10
   444  	MOVDBR (R6+R14),R11
   445  #else
   446  	MOVD (R5+R14),R10
   447  	MOVD (R6+R14),R11
   448  #endif
   449  	CMPU R10,R11
   450  	BGT greater
   451  	BLT less
   452  	ADD $8,R14
   453  	ADD $-8,R9
   454  	PCALIGN $16
   455  word:
   456  	CMP R9,$4 // process 4 bytes
   457  	BLT halfword
   458  #ifdef  GOARCH_ppc64le
   459  	MOVWBR (R5+R14),R10
   460  	MOVWBR (R6+R14),R11
   461  #else
   462  	MOVWZ (R5+R14),R10
   463  	MOVWZ (R6+R14),R11
   464  #endif
   465  	CMPU R10,R11
   466  	BGT greater
   467  	BLT less
   468  	ADD $4,R14
   469  	ADD $-4,R9
   470  	PCALIGN $16
   471  halfword:
   472  	CMP R9,$2 // process 2 bytes
   473  	BLT byte
   474  #ifdef  GOARCH_ppc64le
   475  	MOVHBR (R5+R14),R10
   476  	MOVHBR (R6+R14),R11
   477  #else
   478  	MOVHZ (R5+R14),R10
   479  	MOVHZ (R6+R14),R11
   480  #endif
   481  	CMPU R10,R11
   482  	BGT greater
   483  	BLT less
   484  	ADD $2,R14
   485  	ADD $-2,R9
   486  	PCALIGN $16
   487  byte:
   488  	CMP R9,$0 // process 1 byte
   489  	BEQ skip
   490  	MOVBZ (R5+R14),R10
   491  	MOVBZ (R6+R14),R11
   492  	CMPU R10,R11
   493  	BGT greater
   494  	BLT less
   495  	PCALIGN $16
   496  skip:
   497  	BEQ CR2,equal
   498  	BGT CR2,greater
   499  less:
   500  	MOVD $-1,R3 // return value if A < B
   501  	RET
   502  equal:
   503  	MOVD $0, R3 // return value if A == B
   504  	RET