github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/internal/bytealg/compare_ppc64x.s

github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/internal/bytealg/compare_ppc64x.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64 || ppc64le
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    11  	// incoming:
    12  	// R3 a addr -> R5
    13  	// R4 a len  -> R3
    14  	// R5 a cap unused
    15  	// R6 b addr -> R6
    16  	// R7 b len  -> R4
    17  	// R8 b cap unused
    18  	MOVD	R3, R5
    19  	MOVD	R4, R3
    20  	MOVD	R7, R4
    21  	CMP     R5,R6,CR7
    22  	CMP	R3,R4,CR6
    23  	BEQ	CR7,equal
    24  	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
    25  	CMP	R16,$1
    26  	BNE	power8
    27  	BR	cmpbodyp9<>(SB)
    28  power8:
    29  	BR	cmpbody<>(SB)
    30  equal:
    31  	BEQ	CR6,done
    32  	MOVD	$1, R8
    33  	BGT	CR6,greater
    34  	NEG	R8
    35  greater:
    36  	MOVD	R8, R3
    37  	RET
    38  done:
    39  	MOVD	$0, R3
    40  	RET
    41  
    42  TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    43  	// incoming:
    44  	// R3 a addr -> R5
    45  	// R4 a len  -> R3
    46  	// R5 b addr -> R6
    47  	// R6 b len  -> R4
    48  	MOVD	R6, R7
    49  	MOVD	R5, R6
    50  	MOVD	R3, R5
    51  	MOVD	R4, R3
    52  	MOVD	R7, R4
    53  	CMP     R5,R6,CR7
    54  	CMP	R3,R4,CR6
    55  	BEQ	CR7,equal
    56  	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
    57  	CMP	R16,$1
    58  	BNE	power8
    59  	BR	cmpbodyp9<>(SB)
    60  power8:
    61  	BR	cmpbody<>(SB)
    62  equal:
    63  	BEQ	CR6,done
    64  	MOVD	$1, R8
    65  	BGT	CR6,greater
    66  	NEG	R8
    67  greater:
    68  	MOVD	R8, R3
    69  	RET
    70  
    71  done:
    72  	MOVD	$0, R3
    73  	RET
    74  
    75  #ifdef GOARCH_ppc64le
    76  DATA byteswap<>+0(SB)/8, $0x0706050403020100
    77  DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    78  GLOBL byteswap<>+0(SB), RODATA, $16
    79  #define SWAP V21
    80  #endif
    81  
    82  // Do an efficient memcmp for ppc64le/ppc64/POWER8
    83  // R3 = a len
    84  // R4 = b len
    85  // R5 = a addr
    86  // R6 = b addr
    87  // On exit:
    88  // R3 = return value
    89  TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
    90  	MOVD	R3,R8		// set up length
    91  	CMP	R3,R4,CR2	// unequal?
    92  	BLT	CR2,setuplen	// BLT CR2
    93  	MOVD	R4,R8		// use R4 for comparison len
    94  setuplen:
    95  	CMP	R8,$32		// optimize >= 32
    96  	MOVD	R8,R9
    97  	BLT	setup8a		// optimize < 32
    98  	MOVD	$16,R10		// set offsets to load into vectors
    99  	CMP	R8,$64
   100  	BLT	cmp32		// process size 32-63
   101  
   102  	DCBT	(R5)		// optimize >= 64
   103  	DCBT	(R6)		// cache hint
   104  	MOVD	$32,R11		// set offsets to load into vector
   105  	MOVD	$48,R12		// set offsets to load into vector
   106  
   107  loop64a:// process size 64 and greater
   108  	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
   109  	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
   110  	VCMPEQUDCC	V3,V4,V1
   111  	BGE	CR6,different	// jump out if its different
   112  
   113  	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
   114  	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
   115  
   116  	VCMPEQUDCC	V3,V4,V1
   117  	BGE	CR6,different
   118  
   119  	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
   120  	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
   121  
   122  	VCMPEQUDCC	V3,V4,V1
   123  	BGE	CR6,different
   124  
   125  	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
   126  	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
   127  
   128  	VCMPEQUDCC	V3,V4,V1
   129  	BGE	CR6,different
   130  
   131  	ADD	$-64,R9,R9	// reduce remaining size by 64
   132  	ADD	$64,R5,R5	// increment to next 64 bytes of A
   133  	ADD	$64,R6,R6	// increment to next 64 bytes of B
   134  	CMPU	R9,$64
   135  	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
   136  	
   137  	CMPU	R9,$32
   138  	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
   139  	CMPU	R9,$0
   140  	BNE	rem		// loop to rem if the remainder is not 0
   141  
   142  	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
   143  	BLT	CR2,less	// jump to less if len(A)<len(B)
   144  	BR	greater		// jump to greater otherwise
   145  cmp32:
   146  	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
   147  	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
   148  
   149  	VCMPEQUDCC	V3,V4,V1
   150  	BGE	CR6,different
   151  
   152  	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
   153  	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
   154  
   155  	VCMPEQUDCC	V3,V4,V1
   156  	BGE	CR6,different
   157  
   158  	ADD	$-32,R9,R9	// reduce remaining size by 32
   159  	ADD	$32,R5,R5	// increment to next 32 bytes of A
   160  	ADD	$32,R6,R6	// increment to next 32 bytes of B
   161  	CMPU	R9,$0
   162  	BNE	rem		// loop to rem if the remainder is not 0
   163  	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
   164  	BLT	CR2,less	// jump to less if len(A)<len(B)
   165  	BR	greater		// jump to greater otherwise
   166  rem:
   167  	MOVD	R9,R8
   168  	ANDCC	$24,R8,R9	// Any 8 byte chunks?
   169  	BEQ	leftover	// and result is 0
   170  	BR	setup8a
   171  
   172  different:
   173  #ifdef	GOARCH_ppc64le
   174  	MOVD	$byteswap<>+00(SB), R16
   175  	LXVD2X	(R16)(R0),SWAP	// Set up swap string
   176  
   177  	VPERM	V3,V3,SWAP,V3
   178  	VPERM	V4,V4,SWAP,V4
   179  #endif
   180  	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
   181  	MFVSRD	VS36,R10
   182  
   183  	CMPU	R16,R10
   184  	BEQ	lower
   185  	BGT	greater
   186  	MOVD	$-1,R3		// return value if A < B
   187  	RET
   188  lower:
   189  	VSLDOI	$8,V3,V3,V3	// move lower doublwords of A and B into GPR for comparison
   190  	MFVSRD	VS35,R16
   191  	VSLDOI	$8,V4,V4,V4
   192  	MFVSRD	VS36,R10
   193  
   194  	CMPU	R16,R10
   195  	BGT	greater
   196  	MOVD	$-1,R3		// return value if A < B
   197  	RET
   198  setup8a:
   199  	SRADCC	$3,R8,R9	// get the 8 byte count
   200  	BEQ	leftover	// shifted value is 0
   201  	CMPU	R8,$8		// optimize 8byte move
   202  	BEQ	size8
   203  	CMPU	R8,$16
   204  	BEQ	size16
   205  	MOVD	R9,CTR		// loop count for doublewords
   206  loop8:
   207  #ifdef  GOARCH_ppc64le
   208  	MOVDBR	(R5+R0),R16	// doublewords to compare
   209  	MOVDBR	(R6+R0),R10	// LE compare order
   210  #else
   211  	MOVD	(R5+R0),R16	// doublewords to compare
   212  	MOVD	(R6+R0),R10	// BE compare order
   213  #endif
   214  	ADD	$8,R5
   215  	ADD	$8,R6
   216  	CMPU	R16,R10		// match?
   217  	BC	8,2,loop8	// bt ctr <> 0 && cr
   218  	BGT	greater
   219  	BLT	less
   220  leftover:
   221  	ANDCC	$7,R8,R9	// check for leftover bytes
   222  	BEQ	zeroremainder
   223  simplecheck:
   224  	MOVD	R0,R14
   225  	CMP	R9,$4		// process 4 bytes
   226  	BLT	halfword
   227  #ifdef  GOARCH_ppc64le
   228  	MOVWBR	(R5)(R14),R10
   229  	MOVWBR	(R6)(R14),R11
   230  #else
   231  	MOVWZ	(R5)(R14),R10
   232  	MOVWZ	(R6)(R14),R11
   233  #endif
   234  	CMPU	R10,R11
   235  	BGT	greater
   236  	BLT	less
   237  	ADD	$-4,R9
   238  	ADD	$4,R14
   239  	PCALIGN	$16
   240  
   241  halfword:
   242  	CMP	R9,$2		// process 2 bytes
   243  	BLT	byte
   244  #ifdef  GOARCH_ppc64le
   245  	MOVHBR	(R5)(R14),R10
   246  	MOVHBR	(R6)(R14),R11
   247  #else
   248  	MOVHZ	(R5)(R14),R10
   249  	MOVHZ	(R6)(R14),R11
   250  #endif
   251  	CMPU	R10,R11
   252  	BGT	greater
   253  	BLT	less
   254  	ADD	$-2,R9
   255  	ADD	$2,R14
   256  	PCALIGN	$16
   257  byte:
   258  	CMP	R9,$0		// process 1 byte
   259  	BEQ	skip
   260  	MOVBZ	(R5)(R14),R10
   261  	MOVBZ	(R6)(R14),R11
   262  	CMPU	R10,R11
   263  	BGT	greater
   264  	BLT	less
   265  	PCALIGN	$16
   266  skip:
   267  	BEQ	CR2,equal
   268  	BGT	CR2,greater
   269  
   270  less:	MOVD	$-1,R3		// return value if A < B
   271  	RET
   272  size16:
   273  	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
   274  	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
   275  	VCMPEQUDCC	V3,V4,V1
   276  	BGE	CR6,different
   277  zeroremainder:
   278  	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
   279  	BLT	CR2,less	// jump to less if len(A)<len(B)
   280  	BR	greater		// jump to greater otherwise
   281  size8:
   282  #ifdef  GOARCH_ppc64le
   283  	MOVDBR	(R5+R0),R16	// doublewords to compare
   284  	MOVDBR	(R6+R0),R10	// LE compare order
   285  #else
   286  	MOVD	(R5+R0),R16	// doublewords to compare
   287  	MOVD	(R6+R0),R10	// BE compare order
   288  #endif
   289  	CMPU	R16,R10		// match?
   290  	BGT	greater
   291  	BLT	less
   292  	BGT	CR2,greater	// 2nd len > 1st len
   293  	BLT	CR2,less	// 2nd len < 1st len
   294  equal:
   295  	MOVD	$0, R3		// return value if A == B
   296  	RET
   297  greater:
   298  	MOVD	$1,R3		// return value if A > B
   299  	RET
   300  
   301  // Do an efficient memcmp for ppc64le/ppc64/POWER9
   302  // R3 = a len
   303  // R4 = b len
   304  // R5 = a addr
   305  // R6 = b addr
   306  // On exit:
   307  // R3 = return value
   308  TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
   309  	MOVD	R3,R8		// set up length
   310  	CMP	R3,R4,CR2	// unequal?
   311  	BLT	CR2,setuplen	// BLT CR2
   312  	MOVD	R4,R8		// use R4 for comparison len
   313  setuplen:
   314  	CMP	R8,$16		// optimize for size<16
   315  	MOVD	R8,R9
   316  	BLT	simplecheck
   317  	MOVD	$16,R10		// set offsets to load into vectors
   318  	CMP	R8,$32		// optimize for size 16-31
   319  	BLT	cmp16
   320  	CMP	R8,$64
   321  	BLT	cmp32		// optimize for size 32-63
   322  	DCBT	(R5)		// optimize for size>=64
   323  	DCBT	(R6)		// cache hint
   324  
   325  	MOVD	$32,R11		// set offsets to load into vector
   326  	MOVD	$48,R12		// set offsets to load into vector
   327  
   328  loop64a:// process size 64 and greater
   329  	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
   330  	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
   331  	VCMPNEBCC	V3,V4,V1	// record comparison into V1
   332  	BNE	CR6,different	// jump out if its different
   333  
   334  	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
   335  	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
   336  	VCMPNEBCC	V3,V4,V1
   337  	BNE	CR6,different
   338  
   339  	LXVB16X	(R11)(R5),V3	// load bytes of A at offset 32 into vector
   340  	LXVB16X	(R11)(R6),V4	// load bytes of B at offset 32 into vector
   341  	VCMPNEBCC	V3,V4,V1
   342  	BNE	CR6,different
   343  
   344  	LXVB16X	(R12)(R5),V3	// load bytes of A at offset 48 into vector
   345  	LXVB16X	(R12)(R6),V4	// load bytes of B at offset 48 into vector
   346  	VCMPNEBCC	V3,V4,V1
   347  	BNE	CR6,different
   348  
   349  	ADD	$-64,R9,R9	// reduce remaining size by 64
   350  	ADD	$64,R5,R5	// increment to next 64 bytes of A
   351  	ADD	$64,R6,R6	// increment to next 64 bytes of B
   352  	CMPU	R9,$64
   353  	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
   354  
   355  	CMPU	R9,$32
   356  	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
   357  	CMPU	R9,$16
   358  	BGE	cmp16		// loop to cmp16 if there are 16-31 bytes left
   359  	CMPU	R9,$0
   360  	BNE	simplecheck	// loop to simplecheck for remaining bytes
   361  
   362  	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
   363  	BLT	CR2,less	// jump to less if len(A)<len(B)
   364  	BR	greater		// jump to greater otherwise
   365  cmp32:
   366  	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
   367  	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
   368  
   369  	VCMPNEBCC	V3,V4,V1	// record comparison into V1
   370  	BNE	CR6,different	// jump out if its different
   371  
   372  	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
   373  	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
   374  	VCMPNEBCC	V3,V4,V1
   375  	BNE	CR6,different
   376  
   377  	ADD	$-32,R9,R9	// reduce remaining size by 32
   378  	ADD	$32,R5,R5	// increment to next 32 bytes of A
   379  	ADD	$32,R6,R6	// increment to next 32 bytes of B
   380  	CMPU	R9,$16		// loop to cmp16 if there are 16-31 bytes left
   381  	BGE	cmp16
   382  	CMPU	R9,$0
   383  	BNE	simplecheck	// loop to simplecheck for remainder bytes
   384  	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
   385  	BLT	CR2,less	// jump to less if len(A)<len(B)
   386  	BR	greater		// jump to greater otherwise
   387  different:
   388  
   389  	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
   390  	MFVSRD	VS36,R10
   391  
   392  	CMPU	R16,R10
   393  	BEQ	lower
   394  	BGT	greater
   395  	MOVD	$-1,R3		// return value if A < B
   396  	RET
   397  lower:
   398  	MFVSRLD	VS35,R16	// next move lower doublewords of A and B into GPR for comparison
   399  	MFVSRLD	VS36,R10
   400  
   401  	CMPU	R16,R10
   402  	BGT	greater
   403  	MOVD	$-1,R3		// return value if A < B
   404  	RET
   405  
   406  greater:
   407  	MOVD	$1,R3		// return value if A > B
   408  	RET
   409  cmp16:
   410  	ANDCC	$16,R9,R31
   411  	BEQ	tail
   412  
   413  	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 16 into vector
   414  	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 16 into vector
   415  	VCMPEQUDCC	V3,V4,V1
   416  	BGE	CR6,different
   417  
   418  	ADD	$16,R5
   419  	ADD	$16,R6
   420  tail:
   421  	ANDCC	$15,R9		// Load the last 16 bytes (we know there are at least 32b)
   422  	BEQ	end
   423  
   424  	ADD	R9,R5
   425  	ADD	R9,R6
   426  	MOVD	$-16,R10
   427  
   428  	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
   429  	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
   430  	VCMPEQUDCC	V3,V4,V1
   431  	BGE	CR6,different
   432  end:
   433  	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
   434  	BLT	CR2,less	// jump to less if BLT CR2 that is, len(A)<len(B)
   435  	BR	greater		// jump to greater otherwise
   436  simplecheck:
   437  	MOVD	$0,R14		// process 8 bytes
   438  	CMP	R9,$8
   439  	BLT	word
   440  #ifdef  GOARCH_ppc64le
   441  	MOVDBR	(R5+R14),R10
   442  	MOVDBR	(R6+R14),R11
   443  #else
   444  	MOVD	(R5+R14),R10
   445  	MOVD	(R6+R14),R11
   446  #endif
   447  	CMPU	R10,R11
   448  	BGT	greater
   449  	BLT	less
   450  	ADD	$8,R14
   451  	ADD	$-8,R9
   452  	PCALIGN	$16
   453  word:
   454  	CMP	R9,$4		// process 4 bytes
   455  	BLT	halfword
   456  #ifdef  GOARCH_ppc64le
   457  	MOVWBR	(R5+R14),R10
   458  	MOVWBR	(R6+R14),R11
   459  #else
   460  	MOVWZ	(R5+R14),R10
   461  	MOVWZ	(R6+R14),R11
   462  #endif
   463  	CMPU	R10,R11
   464  	BGT	greater
   465  	BLT	less
   466  	ADD	$4,R14
   467  	ADD	$-4,R9
   468  	PCALIGN	$16
   469  halfword:
   470  	CMP	R9,$2		// process 2 bytes
   471  	BLT	byte
   472  #ifdef  GOARCH_ppc64le
   473  	MOVHBR	(R5+R14),R10
   474  	MOVHBR	(R6+R14),R11
   475  #else
   476  	MOVHZ	(R5+R14),R10
   477  	MOVHZ	(R6+R14),R11
   478  #endif
   479  	CMPU	R10,R11
   480  	BGT	greater
   481  	BLT	less
   482  	ADD	$2,R14
   483  	ADD	$-2,R9
   484  	PCALIGN	$16
   485  byte:
   486  	CMP	R9,$0		// process 1 byte
   487  	BEQ	skip
   488  	MOVBZ	(R5+R14),R10
   489  	MOVBZ	(R6+R14),R11
   490  	CMPU	R10,R11
   491  	BGT	greater
   492  	BLT	less
   493  	PCALIGN	$16
   494  skip:
   495  	BEQ	CR2,equal
   496  	BGT	CR2,greater
   497  less:
   498  	MOVD	$-1,R3		// return value if A < B
   499  	RET
   500  equal:
   501  	MOVD	$0, R3		// return value if A == B
   502  	RET