github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/internal/bytealg/compare_amd64.s

github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/internal/bytealg/compare_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
     9  	// AX = a_base (want in SI)
    10  	// BX = a_len  (want in BX)
    11  	// CX = a_cap  (unused)
    12  	// DI = b_base (want in DI)
    13  	// SI = b_len  (want in DX)
    14  	// R8 = b_cap  (unused)
    15  	MOVQ	SI, DX
    16  	MOVQ	AX, SI
    17  	JMP	cmpbody<>(SB)
    18  
    19  TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
    20  	// AX = a_base (want in SI)
    21  	// BX = a_len  (want in BX)
    22  	// CX = b_base (want in DI)
    23  	// DI = b_len  (want in DX)
    24  	MOVQ	AX, SI
    25  	MOVQ	DI, DX
    26  	MOVQ	CX, DI
    27  	JMP	cmpbody<>(SB)
    28  
    29  // input:
    30  //   SI = a
    31  //   DI = b
    32  //   BX = alen
    33  //   DX = blen
    34  // output:
    35  //   AX = output (-1/0/1)
    36  TEXT cmpbody<>(SB),NOSPLIT,$0-0
    37  	CMPQ	SI, DI
    38  	JEQ	allsame
    39  	CMPQ	BX, DX
    40  	MOVQ	DX, R8
    41  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
    42  	CMPQ	R8, $8
    43  	JB	small
    44  
    45  	CMPQ	R8, $63
    46  	JBE	loop
    47  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    48  	JEQ     big_loop_avx2
    49  	JMP	big_loop
    50  loop:
    51  	CMPQ	R8, $16
    52  	JBE	_0through16
    53  	MOVOU	(SI), X0
    54  	MOVOU	(DI), X1
    55  	PCMPEQB X0, X1
    56  	PMOVMSKB X1, AX
    57  	XORQ	$0xffff, AX	// convert EQ to NE
    58  	JNE	diff16	// branch if at least one byte is not equal
    59  	ADDQ	$16, SI
    60  	ADDQ	$16, DI
    61  	SUBQ	$16, R8
    62  	JMP	loop
    63  
    64  diff64:
    65  	ADDQ	$48, SI
    66  	ADDQ	$48, DI
    67  	JMP	diff16
    68  diff48:
    69  	ADDQ	$32, SI
    70  	ADDQ	$32, DI
    71  	JMP	diff16
    72  diff32:
    73  	ADDQ	$16, SI
    74  	ADDQ	$16, DI
    75  	// AX = bit mask of differences
    76  diff16:
    77  	BSFQ	AX, BX	// index of first byte that differs
    78  	XORQ	AX, AX
    79  	MOVB	(SI)(BX*1), CX
    80  	CMPB	CX, (DI)(BX*1)
    81  	SETHI	AX
    82  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
    83  	RET
    84  
    85  	// 0 through 16 bytes left, alen>=8, blen>=8
    86  _0through16:
    87  	CMPQ	R8, $8
    88  	JBE	_0through8
    89  	MOVQ	(SI), AX
    90  	MOVQ	(DI), CX
    91  	CMPQ	AX, CX
    92  	JNE	diff8
    93  _0through8:
    94  	MOVQ	-8(SI)(R8*1), AX
    95  	MOVQ	-8(DI)(R8*1), CX
    96  	CMPQ	AX, CX
    97  	JEQ	allsame
    98  
    99  	// AX and CX contain parts of a and b that differ.
   100  diff8:
   101  	BSWAPQ	AX	// reverse order of bytes
   102  	BSWAPQ	CX
   103  	XORQ	AX, CX
   104  	BSRQ	CX, CX	// index of highest bit difference
   105  	SHRQ	CX, AX	// move a's bit to bottom
   106  	ANDQ	$1, AX	// mask bit
   107  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   108  	RET
   109  
   110  	// 0-7 bytes in common
   111  small:
   112  	LEAQ	(R8*8), CX	// bytes left -> bits left
   113  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   114  	JEQ	allsame
   115  
   116  	// load bytes of a into high bytes of AX
   117  	CMPB	SI, $0xf8
   118  	JA	si_high
   119  	MOVQ	(SI), SI
   120  	JMP	si_finish
   121  si_high:
   122  	MOVQ	-8(SI)(R8*1), SI
   123  	SHRQ	CX, SI
   124  si_finish:
   125  	SHLQ	CX, SI
   126  
   127  	// load bytes of b in to high bytes of BX
   128  	CMPB	DI, $0xf8
   129  	JA	di_high
   130  	MOVQ	(DI), DI
   131  	JMP	di_finish
   132  di_high:
   133  	MOVQ	-8(DI)(R8*1), DI
   134  	SHRQ	CX, DI
   135  di_finish:
   136  	SHLQ	CX, DI
   137  
   138  	BSWAPQ	SI	// reverse order of bytes
   139  	BSWAPQ	DI
   140  	XORQ	SI, DI	// find bit differences
   141  	JEQ	allsame
   142  	BSRQ	DI, CX	// index of highest bit difference
   143  	SHRQ	CX, SI	// move a's bit to bottom
   144  	ANDQ	$1, SI	// mask bit
   145  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   146  	RET
   147  
   148  allsame:
   149  	XORQ	AX, AX
   150  	XORQ	CX, CX
   151  	CMPQ	BX, DX
   152  	SETGT	AX	// 1 if alen > blen
   153  	SETEQ	CX	// 1 if alen == blen
   154  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   155  	RET
   156  
   157  	// this works for >= 64 bytes of data.
   158  big_loop:
   159  	MOVOU	(SI), X0
   160  	MOVOU	(DI), X1
   161  	PCMPEQB X0, X1
   162  	PMOVMSKB X1, AX
   163  	XORQ	$0xffff, AX
   164  	JNE	diff16
   165  
   166  	MOVOU	16(SI), X0
   167  	MOVOU	16(DI), X1
   168  	PCMPEQB X0, X1
   169  	PMOVMSKB X1, AX
   170  	XORQ	$0xffff, AX
   171  	JNE	diff32
   172  
   173  	MOVOU	32(SI), X0
   174  	MOVOU	32(DI), X1
   175  	PCMPEQB X0, X1
   176  	PMOVMSKB X1, AX
   177  	XORQ	$0xffff, AX
   178  	JNE	diff48
   179  
   180  	MOVOU	48(SI), X0
   181  	MOVOU	48(DI), X1
   182  	PCMPEQB X0, X1
   183  	PMOVMSKB X1, AX
   184  	XORQ	$0xffff, AX
   185  	JNE	diff64
   186  
   187  	ADDQ	$64, SI
   188  	ADDQ	$64, DI
   189  	SUBQ	$64, R8
   190  	CMPQ	R8, $64
   191  	JBE	loop
   192  	JMP	big_loop
   193  
   194  	// Compare 64-bytes per loop iteration.
   195  	// Loop is unrolled and uses AVX2.
   196  big_loop_avx2:
   197  	VMOVDQU	(SI), Y2
   198  	VMOVDQU	(DI), Y3
   199  	VMOVDQU	32(SI), Y4
   200  	VMOVDQU	32(DI), Y5
   201  	VPCMPEQB Y2, Y3, Y0
   202  	VPMOVMSKB Y0, AX
   203  	XORL	$0xffffffff, AX
   204  	JNE	diff32_avx2
   205  	VPCMPEQB Y4, Y5, Y6
   206  	VPMOVMSKB Y6, AX
   207  	XORL	$0xffffffff, AX
   208  	JNE	diff64_avx2
   209  
   210  	ADDQ	$64, SI
   211  	ADDQ	$64, DI
   212  	SUBQ	$64, R8
   213  	CMPQ	R8, $64
   214  	JB	big_loop_avx2_exit
   215  	JMP	big_loop_avx2
   216  
   217  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
   218  diff32_avx2:
   219  	VZEROUPPER
   220  	JMP diff16
   221  
   222  	// Same as diff32_avx2, but for last 32 bytes.
   223  diff64_avx2:
   224  	VZEROUPPER
   225  	JMP diff48
   226  
   227  	// For <64 bytes remainder jump to normal loop.
   228  big_loop_avx2_exit:
   229  	VZEROUPPER
   230  	JMP loop