github.com/sandwichdev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/compare_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Compare(SB),NOSPLIT,$0-56
     9  	MOVQ	a_base+0(FP), SI
    10  	MOVQ	a_len+8(FP), BX
    11  	MOVQ	b_base+24(FP), DI
    12  	MOVQ	b_len+32(FP), DX
    13  	LEAQ	ret+48(FP), R9
    14  	JMP	cmpbody<>(SB)
    15  
    16  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
    17  	MOVQ	a_base+0(FP), SI
    18  	MOVQ	a_len+8(FP), BX
    19  	MOVQ	b_base+16(FP), DI
    20  	MOVQ	b_len+24(FP), DX
    21  	LEAQ	ret+32(FP), R9
    22  	JMP	cmpbody<>(SB)
    23  
    24  // input:
    25  //   SI = a
    26  //   DI = b
    27  //   BX = alen
    28  //   DX = blen
    29  //   R9 = address of output word (stores -1/0/1 here)
    30  TEXT cmpbody<>(SB),NOSPLIT,$0-0
    31  	CMPQ	SI, DI
    32  	JEQ	allsame
    33  	CMPQ	BX, DX
    34  	MOVQ	DX, R8
    35  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
    36  	CMPQ	R8, $8
    37  	JB	small
    38  
    39  	CMPQ	R8, $63
    40  	JBE	loop
    41  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    42  	JEQ     big_loop_avx2
    43  	JMP	big_loop
    44  loop:
    45  	CMPQ	R8, $16
    46  	JBE	_0through16
    47  	MOVOU	(SI), X0
    48  	MOVOU	(DI), X1
    49  	PCMPEQB X0, X1
    50  	PMOVMSKB X1, AX
    51  	XORQ	$0xffff, AX	// convert EQ to NE
    52  	JNE	diff16	// branch if at least one byte is not equal
    53  	ADDQ	$16, SI
    54  	ADDQ	$16, DI
    55  	SUBQ	$16, R8
    56  	JMP	loop
    57  
    58  diff64:
    59  	ADDQ	$48, SI
    60  	ADDQ	$48, DI
    61  	JMP	diff16
    62  diff48:
    63  	ADDQ	$32, SI
    64  	ADDQ	$32, DI
    65  	JMP	diff16
    66  diff32:
    67  	ADDQ	$16, SI
    68  	ADDQ	$16, DI
    69  	// AX = bit mask of differences
    70  diff16:
    71  	BSFQ	AX, BX	// index of first byte that differs
    72  	XORQ	AX, AX
    73  	MOVB	(SI)(BX*1), CX
    74  	CMPB	CX, (DI)(BX*1)
    75  	SETHI	AX
    76  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
    77  	MOVQ	AX, (R9)
    78  	RET
    79  
    80  	// 0 through 16 bytes left, alen>=8, blen>=8
    81  _0through16:
    82  	CMPQ	R8, $8
    83  	JBE	_0through8
    84  	MOVQ	(SI), AX
    85  	MOVQ	(DI), CX
    86  	CMPQ	AX, CX
    87  	JNE	diff8
    88  _0through8:
    89  	MOVQ	-8(SI)(R8*1), AX
    90  	MOVQ	-8(DI)(R8*1), CX
    91  	CMPQ	AX, CX
    92  	JEQ	allsame
    93  
    94  	// AX and CX contain parts of a and b that differ.
    95  diff8:
    96  	BSWAPQ	AX	// reverse order of bytes
    97  	BSWAPQ	CX
    98  	XORQ	AX, CX
    99  	BSRQ	CX, CX	// index of highest bit difference
   100  	SHRQ	CX, AX	// move a's bit to bottom
   101  	ANDQ	$1, AX	// mask bit
   102  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   103  	MOVQ	AX, (R9)
   104  	RET
   105  
   106  	// 0-7 bytes in common
   107  small:
   108  	LEAQ	(R8*8), CX	// bytes left -> bits left
   109  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   110  	JEQ	allsame
   111  
   112  	// load bytes of a into high bytes of AX
   113  	CMPB	SI, $0xf8
   114  	JA	si_high
   115  	MOVQ	(SI), SI
   116  	JMP	si_finish
   117  si_high:
   118  	MOVQ	-8(SI)(R8*1), SI
   119  	SHRQ	CX, SI
   120  si_finish:
   121  	SHLQ	CX, SI
   122  
   123  	// load bytes of b in to high bytes of BX
   124  	CMPB	DI, $0xf8
   125  	JA	di_high
   126  	MOVQ	(DI), DI
   127  	JMP	di_finish
   128  di_high:
   129  	MOVQ	-8(DI)(R8*1), DI
   130  	SHRQ	CX, DI
   131  di_finish:
   132  	SHLQ	CX, DI
   133  
   134  	BSWAPQ	SI	// reverse order of bytes
   135  	BSWAPQ	DI
   136  	XORQ	SI, DI	// find bit differences
   137  	JEQ	allsame
   138  	BSRQ	DI, CX	// index of highest bit difference
   139  	SHRQ	CX, SI	// move a's bit to bottom
   140  	ANDQ	$1, SI	// mask bit
   141  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   142  	MOVQ	AX, (R9)
   143  	RET
   144  
   145  allsame:
   146  	XORQ	AX, AX
   147  	XORQ	CX, CX
   148  	CMPQ	BX, DX
   149  	SETGT	AX	// 1 if alen > blen
   150  	SETEQ	CX	// 1 if alen == blen
   151  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   152  	MOVQ	AX, (R9)
   153  	RET
   154  
   155  	// this works for >= 64 bytes of data.
   156  big_loop:
   157  	MOVOU	(SI), X0
   158  	MOVOU	(DI), X1
   159  	PCMPEQB X0, X1
   160  	PMOVMSKB X1, AX
   161  	XORQ	$0xffff, AX
   162  	JNE	diff16
   163  
   164  	MOVOU	16(SI), X0
   165  	MOVOU	16(DI), X1
   166  	PCMPEQB X0, X1
   167  	PMOVMSKB X1, AX
   168  	XORQ	$0xffff, AX
   169  	JNE	diff32
   170  
   171  	MOVOU	32(SI), X0
   172  	MOVOU	32(DI), X1
   173  	PCMPEQB X0, X1
   174  	PMOVMSKB X1, AX
   175  	XORQ	$0xffff, AX
   176  	JNE	diff48
   177  
   178  	MOVOU	48(SI), X0
   179  	MOVOU	48(DI), X1
   180  	PCMPEQB X0, X1
   181  	PMOVMSKB X1, AX
   182  	XORQ	$0xffff, AX
   183  	JNE	diff64
   184  
   185  	ADDQ	$64, SI
   186  	ADDQ	$64, DI
   187  	SUBQ	$64, R8
   188  	CMPQ	R8, $64
   189  	JBE	loop
   190  	JMP	big_loop
   191  
   192  	// Compare 64-bytes per loop iteration.
   193  	// Loop is unrolled and uses AVX2.
   194  big_loop_avx2:
   195  	VMOVDQU	(SI), Y2
   196  	VMOVDQU	(DI), Y3
   197  	VMOVDQU	32(SI), Y4
   198  	VMOVDQU	32(DI), Y5
   199  	VPCMPEQB Y2, Y3, Y0
   200  	VPMOVMSKB Y0, AX
   201  	XORL	$0xffffffff, AX
   202  	JNE	diff32_avx2
   203  	VPCMPEQB Y4, Y5, Y6
   204  	VPMOVMSKB Y6, AX
   205  	XORL	$0xffffffff, AX
   206  	JNE	diff64_avx2
   207  
   208  	ADDQ	$64, SI
   209  	ADDQ	$64, DI
   210  	SUBQ	$64, R8
   211  	CMPQ	R8, $64
   212  	JB	big_loop_avx2_exit
   213  	JMP	big_loop_avx2
   214  
   215  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
   216  diff32_avx2:
   217  	VZEROUPPER
   218  	JMP diff16
   219  
   220  	// Same as diff32_avx2, but for last 32 bytes.
   221  diff64_avx2:
   222  	VZEROUPPER
   223  	JMP diff48
   224  
   225  	// For <64 bytes remainder jump to normal loop.
   226  big_loop_avx2_exit:
   227  	VZEROUPPER
   228  	JMP loop