github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/runtime/memclr_amd64.s (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !plan9
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  // See memclrNoHeapPointers Go doc for important implementation constraints.
    11  
    12  // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    13  // ABIInternal for performance.
    14  TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
    15  	// AX = ptr
    16  	// BX = n
    17  	MOVQ	AX, DI	// DI = ptr
    18  	XORQ	AX, AX
    19  
    20  	// MOVOU seems always faster than REP STOSQ.
    21  tail:
    22  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    23  	TESTQ	BX, BX
    24  	JEQ	_0
    25  	CMPQ	BX, $2
    26  	JBE	_1or2
    27  	CMPQ	BX, $4
    28  	JBE	_3or4
    29  	CMPQ	BX, $8
    30  	JB	_5through7
    31  	JE	_8
    32  	CMPQ	BX, $16
    33  	JBE	_9through16
    34  	CMPQ	BX, $32
    35  	JBE	_17through32
    36  	CMPQ	BX, $64
    37  	JBE	_33through64
    38  	CMPQ	BX, $128
    39  	JBE	_65through128
    40  	CMPQ	BX, $256
    41  	JBE	_129through256
    42  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    43  	JE loop_preheader_avx2
    44  	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    45  
    46  loop:
    47  	MOVOU	X15, 0(DI)
    48  	MOVOU	X15, 16(DI)
    49  	MOVOU	X15, 32(DI)
    50  	MOVOU	X15, 48(DI)
    51  	MOVOU	X15, 64(DI)
    52  	MOVOU	X15, 80(DI)
    53  	MOVOU	X15, 96(DI)
    54  	MOVOU	X15, 112(DI)
    55  	MOVOU	X15, 128(DI)
    56  	MOVOU	X15, 144(DI)
    57  	MOVOU	X15, 160(DI)
    58  	MOVOU	X15, 176(DI)
    59  	MOVOU	X15, 192(DI)
    60  	MOVOU	X15, 208(DI)
    61  	MOVOU	X15, 224(DI)
    62  	MOVOU	X15, 240(DI)
    63  	SUBQ	$256, BX
    64  	ADDQ	$256, DI
    65  	CMPQ	BX, $256
    66  	JAE	loop
    67  	JMP	tail
    68  
    69  loop_preheader_avx2:
    70  	VPXOR Y0, Y0, Y0
    71  	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    72  	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    73  	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    74  	CMPQ    BX, $0x2000000
    75  	JAE     loop_preheader_avx2_huge
    76  loop_avx2:
    77  	VMOVDQU	Y0, 0(DI)
    78  	VMOVDQU	Y0, 32(DI)
    79  	VMOVDQU	Y0, 64(DI)
    80  	VMOVDQU	Y0, 96(DI)
    81  	SUBQ	$128, BX
    82  	ADDQ	$128, DI
    83  	CMPQ	BX, $128
    84  	JAE	loop_avx2
    85  	VMOVDQU  Y0, -32(DI)(BX*1)
    86  	VMOVDQU  Y0, -64(DI)(BX*1)
    87  	VMOVDQU  Y0, -96(DI)(BX*1)
    88  	VMOVDQU  Y0, -128(DI)(BX*1)
    89  	VZEROUPPER
    90  	RET
    91  loop_preheader_avx2_huge:
    92  	// Align to 32 byte boundary
    93  	VMOVDQU  Y0, 0(DI)
    94  	MOVQ	DI, SI
    95  	ADDQ	$32, DI
    96  	ANDQ	$~31, DI
    97  	SUBQ	DI, SI
    98  	ADDQ	SI, BX
    99  loop_avx2_huge:
   100  	VMOVNTDQ	Y0, 0(DI)
   101  	VMOVNTDQ	Y0, 32(DI)
   102  	VMOVNTDQ	Y0, 64(DI)
   103  	VMOVNTDQ	Y0, 96(DI)
   104  	SUBQ	$128, BX
   105  	ADDQ	$128, DI
   106  	CMPQ	BX, $128
   107  	JAE	loop_avx2_huge
   108  	// In the description of MOVNTDQ in [1]
   109  	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   110  	// should be used in conjunction with MOVNTDQ instructions..."
   111  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   112  	SFENCE
   113  	VMOVDQU  Y0, -32(DI)(BX*1)
   114  	VMOVDQU  Y0, -64(DI)(BX*1)
   115  	VMOVDQU  Y0, -96(DI)(BX*1)
   116  	VMOVDQU  Y0, -128(DI)(BX*1)
   117  	VZEROUPPER
   118  	RET
   119  
   120  _1or2:
   121  	MOVB	AX, (DI)
   122  	MOVB	AX, -1(DI)(BX*1)
   123  	RET
   124  _0:
   125  	RET
   126  _3or4:
   127  	MOVW	AX, (DI)
   128  	MOVW	AX, -2(DI)(BX*1)
   129  	RET
   130  _5through7:
   131  	MOVL	AX, (DI)
   132  	MOVL	AX, -4(DI)(BX*1)
   133  	RET
   134  _8:
   135  	// We need a separate case for 8 to make sure we clear pointers atomically.
   136  	MOVQ	AX, (DI)
   137  	RET
   138  _9through16:
   139  	MOVQ	AX, (DI)
   140  	MOVQ	AX, -8(DI)(BX*1)
   141  	RET
   142  _17through32:
   143  	MOVOU	X15, (DI)
   144  	MOVOU	X15, -16(DI)(BX*1)
   145  	RET
   146  _33through64:
   147  	MOVOU	X15, (DI)
   148  	MOVOU	X15, 16(DI)
   149  	MOVOU	X15, -32(DI)(BX*1)
   150  	MOVOU	X15, -16(DI)(BX*1)
   151  	RET
   152  _65through128:
   153  	MOVOU	X15, (DI)
   154  	MOVOU	X15, 16(DI)
   155  	MOVOU	X15, 32(DI)
   156  	MOVOU	X15, 48(DI)
   157  	MOVOU	X15, -64(DI)(BX*1)
   158  	MOVOU	X15, -48(DI)(BX*1)
   159  	MOVOU	X15, -32(DI)(BX*1)
   160  	MOVOU	X15, -16(DI)(BX*1)
   161  	RET
   162  _129through256:
   163  	MOVOU	X15, (DI)
   164  	MOVOU	X15, 16(DI)
   165  	MOVOU	X15, 32(DI)
   166  	MOVOU	X15, 48(DI)
   167  	MOVOU	X15, 64(DI)
   168  	MOVOU	X15, 80(DI)
   169  	MOVOU	X15, 96(DI)
   170  	MOVOU	X15, 112(DI)
   171  	MOVOU	X15, -128(DI)(BX*1)
   172  	MOVOU	X15, -112(DI)(BX*1)
   173  	MOVOU	X15, -96(DI)(BX*1)
   174  	MOVOU	X15, -80(DI)(BX*1)
   175  	MOVOU	X15, -64(DI)(BX*1)
   176  	MOVOU	X15, -48(DI)(BX*1)
   177  	MOVOU	X15, -32(DI)(BX*1)
   178  	MOVOU	X15, -16(DI)(BX*1)
   179  	RET