github.com/lzhfromustc/gofuzz@v0.0.0-20211116160056-151b3108bbd1/runtime/memclr_amd64.s (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !plan9
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  // NOTE: Windows externalthreadhandler expects memclr to preserve DX.
    11  
    12  // See memclrNoHeapPointers Go doc for important implementation constraints.
    13  
    14  // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    15  TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
    16  	MOVQ	ptr+0(FP), DI
    17  	MOVQ	n+8(FP), BX
    18  	XORQ	AX, AX
    19  
    20  	// MOVOU seems always faster than REP STOSQ.
    21  tail:
    22  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    23  	TESTQ	BX, BX
    24  	JEQ	_0
    25  	CMPQ	BX, $2
    26  	JBE	_1or2
    27  	CMPQ	BX, $4
    28  	JBE	_3or4
    29  	CMPQ	BX, $8
    30  	JB	_5through7
    31  	JE	_8
    32  	CMPQ	BX, $16
    33  	JBE	_9through16
    34  	PXOR	X0, X0
    35  	CMPQ	BX, $32
    36  	JBE	_17through32
    37  	CMPQ	BX, $64
    38  	JBE	_33through64
    39  	CMPQ	BX, $128
    40  	JBE	_65through128
    41  	CMPQ	BX, $256
    42  	JBE	_129through256
    43  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    44  	JE loop_preheader_avx2
    45  	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    46  
    47  loop:
    48  	MOVOU	X0, 0(DI)
    49  	MOVOU	X0, 16(DI)
    50  	MOVOU	X0, 32(DI)
    51  	MOVOU	X0, 48(DI)
    52  	MOVOU	X0, 64(DI)
    53  	MOVOU	X0, 80(DI)
    54  	MOVOU	X0, 96(DI)
    55  	MOVOU	X0, 112(DI)
    56  	MOVOU	X0, 128(DI)
    57  	MOVOU	X0, 144(DI)
    58  	MOVOU	X0, 160(DI)
    59  	MOVOU	X0, 176(DI)
    60  	MOVOU	X0, 192(DI)
    61  	MOVOU	X0, 208(DI)
    62  	MOVOU	X0, 224(DI)
    63  	MOVOU	X0, 240(DI)
    64  	SUBQ	$256, BX
    65  	ADDQ	$256, DI
    66  	CMPQ	BX, $256
    67  	JAE	loop
    68  	JMP	tail
    69  
    70  loop_preheader_avx2:
    71  	VPXOR Y0, Y0, Y0
    72  	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    73  	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    74  	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    75  	CMPQ    BX, $0x2000000
    76  	JAE     loop_preheader_avx2_huge
    77  loop_avx2:
    78  	VMOVDQU	Y0, 0(DI)
    79  	VMOVDQU	Y0, 32(DI)
    80  	VMOVDQU	Y0, 64(DI)
    81  	VMOVDQU	Y0, 96(DI)
    82  	SUBQ	$128, BX
    83  	ADDQ	$128, DI
    84  	CMPQ	BX, $128
    85  	JAE	loop_avx2
    86  	VMOVDQU  Y0, -32(DI)(BX*1)
    87  	VMOVDQU  Y0, -64(DI)(BX*1)
    88  	VMOVDQU  Y0, -96(DI)(BX*1)
    89  	VMOVDQU  Y0, -128(DI)(BX*1)
    90  	VZEROUPPER
    91  	RET
    92  loop_preheader_avx2_huge:
    93  	// Align to 32 byte boundary
    94  	VMOVDQU  Y0, 0(DI)
    95  	MOVQ	DI, SI
    96  	ADDQ	$32, DI
    97  	ANDQ	$~31, DI
    98  	SUBQ	DI, SI
    99  	ADDQ	SI, BX
   100  loop_avx2_huge:
   101  	VMOVNTDQ	Y0, 0(DI)
   102  	VMOVNTDQ	Y0, 32(DI)
   103  	VMOVNTDQ	Y0, 64(DI)
   104  	VMOVNTDQ	Y0, 96(DI)
   105  	SUBQ	$128, BX
   106  	ADDQ	$128, DI
   107  	CMPQ	BX, $128
   108  	JAE	loop_avx2_huge
   109  	// In the description of MOVNTDQ in [1]
   110  	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   111  	// should be used in conjunction with MOVNTDQ instructions..."
   112  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   113  	SFENCE
   114  	VMOVDQU  Y0, -32(DI)(BX*1)
   115  	VMOVDQU  Y0, -64(DI)(BX*1)
   116  	VMOVDQU  Y0, -96(DI)(BX*1)
   117  	VMOVDQU  Y0, -128(DI)(BX*1)
   118  	VZEROUPPER
   119  	RET
   120  
   121  _1or2:
   122  	MOVB	AX, (DI)
   123  	MOVB	AX, -1(DI)(BX*1)
   124  	RET
   125  _0:
   126  	RET
   127  _3or4:
   128  	MOVW	AX, (DI)
   129  	MOVW	AX, -2(DI)(BX*1)
   130  	RET
   131  _5through7:
   132  	MOVL	AX, (DI)
   133  	MOVL	AX, -4(DI)(BX*1)
   134  	RET
   135  _8:
   136  	// We need a separate case for 8 to make sure we clear pointers atomically.
   137  	MOVQ	AX, (DI)
   138  	RET
   139  _9through16:
   140  	MOVQ	AX, (DI)
   141  	MOVQ	AX, -8(DI)(BX*1)
   142  	RET
   143  _17through32:
   144  	MOVOU	X0, (DI)
   145  	MOVOU	X0, -16(DI)(BX*1)
   146  	RET
   147  _33through64:
   148  	MOVOU	X0, (DI)
   149  	MOVOU	X0, 16(DI)
   150  	MOVOU	X0, -32(DI)(BX*1)
   151  	MOVOU	X0, -16(DI)(BX*1)
   152  	RET
   153  _65through128:
   154  	MOVOU	X0, (DI)
   155  	MOVOU	X0, 16(DI)
   156  	MOVOU	X0, 32(DI)
   157  	MOVOU	X0, 48(DI)
   158  	MOVOU	X0, -64(DI)(BX*1)
   159  	MOVOU	X0, -48(DI)(BX*1)
   160  	MOVOU	X0, -32(DI)(BX*1)
   161  	MOVOU	X0, -16(DI)(BX*1)
   162  	RET
   163  _129through256:
   164  	MOVOU	X0, (DI)
   165  	MOVOU	X0, 16(DI)
   166  	MOVOU	X0, 32(DI)
   167  	MOVOU	X0, 48(DI)
   168  	MOVOU	X0, 64(DI)
   169  	MOVOU	X0, 80(DI)
   170  	MOVOU	X0, 96(DI)
   171  	MOVOU	X0, 112(DI)
   172  	MOVOU	X0, -128(DI)(BX*1)
   173  	MOVOU	X0, -112(DI)(BX*1)
   174  	MOVOU	X0, -96(DI)(BX*1)
   175  	MOVOU	X0, -80(DI)(BX*1)
   176  	MOVOU	X0, -64(DI)(BX*1)
   177  	MOVOU	X0, -48(DI)(BX*1)
   178  	MOVOU	X0, -32(DI)(BX*1)
   179  	MOVOU	X0, -16(DI)(BX*1)
   180  	RET