github.com/aloncn/graphics-go@v0.0.1/src/runtime/memclr_amd64.s (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !plan9
     6  
     7  #include "textflag.h"
     8  
     9  // NOTE: Windows externalthreadhandler expects memclr to preserve DX.
    10  
    11  // void runtime·memclr(void*, uintptr)
    12  TEXT runtime·memclr(SB), NOSPLIT, $0-16
    13  	MOVQ	ptr+0(FP), DI
    14  	MOVQ	n+8(FP), BX
    15  	XORQ	AX, AX
    16  
    17  	// MOVOU seems always faster than REP STOSQ.
    18  tail:
    19  	TESTQ	BX, BX
    20  	JEQ	_0
    21  	CMPQ	BX, $2
    22  	JBE	_1or2
    23  	CMPQ	BX, $4
    24  	JBE	_3or4
    25  	CMPQ	BX, $8
    26  	JB	_5through7
    27  	JE	_8
    28  	CMPQ	BX, $16
    29  	JBE	_9through16
    30  	PXOR	X0, X0
    31  	CMPQ	BX, $32
    32  	JBE	_17through32
    33  	CMPQ	BX, $64
    34  	JBE	_33through64
    35  	CMPQ	BX, $128
    36  	JBE	_65through128
    37  	CMPQ	BX, $256
    38  	JBE	_129through256
    39  	CMPB	runtime·support_avx2(SB), $1
    40  	JE loop_preheader_avx2
    41  	// TODO: use branch table and BSR to make this just a single dispatch
    42  	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    43  
    44  loop:
    45  	MOVOU	X0, 0(DI)
    46  	MOVOU	X0, 16(DI)
    47  	MOVOU	X0, 32(DI)
    48  	MOVOU	X0, 48(DI)
    49  	MOVOU	X0, 64(DI)
    50  	MOVOU	X0, 80(DI)
    51  	MOVOU	X0, 96(DI)
    52  	MOVOU	X0, 112(DI)
    53  	MOVOU	X0, 128(DI)
    54  	MOVOU	X0, 144(DI)
    55  	MOVOU	X0, 160(DI)
    56  	MOVOU	X0, 176(DI)
    57  	MOVOU	X0, 192(DI)
    58  	MOVOU	X0, 208(DI)
    59  	MOVOU	X0, 224(DI)
    60  	MOVOU	X0, 240(DI)
    61  	SUBQ	$256, BX
    62  	ADDQ	$256, DI
    63  	CMPQ	BX, $256
    64  	JAE	loop
    65  	JMP	tail
    66  
    67  loop_preheader_avx2:
    68  	VPXOR Y0, Y0, Y0
    69  	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    70  	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    71  	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    72  	CMPQ    BX, $0x2000000
    73  	JAE     loop_preheader_avx2_huge
    74  loop_avx2:
    75  	VMOVDQU	Y0, 0(DI)
    76  	VMOVDQU	Y0, 32(DI)
    77  	VMOVDQU	Y0, 64(DI)
    78  	VMOVDQU	Y0, 96(DI)
    79  	SUBQ	$128, BX
    80  	ADDQ	$128, DI
    81  	CMPQ	BX, $128
    82  	JAE	loop_avx2
    83  	VMOVDQU  Y0, -32(DI)(BX*1)
    84  	VMOVDQU  Y0, -64(DI)(BX*1)
    85  	VMOVDQU  Y0, -96(DI)(BX*1)
    86  	VMOVDQU  Y0, -128(DI)(BX*1)
    87  	VZEROUPPER
    88  	RET
    89  loop_preheader_avx2_huge:
    90  	// Align to 32 byte boundary
    91  	VMOVDQU  Y0, 0(DI)
    92  	MOVQ	DI, SI
    93  	ADDQ	$32, DI
    94  	ANDQ	$~31, DI
    95  	SUBQ	DI, SI
    96  	ADDQ	SI, BX
    97  loop_avx2_huge:
    98  	VMOVNTDQ	Y0, 0(DI)
    99  	VMOVNTDQ	Y0, 32(DI)
   100  	VMOVNTDQ	Y0, 64(DI)
   101  	VMOVNTDQ	Y0, 96(DI)
   102  	SUBQ	$128, BX
   103  	ADDQ	$128, DI
   104  	CMPQ	BX, $128
   105  	JAE	loop_avx2_huge
   106  	// In the desciption of MOVNTDQ in [1]
   107  	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   108  	// should be used in conjunction with MOVNTDQ instructions..."
   109  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   110  	SFENCE
   111  	VMOVDQU  Y0, -32(DI)(BX*1)
   112  	VMOVDQU  Y0, -64(DI)(BX*1)
   113  	VMOVDQU  Y0, -96(DI)(BX*1)
   114  	VMOVDQU  Y0, -128(DI)(BX*1)
   115  	VZEROUPPER
   116  	RET
   117  
   118  _1or2:
   119  	MOVB	AX, (DI)
   120  	MOVB	AX, -1(DI)(BX*1)
   121  	RET
   122  _0:
   123  	RET
   124  _3or4:
   125  	MOVW	AX, (DI)
   126  	MOVW	AX, -2(DI)(BX*1)
   127  	RET
   128  _5through7:
   129  	MOVL	AX, (DI)
   130  	MOVL	AX, -4(DI)(BX*1)
   131  	RET
   132  _8:
   133  	// We need a separate case for 8 to make sure we clear pointers atomically.
   134  	MOVQ	AX, (DI)
   135  	RET
   136  _9through16:
   137  	MOVQ	AX, (DI)
   138  	MOVQ	AX, -8(DI)(BX*1)
   139  	RET
   140  _17through32:
   141  	MOVOU	X0, (DI)
   142  	MOVOU	X0, -16(DI)(BX*1)
   143  	RET
   144  _33through64:
   145  	MOVOU	X0, (DI)
   146  	MOVOU	X0, 16(DI)
   147  	MOVOU	X0, -32(DI)(BX*1)
   148  	MOVOU	X0, -16(DI)(BX*1)
   149  	RET
   150  _65through128:
   151  	MOVOU	X0, (DI)
   152  	MOVOU	X0, 16(DI)
   153  	MOVOU	X0, 32(DI)
   154  	MOVOU	X0, 48(DI)
   155  	MOVOU	X0, -64(DI)(BX*1)
   156  	MOVOU	X0, -48(DI)(BX*1)
   157  	MOVOU	X0, -32(DI)(BX*1)
   158  	MOVOU	X0, -16(DI)(BX*1)
   159  	RET
   160  _129through256:
   161  	MOVOU	X0, (DI)
   162  	MOVOU	X0, 16(DI)
   163  	MOVOU	X0, 32(DI)
   164  	MOVOU	X0, 48(DI)
   165  	MOVOU	X0, 64(DI)
   166  	MOVOU	X0, 80(DI)
   167  	MOVOU	X0, 96(DI)
   168  	MOVOU	X0, 112(DI)
   169  	MOVOU	X0, -128(DI)(BX*1)
   170  	MOVOU	X0, -112(DI)(BX*1)
   171  	MOVOU	X0, -96(DI)(BX*1)
   172  	MOVOU	X0, -80(DI)(BX*1)
   173  	MOVOU	X0, -64(DI)(BX*1)
   174  	MOVOU	X0, -48(DI)(BX*1)
   175  	MOVOU	X0, -32(DI)(BX*1)
   176  	MOVOU	X0, -16(DI)(BX*1)
   177  	RET