github.com/primecitizens/pcz/std@v0.2.1/core/mem/clear_amd64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2014 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && amd64 && !plan9
     9  
    10  #include "textflag.h"
    11  #include "asm_amd64.h"
    12  
    13  // See memclrNoHeapPointers Go doc for important implementation constraints.
    14  
    15  // func Clear(ptr unsafe.Pointer, n uintptr)
    16  // ABIInternal for performance.
    17  TEXT ·Clear<ABIInternal>(SB), NOSPLIT, $0-16
    18  	// AX = ptr
    19  	// BX = n
    20  	MOVQ AX, DI // DI = ptr
    21  	XORQ AX, AX
    22  
    23  	// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
    24  tail:
    25  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    26  	TESTQ BX, BX
    27  	JEQ _0
    28  	CMPQ BX, $2
    29  	JBE _1or2
    30  	CMPQ BX, $4
    31  	JBE _3or4
    32  	CMPQ BX, $8
    33  	JB _5through7
    34  	JE _8
    35  	CMPQ BX, $16
    36  	JBE _9through16
    37  	CMPQ BX, $32
    38  	JBE _17through32
    39  	CMPQ BX, $64
    40  	JBE _33through64
    41  	CMPQ BX, $128
    42  	JBE _65through128
    43  	CMPQ BX, $256
    44  	JBE _129through256
    45  
    46  	CMPB ·hasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    47  	JNE skip_erms
    48  
    49  	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
    50  	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
    51  	// in the Intel Optimization Guide shows better performance for ERMSB starting
    52  	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
    53  	CMPQ    BX, $2048
    54  	JAE loop_preheader_erms
    55  
    56  skip_erms:
    57  #ifndef hasAVX2
    58  	CMPB ·hasAVX2(SB), $1
    59  	JE loop_preheader_avx2
    60  	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    61  
    62  loop:
    63  	MOVOU X15, 0(DI)
    64  	MOVOU X15, 16(DI)
    65  	MOVOU X15, 32(DI)
    66  	MOVOU X15, 48(DI)
    67  	MOVOU X15, 64(DI)
    68  	MOVOU X15, 80(DI)
    69  	MOVOU X15, 96(DI)
    70  	MOVOU X15, 112(DI)
    71  	MOVOU X15, 128(DI)
    72  	MOVOU X15, 144(DI)
    73  	MOVOU X15, 160(DI)
    74  	MOVOU X15, 176(DI)
    75  	MOVOU X15, 192(DI)
    76  	MOVOU X15, 208(DI)
    77  	MOVOU X15, 224(DI)
    78  	MOVOU X15, 240(DI)
    79  	SUBQ $256, BX
    80  	ADDQ $256, DI
    81  	CMPQ BX, $256
    82  	JAE loop
    83  	JMP tail
    84  #endif
    85  
    86  loop_preheader_avx2:
    87  	VPXOR X0, X0, X0
    88  	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    89  	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    90  	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    91  	CMPQ    BX, $0x2000000
    92  	JAE loop_preheader_avx2_huge
    93  
    94  loop_avx2:
    95  	VMOVDQU Y0, 0(DI)
    96  	VMOVDQU Y0, 32(DI)
    97  	VMOVDQU Y0, 64(DI)
    98  	VMOVDQU Y0, 96(DI)
    99  	SUBQ $128, BX
   100  	ADDQ $128, DI
   101  	CMPQ BX, $128
   102  	JAE loop_avx2
   103  	VMOVDQU  Y0, -32(DI)(BX*1)
   104  	VMOVDQU  Y0, -64(DI)(BX*1)
   105  	VMOVDQU  Y0, -96(DI)(BX*1)
   106  	VMOVDQU  Y0, -128(DI)(BX*1)
   107  	VZEROUPPER
   108  	RET
   109  
   110  loop_preheader_erms:
   111  #ifndef hasAVX2
   112  	CMPB ·hasAVX2(SB), $1
   113  	JNE loop_erms
   114  #endif
   115  
   116  	VPXOR X0, X0, X0
   117  	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
   118  	// write protocol, ERMS could show the same or slower performance comparing to
   119  	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
   120  	CMPQ BX, $0x2000000
   121  	JAE loop_preheader_avx2_huge
   122  
   123  loop_erms:
   124  	// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
   125  	// for a memory subsystem as the GC requires this.
   126  	MOVQ BX, CX
   127  	SHRQ $3, CX
   128  	ANDQ $7, BX
   129  	REP;	STOSQ
   130  	JMP tail
   131  
   132  loop_preheader_avx2_huge:
   133  	// Align to 32 byte boundary
   134  	VMOVDQU  Y0, 0(DI)
   135  	MOVQ DI, SI
   136  	ADDQ $32, DI
   137  	ANDQ $~31, DI
   138  	SUBQ DI, SI
   139  	ADDQ SI, BX
   140  loop_avx2_huge:
   141  	VMOVNTDQ Y0, 0(DI)
   142  	VMOVNTDQ Y0, 32(DI)
   143  	VMOVNTDQ Y0, 64(DI)
   144  	VMOVNTDQ Y0, 96(DI)
   145  	SUBQ $128, BX
   146  	ADDQ $128, DI
   147  	CMPQ BX, $128
   148  	JAE loop_avx2_huge
   149  	// In the description of MOVNTDQ in [1]
   150  	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   151  	// should be used in conjunction with MOVNTDQ instructions..."
   152  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   153  	SFENCE
   154  	VMOVDQU  Y0, -32(DI)(BX*1)
   155  	VMOVDQU  Y0, -64(DI)(BX*1)
   156  	VMOVDQU  Y0, -96(DI)(BX*1)
   157  	VMOVDQU  Y0, -128(DI)(BX*1)
   158  	VZEROUPPER
   159  	RET
   160  
   161  _1or2:
   162  	MOVB AX, (DI)
   163  	MOVB AX, -1(DI)(BX*1)
   164  	RET
   165  _0:
   166  	RET
   167  _3or4:
   168  	MOVW AX, (DI)
   169  	MOVW AX, -2(DI)(BX*1)
   170  	RET
   171  _5through7:
   172  	MOVL AX, (DI)
   173  	MOVL AX, -4(DI)(BX*1)
   174  	RET
   175  _8:
   176  	// We need a separate case for 8 to make sure we clear pointers atomically.
   177  	MOVQ AX, (DI)
   178  	RET
   179  _9through16:
   180  	MOVQ AX, (DI)
   181  	MOVQ AX, -8(DI)(BX*1)
   182  	RET
   183  _17through32:
   184  	MOVOU X15, (DI)
   185  	MOVOU X15, -16(DI)(BX*1)
   186  	RET
   187  _33through64:
   188  	MOVOU X15, (DI)
   189  	MOVOU X15, 16(DI)
   190  	MOVOU X15, -32(DI)(BX*1)
   191  	MOVOU X15, -16(DI)(BX*1)
   192  	RET
   193  _65through128:
   194  	MOVOU X15, (DI)
   195  	MOVOU X15, 16(DI)
   196  	MOVOU X15, 32(DI)
   197  	MOVOU X15, 48(DI)
   198  	MOVOU X15, -64(DI)(BX*1)
   199  	MOVOU X15, -48(DI)(BX*1)
   200  	MOVOU X15, -32(DI)(BX*1)
   201  	MOVOU X15, -16(DI)(BX*1)
   202  	RET
   203  _129through256:
   204  	MOVOU X15, (DI)
   205  	MOVOU X15, 16(DI)
   206  	MOVOU X15, 32(DI)
   207  	MOVOU X15, 48(DI)
   208  	MOVOU X15, 64(DI)
   209  	MOVOU X15, 80(DI)
   210  	MOVOU X15, 96(DI)
   211  	MOVOU X15, 112(DI)
   212  	MOVOU X15, -128(DI)(BX*1)
   213  	MOVOU X15, -112(DI)(BX*1)
   214  	MOVOU X15, -96(DI)(BX*1)
   215  	MOVOU X15, -80(DI)(BX*1)
   216  	MOVOU X15, -64(DI)(BX*1)
   217  	MOVOU X15, -48(DI)(BX*1)
   218  	MOVOU X15, -32(DI)(BX*1)
   219  	MOVOU X15, -16(DI)(BX*1)
   220  	RET