github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/runtime/memclr_amd64.s (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !plan9 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 #include "asm_amd64.h" 10 11 // See memclrNoHeapPointers Go doc for important implementation constraints. 12 13 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) 14 // ABIInternal for performance. 15 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16 16 // AX = ptr 17 // BX = n 18 MOVQ AX, DI // DI = ptr 19 XORQ AX, AX 20 21 // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available. 22 tail: 23 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 24 TESTQ BX, BX 25 JEQ _0 26 CMPQ BX, $2 27 JBE _1or2 28 CMPQ BX, $4 29 JBE _3or4 30 CMPQ BX, $8 31 JB _5through7 32 JE _8 33 CMPQ BX, $16 34 JBE _9through16 35 CMPQ BX, $32 36 JBE _17through32 37 CMPQ BX, $64 38 JBE _33through64 39 CMPQ BX, $128 40 JBE _65through128 41 CMPQ BX, $256 42 JBE _129through256 43 44 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB 45 JNE skip_erms 46 47 // If the size is less than 2kb, do not use ERMS as it has a big start-up cost. 48 // Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX 49 // in the Intel Optimization Guide shows better performance for ERMSB starting 50 // from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX. 51 CMPQ BX, $2048 52 JAE loop_preheader_erms 53 54 skip_erms: 55 #ifndef hasAVX2 56 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 57 JE loop_preheader_avx2 58 // TODO: for really big clears, use MOVNTDQ, even without AVX2. 59 60 loop: 61 MOVOU X15, 0(DI) 62 MOVOU X15, 16(DI) 63 MOVOU X15, 32(DI) 64 MOVOU X15, 48(DI) 65 MOVOU X15, 64(DI) 66 MOVOU X15, 80(DI) 67 MOVOU X15, 96(DI) 68 MOVOU X15, 112(DI) 69 MOVOU X15, 128(DI) 70 MOVOU X15, 144(DI) 71 MOVOU X15, 160(DI) 72 MOVOU X15, 176(DI) 73 MOVOU X15, 192(DI) 74 MOVOU X15, 208(DI) 75 MOVOU X15, 224(DI) 76 MOVOU X15, 240(DI) 77 SUBQ $256, BX 78 ADDQ $256, DI 79 CMPQ BX, $256 80 JAE loop 81 JMP tail 82 #endif 83 84 loop_preheader_avx2: 85 VPXOR X0, X0, X0 86 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. 87 // For larger sizes it is always faster, even on dual Xeons with 30M cache. 88 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. 89 CMPQ BX, $0x2000000 90 JAE loop_preheader_avx2_huge 91 92 loop_avx2: 93 VMOVDQU Y0, 0(DI) 94 VMOVDQU Y0, 32(DI) 95 VMOVDQU Y0, 64(DI) 96 VMOVDQU Y0, 96(DI) 97 SUBQ $128, BX 98 ADDQ $128, DI 99 CMPQ BX, $128 100 JAE loop_avx2 101 VMOVDQU Y0, -32(DI)(BX*1) 102 VMOVDQU Y0, -64(DI)(BX*1) 103 VMOVDQU Y0, -96(DI)(BX*1) 104 VMOVDQU Y0, -128(DI)(BX*1) 105 VZEROUPPER 106 RET 107 108 loop_preheader_erms: 109 #ifndef hasAVX2 110 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 111 JNE loop_erms 112 #endif 113 114 VPXOR X0, X0, X0 115 // At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO 116 // write protocol, ERMS could show the same or slower performance comparing to 117 // Non-Temporal Stores when the size is bigger than LLC depending on hardware. 118 CMPQ BX, $0x2000000 119 JAE loop_preheader_avx2_huge 120 121 loop_erms: 122 // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible 123 // for a memory subsystem as the GC requires this. 124 MOVQ BX, CX 125 SHRQ $3, CX 126 ANDQ $7, BX 127 REP; STOSQ 128 JMP tail 129 130 loop_preheader_avx2_huge: 131 // Align to 32 byte boundary 132 VMOVDQU Y0, 0(DI) 133 MOVQ DI, SI 134 ADDQ $32, DI 135 ANDQ $~31, DI 136 SUBQ DI, SI 137 ADDQ SI, BX 138 loop_avx2_huge: 139 VMOVNTDQ Y0, 0(DI) 140 VMOVNTDQ Y0, 32(DI) 141 VMOVNTDQ Y0, 64(DI) 142 VMOVNTDQ Y0, 96(DI) 143 SUBQ $128, BX 144 ADDQ $128, DI 145 CMPQ BX, $128 146 JAE loop_avx2_huge 147 // In the description of MOVNTDQ in [1] 148 // "... fencing operation implemented with the SFENCE or MFENCE instruction 149 // should be used in conjunction with MOVNTDQ instructions..." 150 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 151 SFENCE 152 VMOVDQU Y0, -32(DI)(BX*1) 153 VMOVDQU Y0, -64(DI)(BX*1) 154 VMOVDQU Y0, -96(DI)(BX*1) 155 VMOVDQU Y0, -128(DI)(BX*1) 156 VZEROUPPER 157 RET 158 159 _1or2: 160 MOVB AX, (DI) 161 MOVB AX, -1(DI)(BX*1) 162 RET 163 _0: 164 RET 165 _3or4: 166 MOVW AX, (DI) 167 MOVW AX, -2(DI)(BX*1) 168 RET 169 _5through7: 170 MOVL AX, (DI) 171 MOVL AX, -4(DI)(BX*1) 172 RET 173 _8: 174 // We need a separate case for 8 to make sure we clear pointers atomically. 175 MOVQ AX, (DI) 176 RET 177 _9through16: 178 MOVQ AX, (DI) 179 MOVQ AX, -8(DI)(BX*1) 180 RET 181 _17through32: 182 MOVOU X15, (DI) 183 MOVOU X15, -16(DI)(BX*1) 184 RET 185 _33through64: 186 MOVOU X15, (DI) 187 MOVOU X15, 16(DI) 188 MOVOU X15, -32(DI)(BX*1) 189 MOVOU X15, -16(DI)(BX*1) 190 RET 191 _65through128: 192 MOVOU X15, (DI) 193 MOVOU X15, 16(DI) 194 MOVOU X15, 32(DI) 195 MOVOU X15, 48(DI) 196 MOVOU X15, -64(DI)(BX*1) 197 MOVOU X15, -48(DI)(BX*1) 198 MOVOU X15, -32(DI)(BX*1) 199 MOVOU X15, -16(DI)(BX*1) 200 RET 201 _129through256: 202 MOVOU X15, (DI) 203 MOVOU X15, 16(DI) 204 MOVOU X15, 32(DI) 205 MOVOU X15, 48(DI) 206 MOVOU X15, 64(DI) 207 MOVOU X15, 80(DI) 208 MOVOU X15, 96(DI) 209 MOVOU X15, 112(DI) 210 MOVOU X15, -128(DI)(BX*1) 211 MOVOU X15, -112(DI)(BX*1) 212 MOVOU X15, -96(DI)(BX*1) 213 MOVOU X15, -80(DI)(BX*1) 214 MOVOU X15, -64(DI)(BX*1) 215 MOVOU X15, -48(DI)(BX*1) 216 MOVOU X15, -32(DI)(BX*1) 217 MOVOU X15, -16(DI)(BX*1) 218 RET