github.com/primecitizens/pcz/std@v0.2.1/core/mem/clear_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2014 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && amd64 && !plan9 9 10 #include "textflag.h" 11 #include "asm_amd64.h" 12 13 // See memclrNoHeapPointers Go doc for important implementation constraints. 14 15 // func Clear(ptr unsafe.Pointer, n uintptr) 16 // ABIInternal for performance. 17 TEXT ·Clear<ABIInternal>(SB), NOSPLIT, $0-16 18 // AX = ptr 19 // BX = n 20 MOVQ AX, DI // DI = ptr 21 XORQ AX, AX 22 23 // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available. 24 tail: 25 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 26 TESTQ BX, BX 27 JEQ _0 28 CMPQ BX, $2 29 JBE _1or2 30 CMPQ BX, $4 31 JBE _3or4 32 CMPQ BX, $8 33 JB _5through7 34 JE _8 35 CMPQ BX, $16 36 JBE _9through16 37 CMPQ BX, $32 38 JBE _17through32 39 CMPQ BX, $64 40 JBE _33through64 41 CMPQ BX, $128 42 JBE _65through128 43 CMPQ BX, $256 44 JBE _129through256 45 46 CMPB ·hasERMS(SB), $1 // enhanced REP MOVSB/STOSB 47 JNE skip_erms 48 49 // If the size is less than 2kb, do not use ERMS as it has a big start-up cost. 50 // Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX 51 // in the Intel Optimization Guide shows better performance for ERMSB starting 52 // from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX. 53 CMPQ BX, $2048 54 JAE loop_preheader_erms 55 56 skip_erms: 57 #ifndef hasAVX2 58 CMPB ·hasAVX2(SB), $1 59 JE loop_preheader_avx2 60 // TODO: for really big clears, use MOVNTDQ, even without AVX2. 61 62 loop: 63 MOVOU X15, 0(DI) 64 MOVOU X15, 16(DI) 65 MOVOU X15, 32(DI) 66 MOVOU X15, 48(DI) 67 MOVOU X15, 64(DI) 68 MOVOU X15, 80(DI) 69 MOVOU X15, 96(DI) 70 MOVOU X15, 112(DI) 71 MOVOU X15, 128(DI) 72 MOVOU X15, 144(DI) 73 MOVOU X15, 160(DI) 74 MOVOU X15, 176(DI) 75 MOVOU X15, 192(DI) 76 MOVOU X15, 208(DI) 77 MOVOU X15, 224(DI) 78 MOVOU X15, 240(DI) 79 SUBQ $256, BX 80 ADDQ $256, DI 81 CMPQ BX, $256 82 JAE loop 83 JMP tail 84 #endif 85 86 loop_preheader_avx2: 87 VPXOR X0, X0, X0 88 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. 89 // For larger sizes it is always faster, even on dual Xeons with 30M cache. 90 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. 91 CMPQ BX, $0x2000000 92 JAE loop_preheader_avx2_huge 93 94 loop_avx2: 95 VMOVDQU Y0, 0(DI) 96 VMOVDQU Y0, 32(DI) 97 VMOVDQU Y0, 64(DI) 98 VMOVDQU Y0, 96(DI) 99 SUBQ $128, BX 100 ADDQ $128, DI 101 CMPQ BX, $128 102 JAE loop_avx2 103 VMOVDQU Y0, -32(DI)(BX*1) 104 VMOVDQU Y0, -64(DI)(BX*1) 105 VMOVDQU Y0, -96(DI)(BX*1) 106 VMOVDQU Y0, -128(DI)(BX*1) 107 VZEROUPPER 108 RET 109 110 loop_preheader_erms: 111 #ifndef hasAVX2 112 CMPB ·hasAVX2(SB), $1 113 JNE loop_erms 114 #endif 115 116 VPXOR X0, X0, X0 117 // At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO 118 // write protocol, ERMS could show the same or slower performance comparing to 119 // Non-Temporal Stores when the size is bigger than LLC depending on hardware. 120 CMPQ BX, $0x2000000 121 JAE loop_preheader_avx2_huge 122 123 loop_erms: 124 // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible 125 // for a memory subsystem as the GC requires this. 126 MOVQ BX, CX 127 SHRQ $3, CX 128 ANDQ $7, BX 129 REP; STOSQ 130 JMP tail 131 132 loop_preheader_avx2_huge: 133 // Align to 32 byte boundary 134 VMOVDQU Y0, 0(DI) 135 MOVQ DI, SI 136 ADDQ $32, DI 137 ANDQ $~31, DI 138 SUBQ DI, SI 139 ADDQ SI, BX 140 loop_avx2_huge: 141 VMOVNTDQ Y0, 0(DI) 142 VMOVNTDQ Y0, 32(DI) 143 VMOVNTDQ Y0, 64(DI) 144 VMOVNTDQ Y0, 96(DI) 145 SUBQ $128, BX 146 ADDQ $128, DI 147 CMPQ BX, $128 148 JAE loop_avx2_huge 149 // In the description of MOVNTDQ in [1] 150 // "... fencing operation implemented with the SFENCE or MFENCE instruction 151 // should be used in conjunction with MOVNTDQ instructions..." 152 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 153 SFENCE 154 VMOVDQU Y0, -32(DI)(BX*1) 155 VMOVDQU Y0, -64(DI)(BX*1) 156 VMOVDQU Y0, -96(DI)(BX*1) 157 VMOVDQU Y0, -128(DI)(BX*1) 158 VZEROUPPER 159 RET 160 161 _1or2: 162 MOVB AX, (DI) 163 MOVB AX, -1(DI)(BX*1) 164 RET 165 _0: 166 RET 167 _3or4: 168 MOVW AX, (DI) 169 MOVW AX, -2(DI)(BX*1) 170 RET 171 _5through7: 172 MOVL AX, (DI) 173 MOVL AX, -4(DI)(BX*1) 174 RET 175 _8: 176 // We need a separate case for 8 to make sure we clear pointers atomically. 177 MOVQ AX, (DI) 178 RET 179 _9through16: 180 MOVQ AX, (DI) 181 MOVQ AX, -8(DI)(BX*1) 182 RET 183 _17through32: 184 MOVOU X15, (DI) 185 MOVOU X15, -16(DI)(BX*1) 186 RET 187 _33through64: 188 MOVOU X15, (DI) 189 MOVOU X15, 16(DI) 190 MOVOU X15, -32(DI)(BX*1) 191 MOVOU X15, -16(DI)(BX*1) 192 RET 193 _65through128: 194 MOVOU X15, (DI) 195 MOVOU X15, 16(DI) 196 MOVOU X15, 32(DI) 197 MOVOU X15, 48(DI) 198 MOVOU X15, -64(DI)(BX*1) 199 MOVOU X15, -48(DI)(BX*1) 200 MOVOU X15, -32(DI)(BX*1) 201 MOVOU X15, -16(DI)(BX*1) 202 RET 203 _129through256: 204 MOVOU X15, (DI) 205 MOVOU X15, 16(DI) 206 MOVOU X15, 32(DI) 207 MOVOU X15, 48(DI) 208 MOVOU X15, 64(DI) 209 MOVOU X15, 80(DI) 210 MOVOU X15, 96(DI) 211 MOVOU X15, 112(DI) 212 MOVOU X15, -128(DI)(BX*1) 213 MOVOU X15, -112(DI)(BX*1) 214 MOVOU X15, -96(DI)(BX*1) 215 MOVOU X15, -80(DI)(BX*1) 216 MOVOU X15, -64(DI)(BX*1) 217 MOVOU X15, -48(DI)(BX*1) 218 MOVOU X15, -32(DI)(BX*1) 219 MOVOU X15, -16(DI)(BX*1) 220 RET