github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/runtime/memclr_amd64.s (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !plan9 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 // See memclrNoHeapPointers Go doc for important implementation constraints. 11 12 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) 13 // ABIInternal for performance. 14 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16 15 // AX = ptr 16 // BX = n 17 MOVQ AX, DI // DI = ptr 18 XORQ AX, AX 19 20 // MOVOU seems always faster than REP STOSQ. 21 tail: 22 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 23 TESTQ BX, BX 24 JEQ _0 25 CMPQ BX, $2 26 JBE _1or2 27 CMPQ BX, $4 28 JBE _3or4 29 CMPQ BX, $8 30 JB _5through7 31 JE _8 32 CMPQ BX, $16 33 JBE _9through16 34 CMPQ BX, $32 35 JBE _17through32 36 CMPQ BX, $64 37 JBE _33through64 38 CMPQ BX, $128 39 JBE _65through128 40 CMPQ BX, $256 41 JBE _129through256 42 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 43 JE loop_preheader_avx2 44 // TODO: for really big clears, use MOVNTDQ, even without AVX2. 45 46 loop: 47 MOVOU X15, 0(DI) 48 MOVOU X15, 16(DI) 49 MOVOU X15, 32(DI) 50 MOVOU X15, 48(DI) 51 MOVOU X15, 64(DI) 52 MOVOU X15, 80(DI) 53 MOVOU X15, 96(DI) 54 MOVOU X15, 112(DI) 55 MOVOU X15, 128(DI) 56 MOVOU X15, 144(DI) 57 MOVOU X15, 160(DI) 58 MOVOU X15, 176(DI) 59 MOVOU X15, 192(DI) 60 MOVOU X15, 208(DI) 61 MOVOU X15, 224(DI) 62 MOVOU X15, 240(DI) 63 SUBQ $256, BX 64 ADDQ $256, DI 65 CMPQ BX, $256 66 JAE loop 67 JMP tail 68 69 loop_preheader_avx2: 70 VPXOR Y0, Y0, Y0 71 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. 72 // For larger sizes it is always faster, even on dual Xeons with 30M cache. 73 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. 74 CMPQ BX, $0x2000000 75 JAE loop_preheader_avx2_huge 76 loop_avx2: 77 VMOVDQU Y0, 0(DI) 78 VMOVDQU Y0, 32(DI) 79 VMOVDQU Y0, 64(DI) 80 VMOVDQU Y0, 96(DI) 81 SUBQ $128, BX 82 ADDQ $128, DI 83 CMPQ BX, $128 84 JAE loop_avx2 85 VMOVDQU Y0, -32(DI)(BX*1) 86 VMOVDQU Y0, -64(DI)(BX*1) 87 VMOVDQU Y0, -96(DI)(BX*1) 88 VMOVDQU Y0, -128(DI)(BX*1) 89 VZEROUPPER 90 RET 91 loop_preheader_avx2_huge: 92 // Align to 32 byte boundary 93 VMOVDQU Y0, 0(DI) 94 MOVQ DI, SI 95 ADDQ $32, DI 96 ANDQ $~31, DI 97 SUBQ DI, SI 98 ADDQ SI, BX 99 loop_avx2_huge: 100 VMOVNTDQ Y0, 0(DI) 101 VMOVNTDQ Y0, 32(DI) 102 VMOVNTDQ Y0, 64(DI) 103 VMOVNTDQ Y0, 96(DI) 104 SUBQ $128, BX 105 ADDQ $128, DI 106 CMPQ BX, $128 107 JAE loop_avx2_huge 108 // In the description of MOVNTDQ in [1] 109 // "... fencing operation implemented with the SFENCE or MFENCE instruction 110 // should be used in conjunction with MOVNTDQ instructions..." 111 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 112 SFENCE 113 VMOVDQU Y0, -32(DI)(BX*1) 114 VMOVDQU Y0, -64(DI)(BX*1) 115 VMOVDQU Y0, -96(DI)(BX*1) 116 VMOVDQU Y0, -128(DI)(BX*1) 117 VZEROUPPER 118 RET 119 120 _1or2: 121 MOVB AX, (DI) 122 MOVB AX, -1(DI)(BX*1) 123 RET 124 _0: 125 RET 126 _3or4: 127 MOVW AX, (DI) 128 MOVW AX, -2(DI)(BX*1) 129 RET 130 _5through7: 131 MOVL AX, (DI) 132 MOVL AX, -4(DI)(BX*1) 133 RET 134 _8: 135 // We need a separate case for 8 to make sure we clear pointers atomically. 136 MOVQ AX, (DI) 137 RET 138 _9through16: 139 MOVQ AX, (DI) 140 MOVQ AX, -8(DI)(BX*1) 141 RET 142 _17through32: 143 MOVOU X15, (DI) 144 MOVOU X15, -16(DI)(BX*1) 145 RET 146 _33through64: 147 MOVOU X15, (DI) 148 MOVOU X15, 16(DI) 149 MOVOU X15, -32(DI)(BX*1) 150 MOVOU X15, -16(DI)(BX*1) 151 RET 152 _65through128: 153 MOVOU X15, (DI) 154 MOVOU X15, 16(DI) 155 MOVOU X15, 32(DI) 156 MOVOU X15, 48(DI) 157 MOVOU X15, -64(DI)(BX*1) 158 MOVOU X15, -48(DI)(BX*1) 159 MOVOU X15, -32(DI)(BX*1) 160 MOVOU X15, -16(DI)(BX*1) 161 RET 162 _129through256: 163 MOVOU X15, (DI) 164 MOVOU X15, 16(DI) 165 MOVOU X15, 32(DI) 166 MOVOU X15, 48(DI) 167 MOVOU X15, 64(DI) 168 MOVOU X15, 80(DI) 169 MOVOU X15, 96(DI) 170 MOVOU X15, 112(DI) 171 MOVOU X15, -128(DI)(BX*1) 172 MOVOU X15, -112(DI)(BX*1) 173 MOVOU X15, -96(DI)(BX*1) 174 MOVOU X15, -80(DI)(BX*1) 175 MOVOU X15, -64(DI)(BX*1) 176 MOVOU X15, -48(DI)(BX*1) 177 MOVOU X15, -32(DI)(BX*1) 178 MOVOU X15, -16(DI)(BX*1) 179 RET