github.com/lzhfromustc/gofuzz@v0.0.0-20211116160056-151b3108bbd1/runtime/memclr_amd64.s (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !plan9 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 // NOTE: Windows externalthreadhandler expects memclr to preserve DX. 11 12 // See memclrNoHeapPointers Go doc for important implementation constraints. 13 14 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) 15 TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 16 MOVQ ptr+0(FP), DI 17 MOVQ n+8(FP), BX 18 XORQ AX, AX 19 20 // MOVOU seems always faster than REP STOSQ. 21 tail: 22 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 23 TESTQ BX, BX 24 JEQ _0 25 CMPQ BX, $2 26 JBE _1or2 27 CMPQ BX, $4 28 JBE _3or4 29 CMPQ BX, $8 30 JB _5through7 31 JE _8 32 CMPQ BX, $16 33 JBE _9through16 34 PXOR X0, X0 35 CMPQ BX, $32 36 JBE _17through32 37 CMPQ BX, $64 38 JBE _33through64 39 CMPQ BX, $128 40 JBE _65through128 41 CMPQ BX, $256 42 JBE _129through256 43 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 44 JE loop_preheader_avx2 45 // TODO: for really big clears, use MOVNTDQ, even without AVX2. 46 47 loop: 48 MOVOU X0, 0(DI) 49 MOVOU X0, 16(DI) 50 MOVOU X0, 32(DI) 51 MOVOU X0, 48(DI) 52 MOVOU X0, 64(DI) 53 MOVOU X0, 80(DI) 54 MOVOU X0, 96(DI) 55 MOVOU X0, 112(DI) 56 MOVOU X0, 128(DI) 57 MOVOU X0, 144(DI) 58 MOVOU X0, 160(DI) 59 MOVOU X0, 176(DI) 60 MOVOU X0, 192(DI) 61 MOVOU X0, 208(DI) 62 MOVOU X0, 224(DI) 63 MOVOU X0, 240(DI) 64 SUBQ $256, BX 65 ADDQ $256, DI 66 CMPQ BX, $256 67 JAE loop 68 JMP tail 69 70 loop_preheader_avx2: 71 VPXOR Y0, Y0, Y0 72 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. 73 // For larger sizes it is always faster, even on dual Xeons with 30M cache. 74 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. 75 CMPQ BX, $0x2000000 76 JAE loop_preheader_avx2_huge 77 loop_avx2: 78 VMOVDQU Y0, 0(DI) 79 VMOVDQU Y0, 32(DI) 80 VMOVDQU Y0, 64(DI) 81 VMOVDQU Y0, 96(DI) 82 SUBQ $128, BX 83 ADDQ $128, DI 84 CMPQ BX, $128 85 JAE loop_avx2 86 VMOVDQU Y0, -32(DI)(BX*1) 87 VMOVDQU Y0, -64(DI)(BX*1) 88 VMOVDQU Y0, -96(DI)(BX*1) 89 VMOVDQU Y0, -128(DI)(BX*1) 90 VZEROUPPER 91 RET 92 loop_preheader_avx2_huge: 93 // Align to 32 byte boundary 94 VMOVDQU Y0, 0(DI) 95 MOVQ DI, SI 96 ADDQ $32, DI 97 ANDQ $~31, DI 98 SUBQ DI, SI 99 ADDQ SI, BX 100 loop_avx2_huge: 101 VMOVNTDQ Y0, 0(DI) 102 VMOVNTDQ Y0, 32(DI) 103 VMOVNTDQ Y0, 64(DI) 104 VMOVNTDQ Y0, 96(DI) 105 SUBQ $128, BX 106 ADDQ $128, DI 107 CMPQ BX, $128 108 JAE loop_avx2_huge 109 // In the description of MOVNTDQ in [1] 110 // "... fencing operation implemented with the SFENCE or MFENCE instruction 111 // should be used in conjunction with MOVNTDQ instructions..." 112 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 113 SFENCE 114 VMOVDQU Y0, -32(DI)(BX*1) 115 VMOVDQU Y0, -64(DI)(BX*1) 116 VMOVDQU Y0, -96(DI)(BX*1) 117 VMOVDQU Y0, -128(DI)(BX*1) 118 VZEROUPPER 119 RET 120 121 _1or2: 122 MOVB AX, (DI) 123 MOVB AX, -1(DI)(BX*1) 124 RET 125 _0: 126 RET 127 _3or4: 128 MOVW AX, (DI) 129 MOVW AX, -2(DI)(BX*1) 130 RET 131 _5through7: 132 MOVL AX, (DI) 133 MOVL AX, -4(DI)(BX*1) 134 RET 135 _8: 136 // We need a separate case for 8 to make sure we clear pointers atomically. 137 MOVQ AX, (DI) 138 RET 139 _9through16: 140 MOVQ AX, (DI) 141 MOVQ AX, -8(DI)(BX*1) 142 RET 143 _17through32: 144 MOVOU X0, (DI) 145 MOVOU X0, -16(DI)(BX*1) 146 RET 147 _33through64: 148 MOVOU X0, (DI) 149 MOVOU X0, 16(DI) 150 MOVOU X0, -32(DI)(BX*1) 151 MOVOU X0, -16(DI)(BX*1) 152 RET 153 _65through128: 154 MOVOU X0, (DI) 155 MOVOU X0, 16(DI) 156 MOVOU X0, 32(DI) 157 MOVOU X0, 48(DI) 158 MOVOU X0, -64(DI)(BX*1) 159 MOVOU X0, -48(DI)(BX*1) 160 MOVOU X0, -32(DI)(BX*1) 161 MOVOU X0, -16(DI)(BX*1) 162 RET 163 _129through256: 164 MOVOU X0, (DI) 165 MOVOU X0, 16(DI) 166 MOVOU X0, 32(DI) 167 MOVOU X0, 48(DI) 168 MOVOU X0, 64(DI) 169 MOVOU X0, 80(DI) 170 MOVOU X0, 96(DI) 171 MOVOU X0, 112(DI) 172 MOVOU X0, -128(DI)(BX*1) 173 MOVOU X0, -112(DI)(BX*1) 174 MOVOU X0, -96(DI)(BX*1) 175 MOVOU X0, -80(DI)(BX*1) 176 MOVOU X0, -64(DI)(BX*1) 177 MOVOU X0, -48(DI)(BX*1) 178 MOVOU X0, -32(DI)(BX*1) 179 MOVOU X0, -16(DI)(BX*1) 180 RET