github.com/aloncn/graphics-go@v0.0.1/src/runtime/memclr_amd64.s (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !plan9 6 7 #include "textflag.h" 8 9 // NOTE: Windows externalthreadhandler expects memclr to preserve DX. 10 11 // void runtime·memclr(void*, uintptr) 12 TEXT runtime·memclr(SB), NOSPLIT, $0-16 13 MOVQ ptr+0(FP), DI 14 MOVQ n+8(FP), BX 15 XORQ AX, AX 16 17 // MOVOU seems always faster than REP STOSQ. 18 tail: 19 TESTQ BX, BX 20 JEQ _0 21 CMPQ BX, $2 22 JBE _1or2 23 CMPQ BX, $4 24 JBE _3or4 25 CMPQ BX, $8 26 JB _5through7 27 JE _8 28 CMPQ BX, $16 29 JBE _9through16 30 PXOR X0, X0 31 CMPQ BX, $32 32 JBE _17through32 33 CMPQ BX, $64 34 JBE _33through64 35 CMPQ BX, $128 36 JBE _65through128 37 CMPQ BX, $256 38 JBE _129through256 39 CMPB runtime·support_avx2(SB), $1 40 JE loop_preheader_avx2 41 // TODO: use branch table and BSR to make this just a single dispatch 42 // TODO: for really big clears, use MOVNTDQ, even without AVX2. 43 44 loop: 45 MOVOU X0, 0(DI) 46 MOVOU X0, 16(DI) 47 MOVOU X0, 32(DI) 48 MOVOU X0, 48(DI) 49 MOVOU X0, 64(DI) 50 MOVOU X0, 80(DI) 51 MOVOU X0, 96(DI) 52 MOVOU X0, 112(DI) 53 MOVOU X0, 128(DI) 54 MOVOU X0, 144(DI) 55 MOVOU X0, 160(DI) 56 MOVOU X0, 176(DI) 57 MOVOU X0, 192(DI) 58 MOVOU X0, 208(DI) 59 MOVOU X0, 224(DI) 60 MOVOU X0, 240(DI) 61 SUBQ $256, BX 62 ADDQ $256, DI 63 CMPQ BX, $256 64 JAE loop 65 JMP tail 66 67 loop_preheader_avx2: 68 VPXOR Y0, Y0, Y0 69 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. 70 // For larger sizes it is always faster, even on dual Xeons with 30M cache. 71 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. 72 CMPQ BX, $0x2000000 73 JAE loop_preheader_avx2_huge 74 loop_avx2: 75 VMOVDQU Y0, 0(DI) 76 VMOVDQU Y0, 32(DI) 77 VMOVDQU Y0, 64(DI) 78 VMOVDQU Y0, 96(DI) 79 SUBQ $128, BX 80 ADDQ $128, DI 81 CMPQ BX, $128 82 JAE loop_avx2 83 VMOVDQU Y0, -32(DI)(BX*1) 84 VMOVDQU Y0, -64(DI)(BX*1) 85 VMOVDQU Y0, -96(DI)(BX*1) 86 VMOVDQU Y0, -128(DI)(BX*1) 87 VZEROUPPER 88 RET 89 loop_preheader_avx2_huge: 90 // Align to 32 byte boundary 91 VMOVDQU Y0, 0(DI) 92 MOVQ DI, SI 93 ADDQ $32, DI 94 ANDQ $~31, DI 95 SUBQ DI, SI 96 ADDQ SI, BX 97 loop_avx2_huge: 98 VMOVNTDQ Y0, 0(DI) 99 VMOVNTDQ Y0, 32(DI) 100 VMOVNTDQ Y0, 64(DI) 101 VMOVNTDQ Y0, 96(DI) 102 SUBQ $128, BX 103 ADDQ $128, DI 104 CMPQ BX, $128 105 JAE loop_avx2_huge 106 // In the desciption of MOVNTDQ in [1] 107 // "... fencing operation implemented with the SFENCE or MFENCE instruction 108 // should be used in conjunction with MOVNTDQ instructions..." 109 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 110 SFENCE 111 VMOVDQU Y0, -32(DI)(BX*1) 112 VMOVDQU Y0, -64(DI)(BX*1) 113 VMOVDQU Y0, -96(DI)(BX*1) 114 VMOVDQU Y0, -128(DI)(BX*1) 115 VZEROUPPER 116 RET 117 118 _1or2: 119 MOVB AX, (DI) 120 MOVB AX, -1(DI)(BX*1) 121 RET 122 _0: 123 RET 124 _3or4: 125 MOVW AX, (DI) 126 MOVW AX, -2(DI)(BX*1) 127 RET 128 _5through7: 129 MOVL AX, (DI) 130 MOVL AX, -4(DI)(BX*1) 131 RET 132 _8: 133 // We need a separate case for 8 to make sure we clear pointers atomically. 134 MOVQ AX, (DI) 135 RET 136 _9through16: 137 MOVQ AX, (DI) 138 MOVQ AX, -8(DI)(BX*1) 139 RET 140 _17through32: 141 MOVOU X0, (DI) 142 MOVOU X0, -16(DI)(BX*1) 143 RET 144 _33through64: 145 MOVOU X0, (DI) 146 MOVOU X0, 16(DI) 147 MOVOU X0, -32(DI)(BX*1) 148 MOVOU X0, -16(DI)(BX*1) 149 RET 150 _65through128: 151 MOVOU X0, (DI) 152 MOVOU X0, 16(DI) 153 MOVOU X0, 32(DI) 154 MOVOU X0, 48(DI) 155 MOVOU X0, -64(DI)(BX*1) 156 MOVOU X0, -48(DI)(BX*1) 157 MOVOU X0, -32(DI)(BX*1) 158 MOVOU X0, -16(DI)(BX*1) 159 RET 160 _129through256: 161 MOVOU X0, (DI) 162 MOVOU X0, 16(DI) 163 MOVOU X0, 32(DI) 164 MOVOU X0, 48(DI) 165 MOVOU X0, 64(DI) 166 MOVOU X0, 80(DI) 167 MOVOU X0, 96(DI) 168 MOVOU X0, 112(DI) 169 MOVOU X0, -128(DI)(BX*1) 170 MOVOU X0, -112(DI)(BX*1) 171 MOVOU X0, -96(DI)(BX*1) 172 MOVOU X0, -80(DI)(BX*1) 173 MOVOU X0, -64(DI)(BX*1) 174 MOVOU X0, -48(DI)(BX*1) 175 MOVOU X0, -32(DI)(BX*1) 176 MOVOU X0, -16(DI)(BX*1) 177 RET