github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_386.s (about) 1 // Inferno's libkern/memmove-386.s 2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "go_asm.h" 29 #include "textflag.h" 30 31 // See memmove Go doc for important implementation constraints. 32 33 // func memmove(to, from unsafe.Pointer, n uintptr) 34 TEXT runtime·memmove(SB), NOSPLIT, $0-12 35 MOVL to+0(FP), DI 36 MOVL from+4(FP), SI 37 MOVL n+8(FP), BX 38 39 // REP instructions have a high startup cost, so we handle small sizes 40 // with some straightline code. The REP MOVSL instruction is really fast 41 // for large sizes. The cutover is approximately 1K. We implement up to 42 // 128 because that is the maximum SSE register load (loading all data 43 // into registers lets us ignore copy direction). 44 tail: 45 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 46 TESTL BX, BX 47 JEQ move_0 48 CMPL BX, $2 49 JBE move_1or2 50 CMPL BX, $4 51 JB move_3 52 JE move_4 53 CMPL BX, $8 54 JBE move_5through8 55 CMPL BX, $16 56 JBE move_9through16 57 CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1 58 JNE nosse2 59 CMPL BX, $32 60 JBE move_17through32 61 CMPL BX, $64 62 JBE move_33through64 63 CMPL BX, $128 64 JBE move_65through128 65 66 nosse2: 67 /* 68 * check and set for backwards 69 */ 70 CMPL SI, DI 71 JLS back 72 73 /* 74 * forward copy loop 75 */ 76 forward: 77 // If REP MOVSB isn't fast, don't use it 78 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB 79 JNE fwdBy4 80 81 // Check alignment 82 MOVL SI, AX 83 ORL DI, AX 84 TESTL $3, AX 85 JEQ fwdBy4 86 87 // Do 1 byte at a time 88 MOVL BX, CX 89 REP; MOVSB 90 RET 91 92 fwdBy4: 93 // Do 4 bytes at a time 94 MOVL BX, CX 95 SHRL $2, CX 96 ANDL $3, BX 97 REP; MOVSL 98 JMP tail 99 100 /* 101 * check overlap 102 */ 103 back: 104 MOVL SI, CX 105 ADDL BX, CX 106 CMPL CX, DI 107 JLS forward 108 /* 109 * whole thing backwards has 110 * adjusted addresses 111 */ 112 113 ADDL BX, DI 114 ADDL BX, SI 115 STD 116 117 /* 118 * copy 119 */ 120 MOVL BX, CX 121 SHRL $2, CX 122 ANDL $3, BX 123 124 SUBL $4, DI 125 SUBL $4, SI 126 REP; MOVSL 127 128 CLD 129 ADDL $4, DI 130 ADDL $4, SI 131 SUBL BX, DI 132 SUBL BX, SI 133 JMP tail 134 135 move_1or2: 136 MOVB (SI), AX 137 MOVB -1(SI)(BX*1), CX 138 MOVB AX, (DI) 139 MOVB CX, -1(DI)(BX*1) 140 RET 141 move_0: 142 RET 143 move_3: 144 MOVW (SI), AX 145 MOVB 2(SI), CX 146 MOVW AX, (DI) 147 MOVB CX, 2(DI) 148 RET 149 move_4: 150 // We need a separate case for 4 to make sure we write pointers atomically. 151 MOVL (SI), AX 152 MOVL AX, (DI) 153 RET 154 move_5through8: 155 MOVL (SI), AX 156 MOVL -4(SI)(BX*1), CX 157 MOVL AX, (DI) 158 MOVL CX, -4(DI)(BX*1) 159 RET 160 move_9through16: 161 MOVL (SI), AX 162 MOVL 4(SI), CX 163 MOVL -8(SI)(BX*1), DX 164 MOVL -4(SI)(BX*1), BP 165 MOVL AX, (DI) 166 MOVL CX, 4(DI) 167 MOVL DX, -8(DI)(BX*1) 168 MOVL BP, -4(DI)(BX*1) 169 RET 170 move_17through32: 171 MOVOU (SI), X0 172 MOVOU -16(SI)(BX*1), X1 173 MOVOU X0, (DI) 174 MOVOU X1, -16(DI)(BX*1) 175 RET 176 move_33through64: 177 MOVOU (SI), X0 178 MOVOU 16(SI), X1 179 MOVOU -32(SI)(BX*1), X2 180 MOVOU -16(SI)(BX*1), X3 181 MOVOU X0, (DI) 182 MOVOU X1, 16(DI) 183 MOVOU X2, -32(DI)(BX*1) 184 MOVOU X3, -16(DI)(BX*1) 185 RET 186 move_65through128: 187 MOVOU (SI), X0 188 MOVOU 16(SI), X1 189 MOVOU 32(SI), X2 190 MOVOU 48(SI), X3 191 MOVOU -64(SI)(BX*1), X4 192 MOVOU -48(SI)(BX*1), X5 193 MOVOU -32(SI)(BX*1), X6 194 MOVOU -16(SI)(BX*1), X7 195 MOVOU X0, (DI) 196 MOVOU X1, 16(DI) 197 MOVOU X2, 32(DI) 198 MOVOU X3, 48(DI) 199 MOVOU X4, -64(DI)(BX*1) 200 MOVOU X5, -48(DI)(BX*1) 201 MOVOU X6, -32(DI)(BX*1) 202 MOVOU X7, -16(DI)(BX*1) 203 RET