github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_386.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Inferno's libkern/memmove-386.s 5 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s 6 // 7 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 8 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 9 // Portions Copyright 2009 The Go Authors. All rights reserved. 10 // 11 // Permission is hereby granted, free of charge, to any person obtaining a copy 12 // of this software and associated documentation files (the "Software"), to deal 13 // in the Software without restriction, including without limitation the rights 14 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 // copies of the Software, and to permit persons to whom the Software is 16 // furnished to do so, subject to the following conditions: 17 // 18 // The above copyright notice and this permission notice shall be included in 19 // all copies or substantial portions of the Software. 20 // 21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 27 // THE SOFTWARE. 28 29 //go:build pcz && 386 && !plan9 30 31 #include "textflag.h" 32 33 // See memmove Go doc for important implementation constraints. 34 35 // func Move(to, from unsafe.Pointer, n uintptr) 36 TEXT ·Move(SB), NOSPLIT, $0-12 37 MOVL to+0(FP), DI 38 MOVL from+4(FP), SI 39 MOVL n+8(FP), BX 40 41 // REP instructions have a high startup cost, so we handle small sizes 42 // with some straightline code. The REP MOVSL instruction is really fast 43 // for large sizes. The cutover is approximately 1K. We implement up to 44 // 128 because that is the maximum SSE register load (loading all data 45 // into registers lets us ignore copy direction). 46 tail: 47 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 48 TESTL BX, BX 49 JEQ move_0 50 CMPL BX, $2 51 JBE move_1or2 52 CMPL BX, $4 53 JB move_3 54 JE move_4 55 CMPL BX, $8 56 JBE move_5through8 57 CMPL BX, $16 58 JBE move_9through16 59 #ifdef GO386_softfloat 60 JMP nosse2 61 #endif 62 CMPL BX, $32 63 JBE move_17through32 64 CMPL BX, $64 65 JBE move_33through64 66 CMPL BX, $128 67 JBE move_65through128 68 69 nosse2: 70 /* 71 * check and set for backwards 72 */ 73 CMPL SI, DI 74 JLS back 75 76 /* 77 * forward copy loop 78 */ 79 forward: 80 // If REP MOVSB isn't fast, don't use it 81 CMPB ·hasERMS(SB), $1 // enhanced REP MOVSB/STOSB 82 JNE fwdBy4 83 84 // Check alignment 85 MOVL SI, AX 86 ORL DI, AX 87 TESTL $3, AX 88 JEQ fwdBy4 89 90 // Do 1 byte at a time 91 MOVL BX, CX 92 REP; MOVSB 93 RET 94 95 fwdBy4: 96 // Do 4 bytes at a time 97 MOVL BX, CX 98 SHRL $2, CX 99 ANDL $3, BX 100 REP; MOVSL 101 JMP tail 102 103 /* 104 * check overlap 105 */ 106 back: 107 MOVL SI, CX 108 ADDL BX, CX 109 CMPL CX, DI 110 JLS forward 111 /* 112 * whole thing backwards has 113 * adjusted addresses 114 */ 115 116 ADDL BX, DI 117 ADDL BX, SI 118 STD 119 120 /* 121 * copy 122 */ 123 MOVL BX, CX 124 SHRL $2, CX 125 ANDL $3, BX 126 127 SUBL $4, DI 128 SUBL $4, SI 129 REP; MOVSL 130 131 CLD 132 ADDL $4, DI 133 ADDL $4, SI 134 SUBL BX, DI 135 SUBL BX, SI 136 JMP tail 137 138 move_1or2: 139 MOVB (SI), AX 140 MOVB -1(SI)(BX*1), CX 141 MOVB AX, (DI) 142 MOVB CX, -1(DI)(BX*1) 143 RET 144 move_0: 145 RET 146 move_3: 147 MOVW (SI), AX 148 MOVB 2(SI), CX 149 MOVW AX, (DI) 150 MOVB CX, 2(DI) 151 RET 152 move_4: 153 // We need a separate case for 4 to make sure we write pointers atomically. 154 MOVL (SI), AX 155 MOVL AX, (DI) 156 RET 157 move_5through8: 158 MOVL (SI), AX 159 MOVL -4(SI)(BX*1), CX 160 MOVL AX, (DI) 161 MOVL CX, -4(DI)(BX*1) 162 RET 163 move_9through16: 164 MOVL (SI), AX 165 MOVL 4(SI), CX 166 MOVL -8(SI)(BX*1), DX 167 MOVL -4(SI)(BX*1), BP 168 MOVL AX, (DI) 169 MOVL CX, 4(DI) 170 MOVL DX, -8(DI)(BX*1) 171 MOVL BP, -4(DI)(BX*1) 172 RET 173 move_17through32: 174 MOVOU (SI), X0 175 MOVOU -16(SI)(BX*1), X1 176 MOVOU X0, (DI) 177 MOVOU X1, -16(DI)(BX*1) 178 RET 179 move_33through64: 180 MOVOU (SI), X0 181 MOVOU 16(SI), X1 182 MOVOU -32(SI)(BX*1), X2 183 MOVOU -16(SI)(BX*1), X3 184 MOVOU X0, (DI) 185 MOVOU X1, 16(DI) 186 MOVOU X2, -32(DI)(BX*1) 187 MOVOU X3, -16(DI)(BX*1) 188 RET 189 move_65through128: 190 MOVOU (SI), X0 191 MOVOU 16(SI), X1 192 MOVOU 32(SI), X2 193 MOVOU 48(SI), X3 194 MOVOU -64(SI)(BX*1), X4 195 MOVOU -48(SI)(BX*1), X5 196 MOVOU -32(SI)(BX*1), X6 197 MOVOU -16(SI)(BX*1), X7 198 MOVOU X0, (DI) 199 MOVOU X1, 16(DI) 200 MOVOU X2, 32(DI) 201 MOVOU X3, 48(DI) 202 MOVOU X4, -64(DI)(BX*1) 203 MOVOU X5, -48(DI)(BX*1) 204 MOVOU X6, -32(DI)(BX*1) 205 MOVOU X7, -16(DI)(BX*1) 206 RET