github.com/panjjo/go@v0.0.0-20161104043856-d62b31386338/src/runtime/memmove_386.s (about) 1 // Inferno's libkern/memmove-386.s 2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "textflag.h" 29 30 TEXT runtime·memmove(SB), NOSPLIT, $0-12 31 MOVL to+0(FP), DI 32 MOVL from+4(FP), SI 33 MOVL n+8(FP), BX 34 35 // REP instructions have a high startup cost, so we handle small sizes 36 // with some straightline code. The REP MOVSL instruction is really fast 37 // for large sizes. The cutover is approximately 1K. We implement up to 38 // 128 because that is the maximum SSE register load (loading all data 39 // into registers lets us ignore copy direction). 40 tail: 41 TESTL BX, BX 42 JEQ move_0 43 CMPL BX, $2 44 JBE move_1or2 45 CMPL BX, $4 46 JB move_3 47 JE move_4 48 CMPL BX, $8 49 JBE move_5through8 50 CMPL BX, $16 51 JBE move_9through16 52 TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 53 JEQ nosse2 54 CMPL BX, $32 55 JBE move_17through32 56 CMPL BX, $64 57 JBE move_33through64 58 CMPL BX, $128 59 JBE move_65through128 60 // TODO: use branch table and BSR to make this just a single dispatch 61 62 nosse2: 63 /* 64 * check and set for backwards 65 */ 66 CMPL SI, DI 67 JLS back 68 69 /* 70 * forward copy loop 71 */ 72 forward: 73 // If REP MOVSB isn't fast, don't use it 74 TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB 75 JEQ fwdBy4 76 77 // Check alignment 78 MOVL SI, AX 79 ORL DI, AX 80 TESTL $3, AX 81 JEQ fwdBy4 82 83 // Do 1 byte at a time 84 MOVL BX, CX 85 REP; MOVSB 86 RET 87 88 fwdBy4: 89 // Do 4 bytes at a time 90 MOVL BX, CX 91 SHRL $2, CX 92 ANDL $3, BX 93 REP; MOVSL 94 JMP tail 95 96 /* 97 * check overlap 98 */ 99 back: 100 MOVL SI, CX 101 ADDL BX, CX 102 CMPL CX, DI 103 JLS forward 104 /* 105 * whole thing backwards has 106 * adjusted addresses 107 */ 108 109 ADDL BX, DI 110 ADDL BX, SI 111 STD 112 113 /* 114 * copy 115 */ 116 MOVL BX, CX 117 SHRL $2, CX 118 ANDL $3, BX 119 120 SUBL $4, DI 121 SUBL $4, SI 122 REP; MOVSL 123 124 CLD 125 ADDL $4, DI 126 ADDL $4, SI 127 SUBL BX, DI 128 SUBL BX, SI 129 JMP tail 130 131 move_1or2: 132 MOVB (SI), AX 133 MOVB -1(SI)(BX*1), CX 134 MOVB AX, (DI) 135 MOVB CX, -1(DI)(BX*1) 136 RET 137 move_0: 138 RET 139 move_3: 140 MOVW (SI), AX 141 MOVB 2(SI), CX 142 MOVW AX, (DI) 143 MOVB CX, 2(DI) 144 RET 145 move_4: 146 // We need a separate case for 4 to make sure we write pointers atomically. 147 MOVL (SI), AX 148 MOVL AX, (DI) 149 RET 150 move_5through8: 151 MOVL (SI), AX 152 MOVL -4(SI)(BX*1), CX 153 MOVL AX, (DI) 154 MOVL CX, -4(DI)(BX*1) 155 RET 156 move_9through16: 157 MOVL (SI), AX 158 MOVL 4(SI), CX 159 MOVL -8(SI)(BX*1), DX 160 MOVL -4(SI)(BX*1), BP 161 MOVL AX, (DI) 162 MOVL CX, 4(DI) 163 MOVL DX, -8(DI)(BX*1) 164 MOVL BP, -4(DI)(BX*1) 165 RET 166 move_17through32: 167 MOVOU (SI), X0 168 MOVOU -16(SI)(BX*1), X1 169 MOVOU X0, (DI) 170 MOVOU X1, -16(DI)(BX*1) 171 RET 172 move_33through64: 173 MOVOU (SI), X0 174 MOVOU 16(SI), X1 175 MOVOU -32(SI)(BX*1), X2 176 MOVOU -16(SI)(BX*1), X3 177 MOVOU X0, (DI) 178 MOVOU X1, 16(DI) 179 MOVOU X2, -32(DI)(BX*1) 180 MOVOU X3, -16(DI)(BX*1) 181 RET 182 move_65through128: 183 MOVOU (SI), X0 184 MOVOU 16(SI), X1 185 MOVOU 32(SI), X2 186 MOVOU 48(SI), X3 187 MOVOU -64(SI)(BX*1), X4 188 MOVOU -48(SI)(BX*1), X5 189 MOVOU -32(SI)(BX*1), X6 190 MOVOU -16(SI)(BX*1), X7 191 MOVOU X0, (DI) 192 MOVOU X1, 16(DI) 193 MOVOU X2, 32(DI) 194 MOVOU X3, 48(DI) 195 MOVOU X4, -64(DI)(BX*1) 196 MOVOU X5, -48(DI)(BX*1) 197 MOVOU X6, -32(DI)(BX*1) 198 MOVOU X7, -16(DI)(BX*1) 199 RET