github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/snappy/decode_amd64.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build gc 6 7 #include "textflag.h" 8 9 // func decode(dst, src []byte) int 10 // 11 // The asm code generally follows the pure Go code in decode_other.go, except 12 // where marked with a "!!!". 13 // 14 // All local variables fit into registers. The non-zero stack size is only to 15 // spill registers and push args when issuing a CALL. The register allocation: 16 // - AX scratch 17 // - BX scratch 18 // - CX length or x 19 // - DX offset 20 // - SI &src[s] 21 // - DI &dst[d] 22 // + R8 dst_base 23 // + R9 dst_len 24 // + R10 dst_base + dst_len 25 // + R11 src_base 26 // + R12 src_len 27 // + R13 src_base + src_len 28 // - R14 used by doCopy 29 // - R15 used by doCopy 30 // 31 // The registers R8-R13 (marked with a "+") are set at the start of the 32 // function, and after a CALL returns, and are not otherwise modified. 33 // 34 // The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. 35 // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. 36 TEXT ·decode(SB), NOSPLIT, $48-56 37 // Initialize SI, DI and R8-R13. 38 MOVQ dst_base+0(FP), R8 39 MOVQ dst_len+8(FP), R9 40 MOVQ R8, DI 41 MOVQ R8, R10 42 ADDQ R9, R10 43 MOVQ src_base+24(FP), R11 44 MOVQ src_len+32(FP), R12 45 MOVQ R11, SI 46 MOVQ R11, R13 47 ADDQ R12, R13 48 49 loop: 50 // for s < len(src) 51 CMPQ SI, R13 52 JEQ end 53 54 // CX = uint32(src[s]) 55 // 56 // switch src[s] & 0x03 57 MOVBLZX (SI), CX 58 MOVL CX, BX 59 ANDL $3, BX 60 CMPL BX, $1 61 JAE tagCopy 62 63 // ---------------------------------------- 64 // The code below handles literal tags. 65 66 // case tagLiteral: 67 // x := uint32(src[s] >> 2) 68 // switch 69 SHRL $2, CX 70 CMPL CX, $60 71 JAE tagLit60Plus 72 73 // case x < 60: 74 // s++ 75 INCQ SI 76 77 doLit: 78 // This is the end of the inner "switch", when we have a literal tag. 79 // 80 // We assume that CX == x and x fits in a uint32, where x is the variable 81 // used in the pure Go decode_other.go code. 82 83 // length = int(x) + 1 84 // 85 // Unlike the pure Go code, we don't need to check if length <= 0 because 86 // CX can hold 64 bits, so the increment cannot overflow. 87 INCQ CX 88 89 // Prepare to check if copying length bytes will run past the end of dst or 90 // src. 91 // 92 // AX = len(dst) - d 93 // BX = len(src) - s 94 MOVQ R10, AX 95 SUBQ DI, AX 96 MOVQ R13, BX 97 SUBQ SI, BX 98 99 // !!! Try a faster technique for short (16 or fewer bytes) copies. 100 // 101 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { 102 // goto callMemmove // Fall back on calling runtime·memmove. 103 // } 104 // 105 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s 106 // against 21 instead of 16, because it cannot assume that all of its input 107 // is contiguous in memory and so it needs to leave enough source bytes to 108 // read the next tag without refilling buffers, but Go's Decode assumes 109 // contiguousness (the src argument is a []byte). 110 CMPQ CX, $16 111 JGT callMemmove 112 CMPQ AX, $16 113 JLT callMemmove 114 CMPQ BX, $16 115 JLT callMemmove 116 117 // !!! Implement the copy from src to dst as a 16-byte load and store. 118 // (Decode's documentation says that dst and src must not overlap.) 119 // 120 // This always copies 16 bytes, instead of only length bytes, but that's 121 // OK. If the input is a valid Snappy encoding then subsequent iterations 122 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a 123 // non-nil error), so the overrun will be ignored. 124 // 125 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or 126 // 16-byte loads and stores. This technique probably wouldn't be as 127 // effective on architectures that are fussier about alignment. 128 MOVOU 0(SI), X0 129 MOVOU X0, 0(DI) 130 131 // d += length 132 // s += length 133 ADDQ CX, DI 134 ADDQ CX, SI 135 JMP loop 136 137 callMemmove: 138 // if length > len(dst)-d || length > len(src)-s { etc } 139 CMPQ CX, AX 140 JGT errCorrupt 141 CMPQ CX, BX 142 JGT errCorrupt 143 144 // copy(dst[d:], src[s:s+length]) 145 // 146 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push 147 // DI, SI and CX as arguments. Coincidentally, we also need to spill those 148 // three registers to the stack, to save local variables across the CALL. 149 MOVQ DI, 0(SP) 150 MOVQ SI, 8(SP) 151 MOVQ CX, 16(SP) 152 MOVQ DI, 24(SP) 153 MOVQ SI, 32(SP) 154 MOVQ CX, 40(SP) 155 CALL runtime·memmove(SB) 156 157 // Restore local variables: unspill registers from the stack and 158 // re-calculate R8-R13. 159 MOVQ 24(SP), DI 160 MOVQ 32(SP), SI 161 MOVQ 40(SP), CX 162 MOVQ dst_base+0(FP), R8 163 MOVQ dst_len+8(FP), R9 164 MOVQ R8, R10 165 ADDQ R9, R10 166 MOVQ src_base+24(FP), R11 167 MOVQ src_len+32(FP), R12 168 MOVQ R11, R13 169 ADDQ R12, R13 170 171 // d += length 172 // s += length 173 ADDQ CX, DI 174 ADDQ CX, SI 175 JMP loop 176 177 tagLit60Plus: 178 // !!! This fragment does the 179 // 180 // s += x - 58; if uint(s) > uint(len(src)) { etc } 181 // 182 // checks. In the asm version, we code it once instead of once per switch case. 183 ADDQ CX, SI 184 SUBQ $58, SI 185 MOVQ SI, BX 186 SUBQ R11, BX 187 CMPQ BX, R12 188 JA errCorrupt 189 190 // case x == 60: 191 CMPL CX, $61 192 JEQ tagLit61 193 JA tagLit62Plus 194 195 // x = uint32(src[s-1]) 196 MOVBLZX -1(SI), CX 197 JMP doLit 198 199 tagLit61: 200 // case x == 61: 201 // x = uint32(src[s-2]) | uint32(src[s-1])<<8 202 MOVWLZX -2(SI), CX 203 JMP doLit 204 205 tagLit62Plus: 206 CMPL CX, $62 207 JA tagLit63 208 209 // case x == 62: 210 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 211 MOVWLZX -3(SI), CX 212 MOVBLZX -1(SI), BX 213 SHLL $16, BX 214 ORL BX, CX 215 JMP doLit 216 217 tagLit63: 218 // case x == 63: 219 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 220 MOVL -4(SI), CX 221 JMP doLit 222 223 // The code above handles literal tags. 224 // ---------------------------------------- 225 // The code below handles copy tags. 226 227 tagCopy2: 228 // case tagCopy2: 229 // s += 3 230 ADDQ $3, SI 231 232 // if uint(s) > uint(len(src)) { etc } 233 MOVQ SI, BX 234 SUBQ R11, BX 235 CMPQ BX, R12 236 JA errCorrupt 237 238 // length = 1 + int(src[s-3])>>2 239 SHRQ $2, CX 240 INCQ CX 241 242 // offset = int(src[s-2]) | int(src[s-1])<<8 243 MOVWQZX -2(SI), DX 244 JMP doCopy 245 246 tagCopy: 247 // We have a copy tag. We assume that: 248 // - BX == src[s] & 0x03 249 // - CX == src[s] 250 CMPQ BX, $2 251 JEQ tagCopy2 252 JA errUC4T 253 254 // case tagCopy1: 255 // s += 2 256 ADDQ $2, SI 257 258 // if uint(s) > uint(len(src)) { etc } 259 MOVQ SI, BX 260 SUBQ R11, BX 261 CMPQ BX, R12 262 JA errCorrupt 263 264 // offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) 265 MOVQ CX, DX 266 ANDQ $0xe0, DX 267 SHLQ $3, DX 268 MOVBQZX -1(SI), BX 269 ORQ BX, DX 270 271 // length = 4 + int(src[s-2])>>2&0x7 272 SHRQ $2, CX 273 ANDQ $7, CX 274 ADDQ $4, CX 275 276 doCopy: 277 // This is the end of the outer "switch", when we have a copy tag. 278 // 279 // We assume that: 280 // - CX == length && CX > 0 281 // - DX == offset 282 283 // if offset <= 0 { etc } 284 CMPQ DX, $0 285 JLE errCorrupt 286 287 // if d < offset { etc } 288 MOVQ DI, BX 289 SUBQ R8, BX 290 CMPQ BX, DX 291 JLT errCorrupt 292 293 // if length > len(dst)-d { etc } 294 MOVQ R10, BX 295 SUBQ DI, BX 296 CMPQ CX, BX 297 JGT errCorrupt 298 299 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length 300 // 301 // Set: 302 // - R14 = len(dst)-d 303 // - R15 = &dst[d-offset] 304 MOVQ R10, R14 305 SUBQ DI, R14 306 MOVQ DI, R15 307 SUBQ DX, R15 308 309 // !!! Try a faster technique for short (16 or fewer bytes) forward copies. 310 // 311 // First, try using two 8-byte load/stores, similar to the doLit technique 312 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is 313 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores 314 // and not one 16-byte load/store, and the first store has to be before the 315 // second load, due to the overlap if offset is in the range [8, 16). 316 // 317 // if length > 16 || offset < 8 || len(dst)-d < 16 { 318 // goto slowForwardCopy 319 // } 320 // copy 16 bytes 321 // d += length 322 CMPQ CX, $16 323 JGT slowForwardCopy 324 CMPQ DX, $8 325 JLT slowForwardCopy 326 CMPQ R14, $16 327 JLT slowForwardCopy 328 MOVQ 0(R15), AX 329 MOVQ AX, 0(DI) 330 MOVQ 8(R15), BX 331 MOVQ BX, 8(DI) 332 ADDQ CX, DI 333 JMP loop 334 335 slowForwardCopy: 336 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we 337 // can still try 8-byte load stores, provided we can overrun up to 10 extra 338 // bytes. As above, the overrun will be fixed up by subsequent iterations 339 // of the outermost loop. 340 // 341 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its 342 // commentary says: 343 // 344 // ---- 345 // 346 // The main part of this loop is a simple copy of eight bytes at a time 347 // until we've copied (at least) the requested amount of bytes. However, 348 // if d and d-offset are less than eight bytes apart (indicating a 349 // repeating pattern of length < 8), we first need to expand the pattern in 350 // order to get the correct results. For instance, if the buffer looks like 351 // this, with the eight-byte <d-offset> and <d> patterns marked as 352 // intervals: 353 // 354 // abxxxxxxxxxxxx 355 // [------] d-offset 356 // [------] d 357 // 358 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern 359 // once, after which we can move <d> two bytes without moving <d-offset>: 360 // 361 // ababxxxxxxxxxx 362 // [------] d-offset 363 // [------] d 364 // 365 // and repeat the exercise until the two no longer overlap. 366 // 367 // This allows us to do very well in the special case of one single byte 368 // repeated many times, without taking a big hit for more general cases. 369 // 370 // The worst case of extra writing past the end of the match occurs when 371 // offset == 1 and length == 1; the last copy will read from byte positions 372 // [0..7] and write to [4..11], whereas it was only supposed to write to 373 // position 1. Thus, ten excess bytes. 374 // 375 // ---- 376 // 377 // That "10 byte overrun" worst case is confirmed by Go's 378 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy 379 // and finishSlowForwardCopy algorithm. 380 // 381 // if length > len(dst)-d-10 { 382 // goto verySlowForwardCopy 383 // } 384 SUBQ $10, R14 385 CMPQ CX, R14 386 JGT verySlowForwardCopy 387 388 makeOffsetAtLeast8: 389 // !!! As above, expand the pattern so that offset >= 8 and we can use 390 // 8-byte load/stores. 391 // 392 // for offset < 8 { 393 // copy 8 bytes from dst[d-offset:] to dst[d:] 394 // length -= offset 395 // d += offset 396 // offset += offset 397 // // The two previous lines together means that d-offset, and therefore 398 // // R15, is unchanged. 399 // } 400 CMPQ DX, $8 401 JGE fixUpSlowForwardCopy 402 MOVQ (R15), BX 403 MOVQ BX, (DI) 404 SUBQ DX, CX 405 ADDQ DX, DI 406 ADDQ DX, DX 407 JMP makeOffsetAtLeast8 408 409 fixUpSlowForwardCopy: 410 // !!! Add length (which might be negative now) to d (implied by DI being 411 // &dst[d]) so that d ends up at the right place when we jump back to the 412 // top of the loop. Before we do that, though, we save DI to AX so that, if 413 // length is positive, copying the remaining length bytes will write to the 414 // right place. 415 MOVQ DI, AX 416 ADDQ CX, DI 417 418 finishSlowForwardCopy: 419 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative 420 // length means that we overrun, but as above, that will be fixed up by 421 // subsequent iterations of the outermost loop. 422 CMPQ CX, $0 423 JLE loop 424 MOVQ (R15), BX 425 MOVQ BX, (AX) 426 ADDQ $8, R15 427 ADDQ $8, AX 428 SUBQ $8, CX 429 JMP finishSlowForwardCopy 430 431 verySlowForwardCopy: 432 // verySlowForwardCopy is a simple implementation of forward copy. In C 433 // parlance, this is a do/while loop instead of a while loop, since we know 434 // that length > 0. In Go syntax: 435 // 436 // for { 437 // dst[d] = dst[d - offset] 438 // d++ 439 // length-- 440 // if length == 0 { 441 // break 442 // } 443 // } 444 MOVB (R15), BX 445 MOVB BX, (DI) 446 INCQ R15 447 INCQ DI 448 DECQ CX 449 JNZ verySlowForwardCopy 450 JMP loop 451 452 // The code above handles copy tags. 453 // ---------------------------------------- 454 455 end: 456 // This is the end of the "for s < len(src)". 457 // 458 // if d != len(dst) { etc } 459 CMPQ DI, R10 460 JNE errCorrupt 461 462 // return 0 463 MOVQ $0, ret+48(FP) 464 RET 465 466 errCorrupt: 467 // return decodeErrCodeCorrupt 468 MOVQ $1, ret+48(FP) 469 RET 470 471 errUC4T: 472 // return decodeErrCodeUnsupportedCopy4Tag 473 MOVQ $3, ret+48(FP) 474 RET