github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/cmp_amd64.s (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 TEXT ·firstGreater8SSSE3Asm(SB),4,$0-40 8 MOVQ arg+0(FP), DI 9 MOVQ val+8(FP), BX 10 MOVQ startPos+16(FP), AX 11 MOVQ endPos+24(FP), R9 12 13 ADDQ DI, AX 14 // AX is now &(arg[startPos]) 15 PXOR X1, X1 16 LEAQ -16(DI)(R9*1), R8 17 // R8 is now &(arg[endPos - 16]) 18 19 // We now distinguish two cases. 20 // 1. val <= 127. Then we saturate-add (127 - val) to each byte before 21 // movemask. 22 // 2. val > 127. Then we saturate-subtract (val - 127) from each byte 23 // before movemask. 24 CMPQ BX, $127 25 JG firstGreater8SSSE3HighVal 26 27 XORQ $127, BX 28 MOVD BX, X0 29 PSHUFB X1, X0 30 // all bytes of X0 are now equal to (127 - val) 31 CMPQ R8, AX 32 JLE firstGreater8SSSE3LowValFinal 33 34 firstGreater8SSSE3LowValLoop: 35 MOVOU (AX), X1 36 PADDUSB X0, X1 37 PMOVMSKB X1, BX 38 TESTQ BX, BX 39 JNE firstGreater8SSSE3Found 40 ADDQ $16, AX 41 CMPQ R8, AX 42 JG firstGreater8SSSE3LowValLoop 43 44 firstGreater8SSSE3LowValFinal: 45 MOVQ R8, AX 46 MOVOU (R8), X1 47 PADDUSB X0, X1 48 PMOVMSKB X1, BX 49 TESTQ BX, BX 50 JNE firstGreater8SSSE3Found 51 MOVQ R9, ret+32(FP) 52 RET 53 54 firstGreater8SSSE3Found: 55 BSFQ BX, DX 56 SUBQ DI, AX 57 ADDQ DX, AX 58 MOVQ AX, ret+32(FP) 59 RET 60 61 firstGreater8SSSE3HighVal: 62 SUBQ $127, BX 63 MOVD BX, X0 64 PSHUFB X1, X0 65 // all bytes of X0 are now equal to (val - 127) 66 CMPQ R8, AX 67 JLE firstGreater8SSSE3HighValFinal 68 69 firstGreater8SSSE3HighValLoop: 70 MOVOU (AX), X1 71 PSUBUSB X0, X1 72 PMOVMSKB X1, BX 73 TESTQ BX, BX 74 JNE firstGreater8SSSE3Found 75 ADDQ $16, AX 76 CMPQ R8, AX 77 JG firstGreater8SSSE3HighValLoop 78 79 firstGreater8SSSE3HighValFinal: 80 MOVQ R8, AX 81 MOVOU (R8), X1 82 PSUBUSB X0, X1 83 PMOVMSKB X1, BX 84 TESTQ BX, BX 85 JNE firstGreater8SSSE3Found 86 MOVQ R9, ret+32(FP) 87 RET 88 89 90 TEXT ·firstLeq8SSSE3Asm(SB),4,$0-40 91 MOVQ arg+0(FP), DI 92 MOVD val+8(FP), X0 93 MOVQ startPos+16(FP), AX 94 MOVQ endPos+24(FP), R9 95 96 ADDQ DI, AX 97 // AX is now &(arg[startPos]) 98 PXOR X1, X1 99 // X1 is a fixed all-zero vector 100 LEAQ -16(DI)(R9*1), R8 101 // R8 is now &(arg[endPos - 16]) 102 PSHUFB X1, X0 103 // all bytes of X0 are now equal to val 104 CMPQ R8, AX 105 JLE firstLeq8SSSE3Final 106 107 firstLeq8SSSE3Loop: 108 MOVOU (AX), X2 109 PSUBUSB X0, X2 110 // X2 is 0 for all bytes originally <= val, and nonzero otherwise. 111 PCMPEQB X1, X2 112 // X2 is now 255 for all bytes originally <= val, and 0 otherwise. 113 PMOVMSKB X2, BX 114 TESTQ BX, BX 115 JNE firstLeq8SSSE3Found 116 ADDQ $16, AX 117 CMPQ R8, AX 118 JG firstLeq8SSSE3Loop 119 120 firstLeq8SSSE3Final: 121 MOVQ R8, AX 122 MOVOU (R8), X2 123 PSUBUSB X0, X2 124 PCMPEQB X1, X2 125 PMOVMSKB X2, BX 126 TESTQ BX, BX 127 JNE firstLeq8SSSE3Found 128 MOVQ R9, ret+32(FP) 129 RET 130 131 firstLeq8SSSE3Found: 132 BSFQ BX, DX 133 SUBQ DI, AX 134 ADDQ DX, AX 135 MOVQ AX, ret+32(FP) 136 RET