github.com/grailbio/base@v0.0.11/simd/float_amd64.s (about) 1 // Copyright 2021 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 DATA ·ExponentMask<>+0x00(SB)/8, $0x7ff07ff07ff07ff0 8 DATA ·ExponentMask<>+0x08(SB)/8, $0x7ff07ff07ff07ff0 9 DATA ·ExponentMask<>+0x10(SB)/8, $0x7ff07ff07ff07ff0 10 DATA ·ExponentMask<>+0x18(SB)/8, $0x7ff07ff07ff07ff0 11 // NOPTR = 16, RODATA = 8 12 GLOBL ·ExponentMask<>(SB), 24, $32 13 14 DATA ·FirstShuffle<>+0x00(SB)/8, $0xffffffff0f0e0706 15 DATA ·FirstShuffle<>+0x08(SB)/8, $0xffffffffffffffff 16 DATA ·FirstShuffle<>+0x10(SB)/8, $0xffffffff0f0e0706 17 DATA ·FirstShuffle<>+0x18(SB)/8, $0xffffffffffffffff 18 GLOBL ·FirstShuffle<>(SB), 24, $32 19 20 DATA ·SecondShuffle<>+0x00(SB)/8, $0x0f0e0706ffffffff 21 DATA ·SecondShuffle<>+0x08(SB)/8, $0xffffffffffffffff 22 DATA ·SecondShuffle<>+0x10(SB)/8, $0x0f0e0706ffffffff 23 DATA ·SecondShuffle<>+0x18(SB)/8, $0xffffffffffffffff 24 GLOBL ·SecondShuffle<>(SB), 24, $32 25 26 DATA ·ThirdShuffle<>+0x00(SB)/8, $0xffffffffffffffff 27 DATA ·ThirdShuffle<>+0x08(SB)/8, $0xffffffff0f0e0706 28 DATA ·ThirdShuffle<>+0x10(SB)/8, $0xffffffffffffffff 29 DATA ·ThirdShuffle<>+0x18(SB)/8, $0xffffffff0f0e0706 30 GLOBL ·ThirdShuffle<>(SB), 24, $32 31 32 DATA ·FourthShuffle<>+0x00(SB)/8, $0xffffffffffffffff 33 DATA ·FourthShuffle<>+0x08(SB)/8, $0x0f0e0706ffffffff 34 DATA ·FourthShuffle<>+0x10(SB)/8, $0xffffffffffffffff 35 DATA ·FourthShuffle<>+0x18(SB)/8, $0x0f0e0706ffffffff 36 GLOBL ·FourthShuffle<>(SB), 24, $32 37 38 TEXT ·findNaNOrInf64SSSE3Asm(SB),4,$0-24 39 // findNaNOrInf64SSSE3Asm returns x if the first NaN/inf in data is at 40 // position x, or -1 if no NaN/inf is present. nElem must be at least 41 // 8. 42 // 43 // The implementation exploits the fact that we only need to look at 44 // the exponent bits to determine NaN/inf status, and these occupy just 45 // the top two bytes of each 8-byte float. Thus, we can pack the 46 // exponent-containing-bytes of 8 consecutive float64s into a single 47 // 16-byte vector, and check them in parallel. 48 // 49 // Register allocation: 50 // AX: data 51 // BX: nElem - 8 52 // CX: current index 53 // DX: comparison result 54 // SI: &(data[2]) 55 // DI: &(data[4]) 56 // R8: &(data[6]) 57 // R9: nElem 58 // X0: exponent mask 59 // X1: first shuffle mask 60 // X2: second shuffle mask 61 // X3: third shuffle mask 62 // X4: fourth shuffle mask 63 MOVQ data+0(FP), AX 64 MOVQ nElem+8(FP), BX 65 MOVQ BX, R9 66 SUBQ $8, BX 67 XORL CX, CX 68 MOVQ AX, SI 69 MOVQ AX, DI 70 MOVQ AX, R8 71 ADDQ $16, SI 72 ADDQ $32, DI 73 ADDQ $48, R8 74 75 MOVOU ·ExponentMask<>(SB), X0 76 MOVOU ·FirstShuffle<>(SB), X1 77 MOVOU ·SecondShuffle<>(SB), X2 78 MOVOU ·ThirdShuffle<>(SB), X3 79 MOVOU ·FourthShuffle<>(SB), X4 80 81 findNaNOrInf64SSSE3AsmLoop: 82 // Scan 8 float64s, starting from &(data[CX]), into X5..X8. 83 MOVOU (AX)(CX*8), X5 84 MOVOU (SI)(CX*8), X6 85 MOVOU (DI)(CX*8), X7 86 MOVOU (R8)(CX*8), X8 87 88 // Extract exponent bytes. 89 PSHUFB X1, X5 90 PSHUFB X2, X6 91 PSHUFB X3, X7 92 PSHUFB X4, X8 93 94 // Collect into X5. 95 POR X6, X5 96 POR X8, X7 97 POR X7, X5 98 99 // Mask out non-exponent bits, and then compare 2-byte groups in 100 // parallel. 101 PAND X0, X5 102 PCMPEQW X0, X5 103 104 // Check result. 105 PMOVMSKB X5, DX 106 TESTQ DX, DX 107 JNE findNaNOrInf64SSSE3AsmFound 108 109 // Advance loop. 110 ADDQ $8, CX 111 CMPQ BX, CX 112 JGE findNaNOrInf64SSSE3AsmLoop 113 114 // Less than 8 float64s left... 115 CMPQ R9, CX 116 JE findNaNOrInf64SSSE3AsmNotFound 117 118 // ...but more than zero. Set CX := nElem - 8, and start one last 119 // loop iteration. 120 MOVQ BX, CX 121 JMP findNaNOrInf64SSSE3AsmLoop 122 123 findNaNOrInf64SSSE3AsmNotFound: 124 MOVQ $-1, ret+16(FP) 125 RET 126 127 findNaNOrInf64SSSE3AsmFound: 128 // Determine the position of the lowest set bit in DX, i.e. the byte 129 // offset of the first comparison success. 130 BSFQ DX, BX 131 // We compared 2-byte groups, so divide by 2 to determine the original 132 // index. 133 SHRQ $1, BX 134 ADDQ CX, BX 135 MOVQ BX, ret+16(FP) 136 RET 137 138 139 TEXT ·findNaNOrInf64AVX2Asm(SB),4,$0-24 140 // findNaNOrInf64AVX2Asm is nearly identical to the SSSE3 version; it 141 // just compares 16 float64s at a time instead of 8. 142 MOVQ data+0(FP), AX 143 MOVQ nElem+8(FP), BX 144 MOVQ BX, R9 145 SUBQ $16, BX 146 XORL CX, CX 147 MOVQ AX, SI 148 MOVQ AX, DI 149 MOVQ AX, R8 150 ADDQ $32, SI 151 ADDQ $64, DI 152 ADDQ $96, R8 153 154 VMOVDQU ·ExponentMask<>(SB), Y0 155 VMOVDQU ·FirstShuffle<>(SB), Y1 156 VMOVDQU ·SecondShuffle<>(SB), Y2 157 VMOVDQU ·ThirdShuffle<>(SB), Y3 158 VMOVDQU ·FourthShuffle<>(SB), Y4 159 160 findNaNOrInf64AVX2AsmLoop: 161 // Scan 16 float64s, starting from &(data[CX]), into Y5..Y8. 162 VMOVDQU (AX)(CX*8), Y5 163 VMOVDQU (SI)(CX*8), Y6 164 VMOVDQU (DI)(CX*8), Y7 165 VMOVDQU (R8)(CX*8), Y8 166 167 // Extract exponent bytes. 168 VPSHUFB Y1, Y5, Y5 169 VPSHUFB Y2, Y6, Y6 170 VPSHUFB Y3, Y7, Y7 171 VPSHUFB Y4, Y8, Y8 172 173 // Collect into Y5. 174 VPOR Y6, Y5, Y5 175 VPOR Y8, Y7, Y7 176 VPOR Y7, Y5, Y5 177 178 // Mask out non-exponent bits, and then compare 2-byte groups in 179 // parallel. 180 VPAND Y0, Y5, Y5 181 VPCMPEQW Y0, Y5, Y5 182 183 // Check result. 184 VPMOVMSKB Y5, DX 185 TESTQ DX, DX 186 JNE findNaNOrInf64AVX2AsmFound 187 188 // Advance loop. 189 ADDQ $16, CX 190 CMPQ BX, CX 191 JGE findNaNOrInf64AVX2AsmLoop 192 193 // Less than 8 float64s left... 194 CMPQ R9, CX 195 JE findNaNOrInf64AVX2AsmNotFound 196 197 // ...but more than zero. Set CX := nElem - 8, and start one last 198 // loop iteration. 199 MOVQ BX, CX 200 JMP findNaNOrInf64AVX2AsmLoop 201 202 findNaNOrInf64AVX2AsmNotFound: 203 MOVQ $-1, ret+16(FP) 204 RET 205 206 findNaNOrInf64AVX2AsmFound: 207 // Since the PSHUFB instruction acts separately on the two 16-byte 208 // "lanes", the 2-byte chunks in Y5, and consequently the 2-bit groups 209 // in DX here, are drawn from &(data[CX])..&(data[CX+15]) in the 210 // following order: 211 // 0 1 4 5 8 9 12 13 2 3 6 7 10 11 14 15 212 // We "unscramble" this before grabbing the lowest set bit. 213 214 // Clear odd bits. 215 ANDQ $0x55555555, DX 216 217 // Rearrange to 218 // 0 1 * * 4 5 * * 8 9 * * 12 13 * * 2 3 ... 219 // where the above refers to single bits, and * denotes a cleared bit. 220 MOVQ DX, BX 221 SHRQ $1, BX 222 ORQ BX, DX 223 ANDQ $0x33333333, DX 224 225 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 226 MOVQ DX, BX 227 SHRQ $14, BX 228 ORQ BX, DX 229 230 // Okay, now we're ready. 231 BSFQ DX, BX 232 ADDQ CX, BX 233 MOVQ BX, ret+16(FP) 234 RET