github.com/as/shiny@v0.8.2/driver/internal/swizzle/swizzle_amd64.s (about) 1 // Copyright 2018 (as). Added avx and avx2 support for capable CPUs 2 // Copyright 2015 The Go Authors. All rights reserved. 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "textflag.h" 7 8 DATA ·AVX2_swizzletab<>+0x00(SB)/8, $0x0704050603000102 9 DATA ·AVX2_swizzletab<>+0x08(SB)/8, $0x0f0c0d0e0b08090a 10 DATA ·AVX2_swizzletab<>+0x10(SB)/8, $0x1714151613101112 11 DATA ·AVX2_swizzletab<>+0x18(SB)/8, $0x1f1c1d1e1b18191a 12 GLOBL ·AVX2_swizzletab<>(SB), (NOPTR+RODATA), $32 13 14 // func haveSSSE3() bool 15 TEXT ·haveSSSE3(SB),NOSPLIT,$0 16 MOVQ $1, AX 17 CPUID 18 SHRQ $9, CX 19 ANDQ $1, CX 20 MOVB CX, ret+0(FP) 21 RET 22 23 // func haveAVX() bool 24 TEXT ·haveAVX(SB),NOSPLIT,$0 25 MOVQ $1, AX 26 CPUID 27 SHRQ $28, CX 28 ANDQ $1, CX 29 MOVB CX, ret+0(FP) 30 RET 31 32 // func haveAVX2() bool 33 TEXT ·haveAVX2(SB),NOSPLIT,$0 34 MOVQ $7, AX 35 MOVQ $0, CX 36 CPUID 37 SHRQ $5, BX 38 ANDQ $1, BX 39 MOVB BX, ret+0(FP) 40 RET 41 42 // func bgra256sd(p, q []byte) 43 TEXT ·bgra256sd(SB),NOSPLIT,$0 44 MOVQ p+0(FP), SI 45 MOVQ len+8(FP), CX 46 MOVQ q+24(FP), DI 47 48 VMOVDQU ·AVX2_swizzletab<>(SB), Y0 49 ADDQ SI, CX 50 ADDQ $256, SI 51 CMPQ CX, SI 52 JL prep32 53 SUBQ $256, SI 54 55 loop256: 56 VMOVDQU (0*32)(SI),Y1 57 VMOVDQU (1*32)(SI),Y2 58 VMOVDQU (2*32)(SI),Y3 59 VMOVDQU (3*32)(SI),Y4 60 VMOVDQU (4*32)(SI),Y5 61 VMOVDQU (5*32)(SI),Y6 62 VMOVDQU (6*32)(SI),Y7 63 VMOVDQU (7*32)(SI),Y8 64 VPSHUFB Y0, Y1, Y1 65 VPSHUFB Y0, Y2, Y2 66 VPSHUFB Y0, Y3, Y3 67 VPSHUFB Y0, Y4, Y4 68 VPSHUFB Y0, Y5, Y5 69 VPSHUFB Y0, Y6, Y6 70 VPSHUFB Y0, Y7, Y7 71 VPSHUFB Y0, Y8, Y8 72 VMOVDQU Y1, (0*32)(DI) 73 VMOVDQU Y2, (1*32)(DI) 74 VMOVDQU Y3, (2*32)(DI) 75 VMOVDQU Y4, (3*32)(DI) 76 VMOVDQU Y5, (4*32)(DI) 77 VMOVDQU Y6, (5*32)(DI) 78 VMOVDQU Y7, (6*32)(DI) 79 VMOVDQU Y8, (7*32)(DI) 80 ADDQ $256, SI 81 ADDQ $256, DI 82 CMPQ CX, SI 83 JGT loop256 84 JEQ done 85 86 SUBQ $256, DI 87 prep32: 88 SUBQ $256, SI 89 ADDQ $32, SI 90 CMPQ CX, SI 91 JL prep4 92 SUBQ $32, SI 93 94 loop32: 95 VMOVDQU (0*32)(SI),Y1 96 VPSHUFB Y0, Y1, Y1 97 VMOVDQU Y1, (0*32)(DI) 98 ADDQ $32, SI 99 ADDQ $32, DI 100 CMPQ CX, SI 101 JGT loop32 102 JEQ done 103 104 SUBQ $32, DI 105 prep4: 106 SUBQ $32, SI 107 108 loop4: 109 MOVD 0(SI), AX // r g b a 110 BSWAPL AX // a b g r 111 RORL $8, AX // b g r a 112 MOVD AX, (DI) 113 114 ADDQ $4, SI 115 ADDQ $4, DI 116 CMPQ CX, SI 117 JGT loop4 118 119 done: 120 RET 121 122 // func bgra128sd(p, q []byte) 123 TEXT ·bgra128sd(SB),NOSPLIT,$0 124 MOVQ p+0(FP), SI 125 MOVQ len+8(FP), CX 126 MOVQ q+24(FP), DI 127 128 VMOVDQU ·AVX2_swizzletab<>(SB), X0 129 ADDQ SI, CX 130 ADDQ $128, SI 131 CMPQ CX, SI 132 JL prep16 133 SUBQ $128, SI 134 135 loop128: 136 VMOVDQU (0*16)(SI),X1 137 VMOVDQU (1*16)(SI),X2 138 VMOVDQU (2*16)(SI),X3 139 VMOVDQU (3*16)(SI),X4 140 VMOVDQU (4*16)(SI),X5 141 VMOVDQU (5*16)(SI),X6 142 VMOVDQU (6*16)(SI),X7 143 VMOVDQU (7*16)(SI),X8 144 VPSHUFB X0, X1, X1 145 VPSHUFB X0, X2, X2 146 VPSHUFB X0, X3, X3 147 VPSHUFB X0, X4, X4 148 VPSHUFB X0, X5, X5 149 VPSHUFB X0, X6, X6 150 VPSHUFB X0, X7, X7 151 VPSHUFB X0, X8, X8 152 VMOVDQU X1, (0*16)(DI) 153 VMOVDQU X2, (1*16)(DI) 154 VMOVDQU X3, (2*16)(DI) 155 VMOVDQU X4, (3*16)(DI) 156 VMOVDQU X5, (4*16)(DI) 157 VMOVDQU X6, (5*16)(DI) 158 VMOVDQU X7, (6*16)(DI) 159 VMOVDQU X8, (7*16)(DI) 160 ADDQ $128, SI 161 ADDQ $128, DI 162 CMPQ CX, SI 163 JGT loop128 164 JEQ done 165 166 SUBQ $128, DI 167 prep16: 168 SUBQ $128, SI 169 ADDQ $16, SI 170 CMPQ CX, SI 171 JL prep4 172 SUBQ $16, SI 173 174 loop16: 175 VMOVDQU (0*16)(SI),X1 176 VPSHUFB X0, X1, X1 177 VMOVDQU X1, (0*16)(DI) 178 ADDQ $16, SI 179 ADDQ $16, DI 180 CMPQ CX, SI 181 JGT loop16 182 JEQ done 183 184 SUBQ $16, DI 185 prep4: 186 SUBQ $16, SI 187 188 loop4: 189 MOVD 0(SI), AX // r g b a 190 BSWAPL AX // a b g r 191 RORL $8, AX // b g r a 192 MOVD AX, (DI) 193 194 ADDQ $4, SI 195 ADDQ $4, DI 196 CMPQ CX, SI 197 JGT loop4 198 199 done: 200 RET 201 202 // func bgra16sd(p, q []byte) 203 TEXT ·bgra16sd(SB),NOSPLIT,$0 204 MOVQ p+0(FP), SI 205 MOVQ len+8(FP), CX 206 MOVQ q+24(FP), DI 207 208 // Sanity check that len is a multiple of 16. 209 // MOVQ CX, AX 210 // ANDQ $15, AX 211 // JNZ done 212 ADDQ SI, CX 213 214 // Make the shuffle control mask (16-byte register X0) look like this, 215 // where the low order byte comes first: 216 // 217 // 02 01 00 03 06 05 04 07 0a 09 08 0b 0e 0d 0c 0f 218 // 219 // Load the bottom 8 bytes into X0, the top into X1, then interleave them 220 // into X0. 221 MOVQ $0x0704050603000102, AX 222 MOVQ AX, X0 223 MOVQ $0x0f0c0d0e0b08090a, AX 224 MOVQ AX, X1 225 PUNPCKLQDQ X1, X0 226 227 loop16: 228 MOVOU (SI), X1 229 PSHUFB X0, X1 230 MOVOU X1, (DI) 231 232 ADDQ $16, SI 233 ADDQ $16, DI 234 CMPQ CX, SI 235 JGT loop16 236 JEQ done 237 238 prep4: 239 SUBQ $16, DI 240 SUBQ $16, SI 241 loop4: 242 MOVD 0(SI), AX // r g b a 243 BSWAPL AX // a b g r 244 RORL $8, AX // b g r a 245 MOVD AX, (DI) 246 247 ADDQ $4, SI 248 ADDQ $4, DI 249 CMPQ CX, SI 250 JGT loop4 251 done: 252 RET 253 254 // func bgra4sd(p, q []byte) 255 TEXT ·bgra4sd(SB),NOSPLIT,$0 256 MOVQ p+0(FP), SI 257 MOVQ len+8(FP), CX 258 MOVQ q+24(FP), DI 259 260 ADDQ SI, CX 261 loop: 262 CMPQ SI, CX 263 JEQ done 264 265 MOVD 0(SI), AX // r g b a 266 BSWAPL AX // a b g r 267 RORL $8, AX // b g r a 268 MOVD AX, (DI) 269 270 ADDQ $4, SI 271 ADDQ $4, DI 272 JMP loop 273 done: 274 RET