github.com/as/shiny@v0.8.2/driver/internal/swizzle/swizzle_amd64.s (about)

     1  // Copyright 2018 (as). Added avx and avx2 support for capable CPUs
     2  // Copyright 2015 The Go Authors. All rights reserved.
     3  // Use of this source code is governed by a BSD-style
     4  // license that can be found in the LICENSE file.
     5  
     6  #include "textflag.h"
     7  
     8  DATA ·AVX2_swizzletab<>+0x00(SB)/8, $0x0704050603000102
     9  DATA ·AVX2_swizzletab<>+0x08(SB)/8, $0x0f0c0d0e0b08090a
    10  DATA ·AVX2_swizzletab<>+0x10(SB)/8, $0x1714151613101112
    11  DATA ·AVX2_swizzletab<>+0x18(SB)/8, $0x1f1c1d1e1b18191a
    12  GLOBL ·AVX2_swizzletab<>(SB), (NOPTR+RODATA), $32
    13  
    14  // func haveSSSE3() bool
    15  TEXT ·haveSSSE3(SB),NOSPLIT,$0
    16  	MOVQ	$1, AX
    17  	CPUID
    18  	SHRQ	$9, CX
    19  	ANDQ	$1, CX
    20  	MOVB	CX, ret+0(FP)
    21  	RET
    22  
    23  // func haveAVX() bool
    24  TEXT ·haveAVX(SB),NOSPLIT,$0
    25  	MOVQ	$1, AX
    26  	CPUID
    27  	SHRQ	$28, CX
    28  	ANDQ	$1, CX
    29  	MOVB	CX, ret+0(FP)
    30  	RET
    31  	
    32  // func haveAVX2() bool
    33  TEXT ·haveAVX2(SB),NOSPLIT,$0
    34  	MOVQ	$7, AX
    35  	MOVQ	$0, CX
    36  	CPUID
    37  	SHRQ	$5, BX
    38  	ANDQ	$1, BX
    39  	MOVB	BX, ret+0(FP)
    40  	RET
    41  
    42  // func bgra256sd(p, q []byte)
    43  TEXT ·bgra256sd(SB),NOSPLIT,$0
    44  	MOVQ	p+0(FP), SI
    45  	MOVQ	len+8(FP), CX
    46  	MOVQ	q+24(FP), DI
    47  	
    48  	VMOVDQU ·AVX2_swizzletab<>(SB), Y0
    49  	ADDQ SI, CX
    50  	ADDQ $256, SI
    51  	CMPQ CX, SI
    52  	JL prep32
    53  	SUBQ	$256, SI
    54  	
    55  loop256:
    56  	VMOVDQU 	(0*32)(SI),Y1 
    57  	VMOVDQU 	(1*32)(SI),Y2 
    58  	VMOVDQU 	(2*32)(SI),Y3 
    59  	VMOVDQU 	(3*32)(SI),Y4 
    60  	VMOVDQU 	(4*32)(SI),Y5 
    61  	VMOVDQU 	(5*32)(SI),Y6 
    62  	VMOVDQU 	(6*32)(SI),Y7 
    63  	VMOVDQU 	(7*32)(SI),Y8 
    64  	VPSHUFB Y0, Y1,  Y1
    65  	VPSHUFB Y0, Y2,  Y2
    66  	VPSHUFB Y0, Y3,  Y3
    67  	VPSHUFB Y0, Y4,  Y4
    68  	VPSHUFB Y0, Y5,  Y5
    69  	VPSHUFB Y0, Y6,  Y6
    70  	VPSHUFB Y0, Y7,  Y7
    71  	VPSHUFB Y0, Y8,  Y8
    72  	VMOVDQU	Y1, (0*32)(DI)
    73  	VMOVDQU	Y2, (1*32)(DI)
    74  	VMOVDQU	Y3, (2*32)(DI)
    75  	VMOVDQU	Y4, (3*32)(DI)
    76  	VMOVDQU	Y5, (4*32)(DI)
    77  	VMOVDQU	Y6, (5*32)(DI)
    78  	VMOVDQU	Y7, (6*32)(DI)
    79  	VMOVDQU	Y8, (7*32)(DI)
    80  	ADDQ	$256, SI
    81  	ADDQ	$256, DI
    82  	CMPQ CX, SI
    83  	JGT loop256
    84  	JEQ done
    85  	
    86  	SUBQ	$256, DI
    87  prep32:
    88  	SUBQ	$256, SI
    89  	ADDQ $32, SI
    90  	CMPQ CX, SI
    91  	JL prep4
    92  	SUBQ	$32, SI
    93  
    94  loop32:
    95  	VMOVDQU 	(0*32)(SI),Y1 
    96  	VPSHUFB Y0, Y1,  Y1
    97  	VMOVDQU	Y1, (0*32)(DI)
    98  	ADDQ	$32, SI
    99  	ADDQ	$32, DI
   100  	CMPQ CX, SI
   101  	JGT	loop32
   102  	JEQ done
   103  	
   104  	SUBQ	$32, DI
   105  prep4:
   106  	SUBQ	$32, SI
   107  	
   108  loop4:
   109  	MOVD	0(SI), AX	// r g b a
   110  	BSWAPL AX   // a b g r 
   111  	RORL	$8, AX 	// b g r a 
   112  	MOVD	AX, (DI)
   113  
   114  	ADDQ	$4, SI
   115  	ADDQ	$4, DI
   116  	CMPQ CX, SI
   117  	JGT	loop4
   118  
   119  done:
   120  	RET
   121  
   122  // func bgra128sd(p, q []byte)
   123  TEXT ·bgra128sd(SB),NOSPLIT,$0
   124  	MOVQ	p+0(FP), SI
   125  	MOVQ	len+8(FP), CX
   126  	MOVQ	q+24(FP), DI
   127  	
   128  	VMOVDQU ·AVX2_swizzletab<>(SB), X0
   129  	ADDQ SI, CX
   130  	ADDQ $128, SI
   131  	CMPQ CX, SI
   132  	JL prep16
   133  	SUBQ	$128, SI
   134  	
   135  loop128:
   136  	VMOVDQU 	(0*16)(SI),X1 
   137  	VMOVDQU 	(1*16)(SI),X2 
   138  	VMOVDQU 	(2*16)(SI),X3 
   139  	VMOVDQU 	(3*16)(SI),X4 
   140  	VMOVDQU 	(4*16)(SI),X5 
   141  	VMOVDQU 	(5*16)(SI),X6 
   142  	VMOVDQU 	(6*16)(SI),X7 
   143  	VMOVDQU 	(7*16)(SI),X8 
   144  	VPSHUFB X0, X1,  X1
   145  	VPSHUFB X0, X2,  X2
   146  	VPSHUFB X0, X3,  X3
   147  	VPSHUFB X0, X4,  X4
   148  	VPSHUFB X0, X5,  X5
   149  	VPSHUFB X0, X6,  X6
   150  	VPSHUFB X0, X7,  X7
   151  	VPSHUFB X0, X8,  X8
   152  	VMOVDQU	X1, (0*16)(DI)
   153  	VMOVDQU	X2, (1*16)(DI)
   154  	VMOVDQU	X3, (2*16)(DI)
   155  	VMOVDQU	X4, (3*16)(DI)
   156  	VMOVDQU	X5, (4*16)(DI)
   157  	VMOVDQU	X6, (5*16)(DI)
   158  	VMOVDQU	X7, (6*16)(DI)
   159  	VMOVDQU	X8, (7*16)(DI)
   160  	ADDQ	$128, SI
   161  	ADDQ	$128, DI
   162  	CMPQ CX, SI
   163  	JGT loop128
   164  	JEQ done
   165  	
   166  	SUBQ	$128, DI
   167  prep16:
   168  	SUBQ	$128, SI
   169  	ADDQ $16, SI
   170  	CMPQ CX, SI
   171  	JL prep4
   172  	SUBQ	$16, SI
   173  
   174  loop16:
   175  	VMOVDQU 	(0*16)(SI),X1 
   176  	VPSHUFB X0, X1,  X1
   177  	VMOVDQU	X1, (0*16)(DI)
   178  	ADDQ	$16, SI
   179  	ADDQ	$16, DI
   180  	CMPQ CX, SI
   181  	JGT	loop16
   182  	JEQ done
   183  	
   184  	SUBQ	$16, DI
   185  prep4:
   186  	SUBQ	$16, SI
   187  	
   188  loop4:
   189  	MOVD	0(SI), AX	// r g b a
   190  	BSWAPL AX   // a b g r 
   191  	RORL	$8, AX 	// b g r a 
   192  	MOVD	AX, (DI)
   193  
   194  	ADDQ	$4, SI
   195  	ADDQ	$4, DI
   196  	CMPQ CX, SI
   197  	JGT	loop4
   198  
   199  done:
   200  	RET
   201  	
   202  // func bgra16sd(p, q []byte)
   203  TEXT ·bgra16sd(SB),NOSPLIT,$0
   204  	MOVQ	p+0(FP), SI
   205  	MOVQ	len+8(FP), CX
   206  	MOVQ	q+24(FP), DI
   207  
   208  	// Sanity check that len is a multiple of 16.
   209  	//	MOVQ	CX, AX
   210  	//	ANDQ	$15, AX
   211  	//	JNZ	done
   212  	ADDQ SI, CX
   213  
   214  	// Make the shuffle control mask (16-byte register X0) look like this,
   215  	// where the low order byte comes first:
   216  	//
   217  	// 02 01 00 03  06 05 04 07  0a 09 08 0b  0e 0d 0c 0f
   218  	//
   219  	// Load the bottom 8 bytes into X0, the top into X1, then interleave them
   220  	// into X0.
   221  	MOVQ	$0x0704050603000102, AX
   222  	MOVQ	AX, X0
   223  	MOVQ	$0x0f0c0d0e0b08090a, AX
   224  	MOVQ	AX, X1
   225  	PUNPCKLQDQ	X1, X0
   226  
   227  loop16:
   228  	MOVOU	(SI), X1
   229  	PSHUFB	X0, X1
   230  	MOVOU	X1, (DI)
   231  
   232  	ADDQ	$16, SI
   233  	ADDQ	$16, DI
   234  	CMPQ CX, SI
   235  	JGT	loop16
   236  	JEQ done
   237  
   238  prep4:
   239  	SUBQ	$16, DI
   240  	SUBQ	$16, SI
   241  loop4:
   242  	MOVD	0(SI), AX	// r g b a
   243  	BSWAPL AX   // a b g r 
   244  	RORL	$8, AX 	// b g r a 
   245  	MOVD	AX, (DI)
   246  
   247  	ADDQ	$4, SI
   248  	ADDQ	$4, DI
   249  	CMPQ CX, SI
   250  	JGT	loop4
   251  done:
   252  	RET
   253  
   254  // func bgra4sd(p, q []byte)
   255  TEXT ·bgra4sd(SB),NOSPLIT,$0
   256  	MOVQ	p+0(FP), SI
   257  	MOVQ	len+8(FP), CX
   258  	MOVQ	q+24(FP), DI
   259  
   260  	ADDQ SI, CX
   261  loop:
   262  	CMPQ	SI, CX
   263  	JEQ	done
   264  
   265  	MOVD	0(SI), AX	// r g b a
   266  	BSWAPL AX   // a b g r 
   267  	RORL	$8, AX 	// b g r a 
   268  	MOVD	AX, (DI)
   269  
   270  	ADDQ	$4, SI
   271  	ADDQ	$4, DI
   272  	JMP	loop
   273  done:
   274  	RET