github.com/gop9/olt@v0.0.0-20200202132135-d956aad50b08/framework/drawfillover_avx.s (about)

     1  //+build amd64,go1.10
     2  
     3  #include "textflag.h"
     4  
     5  GLOBL drawFillOver_SIMD_shufflemap<>(SB), (NOPTR+RODATA), $4
     6  DATA drawFillOver_SIMD_shufflemap<>+0x00(SB)/4, $0x0d090501
     7  
     8  TEXT ·drawFillOver_SIMD_internal(SB),0,$0-60
     9  	// base+0(FP)
    10  	// i0+8(FP)
    11  	// i1+16(FP)
    12  	// stride+24(FP)
    13  	// n+32(FP)
    14  	// adivm+40(FP)
    15  	// sr+44(FP)
    16  	// sg+48(FP)
    17  	// sb+52(FP)
    18  	// sa+56(FP)
    19  	
    20  	// DX row index
    21  	// CX column index
    22  	// AX pointer to current pixel
    23  	// R14 i0
    24  	// R15 i1
    25  	
    26  	// X0 zeroed register
    27  	// X1 current pixel
    28  	// X3 source pixel
    29  	// X4 is the shuffle map to do the >> 8 and pack everything back into a single 32bit value
    30  	
    31  	MOVSS drawFillOver_SIMD_shufflemap<>(SB), X4
    32  	
    33  	PXOR X0, X0
    34  	MOVQ i0+8(FP), R14
    35  	MOVQ i1+16(FP), R15
    36  	
    37  	// load adivm to X2, fill all uint32s with it
    38  	MOVSS advim+40(FP), X2
    39  	VBROADCASTSS X2, X2
    40  	
    41  	// load source pixel to X3
    42  	VMOVDQU sr+44(FP), X3
    43  	
    44  	MOVQ $0, DX
    45  row_loop:
    46  	CMPQ DX, n+32(FP)
    47  	JGE row_loop_end
    48  	
    49  	MOVQ R14, CX
    50  	MOVQ base+0(FP), AX
    51  	LEAQ (AX)(CX*1), AX
    52  column_loop:
    53  	CMPQ CX, R15
    54  	JGE column_loop_end
    55  	
    56  	// load current pixel to X1, unpack twice to get uint32s
    57  	MOVSS (AX), X1
    58  	PUNPCKLBW X0, X1
    59  	VPUNPCKLWD X0, X1, X1
    60  	
    61  	VPMULLD X2, X1, X1 // component * a/m
    62  	VPADDD X3, X1, X1 // (component * a/m) + source_component
    63  	
    64  	VPSHUFB X4, X1, X1 // get the second byte of every 32bit word and pack it into the lowest word of X1
    65  	MOVSS X1, (AX) // write back to memory
    66  	
    67  	ADDQ $4, CX
    68  	ADDQ $4, AX
    69  	JMP column_loop
    70  	
    71  column_loop_end:
    72  	ADDQ stride+24(FP), R14
    73  	ADDQ stride+24(FP), R15
    74  	INCQ DX
    75  	JMP row_loop
    76  	
    77  row_loop_end:
    78  	
    79  	RET
    80  
    81  TEXT ·getCPUID1(SB),$0
    82  	MOVQ $1, AX
    83  	CPUID
    84  	MOVD DX, ret+0(FP)
    85  	MOVD CX, ret+4(FP)
    86  	RET
    87  
    88  TEXT ·getCPUID70(SB),$0
    89  	MOVQ $7, AX
    90  	MOVQ $0, CX
    91  	CPUID
    92  	MOVD BX, ret+0(FP)
    93  	MOVD CX, ret+4(FP)
    94  	RET