github.com/gop9/olt@v0.0.0-20200202132135-d956aad50b08/framework/drawfillover_avx.s (about) 1 //+build amd64,go1.10 2 3 #include "textflag.h" 4 5 GLOBL drawFillOver_SIMD_shufflemap<>(SB), (NOPTR+RODATA), $4 6 DATA drawFillOver_SIMD_shufflemap<>+0x00(SB)/4, $0x0d090501 7 8 TEXT ·drawFillOver_SIMD_internal(SB),0,$0-60 9 // base+0(FP) 10 // i0+8(FP) 11 // i1+16(FP) 12 // stride+24(FP) 13 // n+32(FP) 14 // adivm+40(FP) 15 // sr+44(FP) 16 // sg+48(FP) 17 // sb+52(FP) 18 // sa+56(FP) 19 20 // DX row index 21 // CX column index 22 // AX pointer to current pixel 23 // R14 i0 24 // R15 i1 25 26 // X0 zeroed register 27 // X1 current pixel 28 // X3 source pixel 29 // X4 is the shuffle map to do the >> 8 and pack everything back into a single 32bit value 30 31 MOVSS drawFillOver_SIMD_shufflemap<>(SB), X4 32 33 PXOR X0, X0 34 MOVQ i0+8(FP), R14 35 MOVQ i1+16(FP), R15 36 37 // load adivm to X2, fill all uint32s with it 38 MOVSS advim+40(FP), X2 39 VBROADCASTSS X2, X2 40 41 // load source pixel to X3 42 VMOVDQU sr+44(FP), X3 43 44 MOVQ $0, DX 45 row_loop: 46 CMPQ DX, n+32(FP) 47 JGE row_loop_end 48 49 MOVQ R14, CX 50 MOVQ base+0(FP), AX 51 LEAQ (AX)(CX*1), AX 52 column_loop: 53 CMPQ CX, R15 54 JGE column_loop_end 55 56 // load current pixel to X1, unpack twice to get uint32s 57 MOVSS (AX), X1 58 PUNPCKLBW X0, X1 59 VPUNPCKLWD X0, X1, X1 60 61 VPMULLD X2, X1, X1 // component * a/m 62 VPADDD X3, X1, X1 // (component * a/m) + source_component 63 64 VPSHUFB X4, X1, X1 // get the second byte of every 32bit word and pack it into the lowest word of X1 65 MOVSS X1, (AX) // write back to memory 66 67 ADDQ $4, CX 68 ADDQ $4, AX 69 JMP column_loop 70 71 column_loop_end: 72 ADDQ stride+24(FP), R14 73 ADDQ stride+24(FP), R15 74 INCQ DX 75 JMP row_loop 76 77 row_loop_end: 78 79 RET 80 81 TEXT ·getCPUID1(SB),$0 82 MOVQ $1, AX 83 CPUID 84 MOVD DX, ret+0(FP) 85 MOVD CX, ret+4(FP) 86 RET 87 88 TEXT ·getCPUID70(SB),$0 89 MOVQ $7, AX 90 MOVQ $0, CX 91 CPUID 92 MOVD BX, ret+0(FP) 93 MOVD CX, ret+4(FP) 94 RET