github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/internal/bytealg/count_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // func Count(data []byte, value byte) int 6 TEXT ·Count(SB), NOSPLIT, $0-40 7 MOVQ data_base+0(FP), AX 8 MOVQ data_len+8(FP), CX 9 MOVB value+24(FP), BX 10 MOVQ CX, DX // len 11 ADDQ AX, CX // end 12 XORQ SI, SI // count 13 14 CMPQ DX, $256 15 JB test 16 17 CMPB ·hasAVX2(SB), $0 18 JE test 19 20 XORQ R12, R12 21 XORQ R13, R13 22 XORQ R14, R14 23 XORQ R15, R15 24 25 CMPB ·hasAVX512Count(SB), $0 26 JE initAVX2 27 28 SHRQ $8, DX 29 SHLQ $8, DX 30 ADDQ AX, DX 31 VPBROADCASTB BX, Z0 32 loopAVX512: 33 VMOVDQU64 (AX), Z1 34 VMOVDQU64 64(AX), Z2 35 VMOVDQU64 128(AX), Z3 36 VMOVDQU64 192(AX), Z4 37 VPCMPUB $0, Z0, Z1, K1 38 VPCMPUB $0, Z0, Z2, K2 39 VPCMPUB $0, Z0, Z3, K3 40 VPCMPUB $0, Z0, Z4, K4 41 KMOVQ K1, R8 42 KMOVQ K2, R9 43 KMOVQ K3, R10 44 KMOVQ K4, R11 45 POPCNTQ R8, R8 46 POPCNTQ R9, R9 47 POPCNTQ R10, R10 48 POPCNTQ R11, R11 49 ADDQ R8, R12 50 ADDQ R9, R13 51 ADDQ R10, R14 52 ADDQ R11, R15 53 ADDQ $256, AX 54 CMPQ AX, DX 55 JNE loopAVX512 56 ADDQ R12, R13 57 ADDQ R14, R15 58 ADDQ R13, SI 59 ADDQ R15, SI 60 JMP doneAVX 61 62 initAVX2: 63 SHRQ $6, DX 64 SHLQ $6, DX 65 ADDQ AX, DX 66 VPBROADCASTB value+24(FP), Y0 67 loopAVX2: 68 VMOVDQU (AX), Y1 69 VMOVDQU 32(AX), Y2 70 VPCMPEQB Y0, Y1, Y1 71 VPCMPEQB Y0, Y2, Y2 72 VPMOVMSKB Y1, R12 73 VPMOVMSKB Y2, R13 74 POPCNTL R12, R12 75 POPCNTL R13, R13 76 ADDQ R12, R14 77 ADDQ R13, R15 78 ADDQ $64, AX 79 CMPQ AX, DX 80 JNE loopAVX2 81 ADDQ R14, SI 82 ADDQ R15, SI 83 84 doneAVX: 85 VZEROUPPER 86 JMP test 87 88 loop: 89 MOVQ SI, DI 90 INCQ DI 91 MOVB (AX), R8 92 CMPB BX, R8 93 CMOVQEQ DI, SI 94 INCQ AX 95 test: 96 CMPQ AX, CX 97 JNE loop 98 done: 99 MOVQ SI, ret+32(FP) 100 RET