github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/bmi/_lib/bitmap_bmi2.s (about)

     1  	.text
     2  	.intel_syntax noprefix
     3  	.file	"bitmap_bmi2.c"
     4  	.globl	extract_bits                    # -- Begin function extract_bits
     5  	.p2align	4, 0x90
     6  	.type	extract_bits,@function
     7  extract_bits:                           # @extract_bits
     8  # %bb.0:
     9  	push	rbp
    10  	mov	rbp, rsp
    11  	and	rsp, -8
    12  	pext	rax, rdi, rsi
    13  	mov	rsp, rbp
    14  	pop	rbp
    15  	ret
    16  .Lfunc_end0:
    17  	.size	extract_bits, .Lfunc_end0-extract_bits
    18                                          # -- End function
    19  	.section	.rodata.cst32,"aM",@progbits,32
    20  	.p2align	5                               # -- Begin function levels_to_bitmap
    21  .LCPI1_0:
    22  	.quad	0                               # 0x0
    23  	.quad	1                               # 0x1
    24  	.quad	2                               # 0x2
    25  	.quad	3                               # 0x3
    26  	.section	.rodata.cst8,"aM",@progbits,8
    27  	.p2align	3
    28  .LCPI1_1:
    29  	.quad	4                               # 0x4
    30  .LCPI1_2:
    31  	.quad	8                               # 0x8
    32  .LCPI1_3:
    33  	.quad	12                              # 0xc
    34  .LCPI1_4:
    35  	.quad	1                               # 0x1
    36  .LCPI1_5:
    37  	.quad	16                              # 0x10
    38  	.text
    39  	.globl	levels_to_bitmap
    40  	.p2align	4, 0x90
    41  	.type	levels_to_bitmap,@function
    42  levels_to_bitmap:                       # @levels_to_bitmap
    43  # %bb.0:
    44  	push	rbp
    45  	mov	rbp, rsp
    46  	and	rsp, -8
    47  	test	esi, esi
    48  	jle	.LBB1_1
    49  # %bb.2:
    50  	mov	r8d, esi
    51  	cmp	esi, 15
    52  	ja	.LBB1_4
    53  # %bb.3:
    54  	xor	esi, esi
    55  	xor	eax, eax
    56  	jmp	.LBB1_7
    57  .LBB1_1:
    58  	xor	eax, eax
    59  	jmp	.LBB1_8
    60  .LBB1_4:
    61  	mov	esi, r8d
    62  	and	esi, -16
    63  	vmovd	xmm0, edx
    64  	vpbroadcastw	xmm1, xmm0
    65  	vpxor	xmm0, xmm0, xmm0
    66  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI1_0] # ymm2 = [0,1,2,3]
    67  	vpbroadcastq	ymm12, qword ptr [rip + .LCPI1_1] # ymm12 = [4,4,4,4]
    68  	vpbroadcastq	ymm4, qword ptr [rip + .LCPI1_2] # ymm4 = [8,8,8,8]
    69  	vpbroadcastq	ymm5, qword ptr [rip + .LCPI1_3] # ymm5 = [12,12,12,12]
    70  	vpbroadcastq	ymm6, qword ptr [rip + .LCPI1_4] # ymm6 = [1,1,1,1]
    71  	vpbroadcastq	ymm7, qword ptr [rip + .LCPI1_5] # ymm7 = [16,16,16,16]
    72  	xor	eax, eax
    73  	vpxor	xmm8, xmm8, xmm8
    74  	vpxor	xmm9, xmm9, xmm9
    75  	vpxor	xmm10, xmm10, xmm10
    76  	.p2align	4, 0x90
    77  .LBB1_5:                                # =>This Inner Loop Header: Depth=1
    78  	vpaddq	ymm11, ymm12, ymm2
    79  	vmovq	xmm3, qword ptr [rdi + 2*rax + 8] # xmm3 = mem[0],zero
    80  	vpcmpgtw	xmm3, xmm3, xmm1
    81  	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    82  	vpand	ymm3, ymm3, ymm6
    83  	vpsllvq	ymm3, ymm3, ymm11
    84  	vpaddq	ymm11, ymm2, ymm4
    85  	vpor	ymm8, ymm8, ymm3
    86  	vmovq	xmm3, qword ptr [rdi + 2*rax + 16] # xmm3 = mem[0],zero
    87  	vpcmpgtw	xmm3, xmm3, xmm1
    88  	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    89  	vpand	ymm3, ymm3, ymm6
    90  	vpsllvq	ymm3, ymm3, ymm11
    91  	vpaddq	ymm11, ymm2, ymm5
    92  	vpor	ymm9, ymm9, ymm3
    93  	vmovq	xmm3, qword ptr [rdi + 2*rax + 24] # xmm3 = mem[0],zero
    94  	vpcmpgtw	xmm3, xmm3, xmm1
    95  	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    96  	vpand	ymm3, ymm3, ymm6
    97  	vpsllvq	ymm3, ymm3, ymm11
    98  	vpor	ymm10, ymm10, ymm3
    99  	vmovq	xmm3, qword ptr [rdi + 2*rax]   # xmm3 = mem[0],zero
   100  	vpcmpgtw	xmm3, xmm3, xmm1
   101  	vpmovzxwq	ymm3, xmm3              # ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
   102  	vpand	ymm3, ymm3, ymm6
   103  	vpsllvq	ymm3, ymm3, ymm2
   104  	vpor	ymm0, ymm3, ymm0
   105  	add	rax, 16
   106  	vpaddq	ymm2, ymm2, ymm7
   107  	cmp	rsi, rax
   108  	jne	.LBB1_5
   109  # %bb.6:
   110  	vpor	ymm0, ymm8, ymm0
   111  	vpor	ymm0, ymm9, ymm0
   112  	vpor	ymm0, ymm10, ymm0
   113  	vextracti128	xmm1, ymm0, 1
   114  	vpor	xmm0, xmm0, xmm1
   115  	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
   116  	vpor	xmm0, xmm0, xmm1
   117  	vmovq	rax, xmm0
   118  	cmp	rsi, r8
   119  	je	.LBB1_8
   120  	.p2align	4, 0x90
   121  .LBB1_7:                                # =>This Inner Loop Header: Depth=1
   122  	xor	ecx, ecx
   123  	cmp	word ptr [rdi + 2*rsi], dx
   124  	setg	cl
   125  	shlx	rcx, rcx, rsi
   126  	or	rax, rcx
   127  	add	rsi, 1
   128  	cmp	r8, rsi
   129  	jne	.LBB1_7
   130  .LBB1_8:
   131  	mov	rsp, rbp
   132  	pop	rbp
   133  	vzeroupper
   134  	ret
   135  .Lfunc_end1:
   136  	.size	levels_to_bitmap, .Lfunc_end1-levels_to_bitmap
   137                                          # -- End function
   138  	.ident	"Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162"
   139  	.section	".note.GNU-stack","",@progbits
   140  	.addrsig