github.com/apache/arrow/go/v14@v14.0.2/internal/utils/_lib/min_max_avx2_amd64.s (about)

     1  	.text
     2  	.intel_syntax noprefix
     3  	.file	"min_max.c"
     4  	.section	.rodata.cst32,"aM",@progbits,32
     5  	.p2align	5                               # -- Begin function int8_max_min_avx2
     6  .LCPI0_0:
     7  	.zero	32,128
     8  .LCPI0_1:
     9  	.zero	32,127
    10  	.section	.rodata.cst16,"aM",@progbits,16
    11  	.p2align	4
    12  .LCPI0_2:
    13  	.zero	16,127
    14  .LCPI0_3:
    15  	.zero	16,128
    16  	.text
    17  	.globl	int8_max_min_avx2
    18  	.p2align	4, 0x90
    19  	.type	int8_max_min_avx2,@function
    20  int8_max_min_avx2:                      # @int8_max_min_avx2
    21  # %bb.0:
    22  	push	rbp
    23  	mov	rbp, rsp
    24  	and	rsp, -8
    25  	test	esi, esi
    26  	jle	.LBB0_1
    27  # %bb.2:
    28  	mov	r9d, esi
    29  	cmp	esi, 63
    30  	ja	.LBB0_4
    31  # %bb.3:
    32  	mov	r8b, -128
    33  	mov	sil, 127
    34  	xor	r10d, r10d
    35  	jmp	.LBB0_11
    36  .LBB0_1:
    37  	mov	sil, 127
    38  	mov	r8b, -128
    39  	jmp	.LBB0_12
    40  .LBB0_4:
    41  	mov	r10d, r9d
    42  	and	r10d, -64
    43  	lea	rax, [r10 - 64]
    44  	mov	r8, rax
    45  	shr	r8, 6
    46  	add	r8, 1
    47  	test	rax, rax
    48  	je	.LBB0_5
    49  # %bb.6:
    50  	mov	rsi, r8
    51  	and	rsi, -2
    52  	neg	rsi
    53  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    54  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    55  	xor	eax, eax
    56  	vmovdqa	ymm2, ymm0
    57  	vmovdqa	ymm3, ymm1
    58  	.p2align	4, 0x90
    59  .LBB0_7:                                # =>This Inner Loop Header: Depth=1
    60  	vmovdqu	ymm4, ymmword ptr [rdi + rax]
    61  	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
    62  	vmovdqu	ymm6, ymmword ptr [rdi + rax + 64]
    63  	vmovdqu	ymm7, ymmword ptr [rdi + rax + 96]
    64  	vpminsb	ymm0, ymm0, ymm4
    65  	vpminsb	ymm2, ymm2, ymm5
    66  	vpmaxsb	ymm1, ymm1, ymm4
    67  	vpmaxsb	ymm3, ymm3, ymm5
    68  	vpminsb	ymm0, ymm0, ymm6
    69  	vpminsb	ymm2, ymm2, ymm7
    70  	vpmaxsb	ymm1, ymm1, ymm6
    71  	vpmaxsb	ymm3, ymm3, ymm7
    72  	sub	rax, -128
    73  	add	rsi, 2
    74  	jne	.LBB0_7
    75  # %bb.8:
    76  	test	r8b, 1
    77  	je	.LBB0_10
    78  .LBB0_9:
    79  	vmovdqu	ymm4, ymmword ptr [rdi + rax]
    80  	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
    81  	vpmaxsb	ymm3, ymm3, ymm5
    82  	vpmaxsb	ymm1, ymm1, ymm4
    83  	vpminsb	ymm2, ymm2, ymm5
    84  	vpminsb	ymm0, ymm0, ymm4
    85  .LBB0_10:
    86  	vpmaxsb	ymm1, ymm1, ymm3
    87  	vextracti128	xmm3, ymm1, 1
    88  	vpmaxsb	xmm1, xmm1, xmm3
    89  	vpxor	xmm1, xmm1, xmmword ptr [rip + .LCPI0_2]
    90  	vpminsb	ymm0, ymm0, ymm2
    91  	vpsrlw	xmm2, xmm1, 8
    92  	vpminub	xmm1, xmm1, xmm2
    93  	vphminposuw	xmm1, xmm1
    94  	vmovd	r8d, xmm1
    95  	xor	r8b, 127
    96  	vextracti128	xmm1, ymm0, 1
    97  	vpminsb	xmm0, xmm0, xmm1
    98  	vpxor	xmm0, xmm0, xmmword ptr [rip + .LCPI0_3]
    99  	vpsrlw	xmm1, xmm0, 8
   100  	vpminub	xmm0, xmm0, xmm1
   101  	vphminposuw	xmm0, xmm0
   102  	vmovd	esi, xmm0
   103  	xor	sil, -128
   104  	cmp	r10, r9
   105  	je	.LBB0_12
   106  	.p2align	4, 0x90
   107  .LBB0_11:                               # =>This Inner Loop Header: Depth=1
   108  	movzx	eax, byte ptr [rdi + r10]
   109  	cmp	sil, al
   110  	movzx	esi, sil
   111  	cmovg	esi, eax
   112  	cmp	r8b, al
   113  	movzx	r8d, r8b
   114  	cmovl	r8d, eax
   115  	add	r10, 1
   116  	cmp	r9, r10
   117  	jne	.LBB0_11
   118  .LBB0_12:
   119  	mov	byte ptr [rcx], r8b
   120  	mov	byte ptr [rdx], sil
   121  	mov	rsp, rbp
   122  	pop	rbp
   123  	vzeroupper
   124  	ret
   125  .LBB0_5:
   126  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   127  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   128  	xor	eax, eax
   129  	vmovdqa	ymm2, ymm0
   130  	vmovdqa	ymm3, ymm1
   131  	test	r8b, 1
   132  	jne	.LBB0_9
   133  	jmp	.LBB0_10
   134  .Lfunc_end0:
   135  	.size	int8_max_min_avx2, .Lfunc_end0-int8_max_min_avx2
   136                                          # -- End function
   137  	.globl	uint8_max_min_avx2              # -- Begin function uint8_max_min_avx2
   138  	.p2align	4, 0x90
   139  	.type	uint8_max_min_avx2,@function
   140  uint8_max_min_avx2:                     # @uint8_max_min_avx2
   141  # %bb.0:
   142  	push	rbp
   143  	mov	rbp, rsp
   144  	and	rsp, -8
   145  	test	esi, esi
   146  	jle	.LBB1_1
   147  # %bb.2:
   148  	mov	r9d, esi
   149  	cmp	esi, 63
   150  	ja	.LBB1_4
   151  # %bb.3:
   152  	mov	sil, -1
   153  	xor	r10d, r10d
   154  	xor	eax, eax
   155  	jmp	.LBB1_11
   156  .LBB1_1:
   157  	mov	sil, -1
   158  	xor	eax, eax
   159  	jmp	.LBB1_12
   160  .LBB1_4:
   161  	mov	r10d, r9d
   162  	and	r10d, -64
   163  	lea	rax, [r10 - 64]
   164  	mov	r8, rax
   165  	shr	r8, 6
   166  	add	r8, 1
   167  	test	rax, rax
   168  	je	.LBB1_5
   169  # %bb.6:
   170  	mov	rsi, r8
   171  	and	rsi, -2
   172  	neg	rsi
   173  	vpxor	xmm0, xmm0, xmm0
   174  	vpcmpeqd	ymm1, ymm1, ymm1
   175  	xor	eax, eax
   176  	vpcmpeqd	ymm2, ymm2, ymm2
   177  	vpxor	xmm3, xmm3, xmm3
   178  	.p2align	4, 0x90
   179  .LBB1_7:                                # =>This Inner Loop Header: Depth=1
   180  	vmovdqu	ymm4, ymmword ptr [rdi + rax]
   181  	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
   182  	vmovdqu	ymm6, ymmword ptr [rdi + rax + 64]
   183  	vmovdqu	ymm7, ymmword ptr [rdi + rax + 96]
   184  	vpminub	ymm1, ymm1, ymm4
   185  	vpminub	ymm2, ymm2, ymm5
   186  	vpmaxub	ymm0, ymm0, ymm4
   187  	vpmaxub	ymm3, ymm3, ymm5
   188  	vpminub	ymm1, ymm1, ymm6
   189  	vpminub	ymm2, ymm2, ymm7
   190  	vpmaxub	ymm0, ymm0, ymm6
   191  	vpmaxub	ymm3, ymm3, ymm7
   192  	sub	rax, -128
   193  	add	rsi, 2
   194  	jne	.LBB1_7
   195  # %bb.8:
   196  	test	r8b, 1
   197  	je	.LBB1_10
   198  .LBB1_9:
   199  	vmovdqu	ymm4, ymmword ptr [rdi + rax]
   200  	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
   201  	vpmaxub	ymm3, ymm3, ymm5
   202  	vpmaxub	ymm0, ymm0, ymm4
   203  	vpminub	ymm2, ymm2, ymm5
   204  	vpminub	ymm1, ymm1, ymm4
   205  .LBB1_10:
   206  	vpminub	ymm1, ymm1, ymm2
   207  	vpmaxub	ymm0, ymm0, ymm3
   208  	vextracti128	xmm2, ymm0, 1
   209  	vpmaxub	xmm0, xmm0, xmm2
   210  	vpcmpeqd	xmm2, xmm2, xmm2
   211  	vpxor	xmm0, xmm0, xmm2
   212  	vpsrlw	xmm2, xmm0, 8
   213  	vpminub	xmm0, xmm0, xmm2
   214  	vphminposuw	xmm0, xmm0
   215  	vmovd	eax, xmm0
   216  	not	al
   217  	vextracti128	xmm0, ymm1, 1
   218  	vpminub	xmm0, xmm1, xmm0
   219  	vpsrlw	xmm1, xmm0, 8
   220  	vpminub	xmm0, xmm0, xmm1
   221  	vphminposuw	xmm0, xmm0
   222  	vmovd	esi, xmm0
   223  	cmp	r10, r9
   224  	je	.LBB1_12
   225  	.p2align	4, 0x90
   226  .LBB1_11:                               # =>This Inner Loop Header: Depth=1
   227  	movzx	r8d, byte ptr [rdi + r10]
   228  	cmp	sil, r8b
   229  	movzx	esi, sil
   230  	cmovae	esi, r8d
   231  	cmp	al, r8b
   232  	movzx	eax, al
   233  	cmovbe	eax, r8d
   234  	add	r10, 1
   235  	cmp	r9, r10
   236  	jne	.LBB1_11
   237  .LBB1_12:
   238  	mov	byte ptr [rcx], al
   239  	mov	byte ptr [rdx], sil
   240  	mov	rsp, rbp
   241  	pop	rbp
   242  	vzeroupper
   243  	ret
   244  .LBB1_5:
   245  	vpxor	xmm0, xmm0, xmm0
   246  	vpcmpeqd	ymm1, ymm1, ymm1
   247  	xor	eax, eax
   248  	vpcmpeqd	ymm2, ymm2, ymm2
   249  	vpxor	xmm3, xmm3, xmm3
   250  	test	r8b, 1
   251  	jne	.LBB1_9
   252  	jmp	.LBB1_10
   253  .Lfunc_end1:
   254  	.size	uint8_max_min_avx2, .Lfunc_end1-uint8_max_min_avx2
   255                                          # -- End function
   256  	.section	.rodata.cst32,"aM",@progbits,32
   257  	.p2align	5                               # -- Begin function int16_max_min_avx2
   258  .LCPI2_0:
   259  	.short	32768                           # 0x8000
   260  	.short	32768                           # 0x8000
   261  	.short	32768                           # 0x8000
   262  	.short	32768                           # 0x8000
   263  	.short	32768                           # 0x8000
   264  	.short	32768                           # 0x8000
   265  	.short	32768                           # 0x8000
   266  	.short	32768                           # 0x8000
   267  	.short	32768                           # 0x8000
   268  	.short	32768                           # 0x8000
   269  	.short	32768                           # 0x8000
   270  	.short	32768                           # 0x8000
   271  	.short	32768                           # 0x8000
   272  	.short	32768                           # 0x8000
   273  	.short	32768                           # 0x8000
   274  	.short	32768                           # 0x8000
   275  .LCPI2_1:
   276  	.short	32767                           # 0x7fff
   277  	.short	32767                           # 0x7fff
   278  	.short	32767                           # 0x7fff
   279  	.short	32767                           # 0x7fff
   280  	.short	32767                           # 0x7fff
   281  	.short	32767                           # 0x7fff
   282  	.short	32767                           # 0x7fff
   283  	.short	32767                           # 0x7fff
   284  	.short	32767                           # 0x7fff
   285  	.short	32767                           # 0x7fff
   286  	.short	32767                           # 0x7fff
   287  	.short	32767                           # 0x7fff
   288  	.short	32767                           # 0x7fff
   289  	.short	32767                           # 0x7fff
   290  	.short	32767                           # 0x7fff
   291  	.short	32767                           # 0x7fff
   292  	.section	.rodata.cst16,"aM",@progbits,16
   293  	.p2align	4
   294  .LCPI2_2:
   295  	.short	32767                           # 0x7fff
   296  	.short	32767                           # 0x7fff
   297  	.short	32767                           # 0x7fff
   298  	.short	32767                           # 0x7fff
   299  	.short	32767                           # 0x7fff
   300  	.short	32767                           # 0x7fff
   301  	.short	32767                           # 0x7fff
   302  	.short	32767                           # 0x7fff
   303  .LCPI2_3:
   304  	.short	32768                           # 0x8000
   305  	.short	32768                           # 0x8000
   306  	.short	32768                           # 0x8000
   307  	.short	32768                           # 0x8000
   308  	.short	32768                           # 0x8000
   309  	.short	32768                           # 0x8000
   310  	.short	32768                           # 0x8000
   311  	.short	32768                           # 0x8000
   312  	.text
   313  	.globl	int16_max_min_avx2
   314  	.p2align	4, 0x90
   315  	.type	int16_max_min_avx2,@function
   316  int16_max_min_avx2:                     # @int16_max_min_avx2
   317  # %bb.0:
   318  	push	rbp
   319  	mov	rbp, rsp
   320  	and	rsp, -8
   321  	test	esi, esi
   322  	jle	.LBB2_1
   323  # %bb.2:
   324  	mov	r9d, esi
   325  	cmp	esi, 31
   326  	ja	.LBB2_4
   327  # %bb.3:
   328  	mov	r8w, -32768
   329  	mov	si, 32767
   330  	xor	r10d, r10d
   331  	jmp	.LBB2_11
   332  .LBB2_1:
   333  	mov	si, 32767
   334  	mov	r8w, -32768
   335  	jmp	.LBB2_12
   336  .LBB2_4:
   337  	mov	r10d, r9d
   338  	and	r10d, -32
   339  	lea	rax, [r10 - 32]
   340  	mov	r8, rax
   341  	shr	r8, 5
   342  	add	r8, 1
   343  	test	rax, rax
   344  	je	.LBB2_5
   345  # %bb.6:
   346  	mov	rsi, r8
   347  	and	rsi, -2
   348  	neg	rsi
   349  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
   350  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
   351  	xor	eax, eax
   352  	vmovdqa	ymm2, ymm0
   353  	vmovdqa	ymm3, ymm1
   354  	.p2align	4, 0x90
   355  .LBB2_7:                                # =>This Inner Loop Header: Depth=1
   356  	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
   357  	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
   358  	vmovdqu	ymm6, ymmword ptr [rdi + 2*rax + 64]
   359  	vmovdqu	ymm7, ymmword ptr [rdi + 2*rax + 96]
   360  	vpminsw	ymm0, ymm0, ymm4
   361  	vpminsw	ymm2, ymm2, ymm5
   362  	vpmaxsw	ymm1, ymm1, ymm4
   363  	vpmaxsw	ymm3, ymm3, ymm5
   364  	vpminsw	ymm0, ymm0, ymm6
   365  	vpminsw	ymm2, ymm2, ymm7
   366  	vpmaxsw	ymm1, ymm1, ymm6
   367  	vpmaxsw	ymm3, ymm3, ymm7
   368  	add	rax, 64
   369  	add	rsi, 2
   370  	jne	.LBB2_7
   371  # %bb.8:
   372  	test	r8b, 1
   373  	je	.LBB2_10
   374  .LBB2_9:
   375  	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
   376  	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
   377  	vpmaxsw	ymm3, ymm3, ymm5
   378  	vpmaxsw	ymm1, ymm1, ymm4
   379  	vpminsw	ymm2, ymm2, ymm5
   380  	vpminsw	ymm0, ymm0, ymm4
   381  .LBB2_10:
   382  	vpmaxsw	ymm1, ymm1, ymm3
   383  	vextracti128	xmm3, ymm1, 1
   384  	vpmaxsw	xmm1, xmm1, xmm3
   385  	vpxor	xmm1, xmm1, xmmword ptr [rip + .LCPI2_2]
   386  	vpminsw	ymm0, ymm0, ymm2
   387  	vphminposuw	xmm1, xmm1
   388  	vmovd	r8d, xmm1
   389  	xor	r8d, 32767
   390  	vextracti128	xmm1, ymm0, 1
   391  	vpminsw	xmm0, xmm0, xmm1
   392  	vpxor	xmm0, xmm0, xmmword ptr [rip + .LCPI2_3]
   393  	vphminposuw	xmm0, xmm0
   394  	vmovd	esi, xmm0
   395  	xor	esi, 32768
   396  	cmp	r10, r9
   397  	je	.LBB2_12
   398  	.p2align	4, 0x90
   399  .LBB2_11:                               # =>This Inner Loop Header: Depth=1
   400  	movzx	eax, word ptr [rdi + 2*r10]
   401  	cmp	si, ax
   402  	cmovg	esi, eax
   403  	cmp	r8w, ax
   404  	cmovl	r8d, eax
   405  	add	r10, 1
   406  	cmp	r9, r10
   407  	jne	.LBB2_11
   408  .LBB2_12:
   409  	mov	word ptr [rcx], r8w
   410  	mov	word ptr [rdx], si
   411  	mov	rsp, rbp
   412  	pop	rbp
   413  	vzeroupper
   414  	ret
   415  .LBB2_5:
   416  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
   417  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
   418  	xor	eax, eax
   419  	vmovdqa	ymm2, ymm0
   420  	vmovdqa	ymm3, ymm1
   421  	test	r8b, 1
   422  	jne	.LBB2_9
   423  	jmp	.LBB2_10
   424  .Lfunc_end2:
   425  	.size	int16_max_min_avx2, .Lfunc_end2-int16_max_min_avx2
   426                                          # -- End function
   427  	.globl	uint16_max_min_avx2             # -- Begin function uint16_max_min_avx2
   428  	.p2align	4, 0x90
   429  	.type	uint16_max_min_avx2,@function
   430  uint16_max_min_avx2:                    # @uint16_max_min_avx2
   431  # %bb.0:
   432  	push	rbp
   433  	mov	rbp, rsp
   434  	and	rsp, -8
   435  	test	esi, esi
   436  	jle	.LBB3_1
   437  # %bb.2:
   438  	mov	r9d, esi
   439  	cmp	esi, 31
   440  	ja	.LBB3_4
   441  # %bb.3:
   442  	mov	r8w, -1
   443  	xor	r10d, r10d
   444  	xor	esi, esi
   445  	jmp	.LBB3_11
   446  .LBB3_1:
   447  	mov	r8w, -1
   448  	xor	esi, esi
   449  	jmp	.LBB3_12
   450  .LBB3_4:
   451  	mov	r10d, r9d
   452  	and	r10d, -32
   453  	lea	rax, [r10 - 32]
   454  	mov	r8, rax
   455  	shr	r8, 5
   456  	add	r8, 1
   457  	test	rax, rax
   458  	je	.LBB3_5
   459  # %bb.6:
   460  	mov	rsi, r8
   461  	and	rsi, -2
   462  	neg	rsi
   463  	vpxor	xmm0, xmm0, xmm0
   464  	vpcmpeqd	ymm1, ymm1, ymm1
   465  	xor	eax, eax
   466  	vpcmpeqd	ymm2, ymm2, ymm2
   467  	vpxor	xmm3, xmm3, xmm3
   468  	.p2align	4, 0x90
   469  .LBB3_7:                                # =>This Inner Loop Header: Depth=1
   470  	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
   471  	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
   472  	vmovdqu	ymm6, ymmword ptr [rdi + 2*rax + 64]
   473  	vmovdqu	ymm7, ymmword ptr [rdi + 2*rax + 96]
   474  	vpminuw	ymm1, ymm1, ymm4
   475  	vpminuw	ymm2, ymm2, ymm5
   476  	vpmaxuw	ymm0, ymm0, ymm4
   477  	vpmaxuw	ymm3, ymm3, ymm5
   478  	vpminuw	ymm1, ymm1, ymm6
   479  	vpminuw	ymm2, ymm2, ymm7
   480  	vpmaxuw	ymm0, ymm0, ymm6
   481  	vpmaxuw	ymm3, ymm3, ymm7
   482  	add	rax, 64
   483  	add	rsi, 2
   484  	jne	.LBB3_7
   485  # %bb.8:
   486  	test	r8b, 1
   487  	je	.LBB3_10
   488  .LBB3_9:
   489  	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
   490  	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
   491  	vpmaxuw	ymm3, ymm3, ymm5
   492  	vpmaxuw	ymm0, ymm0, ymm4
   493  	vpminuw	ymm2, ymm2, ymm5
   494  	vpminuw	ymm1, ymm1, ymm4
   495  .LBB3_10:
   496  	vpminuw	ymm1, ymm1, ymm2
   497  	vpmaxuw	ymm0, ymm0, ymm3
   498  	vextracti128	xmm2, ymm0, 1
   499  	vpmaxuw	xmm0, xmm0, xmm2
   500  	vpcmpeqd	xmm2, xmm2, xmm2
   501  	vpxor	xmm0, xmm0, xmm2
   502  	vphminposuw	xmm0, xmm0
   503  	vmovd	esi, xmm0
   504  	not	esi
   505  	vextracti128	xmm0, ymm1, 1
   506  	vpminuw	xmm0, xmm1, xmm0
   507  	vphminposuw	xmm0, xmm0
   508  	vmovd	r8d, xmm0
   509  	cmp	r10, r9
   510  	je	.LBB3_12
   511  	.p2align	4, 0x90
   512  .LBB3_11:                               # =>This Inner Loop Header: Depth=1
   513  	movzx	eax, word ptr [rdi + 2*r10]
   514  	cmp	r8w, ax
   515  	cmovae	r8d, eax
   516  	cmp	si, ax
   517  	cmovbe	esi, eax
   518  	add	r10, 1
   519  	cmp	r9, r10
   520  	jne	.LBB3_11
   521  .LBB3_12:
   522  	mov	word ptr [rcx], si
   523  	mov	word ptr [rdx], r8w
   524  	mov	rsp, rbp
   525  	pop	rbp
   526  	vzeroupper
   527  	ret
   528  .LBB3_5:
   529  	vpxor	xmm0, xmm0, xmm0
   530  	vpcmpeqd	ymm1, ymm1, ymm1
   531  	xor	eax, eax
   532  	vpcmpeqd	ymm2, ymm2, ymm2
   533  	vpxor	xmm3, xmm3, xmm3
   534  	test	r8b, 1
   535  	jne	.LBB3_9
   536  	jmp	.LBB3_10
   537  .Lfunc_end3:
   538  	.size	uint16_max_min_avx2, .Lfunc_end3-uint16_max_min_avx2
   539                                          # -- End function
   540  	.section	.rodata.cst4,"aM",@progbits,4
   541  	.p2align	2                               # -- Begin function int32_max_min_avx2
   542  .LCPI4_0:
   543  	.long	2147483648                      # 0x80000000
   544  .LCPI4_1:
   545  	.long	2147483647                      # 0x7fffffff
   546  	.text
   547  	.globl	int32_max_min_avx2
   548  	.p2align	4, 0x90
   549  	.type	int32_max_min_avx2,@function
   550  int32_max_min_avx2:                     # @int32_max_min_avx2
   551  # %bb.0:
   552  	push	rbp
   553  	mov	rbp, rsp
   554  	and	rsp, -8
   555  	test	esi, esi
   556  	jle	.LBB4_1
   557  # %bb.2:
   558  	mov	r8d, esi
   559  	cmp	esi, 31
   560  	ja	.LBB4_4
   561  # %bb.3:
   562  	mov	r10d, -2147483648
   563  	mov	eax, 2147483647
   564  	xor	r9d, r9d
   565  	jmp	.LBB4_7
   566  .LBB4_1:
   567  	mov	eax, 2147483647
   568  	mov	esi, -2147483648
   569  	jmp	.LBB4_8
   570  .LBB4_4:
   571  	mov	r9d, r8d
   572  	vpbroadcastd	ymm4, dword ptr [rip + .LCPI4_0] # ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
   573  	and	r9d, -32
   574  	vpbroadcastd	ymm0, dword ptr [rip + .LCPI4_1] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
   575  	xor	eax, eax
   576  	vmovdqa	ymm1, ymm0
   577  	vmovdqa	ymm2, ymm0
   578  	vmovdqa	ymm3, ymm0
   579  	vmovdqa	ymm5, ymm4
   580  	vmovdqa	ymm6, ymm4
   581  	vmovdqa	ymm7, ymm4
   582  	.p2align	4, 0x90
   583  .LBB4_5:                                # =>This Inner Loop Header: Depth=1
   584  	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax]
   585  	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 32]
   586  	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 64]
   587  	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 96]
   588  	vpminsd	ymm0, ymm0, ymm8
   589  	vpminsd	ymm1, ymm1, ymm9
   590  	vpminsd	ymm2, ymm2, ymm10
   591  	vpminsd	ymm3, ymm3, ymm11
   592  	vpmaxsd	ymm4, ymm4, ymm8
   593  	vpmaxsd	ymm5, ymm5, ymm9
   594  	vpmaxsd	ymm6, ymm6, ymm10
   595  	vpmaxsd	ymm7, ymm7, ymm11
   596  	add	rax, 32
   597  	cmp	r9, rax
   598  	jne	.LBB4_5
   599  # %bb.6:
   600  	vpmaxsd	ymm4, ymm4, ymm5
   601  	vpmaxsd	ymm4, ymm4, ymm6
   602  	vpmaxsd	ymm4, ymm4, ymm7
   603  	vextracti128	xmm5, ymm4, 1
   604  	vpmaxsd	xmm4, xmm4, xmm5
   605  	vpshufd	xmm5, xmm4, 78                  # xmm5 = xmm4[2,3,0,1]
   606  	vpmaxsd	xmm4, xmm4, xmm5
   607  	vpshufd	xmm5, xmm4, 229                 # xmm5 = xmm4[1,1,2,3]
   608  	vpmaxsd	xmm4, xmm4, xmm5
   609  	vmovd	r10d, xmm4
   610  	vpminsd	ymm0, ymm0, ymm1
   611  	vpminsd	ymm0, ymm0, ymm2
   612  	vpminsd	ymm0, ymm0, ymm3
   613  	vextracti128	xmm1, ymm0, 1
   614  	vpminsd	xmm0, xmm0, xmm1
   615  	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
   616  	vpminsd	xmm0, xmm0, xmm1
   617  	vpshufd	xmm1, xmm0, 229                 # xmm1 = xmm0[1,1,2,3]
   618  	vpminsd	xmm0, xmm0, xmm1
   619  	vmovd	eax, xmm0
   620  	mov	esi, r10d
   621  	cmp	r9, r8
   622  	je	.LBB4_8
   623  	.p2align	4, 0x90
   624  .LBB4_7:                                # =>This Inner Loop Header: Depth=1
   625  	mov	esi, dword ptr [rdi + 4*r9]
   626  	cmp	eax, esi
   627  	cmovg	eax, esi
   628  	cmp	r10d, esi
   629  	cmovge	esi, r10d
   630  	add	r9, 1
   631  	mov	r10d, esi
   632  	cmp	r8, r9
   633  	jne	.LBB4_7
   634  .LBB4_8:
   635  	mov	dword ptr [rcx], esi
   636  	mov	dword ptr [rdx], eax
   637  	mov	rsp, rbp
   638  	pop	rbp
   639  	vzeroupper
   640  	ret
   641  .Lfunc_end4:
   642  	.size	int32_max_min_avx2, .Lfunc_end4-int32_max_min_avx2
   643                                          # -- End function
   644  	.globl	uint32_max_min_avx2             # -- Begin function uint32_max_min_avx2
   645  	.p2align	4, 0x90
   646  	.type	uint32_max_min_avx2,@function
   647  uint32_max_min_avx2:                    # @uint32_max_min_avx2
   648  # %bb.0:
   649  	push	rbp
   650  	mov	rbp, rsp
   651  	and	rsp, -8
   652  	test	esi, esi
   653  	jle	.LBB5_1
   654  # %bb.2:
   655  	mov	r8d, esi
   656  	cmp	esi, 31
   657  	ja	.LBB5_4
   658  # %bb.3:
   659  	xor	r9d, r9d
   660  	mov	eax, -1
   661  	xor	r10d, r10d
   662  	jmp	.LBB5_7
   663  .LBB5_1:
   664  	mov	eax, -1
   665  	xor	esi, esi
   666  	jmp	.LBB5_8
   667  .LBB5_4:
   668  	mov	r9d, r8d
   669  	and	r9d, -32
   670  	vpxor	xmm4, xmm4, xmm4
   671  	vpcmpeqd	ymm0, ymm0, ymm0
   672  	xor	eax, eax
   673  	vpcmpeqd	ymm1, ymm1, ymm1
   674  	vpcmpeqd	ymm2, ymm2, ymm2
   675  	vpcmpeqd	ymm3, ymm3, ymm3
   676  	vpxor	xmm5, xmm5, xmm5
   677  	vpxor	xmm6, xmm6, xmm6
   678  	vpxor	xmm7, xmm7, xmm7
   679  	.p2align	4, 0x90
   680  .LBB5_5:                                # =>This Inner Loop Header: Depth=1
   681  	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax]
   682  	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 32]
   683  	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 64]
   684  	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 96]
   685  	vpminud	ymm0, ymm0, ymm8
   686  	vpminud	ymm1, ymm1, ymm9
   687  	vpminud	ymm2, ymm2, ymm10
   688  	vpminud	ymm3, ymm3, ymm11
   689  	vpmaxud	ymm4, ymm4, ymm8
   690  	vpmaxud	ymm5, ymm5, ymm9
   691  	vpmaxud	ymm6, ymm6, ymm10
   692  	vpmaxud	ymm7, ymm7, ymm11
   693  	add	rax, 32
   694  	cmp	r9, rax
   695  	jne	.LBB5_5
   696  # %bb.6:
   697  	vpmaxud	ymm4, ymm4, ymm5
   698  	vpmaxud	ymm4, ymm4, ymm6
   699  	vpmaxud	ymm4, ymm4, ymm7
   700  	vextracti128	xmm5, ymm4, 1
   701  	vpmaxud	xmm4, xmm4, xmm5
   702  	vpshufd	xmm5, xmm4, 78                  # xmm5 = xmm4[2,3,0,1]
   703  	vpmaxud	xmm4, xmm4, xmm5
   704  	vpshufd	xmm5, xmm4, 229                 # xmm5 = xmm4[1,1,2,3]
   705  	vpmaxud	xmm4, xmm4, xmm5
   706  	vmovd	r10d, xmm4
   707  	vpminud	ymm0, ymm0, ymm1
   708  	vpminud	ymm0, ymm0, ymm2
   709  	vpminud	ymm0, ymm0, ymm3
   710  	vextracti128	xmm1, ymm0, 1
   711  	vpminud	xmm0, xmm0, xmm1
   712  	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
   713  	vpminud	xmm0, xmm0, xmm1
   714  	vpshufd	xmm1, xmm0, 229                 # xmm1 = xmm0[1,1,2,3]
   715  	vpminud	xmm0, xmm0, xmm1
   716  	vmovd	eax, xmm0
   717  	mov	esi, r10d
   718  	cmp	r9, r8
   719  	je	.LBB5_8
   720  	.p2align	4, 0x90
   721  .LBB5_7:                                # =>This Inner Loop Header: Depth=1
   722  	mov	esi, dword ptr [rdi + 4*r9]
   723  	cmp	eax, esi
   724  	cmovae	eax, esi
   725  	cmp	r10d, esi
   726  	cmova	esi, r10d
   727  	add	r9, 1
   728  	mov	r10d, esi
   729  	cmp	r8, r9
   730  	jne	.LBB5_7
   731  .LBB5_8:
   732  	mov	dword ptr [rcx], esi
   733  	mov	dword ptr [rdx], eax
   734  	mov	rsp, rbp
   735  	pop	rbp
   736  	vzeroupper
   737  	ret
   738  .Lfunc_end5:
   739  	.size	uint32_max_min_avx2, .Lfunc_end5-uint32_max_min_avx2
   740                                          # -- End function
   741  	.section	.rodata.cst8,"aM",@progbits,8
   742  	.p2align	3                               # -- Begin function int64_max_min_avx2
   743  .LCPI6_0:
   744  	.quad	-9223372036854775808            # 0x8000000000000000
   745  .LCPI6_1:
   746  	.quad	9223372036854775807             # 0x7fffffffffffffff
   747  	.text
   748  	.globl	int64_max_min_avx2
   749  	.p2align	4, 0x90
   750  	.type	int64_max_min_avx2,@function
   751  int64_max_min_avx2:                     # @int64_max_min_avx2
   752  # %bb.0:
   753  	push	rbp
   754  	mov	rbp, rsp
   755  	and	rsp, -8
   756  	movabs	rax, 9223372036854775807
   757  	test	esi, esi
   758  	jle	.LBB6_1
   759  # %bb.2:
   760  	mov	r8d, esi
   761  	cmp	esi, 15
   762  	ja	.LBB6_4
   763  # %bb.3:
   764  	lea	r10, [rax + 1]
   765  	xor	r9d, r9d
   766  	jmp	.LBB6_7
   767  .LBB6_1:
   768  	lea	rsi, [rax + 1]
   769  	jmp	.LBB6_8
   770  .LBB6_4:
   771  	mov	r9d, r8d
   772  	vpbroadcastq	ymm4, qword ptr [rip + .LCPI6_0] # ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
   773  	and	r9d, -16
   774  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI6_1] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
   775  	xor	eax, eax
   776  	vmovdqa	ymm3, ymm0
   777  	vmovdqa	ymm2, ymm0
   778  	vmovdqa	ymm1, ymm0
   779  	vmovdqa	ymm7, ymm4
   780  	vmovdqa	ymm6, ymm4
   781  	vmovdqa	ymm5, ymm4
   782  	.p2align	4, 0x90
   783  .LBB6_5:                                # =>This Inner Loop Header: Depth=1
   784  	vmovdqu	ymm8, ymmword ptr [rdi + 8*rax]
   785  	vpcmpgtq	ymm9, ymm8, ymm0
   786  	vblendvpd	ymm0, ymm8, ymm0, ymm9
   787  	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 32]
   788  	vpcmpgtq	ymm10, ymm9, ymm3
   789  	vblendvpd	ymm3, ymm9, ymm3, ymm10
   790  	vmovdqu	ymm10, ymmword ptr [rdi + 8*rax + 64]
   791  	vpcmpgtq	ymm11, ymm10, ymm2
   792  	vblendvpd	ymm2, ymm10, ymm2, ymm11
   793  	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 96]
   794  	vpcmpgtq	ymm12, ymm11, ymm1
   795  	vblendvpd	ymm1, ymm11, ymm1, ymm12
   796  	vpcmpgtq	ymm12, ymm4, ymm8
   797  	vblendvpd	ymm4, ymm8, ymm4, ymm12
   798  	vpcmpgtq	ymm8, ymm7, ymm9
   799  	vblendvpd	ymm7, ymm9, ymm7, ymm8
   800  	vpcmpgtq	ymm8, ymm6, ymm10
   801  	vblendvpd	ymm6, ymm10, ymm6, ymm8
   802  	vpcmpgtq	ymm8, ymm5, ymm11
   803  	vblendvpd	ymm5, ymm11, ymm5, ymm8
   804  	add	rax, 16
   805  	cmp	r9, rax
   806  	jne	.LBB6_5
   807  # %bb.6:
   808  	vpcmpgtq	ymm8, ymm4, ymm7
   809  	vblendvpd	ymm4, ymm7, ymm4, ymm8
   810  	vpcmpgtq	ymm7, ymm4, ymm6
   811  	vblendvpd	ymm4, ymm6, ymm4, ymm7
   812  	vpcmpgtq	ymm6, ymm4, ymm5
   813  	vblendvpd	ymm4, ymm5, ymm4, ymm6
   814  	vextractf128	xmm5, ymm4, 1
   815  	vpcmpgtq	xmm6, xmm4, xmm5
   816  	vblendvpd	xmm4, xmm5, xmm4, xmm6
   817  	vpermilps	xmm5, xmm4, 78          # xmm5 = xmm4[2,3,0,1]
   818  	vpcmpgtq	xmm6, xmm4, xmm5
   819  	vblendvpd	xmm4, xmm5, xmm4, xmm6
   820  	vmovq	r10, xmm4
   821  	vpcmpgtq	ymm4, ymm3, ymm0
   822  	vblendvpd	ymm0, ymm3, ymm0, ymm4
   823  	vpcmpgtq	ymm3, ymm2, ymm0
   824  	vblendvpd	ymm0, ymm2, ymm0, ymm3
   825  	vpcmpgtq	ymm2, ymm1, ymm0
   826  	vblendvpd	ymm0, ymm1, ymm0, ymm2
   827  	vextractf128	xmm1, ymm0, 1
   828  	vpcmpgtq	xmm2, xmm1, xmm0
   829  	vblendvpd	xmm0, xmm1, xmm0, xmm2
   830  	vpermilps	xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
   831  	vpcmpgtq	xmm2, xmm1, xmm0
   832  	vblendvpd	xmm0, xmm1, xmm0, xmm2
   833  	vmovq	rax, xmm0
   834  	mov	rsi, r10
   835  	cmp	r9, r8
   836  	je	.LBB6_8
   837  	.p2align	4, 0x90
   838  .LBB6_7:                                # =>This Inner Loop Header: Depth=1
   839  	mov	rsi, qword ptr [rdi + 8*r9]
   840  	cmp	rax, rsi
   841  	cmovg	rax, rsi
   842  	cmp	r10, rsi
   843  	cmovge	rsi, r10
   844  	add	r9, 1
   845  	mov	r10, rsi
   846  	cmp	r8, r9
   847  	jne	.LBB6_7
   848  .LBB6_8:
   849  	mov	qword ptr [rcx], rsi
   850  	mov	qword ptr [rdx], rax
   851  	mov	rsp, rbp
   852  	pop	rbp
   853  	vzeroupper
   854  	ret
   855  .Lfunc_end6:
   856  	.size	int64_max_min_avx2, .Lfunc_end6-int64_max_min_avx2
   857                                          # -- End function
   858  	.section	.rodata.cst8,"aM",@progbits,8
   859  	.p2align	3                               # -- Begin function uint64_max_min_avx2
   860  .LCPI7_0:
   861  	.quad	-9223372036854775808            # 0x8000000000000000
   862  	.text
   863  	.globl	uint64_max_min_avx2
   864  	.p2align	4, 0x90
   865  	.type	uint64_max_min_avx2,@function
   866  uint64_max_min_avx2:                    # @uint64_max_min_avx2
   867  # %bb.0:
   868  	push	rbp
   869  	mov	rbp, rsp
   870  	and	rsp, -8
   871  	test	esi, esi
   872  	jle	.LBB7_1
   873  # %bb.2:
   874  	mov	r8d, esi
   875  	cmp	esi, 15
   876  	ja	.LBB7_4
   877  # %bb.3:
   878  	mov	rax, -1
   879  	xor	r9d, r9d
   880  	xor	r10d, r10d
   881  	jmp	.LBB7_7
   882  .LBB7_1:
   883  	mov	rax, -1
   884  	xor	esi, esi
   885  	jmp	.LBB7_8
   886  .LBB7_4:
   887  	mov	r9d, r8d
   888  	and	r9d, -16
   889  	vpxor	xmm5, xmm5, xmm5
   890  	vpcmpeqd	ymm1, ymm1, ymm1
   891  	xor	eax, eax
   892  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI7_0] # ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
   893  	vpcmpeqd	ymm4, ymm4, ymm4
   894  	vpcmpeqd	ymm3, ymm3, ymm3
   895  	vpcmpeqd	ymm2, ymm2, ymm2
   896  	vpxor	xmm8, xmm8, xmm8
   897  	vpxor	xmm7, xmm7, xmm7
   898  	vpxor	xmm6, xmm6, xmm6
   899  	.p2align	4, 0x90
   900  .LBB7_5:                                # =>This Inner Loop Header: Depth=1
   901  	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax]
   902  	vpxor	ymm10, ymm1, ymm0
   903  	vpxor	ymm11, ymm9, ymm0
   904  	vpcmpgtq	ymm10, ymm11, ymm10
   905  	vblendvpd	ymm1, ymm9, ymm1, ymm10
   906  	vpxor	ymm10, ymm5, ymm0
   907  	vpcmpgtq	ymm10, ymm10, ymm11
   908  	vblendvpd	ymm5, ymm9, ymm5, ymm10
   909  	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 32]
   910  	vpxor	ymm10, ymm4, ymm0
   911  	vpxor	ymm11, ymm9, ymm0
   912  	vpcmpgtq	ymm10, ymm11, ymm10
   913  	vblendvpd	ymm4, ymm9, ymm4, ymm10
   914  	vpxor	ymm10, ymm8, ymm0
   915  	vpcmpgtq	ymm10, ymm10, ymm11
   916  	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 64]
   917  	vblendvpd	ymm8, ymm9, ymm8, ymm10
   918  	vpxor	ymm9, ymm3, ymm0
   919  	vpxor	ymm10, ymm11, ymm0
   920  	vpcmpgtq	ymm9, ymm10, ymm9
   921  	vblendvpd	ymm3, ymm11, ymm3, ymm9
   922  	vpxor	ymm9, ymm7, ymm0
   923  	vpcmpgtq	ymm9, ymm9, ymm10
   924  	vblendvpd	ymm7, ymm11, ymm7, ymm9
   925  	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 96]
   926  	vpxor	ymm10, ymm2, ymm0
   927  	vpxor	ymm11, ymm9, ymm0
   928  	vpcmpgtq	ymm10, ymm11, ymm10
   929  	vblendvpd	ymm2, ymm9, ymm2, ymm10
   930  	vpxor	ymm10, ymm6, ymm0
   931  	vpcmpgtq	ymm10, ymm10, ymm11
   932  	vblendvpd	ymm6, ymm9, ymm6, ymm10
   933  	add	rax, 16
   934  	cmp	r9, rax
   935  	jne	.LBB7_5
   936  # %bb.6:
   937  	vpxor	ymm9, ymm8, ymm0
   938  	vpxor	ymm10, ymm5, ymm0
   939  	vpcmpgtq	ymm9, ymm10, ymm9
   940  	vblendvpd	ymm5, ymm8, ymm5, ymm9
   941  	vxorpd	ymm8, ymm5, ymm0
   942  	vpxor	ymm9, ymm7, ymm0
   943  	vpcmpgtq	ymm8, ymm8, ymm9
   944  	vblendvpd	ymm5, ymm7, ymm5, ymm8
   945  	vxorpd	ymm7, ymm5, ymm0
   946  	vpxor	ymm8, ymm6, ymm0
   947  	vpcmpgtq	ymm7, ymm7, ymm8
   948  	vblendvpd	ymm5, ymm6, ymm5, ymm7
   949  	vextractf128	xmm6, ymm5, 1
   950  	vxorpd	xmm8, xmm6, xmm0
   951  	vxorpd	xmm7, xmm5, xmm0
   952  	vpcmpgtq	xmm7, xmm7, xmm8
   953  	vblendvpd	xmm5, xmm6, xmm5, xmm7
   954  	vpermilps	xmm6, xmm5, 78          # xmm6 = xmm5[2,3,0,1]
   955  	vxorpd	xmm8, xmm5, xmm0
   956  	vxorpd	xmm7, xmm6, xmm0
   957  	vpcmpgtq	xmm7, xmm8, xmm7
   958  	vblendvpd	xmm5, xmm6, xmm5, xmm7
   959  	vpxor	ymm6, ymm1, ymm0
   960  	vpxor	ymm7, ymm4, ymm0
   961  	vpcmpgtq	ymm6, ymm7, ymm6
   962  	vblendvpd	ymm1, ymm4, ymm1, ymm6
   963  	vxorpd	ymm4, ymm1, ymm0
   964  	vpxor	ymm6, ymm3, ymm0
   965  	vpcmpgtq	ymm4, ymm6, ymm4
   966  	vblendvpd	ymm1, ymm3, ymm1, ymm4
   967  	vmovq	r10, xmm5
   968  	vxorpd	ymm3, ymm1, ymm0
   969  	vpxor	ymm4, ymm2, ymm0
   970  	vpcmpgtq	ymm3, ymm4, ymm3
   971  	vblendvpd	ymm1, ymm2, ymm1, ymm3
   972  	vextractf128	xmm2, ymm1, 1
   973  	vxorpd	xmm3, xmm1, xmm0
   974  	vxorpd	xmm4, xmm2, xmm0
   975  	vpcmpgtq	xmm3, xmm4, xmm3
   976  	vblendvpd	xmm1, xmm2, xmm1, xmm3
   977  	vpermilps	xmm2, xmm1, 78          # xmm2 = xmm1[2,3,0,1]
   978  	vxorpd	xmm3, xmm1, xmm0
   979  	vxorpd	xmm0, xmm2, xmm0
   980  	vpcmpgtq	xmm0, xmm0, xmm3
   981  	vblendvpd	xmm0, xmm2, xmm1, xmm0
   982  	vmovq	rax, xmm0
   983  	mov	rsi, r10
   984  	cmp	r9, r8
   985  	je	.LBB7_8
   986  	.p2align	4, 0x90
   987  .LBB7_7:                                # =>This Inner Loop Header: Depth=1
   988  	mov	rsi, qword ptr [rdi + 8*r9]
   989  	cmp	rax, rsi
   990  	cmovae	rax, rsi
   991  	cmp	r10, rsi
   992  	cmova	rsi, r10
   993  	add	r9, 1
   994  	mov	r10, rsi
   995  	cmp	r8, r9
   996  	jne	.LBB7_7
   997  .LBB7_8:
   998  	mov	qword ptr [rcx], rsi
   999  	mov	qword ptr [rdx], rax
  1000  	mov	rsp, rbp
  1001  	pop	rbp
  1002  	vzeroupper
  1003  	ret
  1004  .Lfunc_end7:
  1005  	.size	uint64_max_min_avx2, .Lfunc_end7-uint64_max_min_avx2
  1006                                          # -- End function
  1007  	.ident	"Debian clang version 11.0.1-2"
  1008  	.section	".note.GNU-stack","",@progbits
  1009  	.addrsig