github.com/apache/arrow/go/v14@v14.0.1/internal/utils/min_max_avx2_amd64.s (about) 1 //+build !noasm !appengine 2 // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT 3 4 DATA LCDATA1<>+0x000(SB)/8, $0x8080808080808080 5 DATA LCDATA1<>+0x008(SB)/8, $0x8080808080808080 6 DATA LCDATA1<>+0x010(SB)/8, $0x8080808080808080 7 DATA LCDATA1<>+0x018(SB)/8, $0x8080808080808080 8 DATA LCDATA1<>+0x020(SB)/8, $0x7f7f7f7f7f7f7f7f 9 DATA LCDATA1<>+0x028(SB)/8, $0x7f7f7f7f7f7f7f7f 10 DATA LCDATA1<>+0x030(SB)/8, $0x7f7f7f7f7f7f7f7f 11 DATA LCDATA1<>+0x038(SB)/8, $0x7f7f7f7f7f7f7f7f 12 DATA LCDATA1<>+0x040(SB)/8, $0x7f7f7f7f7f7f7f7f 13 DATA LCDATA1<>+0x048(SB)/8, $0x7f7f7f7f7f7f7f7f 14 DATA LCDATA1<>+0x050(SB)/8, $0x8080808080808080 15 DATA LCDATA1<>+0x058(SB)/8, $0x8080808080808080 16 GLOBL LCDATA1<>(SB), 8, $96 17 18 TEXT ·_int8_max_min_avx2(SB), $0-32 19 20 MOVQ values+0(FP), DI 21 MOVQ length+8(FP), SI 22 MOVQ minout+16(FP), DX 23 MOVQ maxout+24(FP), CX 24 LEAQ LCDATA1<>(SB), BP 25 26 WORD $0xf685 // test esi, esi 27 JLE LBB0_1 28 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 29 WORD $0xfe83; BYTE $0x3f // cmp esi, 63 30 JA LBB0_4 31 WORD $0xb041; BYTE $0x80 // mov r8b, -128 32 WORD $0xb640; BYTE $0x7f // mov sil, 127 33 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 34 JMP LBB0_11 35 36 LBB0_1: 37 WORD $0xb640; BYTE $0x7f // mov sil, 127 38 WORD $0xb041; BYTE $0x80 // mov r8b, -128 39 JMP LBB0_12 40 41 LBB0_4: 42 WORD $0x8945; BYTE $0xca // mov r10d, r9d 43 LONG $0xc0e28341 // and r10d, -64 44 LONG $0xc0428d49 // lea rax, [r10 - 64] 45 WORD $0x8949; BYTE $0xc0 // mov r8, rax 46 LONG $0x06e8c149 // shr r8, 6 47 LONG $0x01c08349 // add r8, 1 48 WORD $0x8548; BYTE $0xc0 // test rax, rax 49 JE LBB0_5 50 WORD $0x894c; BYTE $0xc6 // mov rsi, r8 51 LONG $0xfee68348 // and rsi, -2 52 WORD $0xf748; BYTE $0xde // neg rsi 53 LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI0_0] */ 54 LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI0_1] */ 55 WORD $0xc031 // xor eax, eax 56 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 57 LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 58 59 LBB0_7: 60 LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] 61 LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] 62 LONG $0x746ffec5; WORD $0x4007 // vmovdqu ymm6, yword [rdi + rax + 64] 63 LONG $0x7c6ffec5; WORD $0x6007 // vmovdqu ymm7, yword [rdi + rax + 96] 64 LONG $0x387de2c4; BYTE $0xc4 // vpminsb ymm0, ymm0, ymm4 65 LONG $0x386de2c4; BYTE $0xd5 // vpminsb ymm2, ymm2, ymm5 66 LONG $0x3c75e2c4; BYTE $0xcc // vpmaxsb ymm1, ymm1, ymm4 67 LONG $0x3c65e2c4; BYTE $0xdd // vpmaxsb ymm3, ymm3, ymm5 68 LONG $0x387de2c4; BYTE $0xc6 // vpminsb ymm0, ymm0, ymm6 69 LONG $0x386de2c4; BYTE $0xd7 // vpminsb ymm2, ymm2, ymm7 70 LONG $0x3c75e2c4; BYTE $0xce // vpmaxsb ymm1, ymm1, ymm6 71 LONG $0x3c65e2c4; BYTE $0xdf // vpmaxsb ymm3, ymm3, ymm7 72 LONG $0x80e88348 // sub rax, -128 73 LONG $0x02c68348 // add rsi, 2 74 JNE LBB0_7 75 LONG $0x01c0f641 // test r8b, 1 76 JE LBB0_10 77 78 LBB0_9: 79 LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] 80 LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] 81 LONG $0x3c65e2c4; BYTE $0xdd // vpmaxsb ymm3, ymm3, ymm5 82 LONG $0x3c75e2c4; BYTE $0xcc // vpmaxsb ymm1, ymm1, ymm4 83 LONG $0x386de2c4; BYTE $0xd5 // vpminsb ymm2, ymm2, ymm5 84 LONG $0x387de2c4; BYTE $0xc4 // vpminsb ymm0, ymm0, ymm4 85 86 LBB0_10: 87 LONG $0x3c75e2c4; BYTE $0xcb // vpmaxsb ymm1, ymm1, ymm3 88 LONG $0x397de3c4; WORD $0x01cb // vextracti128 xmm3, ymm1, 1 89 LONG $0x3c71e2c4; BYTE $0xcb // vpmaxsb xmm1, xmm1, xmm3 90 LONG $0x4deff1c5; BYTE $0x40 // vpxor xmm1, xmm1, oword 64[rbp] /* [rip + .LCPI0_2] */ 91 LONG $0x387de2c4; BYTE $0xc2 // vpminsb ymm0, ymm0, ymm2 92 LONG $0xd171e9c5; BYTE $0x08 // vpsrlw xmm2, xmm1, 8 93 LONG $0xcadaf1c5 // vpminub xmm1, xmm1, xmm2 94 LONG $0x4179e2c4; BYTE $0xc9 // vphminposuw xmm1, xmm1 95 LONG $0x7e79c1c4; BYTE $0xc8 // vmovd r8d, xmm1 96 LONG $0x7ff08041 // xor r8b, 127 97 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 98 LONG $0x3879e2c4; BYTE $0xc1 // vpminsb xmm0, xmm0, xmm1 99 LONG $0x45eff9c5; BYTE $0x50 // vpxor xmm0, xmm0, oword 80[rbp] /* [rip + .LCPI0_3] */ 100 LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 101 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 102 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 103 LONG $0xc67ef9c5 // vmovd esi, xmm0 104 LONG $0x80f68040 // xor sil, -128 105 WORD $0x394d; BYTE $0xca // cmp r10, r9 106 JE LBB0_12 107 108 LBB0_11: 109 LONG $0x04b60f42; BYTE $0x17 // movzx eax, byte [rdi + r10] 110 WORD $0x3840; BYTE $0xc6 // cmp sil, al 111 LONG $0xf6b60f40 // movzx esi, sil 112 WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax 113 WORD $0x3841; BYTE $0xc0 // cmp r8b, al 114 LONG $0xc0b60f45 // movzx r8d, r8b 115 LONG $0xc04c0f44 // cmovl r8d, eax 116 LONG $0x01c28349 // add r10, 1 117 WORD $0x394d; BYTE $0xd1 // cmp r9, r10 118 JNE LBB0_11 119 120 LBB0_12: 121 WORD $0x8844; BYTE $0x01 // mov byte [rcx], r8b 122 WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil 123 VZEROUPPER 124 RET 125 126 LBB0_5: 127 LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI0_0] */ 128 LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI0_1] */ 129 WORD $0xc031 // xor eax, eax 130 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 131 LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 132 LONG $0x01c0f641 // test r8b, 1 133 JNE LBB0_9 134 JMP LBB0_10 135 136 TEXT ·_uint8_max_min_avx2(SB), $0-32 137 138 MOVQ values+0(FP), DI 139 MOVQ length+8(FP), SI 140 MOVQ minout+16(FP), DX 141 MOVQ maxout+24(FP), CX 142 143 WORD $0xf685 // test esi, esi 144 JLE LBB1_1 145 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 146 WORD $0xfe83; BYTE $0x3f // cmp esi, 63 147 JA LBB1_4 148 WORD $0xb640; BYTE $0xff // mov sil, -1 149 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 150 WORD $0xc031 // xor eax, eax 151 JMP LBB1_11 152 153 LBB1_1: 154 WORD $0xb640; BYTE $0xff // mov sil, -1 155 WORD $0xc031 // xor eax, eax 156 JMP LBB1_12 157 158 LBB1_4: 159 WORD $0x8945; BYTE $0xca // mov r10d, r9d 160 LONG $0xc0e28341 // and r10d, -64 161 LONG $0xc0428d49 // lea rax, [r10 - 64] 162 WORD $0x8949; BYTE $0xc0 // mov r8, rax 163 LONG $0x06e8c149 // shr r8, 6 164 LONG $0x01c08349 // add r8, 1 165 WORD $0x8548; BYTE $0xc0 // test rax, rax 166 JE LBB1_5 167 WORD $0x894c; BYTE $0xc6 // mov rsi, r8 168 LONG $0xfee68348 // and rsi, -2 169 WORD $0xf748; BYTE $0xde // neg rsi 170 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 171 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 172 WORD $0xc031 // xor eax, eax 173 LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 174 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 175 176 LBB1_7: 177 LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] 178 LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] 179 LONG $0x746ffec5; WORD $0x4007 // vmovdqu ymm6, yword [rdi + rax + 64] 180 LONG $0x7c6ffec5; WORD $0x6007 // vmovdqu ymm7, yword [rdi + rax + 96] 181 LONG $0xccdaf5c5 // vpminub ymm1, ymm1, ymm4 182 LONG $0xd5daedc5 // vpminub ymm2, ymm2, ymm5 183 LONG $0xc4defdc5 // vpmaxub ymm0, ymm0, ymm4 184 LONG $0xdddee5c5 // vpmaxub ymm3, ymm3, ymm5 185 LONG $0xcedaf5c5 // vpminub ymm1, ymm1, ymm6 186 LONG $0xd7daedc5 // vpminub ymm2, ymm2, ymm7 187 LONG $0xc6defdc5 // vpmaxub ymm0, ymm0, ymm6 188 LONG $0xdfdee5c5 // vpmaxub ymm3, ymm3, ymm7 189 LONG $0x80e88348 // sub rax, -128 190 LONG $0x02c68348 // add rsi, 2 191 JNE LBB1_7 192 LONG $0x01c0f641 // test r8b, 1 193 JE LBB1_10 194 195 LBB1_9: 196 LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] 197 LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] 198 LONG $0xdddee5c5 // vpmaxub ymm3, ymm3, ymm5 199 LONG $0xc4defdc5 // vpmaxub ymm0, ymm0, ymm4 200 LONG $0xd5daedc5 // vpminub ymm2, ymm2, ymm5 201 LONG $0xccdaf5c5 // vpminub ymm1, ymm1, ymm4 202 203 LBB1_10: 204 LONG $0xcadaf5c5 // vpminub ymm1, ymm1, ymm2 205 LONG $0xc3defdc5 // vpmaxub ymm0, ymm0, ymm3 206 LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1 207 LONG $0xc2def9c5 // vpmaxub xmm0, xmm0, xmm2 208 LONG $0xd276e9c5 // vpcmpeqd xmm2, xmm2, xmm2 209 LONG $0xc2eff9c5 // vpxor xmm0, xmm0, xmm2 210 LONG $0xd071e9c5; BYTE $0x08 // vpsrlw xmm2, xmm0, 8 211 LONG $0xc2daf9c5 // vpminub xmm0, xmm0, xmm2 212 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 213 LONG $0xc07ef9c5 // vmovd eax, xmm0 214 WORD $0xd0f6 // not al 215 LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1 216 LONG $0xc0daf1c5 // vpminub xmm0, xmm1, xmm0 217 LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 218 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 219 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 220 LONG $0xc67ef9c5 // vmovd esi, xmm0 221 WORD $0x394d; BYTE $0xca // cmp r10, r9 222 JE LBB1_12 223 224 LBB1_11: 225 LONG $0x04b60f46; BYTE $0x17 // movzx r8d, byte [rdi + r10] 226 WORD $0x3844; BYTE $0xc6 // cmp sil, r8b 227 LONG $0xf6b60f40 // movzx esi, sil 228 LONG $0xf0430f41 // cmovae esi, r8d 229 WORD $0x3844; BYTE $0xc0 // cmp al, r8b 230 WORD $0xb60f; BYTE $0xc0 // movzx eax, al 231 LONG $0xc0460f41 // cmovbe eax, r8d 232 LONG $0x01c28349 // add r10, 1 233 WORD $0x394d; BYTE $0xd1 // cmp r9, r10 234 JNE LBB1_11 235 236 LBB1_12: 237 WORD $0x0188 // mov byte [rcx], al 238 WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil 239 VZEROUPPER 240 RET 241 242 LBB1_5: 243 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 244 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 245 WORD $0xc031 // xor eax, eax 246 LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 247 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 248 LONG $0x01c0f641 // test r8b, 1 249 JNE LBB1_9 250 JMP LBB1_10 251 252 DATA LCDATA2<>+0x000(SB)/8, $0x8000800080008000 253 DATA LCDATA2<>+0x008(SB)/8, $0x8000800080008000 254 DATA LCDATA2<>+0x010(SB)/8, $0x8000800080008000 255 DATA LCDATA2<>+0x018(SB)/8, $0x8000800080008000 256 DATA LCDATA2<>+0x020(SB)/8, $0x7fff7fff7fff7fff 257 DATA LCDATA2<>+0x028(SB)/8, $0x7fff7fff7fff7fff 258 DATA LCDATA2<>+0x030(SB)/8, $0x7fff7fff7fff7fff 259 DATA LCDATA2<>+0x038(SB)/8, $0x7fff7fff7fff7fff 260 DATA LCDATA2<>+0x040(SB)/8, $0x7fff7fff7fff7fff 261 DATA LCDATA2<>+0x048(SB)/8, $0x7fff7fff7fff7fff 262 DATA LCDATA2<>+0x050(SB)/8, $0x8000800080008000 263 DATA LCDATA2<>+0x058(SB)/8, $0x8000800080008000 264 GLOBL LCDATA2<>(SB), 8, $96 265 266 TEXT ·_int16_max_min_avx2(SB), $0-32 267 268 MOVQ values+0(FP), DI 269 MOVQ length+8(FP), SI 270 MOVQ minout+16(FP), DX 271 MOVQ maxout+24(FP), CX 272 LEAQ LCDATA2<>(SB), BP 273 274 WORD $0xf685 // test esi, esi 275 JLE LBB2_1 276 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 277 WORD $0xfe83; BYTE $0x1f // cmp esi, 31 278 JA LBB2_4 279 LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 280 LONG $0x7fffbe66 // mov si, 32767 281 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 282 JMP LBB2_11 283 284 LBB2_1: 285 LONG $0x7fffbe66 // mov si, 32767 286 LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 287 JMP LBB2_12 288 289 LBB2_4: 290 WORD $0x8945; BYTE $0xca // mov r10d, r9d 291 LONG $0xe0e28341 // and r10d, -32 292 LONG $0xe0428d49 // lea rax, [r10 - 32] 293 WORD $0x8949; BYTE $0xc0 // mov r8, rax 294 LONG $0x05e8c149 // shr r8, 5 295 LONG $0x01c08349 // add r8, 1 296 WORD $0x8548; BYTE $0xc0 // test rax, rax 297 JE LBB2_5 298 WORD $0x894c; BYTE $0xc6 // mov rsi, r8 299 LONG $0xfee68348 // and rsi, -2 300 WORD $0xf748; BYTE $0xde // neg rsi 301 LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI2_0] */ 302 LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI2_1] */ 303 WORD $0xc031 // xor eax, eax 304 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 305 LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 306 307 LBB2_7: 308 LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] 309 LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] 310 LONG $0x746ffec5; WORD $0x4047 // vmovdqu ymm6, yword [rdi + 2*rax + 64] 311 LONG $0x7c6ffec5; WORD $0x6047 // vmovdqu ymm7, yword [rdi + 2*rax + 96] 312 LONG $0xc4eafdc5 // vpminsw ymm0, ymm0, ymm4 313 LONG $0xd5eaedc5 // vpminsw ymm2, ymm2, ymm5 314 LONG $0xcceef5c5 // vpmaxsw ymm1, ymm1, ymm4 315 LONG $0xddeee5c5 // vpmaxsw ymm3, ymm3, ymm5 316 LONG $0xc6eafdc5 // vpminsw ymm0, ymm0, ymm6 317 LONG $0xd7eaedc5 // vpminsw ymm2, ymm2, ymm7 318 LONG $0xceeef5c5 // vpmaxsw ymm1, ymm1, ymm6 319 LONG $0xdfeee5c5 // vpmaxsw ymm3, ymm3, ymm7 320 LONG $0x40c08348 // add rax, 64 321 LONG $0x02c68348 // add rsi, 2 322 JNE LBB2_7 323 LONG $0x01c0f641 // test r8b, 1 324 JE LBB2_10 325 326 LBB2_9: 327 LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] 328 LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] 329 LONG $0xddeee5c5 // vpmaxsw ymm3, ymm3, ymm5 330 LONG $0xcceef5c5 // vpmaxsw ymm1, ymm1, ymm4 331 LONG $0xd5eaedc5 // vpminsw ymm2, ymm2, ymm5 332 LONG $0xc4eafdc5 // vpminsw ymm0, ymm0, ymm4 333 334 LBB2_10: 335 LONG $0xcbeef5c5 // vpmaxsw ymm1, ymm1, ymm3 336 LONG $0x397de3c4; WORD $0x01cb // vextracti128 xmm3, ymm1, 1 337 LONG $0xcbeef1c5 // vpmaxsw xmm1, xmm1, xmm3 338 LONG $0x4deff1c5; BYTE $0x40 // vpxor xmm1, xmm1, oword 64[rbp] /* [rip + .LCPI2_2] */ 339 LONG $0xc2eafdc5 // vpminsw ymm0, ymm0, ymm2 340 LONG $0x4179e2c4; BYTE $0xc9 // vphminposuw xmm1, xmm1 341 LONG $0x7e79c1c4; BYTE $0xc8 // vmovd r8d, xmm1 342 LONG $0xfff08141; WORD $0x007f; BYTE $0x00 // xor r8d, 32767 343 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 344 LONG $0xc1eaf9c5 // vpminsw xmm0, xmm0, xmm1 345 LONG $0x45eff9c5; BYTE $0x50 // vpxor xmm0, xmm0, oword 80[rbp] /* [rip + .LCPI2_3] */ 346 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 347 LONG $0xc67ef9c5 // vmovd esi, xmm0 348 LONG $0x8000f681; WORD $0x0000 // xor esi, 32768 349 WORD $0x394d; BYTE $0xca // cmp r10, r9 350 JE LBB2_12 351 352 LBB2_11: 353 LONG $0x04b70f42; BYTE $0x57 // movzx eax, word [rdi + 2*r10] 354 WORD $0x3966; BYTE $0xc6 // cmp si, ax 355 WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax 356 LONG $0xc0394166 // cmp r8w, ax 357 LONG $0xc04c0f44 // cmovl r8d, eax 358 LONG $0x01c28349 // add r10, 1 359 WORD $0x394d; BYTE $0xd1 // cmp r9, r10 360 JNE LBB2_11 361 362 LBB2_12: 363 LONG $0x01894466 // mov word [rcx], r8w 364 WORD $0x8966; BYTE $0x32 // mov word [rdx], si 365 VZEROUPPER 366 RET 367 368 LBB2_5: 369 LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI2_0] */ 370 LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI2_1] */ 371 WORD $0xc031 // xor eax, eax 372 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 373 LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 374 LONG $0x01c0f641 // test r8b, 1 375 JNE LBB2_9 376 JMP LBB2_10 377 378 TEXT ·_uint16_max_min_avx2(SB), $0-32 379 380 MOVQ values+0(FP), DI 381 MOVQ length+8(FP), SI 382 MOVQ minout+16(FP), DX 383 MOVQ maxout+24(FP), CX 384 385 WORD $0xf685 // test esi, esi 386 JLE LBB3_1 387 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 388 WORD $0xfe83; BYTE $0x1f // cmp esi, 31 389 JA LBB3_4 390 LONG $0xffb84166; BYTE $0xff // mov r8w, -1 391 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 392 WORD $0xf631 // xor esi, esi 393 JMP LBB3_11 394 395 LBB3_1: 396 LONG $0xffb84166; BYTE $0xff // mov r8w, -1 397 WORD $0xf631 // xor esi, esi 398 JMP LBB3_12 399 400 LBB3_4: 401 WORD $0x8945; BYTE $0xca // mov r10d, r9d 402 LONG $0xe0e28341 // and r10d, -32 403 LONG $0xe0428d49 // lea rax, [r10 - 32] 404 WORD $0x8949; BYTE $0xc0 // mov r8, rax 405 LONG $0x05e8c149 // shr r8, 5 406 LONG $0x01c08349 // add r8, 1 407 WORD $0x8548; BYTE $0xc0 // test rax, rax 408 JE LBB3_5 409 WORD $0x894c; BYTE $0xc6 // mov rsi, r8 410 LONG $0xfee68348 // and rsi, -2 411 WORD $0xf748; BYTE $0xde // neg rsi 412 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 413 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 414 WORD $0xc031 // xor eax, eax 415 LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 416 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 417 418 LBB3_7: 419 LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] 420 LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] 421 LONG $0x746ffec5; WORD $0x4047 // vmovdqu ymm6, yword [rdi + 2*rax + 64] 422 LONG $0x7c6ffec5; WORD $0x6047 // vmovdqu ymm7, yword [rdi + 2*rax + 96] 423 LONG $0x3a75e2c4; BYTE $0xcc // vpminuw ymm1, ymm1, ymm4 424 LONG $0x3a6de2c4; BYTE $0xd5 // vpminuw ymm2, ymm2, ymm5 425 LONG $0x3e7de2c4; BYTE $0xc4 // vpmaxuw ymm0, ymm0, ymm4 426 LONG $0x3e65e2c4; BYTE $0xdd // vpmaxuw ymm3, ymm3, ymm5 427 LONG $0x3a75e2c4; BYTE $0xce // vpminuw ymm1, ymm1, ymm6 428 LONG $0x3a6de2c4; BYTE $0xd7 // vpminuw ymm2, ymm2, ymm7 429 LONG $0x3e7de2c4; BYTE $0xc6 // vpmaxuw ymm0, ymm0, ymm6 430 LONG $0x3e65e2c4; BYTE $0xdf // vpmaxuw ymm3, ymm3, ymm7 431 LONG $0x40c08348 // add rax, 64 432 LONG $0x02c68348 // add rsi, 2 433 JNE LBB3_7 434 LONG $0x01c0f641 // test r8b, 1 435 JE LBB3_10 436 437 LBB3_9: 438 LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] 439 LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] 440 LONG $0x3e65e2c4; BYTE $0xdd // vpmaxuw ymm3, ymm3, ymm5 441 LONG $0x3e7de2c4; BYTE $0xc4 // vpmaxuw ymm0, ymm0, ymm4 442 LONG $0x3a6de2c4; BYTE $0xd5 // vpminuw ymm2, ymm2, ymm5 443 LONG $0x3a75e2c4; BYTE $0xcc // vpminuw ymm1, ymm1, ymm4 444 445 LBB3_10: 446 LONG $0x3a75e2c4; BYTE $0xca // vpminuw ymm1, ymm1, ymm2 447 LONG $0x3e7de2c4; BYTE $0xc3 // vpmaxuw ymm0, ymm0, ymm3 448 LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1 449 LONG $0x3e79e2c4; BYTE $0xc2 // vpmaxuw xmm0, xmm0, xmm2 450 LONG $0xd276e9c5 // vpcmpeqd xmm2, xmm2, xmm2 451 LONG $0xc2eff9c5 // vpxor xmm0, xmm0, xmm2 452 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 453 LONG $0xc67ef9c5 // vmovd esi, xmm0 454 WORD $0xd6f7 // not esi 455 LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1 456 LONG $0x3a71e2c4; BYTE $0xc0 // vpminuw xmm0, xmm1, xmm0 457 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 458 LONG $0x7e79c1c4; BYTE $0xc0 // vmovd r8d, xmm0 459 WORD $0x394d; BYTE $0xca // cmp r10, r9 460 JE LBB3_12 461 462 LBB3_11: 463 LONG $0x04b70f42; BYTE $0x57 // movzx eax, word [rdi + 2*r10] 464 LONG $0xc0394166 // cmp r8w, ax 465 LONG $0xc0430f44 // cmovae r8d, eax 466 WORD $0x3966; BYTE $0xc6 // cmp si, ax 467 WORD $0x460f; BYTE $0xf0 // cmovbe esi, eax 468 LONG $0x01c28349 // add r10, 1 469 WORD $0x394d; BYTE $0xd1 // cmp r9, r10 470 JNE LBB3_11 471 472 LBB3_12: 473 WORD $0x8966; BYTE $0x31 // mov word [rcx], si 474 LONG $0x02894466 // mov word [rdx], r8w 475 VZEROUPPER 476 RET 477 478 LBB3_5: 479 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 480 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 481 WORD $0xc031 // xor eax, eax 482 LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 483 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 484 LONG $0x01c0f641 // test r8b, 1 485 JNE LBB3_9 486 JMP LBB3_10 487 488 DATA LCDATA3<>+0x000(SB)/8, $0x7fffffff80000000 489 GLOBL LCDATA3<>(SB), 8, $8 490 491 TEXT ·_int32_max_min_avx2(SB), $0-32 492 493 MOVQ values+0(FP), DI 494 MOVQ length+8(FP), SI 495 MOVQ minout+16(FP), DX 496 MOVQ maxout+24(FP), CX 497 LEAQ LCDATA3<>(SB), BP 498 499 WORD $0xf685 // test esi, esi 500 JLE LBB4_1 501 WORD $0x8941; BYTE $0xf0 // mov r8d, esi 502 WORD $0xfe83; BYTE $0x1f // cmp esi, 31 503 JA LBB4_4 504 LONG $0x0000ba41; WORD $0x8000 // mov r10d, -2147483648 505 LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647 506 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d 507 JMP LBB4_7 508 509 LBB4_1: 510 LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647 511 LONG $0x000000be; BYTE $0x80 // mov esi, -2147483648 512 JMP LBB4_8 513 514 LBB4_4: 515 WORD $0x8945; BYTE $0xc1 // mov r9d, r8d 516 LONG $0x587de2c4; WORD $0x0065 // vpbroadcastd ymm4, dword 0[rbp] /* [rip + .LCPI4_0] */ 517 LONG $0xe0e18341 // and r9d, -32 518 LONG $0x587de2c4; WORD $0x0445 // vpbroadcastd ymm0, dword 4[rbp] /* [rip + .LCPI4_1] */ 519 WORD $0xc031 // xor eax, eax 520 LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 521 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 522 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 523 LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 524 LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4 525 LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4 526 527 LBB4_5: 528 LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] 529 LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] 530 LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] 531 LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] 532 LONG $0x397dc2c4; BYTE $0xc0 // vpminsd ymm0, ymm0, ymm8 533 LONG $0x3975c2c4; BYTE $0xc9 // vpminsd ymm1, ymm1, ymm9 534 LONG $0x396dc2c4; BYTE $0xd2 // vpminsd ymm2, ymm2, ymm10 535 LONG $0x3965c2c4; BYTE $0xdb // vpminsd ymm3, ymm3, ymm11 536 LONG $0x3d5dc2c4; BYTE $0xe0 // vpmaxsd ymm4, ymm4, ymm8 537 LONG $0x3d55c2c4; BYTE $0xe9 // vpmaxsd ymm5, ymm5, ymm9 538 LONG $0x3d4dc2c4; BYTE $0xf2 // vpmaxsd ymm6, ymm6, ymm10 539 LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd ymm7, ymm7, ymm11 540 LONG $0x20c08348 // add rax, 32 541 WORD $0x3949; BYTE $0xc1 // cmp r9, rax 542 JNE LBB4_5 543 LONG $0x3d5de2c4; BYTE $0xe5 // vpmaxsd ymm4, ymm4, ymm5 544 LONG $0x3d5de2c4; BYTE $0xe6 // vpmaxsd ymm4, ymm4, ymm6 545 LONG $0x3d5de2c4; BYTE $0xe7 // vpmaxsd ymm4, ymm4, ymm7 546 LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1 547 LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 548 LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78 549 LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 550 LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229 551 LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 552 LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4 553 LONG $0x397de2c4; BYTE $0xc1 // vpminsd ymm0, ymm0, ymm1 554 LONG $0x397de2c4; BYTE $0xc2 // vpminsd ymm0, ymm0, ymm2 555 LONG $0x397de2c4; BYTE $0xc3 // vpminsd ymm0, ymm0, ymm3 556 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 557 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 558 LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 559 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 560 LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229 561 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 562 LONG $0xc07ef9c5 // vmovd eax, xmm0 563 WORD $0x8944; BYTE $0xd6 // mov esi, r10d 564 WORD $0x394d; BYTE $0xc1 // cmp r9, r8 565 JE LBB4_8 566 567 LBB4_7: 568 LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9] 569 WORD $0xf039 // cmp eax, esi 570 WORD $0x4f0f; BYTE $0xc6 // cmovg eax, esi 571 WORD $0x3941; BYTE $0xf2 // cmp r10d, esi 572 LONG $0xf24d0f41 // cmovge esi, r10d 573 LONG $0x01c18349 // add r9, 1 574 WORD $0x8941; BYTE $0xf2 // mov r10d, esi 575 WORD $0x394d; BYTE $0xc8 // cmp r8, r9 576 JNE LBB4_7 577 578 LBB4_8: 579 WORD $0x3189 // mov dword [rcx], esi 580 WORD $0x0289 // mov dword [rdx], eax 581 VZEROUPPER 582 RET 583 584 TEXT ·_uint32_max_min_avx2(SB), $0-32 585 586 MOVQ values+0(FP), DI 587 MOVQ length+8(FP), SI 588 MOVQ minout+16(FP), DX 589 MOVQ maxout+24(FP), CX 590 591 WORD $0xf685 // test esi, esi 592 JLE LBB5_1 593 WORD $0x8941; BYTE $0xf0 // mov r8d, esi 594 WORD $0xfe83; BYTE $0x1f // cmp esi, 31 595 JA LBB5_4 596 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d 597 LONG $0xffffffb8; BYTE $0xff // mov eax, -1 598 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 599 JMP LBB5_7 600 601 LBB5_1: 602 LONG $0xffffffb8; BYTE $0xff // mov eax, -1 603 WORD $0xf631 // xor esi, esi 604 JMP LBB5_8 605 606 LBB5_4: 607 WORD $0x8945; BYTE $0xc1 // mov r9d, r8d 608 LONG $0xe0e18341 // and r9d, -32 609 LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4 610 LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 611 WORD $0xc031 // xor eax, eax 612 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 613 LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 614 LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3 615 LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 616 LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 617 LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 618 619 LBB5_5: 620 LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] 621 LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] 622 LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] 623 LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] 624 LONG $0x3b7dc2c4; BYTE $0xc0 // vpminud ymm0, ymm0, ymm8 625 LONG $0x3b75c2c4; BYTE $0xc9 // vpminud ymm1, ymm1, ymm9 626 LONG $0x3b6dc2c4; BYTE $0xd2 // vpminud ymm2, ymm2, ymm10 627 LONG $0x3b65c2c4; BYTE $0xdb // vpminud ymm3, ymm3, ymm11 628 LONG $0x3f5dc2c4; BYTE $0xe0 // vpmaxud ymm4, ymm4, ymm8 629 LONG $0x3f55c2c4; BYTE $0xe9 // vpmaxud ymm5, ymm5, ymm9 630 LONG $0x3f4dc2c4; BYTE $0xf2 // vpmaxud ymm6, ymm6, ymm10 631 LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud ymm7, ymm7, ymm11 632 LONG $0x20c08348 // add rax, 32 633 WORD $0x3949; BYTE $0xc1 // cmp r9, rax 634 JNE LBB5_5 635 LONG $0x3f5de2c4; BYTE $0xe5 // vpmaxud ymm4, ymm4, ymm5 636 LONG $0x3f5de2c4; BYTE $0xe6 // vpmaxud ymm4, ymm4, ymm6 637 LONG $0x3f5de2c4; BYTE $0xe7 // vpmaxud ymm4, ymm4, ymm7 638 LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1 639 LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 640 LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78 641 LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 642 LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229 643 LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 644 LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4 645 LONG $0x3b7de2c4; BYTE $0xc1 // vpminud ymm0, ymm0, ymm1 646 LONG $0x3b7de2c4; BYTE $0xc2 // vpminud ymm0, ymm0, ymm2 647 LONG $0x3b7de2c4; BYTE $0xc3 // vpminud ymm0, ymm0, ymm3 648 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 649 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 650 LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 651 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 652 LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229 653 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 654 LONG $0xc07ef9c5 // vmovd eax, xmm0 655 WORD $0x8944; BYTE $0xd6 // mov esi, r10d 656 WORD $0x394d; BYTE $0xc1 // cmp r9, r8 657 JE LBB5_8 658 659 LBB5_7: 660 LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9] 661 WORD $0xf039 // cmp eax, esi 662 WORD $0x430f; BYTE $0xc6 // cmovae eax, esi 663 WORD $0x3941; BYTE $0xf2 // cmp r10d, esi 664 LONG $0xf2470f41 // cmova esi, r10d 665 LONG $0x01c18349 // add r9, 1 666 WORD $0x8941; BYTE $0xf2 // mov r10d, esi 667 WORD $0x394d; BYTE $0xc8 // cmp r8, r9 668 JNE LBB5_7 669 670 LBB5_8: 671 WORD $0x3189 // mov dword [rcx], esi 672 WORD $0x0289 // mov dword [rdx], eax 673 VZEROUPPER 674 RET 675 676 DATA LCDATA4<>+0x000(SB)/8, $0x8000000000000000 677 DATA LCDATA4<>+0x008(SB)/8, $0x7fffffffffffffff 678 GLOBL LCDATA4<>(SB), 8, $16 679 680 TEXT ·_int64_max_min_avx2(SB), $0-32 681 682 MOVQ values+0(FP), DI 683 MOVQ length+8(FP), SI 684 MOVQ minout+16(FP), DX 685 MOVQ maxout+24(FP), CX 686 LEAQ LCDATA4<>(SB), BP 687 688 QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807 689 WORD $0xf685 // test esi, esi 690 JLE LBB6_1 691 WORD $0x8941; BYTE $0xf0 // mov r8d, esi 692 WORD $0xfe83; BYTE $0x0f // cmp esi, 15 693 JA LBB6_4 694 LONG $0x01508d4c // lea r10, [rax + 1] 695 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d 696 JMP LBB6_7 697 698 LBB6_1: 699 LONG $0x01708d48 // lea rsi, [rax + 1] 700 JMP LBB6_8 701 702 LBB6_4: 703 WORD $0x8945; BYTE $0xc1 // mov r9d, r8d 704 LONG $0x597de2c4; WORD $0x0065 // vpbroadcastq ymm4, qword 0[rbp] /* [rip + .LCPI6_0] */ 705 LONG $0xf0e18341 // and r9d, -16 706 LONG $0x597de2c4; WORD $0x0845 // vpbroadcastq ymm0, qword 8[rbp] /* [rip + .LCPI6_1] */ 707 WORD $0xc031 // xor eax, eax 708 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 709 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 710 LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 711 LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4 712 LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4 713 LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 714 715 LBB6_5: 716 LONG $0x046f7ec5; BYTE $0xc7 // vmovdqu ymm8, yword [rdi + 8*rax] 717 LONG $0x373d62c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm8, ymm0 718 LONG $0x4b3de3c4; WORD $0x90c0 // vblendvpd ymm0, ymm8, ymm0, ymm9 719 LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32] 720 LONG $0x373562c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm9, ymm3 721 LONG $0x4b35e3c4; WORD $0xa0db // vblendvpd ymm3, ymm9, ymm3, ymm10 722 LONG $0x546f7ec5; WORD $0x40c7 // vmovdqu ymm10, yword [rdi + 8*rax + 64] 723 LONG $0x372d62c4; BYTE $0xda // vpcmpgtq ymm11, ymm10, ymm2 724 LONG $0x4b2de3c4; WORD $0xb0d2 // vblendvpd ymm2, ymm10, ymm2, ymm11 725 LONG $0x5c6f7ec5; WORD $0x60c7 // vmovdqu ymm11, yword [rdi + 8*rax + 96] 726 LONG $0x372562c4; BYTE $0xe1 // vpcmpgtq ymm12, ymm11, ymm1 727 LONG $0x4b25e3c4; WORD $0xc0c9 // vblendvpd ymm1, ymm11, ymm1, ymm12 728 LONG $0x375d42c4; BYTE $0xe0 // vpcmpgtq ymm12, ymm4, ymm8 729 LONG $0x4b3de3c4; WORD $0xc0e4 // vblendvpd ymm4, ymm8, ymm4, ymm12 730 LONG $0x374542c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm7, ymm9 731 LONG $0x4b35e3c4; WORD $0x80ff // vblendvpd ymm7, ymm9, ymm7, ymm8 732 LONG $0x374d42c4; BYTE $0xc2 // vpcmpgtq ymm8, ymm6, ymm10 733 LONG $0x4b2de3c4; WORD $0x80f6 // vblendvpd ymm6, ymm10, ymm6, ymm8 734 LONG $0x375542c4; BYTE $0xc3 // vpcmpgtq ymm8, ymm5, ymm11 735 LONG $0x4b25e3c4; WORD $0x80ed // vblendvpd ymm5, ymm11, ymm5, ymm8 736 LONG $0x10c08348 // add rax, 16 737 WORD $0x3949; BYTE $0xc1 // cmp r9, rax 738 JNE LBB6_5 739 LONG $0x375d62c4; BYTE $0xc7 // vpcmpgtq ymm8, ymm4, ymm7 740 LONG $0x4b45e3c4; WORD $0x80e4 // vblendvpd ymm4, ymm7, ymm4, ymm8 741 LONG $0x375de2c4; BYTE $0xfe // vpcmpgtq ymm7, ymm4, ymm6 742 LONG $0x4b4de3c4; WORD $0x70e4 // vblendvpd ymm4, ymm6, ymm4, ymm7 743 LONG $0x375de2c4; BYTE $0xf5 // vpcmpgtq ymm6, ymm4, ymm5 744 LONG $0x4b55e3c4; WORD $0x60e4 // vblendvpd ymm4, ymm5, ymm4, ymm6 745 LONG $0x197de3c4; WORD $0x01e5 // vextractf128 xmm5, ymm4, 1 746 LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5 747 LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6 748 LONG $0x0479e3c4; WORD $0x4eec // vpermilps xmm5, xmm4, 78 749 LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5 750 LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6 751 LONG $0x7ef9c1c4; BYTE $0xe2 // vmovq r10, xmm4 752 LONG $0x3765e2c4; BYTE $0xe0 // vpcmpgtq ymm4, ymm3, ymm0 753 LONG $0x4b65e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm3, ymm0, ymm4 754 LONG $0x376de2c4; BYTE $0xd8 // vpcmpgtq ymm3, ymm2, ymm0 755 LONG $0x4b6de3c4; WORD $0x30c0 // vblendvpd ymm0, ymm2, ymm0, ymm3 756 LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0 757 LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 758 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 759 LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 760 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 761 LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 762 LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 763 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 764 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 765 WORD $0x894c; BYTE $0xd6 // mov rsi, r10 766 WORD $0x394d; BYTE $0xc1 // cmp r9, r8 767 JE LBB6_8 768 769 LBB6_7: 770 LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9] 771 WORD $0x3948; BYTE $0xf0 // cmp rax, rsi 772 LONG $0xc64f0f48 // cmovg rax, rsi 773 WORD $0x3949; BYTE $0xf2 // cmp r10, rsi 774 LONG $0xf24d0f49 // cmovge rsi, r10 775 LONG $0x01c18349 // add r9, 1 776 WORD $0x8949; BYTE $0xf2 // mov r10, rsi 777 WORD $0x394d; BYTE $0xc8 // cmp r8, r9 778 JNE LBB6_7 779 780 LBB6_8: 781 WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi 782 WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax 783 VZEROUPPER 784 RET 785 786 DATA LCDATA5<>+0x000(SB)/8, $0x8000000000000000 787 GLOBL LCDATA5<>(SB), 8, $8 788 789 TEXT ·_uint64_max_min_avx2(SB), $0-32 790 791 MOVQ values+0(FP), DI 792 MOVQ length+8(FP), SI 793 MOVQ minout+16(FP), DX 794 MOVQ maxout+24(FP), CX 795 LEAQ LCDATA5<>(SB), BP 796 797 WORD $0xf685 // test esi, esi 798 JLE LBB7_1 799 WORD $0x8941; BYTE $0xf0 // mov r8d, esi 800 WORD $0xfe83; BYTE $0x0f // cmp esi, 15 801 JA LBB7_4 802 LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1 803 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d 804 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 805 JMP LBB7_7 806 807 LBB7_1: 808 LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1 809 WORD $0xf631 // xor esi, esi 810 JMP LBB7_8 811 812 LBB7_4: 813 WORD $0x8945; BYTE $0xc1 // mov r9d, r8d 814 LONG $0xf0e18341 // and r9d, -16 815 LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 816 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 817 WORD $0xc031 // xor eax, eax 818 LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI7_0] */ 819 LONG $0xe476ddc5 // vpcmpeqd ymm4, ymm4, ymm4 820 LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3 821 LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 822 LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 823 LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 824 LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 825 826 LBB7_5: 827 LONG $0x0c6f7ec5; BYTE $0xc7 // vmovdqu ymm9, yword [rdi + 8*rax] 828 LONG $0xd0ef75c5 // vpxor ymm10, ymm1, ymm0 829 LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 830 LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 831 LONG $0x4b35e3c4; WORD $0xa0c9 // vblendvpd ymm1, ymm9, ymm1, ymm10 832 LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0 833 LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 834 LONG $0x4b35e3c4; WORD $0xa0ed // vblendvpd ymm5, ymm9, ymm5, ymm10 835 LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32] 836 LONG $0xd0ef5dc5 // vpxor ymm10, ymm4, ymm0 837 LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 838 LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 839 LONG $0x4b35e3c4; WORD $0xa0e4 // vblendvpd ymm4, ymm9, ymm4, ymm10 840 LONG $0xd0ef3dc5 // vpxor ymm10, ymm8, ymm0 841 LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 842 LONG $0x5c6f7ec5; WORD $0x40c7 // vmovdqu ymm11, yword [rdi + 8*rax + 64] 843 LONG $0x4b3543c4; WORD $0xa0c0 // vblendvpd ymm8, ymm9, ymm8, ymm10 844 LONG $0xc8ef65c5 // vpxor ymm9, ymm3, ymm0 845 LONG $0xd0ef25c5 // vpxor ymm10, ymm11, ymm0 846 LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9 847 LONG $0x4b25e3c4; WORD $0x90db // vblendvpd ymm3, ymm11, ymm3, ymm9 848 LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0 849 LONG $0x373542c4; BYTE $0xca // vpcmpgtq ymm9, ymm9, ymm10 850 LONG $0x4b25e3c4; WORD $0x90ff // vblendvpd ymm7, ymm11, ymm7, ymm9 851 LONG $0x4c6f7ec5; WORD $0x60c7 // vmovdqu ymm9, yword [rdi + 8*rax + 96] 852 LONG $0xd0ef6dc5 // vpxor ymm10, ymm2, ymm0 853 LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 854 LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 855 LONG $0x4b35e3c4; WORD $0xa0d2 // vblendvpd ymm2, ymm9, ymm2, ymm10 856 LONG $0xd0ef4dc5 // vpxor ymm10, ymm6, ymm0 857 LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 858 LONG $0x4b35e3c4; WORD $0xa0f6 // vblendvpd ymm6, ymm9, ymm6, ymm10 859 LONG $0x10c08348 // add rax, 16 860 WORD $0x3949; BYTE $0xc1 // cmp r9, rax 861 JNE LBB7_5 862 LONG $0xc8ef3dc5 // vpxor ymm9, ymm8, ymm0 863 LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0 864 LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9 865 LONG $0x4b3de3c4; WORD $0x90ed // vblendvpd ymm5, ymm8, ymm5, ymm9 866 LONG $0xc05755c5 // vxorpd ymm8, ymm5, ymm0 867 LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0 868 LONG $0x373d42c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm8, ymm9 869 LONG $0x4b45e3c4; WORD $0x80ed // vblendvpd ymm5, ymm7, ymm5, ymm8 870 LONG $0xf857d5c5 // vxorpd ymm7, ymm5, ymm0 871 LONG $0xc0ef4dc5 // vpxor ymm8, ymm6, ymm0 872 LONG $0x3745c2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm7, ymm8 873 LONG $0x4b4de3c4; WORD $0x70ed // vblendvpd ymm5, ymm6, ymm5, ymm7 874 LONG $0x197de3c4; WORD $0x01ee // vextractf128 xmm6, ymm5, 1 875 LONG $0xc05749c5 // vxorpd xmm8, xmm6, xmm0 876 LONG $0xf857d1c5 // vxorpd xmm7, xmm5, xmm0 877 LONG $0x3741c2c4; BYTE $0xf8 // vpcmpgtq xmm7, xmm7, xmm8 878 LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7 879 LONG $0x0479e3c4; WORD $0x4ef5 // vpermilps xmm6, xmm5, 78 880 LONG $0xc05751c5 // vxorpd xmm8, xmm5, xmm0 881 LONG $0xf857c9c5 // vxorpd xmm7, xmm6, xmm0 882 LONG $0x3739e2c4; BYTE $0xff // vpcmpgtq xmm7, xmm8, xmm7 883 LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7 884 LONG $0xf0eff5c5 // vpxor ymm6, ymm1, ymm0 885 LONG $0xf8efddc5 // vpxor ymm7, ymm4, ymm0 886 LONG $0x3745e2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm7, ymm6 887 LONG $0x4b5de3c4; WORD $0x60c9 // vblendvpd ymm1, ymm4, ymm1, ymm6 888 LONG $0xe057f5c5 // vxorpd ymm4, ymm1, ymm0 889 LONG $0xf0efe5c5 // vpxor ymm6, ymm3, ymm0 890 LONG $0x374de2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm6, ymm4 891 LONG $0x4b65e3c4; WORD $0x40c9 // vblendvpd ymm1, ymm3, ymm1, ymm4 892 LONG $0x7ef9c1c4; BYTE $0xea // vmovq r10, xmm5 893 LONG $0xd857f5c5 // vxorpd ymm3, ymm1, ymm0 894 LONG $0xe0efedc5 // vpxor ymm4, ymm2, ymm0 895 LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 896 LONG $0x4b6de3c4; WORD $0x30c9 // vblendvpd ymm1, ymm2, ymm1, ymm3 897 LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 898 LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 899 LONG $0xe057e9c5 // vxorpd xmm4, xmm2, xmm0 900 LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 901 LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 902 LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps xmm2, xmm1, 78 903 LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 904 LONG $0xc057e9c5 // vxorpd xmm0, xmm2, xmm0 905 LONG $0x3779e2c4; BYTE $0xc3 // vpcmpgtq xmm0, xmm0, xmm3 906 LONG $0x4b69e3c4; WORD $0x00c1 // vblendvpd xmm0, xmm2, xmm1, xmm0 907 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 908 WORD $0x894c; BYTE $0xd6 // mov rsi, r10 909 WORD $0x394d; BYTE $0xc1 // cmp r9, r8 910 JE LBB7_8 911 912 LBB7_7: 913 LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9] 914 WORD $0x3948; BYTE $0xf0 // cmp rax, rsi 915 LONG $0xc6430f48 // cmovae rax, rsi 916 WORD $0x3949; BYTE $0xf2 // cmp r10, rsi 917 LONG $0xf2470f49 // cmova rsi, r10 918 LONG $0x01c18349 // add r9, 1 919 WORD $0x8949; BYTE $0xf2 // mov r10, rsi 920 WORD $0x394d; BYTE $0xc8 // cmp r8, r9 921 JNE LBB7_7 922 923 LBB7_8: 924 WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi 925 WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax 926 VZEROUPPER 927 RET