github.com/apache/arrow/go/v14@v14.0.2/internal/utils/min_max_sse4_amd64.s (about) 1 //+build !noasm !appengine 2 // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT 3 4 DATA LCDATA1<>+0x000(SB)/8, $0x8080808080808080 5 DATA LCDATA1<>+0x008(SB)/8, $0x8080808080808080 6 DATA LCDATA1<>+0x010(SB)/8, $0x7f7f7f7f7f7f7f7f 7 DATA LCDATA1<>+0x018(SB)/8, $0x7f7f7f7f7f7f7f7f 8 GLOBL LCDATA1<>(SB), 8, $32 9 10 TEXT ·_int8_max_min_sse4(SB), $0-32 11 12 MOVQ values+0(FP), DI 13 MOVQ length+8(FP), SI 14 MOVQ minout+16(FP), DX 15 MOVQ maxout+24(FP), CX 16 LEAQ LCDATA1<>(SB), BP 17 18 WORD $0xf685 // test esi, esi 19 JLE LBB0_1 20 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 21 WORD $0xfe83; BYTE $0x1f // cmp esi, 31 22 JA LBB0_4 23 WORD $0xb041; BYTE $0x80 // mov r8b, -128 24 WORD $0xb640; BYTE $0x7f // mov sil, 127 25 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 26 JMP LBB0_11 27 28 LBB0_1: 29 WORD $0xb640; BYTE $0x7f // mov sil, 127 30 WORD $0xb041; BYTE $0x80 // mov r8b, -128 31 JMP LBB0_12 32 33 LBB0_4: 34 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 35 LONG $0xe0e38341 // and r11d, -32 36 LONG $0xe0438d49 // lea rax, [r11 - 32] 37 WORD $0x8949; BYTE $0xc0 // mov r8, rax 38 LONG $0x05e8c149 // shr r8, 5 39 LONG $0x01c08349 // add r8, 1 40 WORD $0x8548; BYTE $0xc0 // test rax, rax 41 JE LBB0_5 42 WORD $0x894d; BYTE $0xc2 // mov r10, r8 43 LONG $0xfee28349 // and r10, -2 44 WORD $0xf749; BYTE $0xda // neg r10 45 LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */ 46 LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */ 47 WORD $0xc031 // xor eax, eax 48 LONG $0xd06f0f66 // movdqa xmm2, xmm0 49 LONG $0xd96f0f66 // movdqa xmm3, xmm1 50 51 LBB0_7: 52 LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] 53 LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] 54 LONG $0x746f0ff3; WORD $0x2007 // movdqu xmm6, oword [rdi + rax + 32] 55 LONG $0x7c6f0ff3; WORD $0x3007 // movdqu xmm7, oword [rdi + rax + 48] 56 LONG $0x38380f66; BYTE $0xc4 // pminsb xmm0, xmm4 57 LONG $0x38380f66; BYTE $0xd5 // pminsb xmm2, xmm5 58 LONG $0x3c380f66; BYTE $0xcc // pmaxsb xmm1, xmm4 59 LONG $0x3c380f66; BYTE $0xdd // pmaxsb xmm3, xmm5 60 LONG $0x38380f66; BYTE $0xc6 // pminsb xmm0, xmm6 61 LONG $0x38380f66; BYTE $0xd7 // pminsb xmm2, xmm7 62 LONG $0x3c380f66; BYTE $0xce // pmaxsb xmm1, xmm6 63 LONG $0x3c380f66; BYTE $0xdf // pmaxsb xmm3, xmm7 64 LONG $0x40c08348 // add rax, 64 65 LONG $0x02c28349 // add r10, 2 66 JNE LBB0_7 67 LONG $0x01c0f641 // test r8b, 1 68 JE LBB0_10 69 70 LBB0_9: 71 LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] 72 LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] 73 LONG $0x3c380f66; BYTE $0xdd // pmaxsb xmm3, xmm5 74 LONG $0x3c380f66; BYTE $0xcc // pmaxsb xmm1, xmm4 75 LONG $0x38380f66; BYTE $0xd5 // pminsb xmm2, xmm5 76 LONG $0x38380f66; BYTE $0xc4 // pminsb xmm0, xmm4 77 78 LBB0_10: 79 LONG $0x38380f66; BYTE $0xc2 // pminsb xmm0, xmm2 80 LONG $0x3c380f66; BYTE $0xcb // pmaxsb xmm1, xmm3 81 LONG $0x4def0f66; BYTE $0x10 // pxor xmm1, oword 16[rbp] /* [rip + .LCPI0_1] */ 82 LONG $0xd16f0f66 // movdqa xmm2, xmm1 83 LONG $0xd2710f66; BYTE $0x08 // psrlw xmm2, 8 84 LONG $0xd1da0f66 // pminub xmm2, xmm1 85 LONG $0x41380f66; BYTE $0xca // phminposuw xmm1, xmm2 86 LONG $0x7e0f4166; BYTE $0xc8 // movd r8d, xmm1 87 LONG $0x7ff08041 // xor r8b, 127 88 LONG $0x45ef0f66; BYTE $0x00 // pxor xmm0, oword 0[rbp] /* [rip + .LCPI0_0] */ 89 LONG $0xc86f0f66 // movdqa xmm1, xmm0 90 LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8 91 LONG $0xc8da0f66 // pminub xmm1, xmm0 92 LONG $0x41380f66; BYTE $0xc1 // phminposuw xmm0, xmm1 93 LONG $0xc67e0f66 // movd esi, xmm0 94 LONG $0x80f68040 // xor sil, -128 95 WORD $0x394d; BYTE $0xcb // cmp r11, r9 96 JE LBB0_12 97 98 LBB0_11: 99 LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte [rdi + r11] 100 WORD $0x3840; BYTE $0xc6 // cmp sil, al 101 LONG $0xf6b60f40 // movzx esi, sil 102 WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax 103 WORD $0x3841; BYTE $0xc0 // cmp r8b, al 104 LONG $0xc0b60f45 // movzx r8d, r8b 105 LONG $0xc04c0f44 // cmovl r8d, eax 106 LONG $0x01c38349 // add r11, 1 107 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 108 JNE LBB0_11 109 110 LBB0_12: 111 WORD $0x8844; BYTE $0x01 // mov byte [rcx], r8b 112 WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil 113 RET 114 115 LBB0_5: 116 LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */ 117 LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */ 118 WORD $0xc031 // xor eax, eax 119 LONG $0xd06f0f66 // movdqa xmm2, xmm0 120 LONG $0xd96f0f66 // movdqa xmm3, xmm1 121 LONG $0x01c0f641 // test r8b, 1 122 JNE LBB0_9 123 JMP LBB0_10 124 125 TEXT ·_uint8_max_min_sse4(SB), $0-32 126 127 MOVQ values+0(FP), DI 128 MOVQ length+8(FP), SI 129 MOVQ minout+16(FP), DX 130 MOVQ maxout+24(FP), CX 131 132 WORD $0xf685 // test esi, esi 133 JLE LBB1_1 134 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 135 WORD $0xfe83; BYTE $0x1f // cmp esi, 31 136 JA LBB1_4 137 WORD $0xb640; BYTE $0xff // mov sil, -1 138 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 139 WORD $0xc031 // xor eax, eax 140 JMP LBB1_11 141 142 LBB1_1: 143 WORD $0xb640; BYTE $0xff // mov sil, -1 144 WORD $0xc031 // xor eax, eax 145 JMP LBB1_12 146 147 LBB1_4: 148 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 149 LONG $0xe0e38341 // and r11d, -32 150 LONG $0xe0438d49 // lea rax, [r11 - 32] 151 WORD $0x8949; BYTE $0xc0 // mov r8, rax 152 LONG $0x05e8c149 // shr r8, 5 153 LONG $0x01c08349 // add r8, 1 154 WORD $0x8548; BYTE $0xc0 // test rax, rax 155 JE LBB1_5 156 WORD $0x894d; BYTE $0xc2 // mov r10, r8 157 LONG $0xfee28349 // and r10, -2 158 WORD $0xf749; BYTE $0xda // neg r10 159 LONG $0xc9ef0f66 // pxor xmm1, xmm1 160 LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 161 WORD $0xc031 // xor eax, eax 162 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 163 LONG $0xdbef0f66 // pxor xmm3, xmm3 164 165 LBB1_7: 166 LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] 167 LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] 168 LONG $0x746f0ff3; WORD $0x2007 // movdqu xmm6, oword [rdi + rax + 32] 169 LONG $0x7c6f0ff3; WORD $0x3007 // movdqu xmm7, oword [rdi + rax + 48] 170 LONG $0xc4da0f66 // pminub xmm0, xmm4 171 LONG $0xd5da0f66 // pminub xmm2, xmm5 172 LONG $0xccde0f66 // pmaxub xmm1, xmm4 173 LONG $0xddde0f66 // pmaxub xmm3, xmm5 174 LONG $0xc6da0f66 // pminub xmm0, xmm6 175 LONG $0xd7da0f66 // pminub xmm2, xmm7 176 LONG $0xcede0f66 // pmaxub xmm1, xmm6 177 LONG $0xdfde0f66 // pmaxub xmm3, xmm7 178 LONG $0x40c08348 // add rax, 64 179 LONG $0x02c28349 // add r10, 2 180 JNE LBB1_7 181 LONG $0x01c0f641 // test r8b, 1 182 JE LBB1_10 183 184 LBB1_9: 185 LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] 186 LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] 187 LONG $0xddde0f66 // pmaxub xmm3, xmm5 188 LONG $0xccde0f66 // pmaxub xmm1, xmm4 189 LONG $0xd5da0f66 // pminub xmm2, xmm5 190 LONG $0xc4da0f66 // pminub xmm0, xmm4 191 192 LBB1_10: 193 LONG $0xc2da0f66 // pminub xmm0, xmm2 194 LONG $0xcbde0f66 // pmaxub xmm1, xmm3 195 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 196 LONG $0xd1ef0f66 // pxor xmm2, xmm1 197 LONG $0xca6f0f66 // movdqa xmm1, xmm2 198 LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8 199 LONG $0xcada0f66 // pminub xmm1, xmm2 200 LONG $0x41380f66; BYTE $0xc9 // phminposuw xmm1, xmm1 201 LONG $0xc87e0f66 // movd eax, xmm1 202 WORD $0xd0f6 // not al 203 LONG $0xc86f0f66 // movdqa xmm1, xmm0 204 LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8 205 LONG $0xc8da0f66 // pminub xmm1, xmm0 206 LONG $0x41380f66; BYTE $0xc1 // phminposuw xmm0, xmm1 207 LONG $0xc67e0f66 // movd esi, xmm0 208 WORD $0x394d; BYTE $0xcb // cmp r11, r9 209 JE LBB1_12 210 211 LBB1_11: 212 LONG $0x04b60f46; BYTE $0x1f // movzx r8d, byte [rdi + r11] 213 WORD $0x3844; BYTE $0xc6 // cmp sil, r8b 214 LONG $0xf6b60f40 // movzx esi, sil 215 LONG $0xf0430f41 // cmovae esi, r8d 216 WORD $0x3844; BYTE $0xc0 // cmp al, r8b 217 WORD $0xb60f; BYTE $0xc0 // movzx eax, al 218 LONG $0xc0460f41 // cmovbe eax, r8d 219 LONG $0x01c38349 // add r11, 1 220 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 221 JNE LBB1_11 222 223 LBB1_12: 224 WORD $0x0188 // mov byte [rcx], al 225 WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil 226 RET 227 228 LBB1_5: 229 LONG $0xc9ef0f66 // pxor xmm1, xmm1 230 LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 231 WORD $0xc031 // xor eax, eax 232 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 233 LONG $0xdbef0f66 // pxor xmm3, xmm3 234 LONG $0x01c0f641 // test r8b, 1 235 JNE LBB1_9 236 JMP LBB1_10 237 238 DATA LCDATA2<>+0x000(SB)/8, $0x8000800080008000 239 DATA LCDATA2<>+0x008(SB)/8, $0x8000800080008000 240 DATA LCDATA2<>+0x010(SB)/8, $0x7fff7fff7fff7fff 241 DATA LCDATA2<>+0x018(SB)/8, $0x7fff7fff7fff7fff 242 GLOBL LCDATA2<>(SB), 8, $32 243 244 TEXT ·_int16_max_min_sse4(SB), $0-32 245 246 MOVQ values+0(FP), DI 247 MOVQ length+8(FP), SI 248 MOVQ minout+16(FP), DX 249 MOVQ maxout+24(FP), CX 250 LEAQ LCDATA2<>(SB), BP 251 252 WORD $0xf685 // test esi, esi 253 JLE LBB2_1 254 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 255 WORD $0xfe83; BYTE $0x0f // cmp esi, 15 256 JA LBB2_4 257 LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 258 LONG $0x7fffbe66 // mov si, 32767 259 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 260 JMP LBB2_11 261 262 LBB2_1: 263 LONG $0x7fffbe66 // mov si, 32767 264 LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 265 JMP LBB2_12 266 267 LBB2_4: 268 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 269 LONG $0xf0e38341 // and r11d, -16 270 LONG $0xf0438d49 // lea rax, [r11 - 16] 271 WORD $0x8949; BYTE $0xc0 // mov r8, rax 272 LONG $0x04e8c149 // shr r8, 4 273 LONG $0x01c08349 // add r8, 1 274 WORD $0x8548; BYTE $0xc0 // test rax, rax 275 JE LBB2_5 276 WORD $0x894d; BYTE $0xc2 // mov r10, r8 277 LONG $0xfee28349 // and r10, -2 278 WORD $0xf749; BYTE $0xda // neg r10 279 LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */ 280 LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI2_1] */ 281 WORD $0xc031 // xor eax, eax 282 LONG $0xd06f0f66 // movdqa xmm2, xmm0 283 LONG $0xd96f0f66 // movdqa xmm3, xmm1 284 285 LBB2_7: 286 LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] 287 LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] 288 LONG $0x746f0ff3; WORD $0x2047 // movdqu xmm6, oword [rdi + 2*rax + 32] 289 LONG $0x7c6f0ff3; WORD $0x3047 // movdqu xmm7, oword [rdi + 2*rax + 48] 290 LONG $0xc4ea0f66 // pminsw xmm0, xmm4 291 LONG $0xd5ea0f66 // pminsw xmm2, xmm5 292 LONG $0xccee0f66 // pmaxsw xmm1, xmm4 293 LONG $0xddee0f66 // pmaxsw xmm3, xmm5 294 LONG $0xc6ea0f66 // pminsw xmm0, xmm6 295 LONG $0xd7ea0f66 // pminsw xmm2, xmm7 296 LONG $0xceee0f66 // pmaxsw xmm1, xmm6 297 LONG $0xdfee0f66 // pmaxsw xmm3, xmm7 298 LONG $0x20c08348 // add rax, 32 299 LONG $0x02c28349 // add r10, 2 300 JNE LBB2_7 301 LONG $0x01c0f641 // test r8b, 1 302 JE LBB2_10 303 304 LBB2_9: 305 LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] 306 LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] 307 LONG $0xddee0f66 // pmaxsw xmm3, xmm5 308 LONG $0xccee0f66 // pmaxsw xmm1, xmm4 309 LONG $0xd5ea0f66 // pminsw xmm2, xmm5 310 LONG $0xc4ea0f66 // pminsw xmm0, xmm4 311 312 LBB2_10: 313 LONG $0xc2ea0f66 // pminsw xmm0, xmm2 314 LONG $0xcbee0f66 // pmaxsw xmm1, xmm3 315 LONG $0x4def0f66; BYTE $0x10 // pxor xmm1, oword 16[rbp] /* [rip + .LCPI2_1] */ 316 LONG $0x41380f66; BYTE $0xc9 // phminposuw xmm1, xmm1 317 LONG $0x7e0f4166; BYTE $0xc8 // movd r8d, xmm1 318 LONG $0xfff08141; WORD $0x007f; BYTE $0x00 // xor r8d, 32767 319 LONG $0x45ef0f66; BYTE $0x00 // pxor xmm0, oword 0[rbp] /* [rip + .LCPI2_0] */ 320 LONG $0x41380f66; BYTE $0xc0 // phminposuw xmm0, xmm0 321 LONG $0xc67e0f66 // movd esi, xmm0 322 LONG $0x8000f681; WORD $0x0000 // xor esi, 32768 323 WORD $0x394d; BYTE $0xcb // cmp r11, r9 324 JE LBB2_12 325 326 LBB2_11: 327 LONG $0x04b70f42; BYTE $0x5f // movzx eax, word [rdi + 2*r11] 328 WORD $0x3966; BYTE $0xc6 // cmp si, ax 329 WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax 330 LONG $0xc0394166 // cmp r8w, ax 331 LONG $0xc04c0f44 // cmovl r8d, eax 332 LONG $0x01c38349 // add r11, 1 333 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 334 JNE LBB2_11 335 336 LBB2_12: 337 LONG $0x01894466 // mov word [rcx], r8w 338 WORD $0x8966; BYTE $0x32 // mov word [rdx], si 339 RET 340 341 LBB2_5: 342 LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */ 343 LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI2_1] */ 344 WORD $0xc031 // xor eax, eax 345 LONG $0xd06f0f66 // movdqa xmm2, xmm0 346 LONG $0xd96f0f66 // movdqa xmm3, xmm1 347 LONG $0x01c0f641 // test r8b, 1 348 JNE LBB2_9 349 JMP LBB2_10 350 351 TEXT ·_uint16_max_min_sse4(SB), $0-32 352 353 MOVQ values+0(FP), DI 354 MOVQ length+8(FP), SI 355 MOVQ minout+16(FP), DX 356 MOVQ maxout+24(FP), CX 357 358 WORD $0xf685 // test esi, esi 359 JLE LBB3_1 360 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 361 WORD $0xfe83; BYTE $0x0f // cmp esi, 15 362 JA LBB3_4 363 LONG $0xffb84166; BYTE $0xff // mov r8w, -1 364 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 365 WORD $0xf631 // xor esi, esi 366 JMP LBB3_11 367 368 LBB3_1: 369 LONG $0xffb84166; BYTE $0xff // mov r8w, -1 370 WORD $0xf631 // xor esi, esi 371 JMP LBB3_12 372 373 LBB3_4: 374 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 375 LONG $0xf0e38341 // and r11d, -16 376 LONG $0xf0438d49 // lea rax, [r11 - 16] 377 WORD $0x8949; BYTE $0xc0 // mov r8, rax 378 LONG $0x04e8c149 // shr r8, 4 379 LONG $0x01c08349 // add r8, 1 380 WORD $0x8548; BYTE $0xc0 // test rax, rax 381 JE LBB3_5 382 WORD $0x894d; BYTE $0xc2 // mov r10, r8 383 LONG $0xfee28349 // and r10, -2 384 WORD $0xf749; BYTE $0xda // neg r10 385 LONG $0xc9ef0f66 // pxor xmm1, xmm1 386 LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 387 WORD $0xc031 // xor eax, eax 388 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 389 LONG $0xdbef0f66 // pxor xmm3, xmm3 390 391 LBB3_7: 392 LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] 393 LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] 394 LONG $0x746f0ff3; WORD $0x2047 // movdqu xmm6, oword [rdi + 2*rax + 32] 395 LONG $0x7c6f0ff3; WORD $0x3047 // movdqu xmm7, oword [rdi + 2*rax + 48] 396 LONG $0x3a380f66; BYTE $0xc4 // pminuw xmm0, xmm4 397 LONG $0x3a380f66; BYTE $0xd5 // pminuw xmm2, xmm5 398 LONG $0x3e380f66; BYTE $0xcc // pmaxuw xmm1, xmm4 399 LONG $0x3e380f66; BYTE $0xdd // pmaxuw xmm3, xmm5 400 LONG $0x3a380f66; BYTE $0xc6 // pminuw xmm0, xmm6 401 LONG $0x3a380f66; BYTE $0xd7 // pminuw xmm2, xmm7 402 LONG $0x3e380f66; BYTE $0xce // pmaxuw xmm1, xmm6 403 LONG $0x3e380f66; BYTE $0xdf // pmaxuw xmm3, xmm7 404 LONG $0x20c08348 // add rax, 32 405 LONG $0x02c28349 // add r10, 2 406 JNE LBB3_7 407 LONG $0x01c0f641 // test r8b, 1 408 JE LBB3_10 409 410 LBB3_9: 411 LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] 412 LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] 413 LONG $0x3e380f66; BYTE $0xdd // pmaxuw xmm3, xmm5 414 LONG $0x3e380f66; BYTE $0xcc // pmaxuw xmm1, xmm4 415 LONG $0x3a380f66; BYTE $0xd5 // pminuw xmm2, xmm5 416 LONG $0x3a380f66; BYTE $0xc4 // pminuw xmm0, xmm4 417 418 LBB3_10: 419 LONG $0x3a380f66; BYTE $0xc2 // pminuw xmm0, xmm2 420 LONG $0x3e380f66; BYTE $0xcb // pmaxuw xmm1, xmm3 421 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 422 LONG $0xd1ef0f66 // pxor xmm2, xmm1 423 LONG $0x41380f66; BYTE $0xca // phminposuw xmm1, xmm2 424 LONG $0xce7e0f66 // movd esi, xmm1 425 WORD $0xd6f7 // not esi 426 LONG $0x41380f66; BYTE $0xc0 // phminposuw xmm0, xmm0 427 LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0 428 WORD $0x394d; BYTE $0xcb // cmp r11, r9 429 JE LBB3_12 430 431 LBB3_11: 432 LONG $0x04b70f42; BYTE $0x5f // movzx eax, word [rdi + 2*r11] 433 LONG $0xc0394166 // cmp r8w, ax 434 LONG $0xc0430f44 // cmovae r8d, eax 435 WORD $0x3966; BYTE $0xc6 // cmp si, ax 436 WORD $0x460f; BYTE $0xf0 // cmovbe esi, eax 437 LONG $0x01c38349 // add r11, 1 438 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 439 JNE LBB3_11 440 441 LBB3_12: 442 WORD $0x8966; BYTE $0x31 // mov word [rcx], si 443 LONG $0x02894466 // mov word [rdx], r8w 444 RET 445 446 LBB3_5: 447 LONG $0xc9ef0f66 // pxor xmm1, xmm1 448 LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 449 WORD $0xc031 // xor eax, eax 450 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 451 LONG $0xdbef0f66 // pxor xmm3, xmm3 452 LONG $0x01c0f641 // test r8b, 1 453 JNE LBB3_9 454 JMP LBB3_10 455 456 DATA LCDATA3<>+0x000(SB)/8, $0x8000000080000000 457 DATA LCDATA3<>+0x008(SB)/8, $0x8000000080000000 458 DATA LCDATA3<>+0x010(SB)/8, $0x7fffffff7fffffff 459 DATA LCDATA3<>+0x018(SB)/8, $0x7fffffff7fffffff 460 GLOBL LCDATA3<>(SB), 8, $32 461 462 TEXT ·_int32_max_min_sse4(SB), $0-32 463 464 MOVQ values+0(FP), DI 465 MOVQ length+8(FP), SI 466 MOVQ minout+16(FP), DX 467 MOVQ maxout+24(FP), CX 468 LEAQ LCDATA3<>(SB), BP 469 470 WORD $0xf685 // test esi, esi 471 JLE LBB4_1 472 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 473 WORD $0xfe83; BYTE $0x07 // cmp esi, 7 474 JA LBB4_6 475 LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648 476 LONG $0xffffb841; WORD $0x7fff // mov r8d, 2147483647 477 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 478 JMP LBB4_4 479 480 LBB4_1: 481 LONG $0xffffb841; WORD $0x7fff // mov r8d, 2147483647 482 LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648 483 JMP LBB4_13 484 485 LBB4_6: 486 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 487 LONG $0xf8e38341 // and r11d, -8 488 LONG $0xf8438d49 // lea rax, [r11 - 8] 489 WORD $0x8949; BYTE $0xc0 // mov r8, rax 490 LONG $0x03e8c149 // shr r8, 3 491 LONG $0x01c08349 // add r8, 1 492 WORD $0x8548; BYTE $0xc0 // test rax, rax 493 JE LBB4_7 494 WORD $0x894d; BYTE $0xc2 // mov r10, r8 495 LONG $0xfee28349 // and r10, -2 496 WORD $0xf749; BYTE $0xda // neg r10 497 LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI4_0] */ 498 LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI4_1] */ 499 WORD $0xc031 // xor eax, eax 500 LONG $0xd06f0f66 // movdqa xmm2, xmm0 501 LONG $0xd96f0f66 // movdqa xmm3, xmm1 502 503 LBB4_9: 504 LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] 505 LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] 506 LONG $0x746f0ff3; WORD $0x2087 // movdqu xmm6, oword [rdi + 4*rax + 32] 507 LONG $0x7c6f0ff3; WORD $0x3087 // movdqu xmm7, oword [rdi + 4*rax + 48] 508 LONG $0x39380f66; BYTE $0xc4 // pminsd xmm0, xmm4 509 LONG $0x39380f66; BYTE $0xd5 // pminsd xmm2, xmm5 510 LONG $0x3d380f66; BYTE $0xcc // pmaxsd xmm1, xmm4 511 LONG $0x3d380f66; BYTE $0xdd // pmaxsd xmm3, xmm5 512 LONG $0x39380f66; BYTE $0xc6 // pminsd xmm0, xmm6 513 LONG $0x39380f66; BYTE $0xd7 // pminsd xmm2, xmm7 514 LONG $0x3d380f66; BYTE $0xce // pmaxsd xmm1, xmm6 515 LONG $0x3d380f66; BYTE $0xdf // pmaxsd xmm3, xmm7 516 LONG $0x10c08348 // add rax, 16 517 LONG $0x02c28349 // add r10, 2 518 JNE LBB4_9 519 LONG $0x01c0f641 // test r8b, 1 520 JE LBB4_12 521 522 LBB4_11: 523 LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] 524 LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] 525 LONG $0x3d380f66; BYTE $0xdd // pmaxsd xmm3, xmm5 526 LONG $0x3d380f66; BYTE $0xcc // pmaxsd xmm1, xmm4 527 LONG $0x39380f66; BYTE $0xd5 // pminsd xmm2, xmm5 528 LONG $0x39380f66; BYTE $0xc4 // pminsd xmm0, xmm4 529 530 LBB4_12: 531 LONG $0x39380f66; BYTE $0xc2 // pminsd xmm0, xmm2 532 LONG $0x3d380f66; BYTE $0xcb // pmaxsd xmm1, xmm3 533 LONG $0xd1700f66; BYTE $0x4e // pshufd xmm2, xmm1, 78 534 LONG $0x3d380f66; BYTE $0xd1 // pmaxsd xmm2, xmm1 535 LONG $0xca700f66; BYTE $0xe5 // pshufd xmm1, xmm2, 229 536 LONG $0x3d380f66; BYTE $0xca // pmaxsd xmm1, xmm2 537 LONG $0xc87e0f66 // movd eax, xmm1 538 LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78 539 LONG $0x39380f66; BYTE $0xc8 // pminsd xmm1, xmm0 540 LONG $0xc1700f66; BYTE $0xe5 // pshufd xmm0, xmm1, 229 541 LONG $0x39380f66; BYTE $0xc1 // pminsd xmm0, xmm1 542 LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0 543 WORD $0x394d; BYTE $0xcb // cmp r11, r9 544 JE LBB4_13 545 546 LBB4_4: 547 WORD $0xc689 // mov esi, eax 548 549 LBB4_5: 550 LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11] 551 WORD $0x3941; BYTE $0xc0 // cmp r8d, eax 552 LONG $0xc04f0f44 // cmovg r8d, eax 553 WORD $0xc639 // cmp esi, eax 554 WORD $0x4d0f; BYTE $0xc6 // cmovge eax, esi 555 LONG $0x01c38349 // add r11, 1 556 WORD $0xc689 // mov esi, eax 557 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 558 JNE LBB4_5 559 560 LBB4_13: 561 WORD $0x0189 // mov dword [rcx], eax 562 WORD $0x8944; BYTE $0x02 // mov dword [rdx], r8d 563 RET 564 565 LBB4_7: 566 LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI4_0] */ 567 LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI4_1] */ 568 WORD $0xc031 // xor eax, eax 569 LONG $0xd06f0f66 // movdqa xmm2, xmm0 570 LONG $0xd96f0f66 // movdqa xmm3, xmm1 571 LONG $0x01c0f641 // test r8b, 1 572 JNE LBB4_11 573 JMP LBB4_12 574 575 TEXT ·_uint32_max_min_sse4(SB), $0-32 576 577 MOVQ values+0(FP), DI 578 MOVQ length+8(FP), SI 579 MOVQ minout+16(FP), DX 580 MOVQ maxout+24(FP), CX 581 582 WORD $0xf685 // test esi, esi 583 JLE LBB5_1 584 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 585 WORD $0xfe83; BYTE $0x07 // cmp esi, 7 586 JA LBB5_6 587 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 588 LONG $0xffffb841; WORD $0xffff // mov r8d, -1 589 WORD $0xf631 // xor esi, esi 590 JMP LBB5_4 591 592 LBB5_1: 593 LONG $0xffffb841; WORD $0xffff // mov r8d, -1 594 WORD $0xf631 // xor esi, esi 595 JMP LBB5_13 596 597 LBB5_6: 598 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 599 LONG $0xf8e38341 // and r11d, -8 600 LONG $0xf8438d49 // lea rax, [r11 - 8] 601 WORD $0x8949; BYTE $0xc0 // mov r8, rax 602 LONG $0x03e8c149 // shr r8, 3 603 LONG $0x01c08349 // add r8, 1 604 WORD $0x8548; BYTE $0xc0 // test rax, rax 605 JE LBB5_7 606 WORD $0x894d; BYTE $0xc2 // mov r10, r8 607 LONG $0xfee28349 // and r10, -2 608 WORD $0xf749; BYTE $0xda // neg r10 609 LONG $0xc9ef0f66 // pxor xmm1, xmm1 610 LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 611 WORD $0xc031 // xor eax, eax 612 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 613 LONG $0xdbef0f66 // pxor xmm3, xmm3 614 615 LBB5_9: 616 LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] 617 LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] 618 LONG $0x746f0ff3; WORD $0x2087 // movdqu xmm6, oword [rdi + 4*rax + 32] 619 LONG $0x7c6f0ff3; WORD $0x3087 // movdqu xmm7, oword [rdi + 4*rax + 48] 620 LONG $0x3b380f66; BYTE $0xc4 // pminud xmm0, xmm4 621 LONG $0x3b380f66; BYTE $0xd5 // pminud xmm2, xmm5 622 LONG $0x3f380f66; BYTE $0xcc // pmaxud xmm1, xmm4 623 LONG $0x3f380f66; BYTE $0xdd // pmaxud xmm3, xmm5 624 LONG $0x3b380f66; BYTE $0xc6 // pminud xmm0, xmm6 625 LONG $0x3b380f66; BYTE $0xd7 // pminud xmm2, xmm7 626 LONG $0x3f380f66; BYTE $0xce // pmaxud xmm1, xmm6 627 LONG $0x3f380f66; BYTE $0xdf // pmaxud xmm3, xmm7 628 LONG $0x10c08348 // add rax, 16 629 LONG $0x02c28349 // add r10, 2 630 JNE LBB5_9 631 LONG $0x01c0f641 // test r8b, 1 632 JE LBB5_12 633 634 LBB5_11: 635 LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] 636 LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] 637 LONG $0x3f380f66; BYTE $0xdd // pmaxud xmm3, xmm5 638 LONG $0x3f380f66; BYTE $0xcc // pmaxud xmm1, xmm4 639 LONG $0x3b380f66; BYTE $0xd5 // pminud xmm2, xmm5 640 LONG $0x3b380f66; BYTE $0xc4 // pminud xmm0, xmm4 641 642 LBB5_12: 643 LONG $0x3b380f66; BYTE $0xc2 // pminud xmm0, xmm2 644 LONG $0x3f380f66; BYTE $0xcb // pmaxud xmm1, xmm3 645 LONG $0xd1700f66; BYTE $0x4e // pshufd xmm2, xmm1, 78 646 LONG $0x3f380f66; BYTE $0xd1 // pmaxud xmm2, xmm1 647 LONG $0xca700f66; BYTE $0xe5 // pshufd xmm1, xmm2, 229 648 LONG $0x3f380f66; BYTE $0xca // pmaxud xmm1, xmm2 649 LONG $0xce7e0f66 // movd esi, xmm1 650 LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78 651 LONG $0x3b380f66; BYTE $0xc8 // pminud xmm1, xmm0 652 LONG $0xc1700f66; BYTE $0xe5 // pshufd xmm0, xmm1, 229 653 LONG $0x3b380f66; BYTE $0xc1 // pminud xmm0, xmm1 654 LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0 655 WORD $0x394d; BYTE $0xcb // cmp r11, r9 656 JE LBB5_13 657 658 LBB5_4: 659 WORD $0xf089 // mov eax, esi 660 661 LBB5_5: 662 LONG $0x9f348b42 // mov esi, dword [rdi + 4*r11] 663 WORD $0x3941; BYTE $0xf0 // cmp r8d, esi 664 LONG $0xc6430f44 // cmovae r8d, esi 665 WORD $0xf039 // cmp eax, esi 666 WORD $0x470f; BYTE $0xf0 // cmova esi, eax 667 LONG $0x01c38349 // add r11, 1 668 WORD $0xf089 // mov eax, esi 669 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 670 JNE LBB5_5 671 672 LBB5_13: 673 WORD $0x3189 // mov dword [rcx], esi 674 WORD $0x8944; BYTE $0x02 // mov dword [rdx], r8d 675 RET 676 677 LBB5_7: 678 LONG $0xc9ef0f66 // pxor xmm1, xmm1 679 LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 680 WORD $0xc031 // xor eax, eax 681 LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 682 LONG $0xdbef0f66 // pxor xmm3, xmm3 683 LONG $0x01c0f641 // test r8b, 1 684 JNE LBB5_11 685 JMP LBB5_12 686 687 DATA LCDATA4<>+0x000(SB)/8, $0x8000000000000000 688 DATA LCDATA4<>+0x008(SB)/8, $0x8000000000000000 689 DATA LCDATA4<>+0x010(SB)/8, $0x7fffffffffffffff 690 DATA LCDATA4<>+0x018(SB)/8, $0x7fffffffffffffff 691 GLOBL LCDATA4<>(SB), 8, $32 692 693 TEXT ·_int64_max_min_sse4(SB), $0-32 694 695 MOVQ values+0(FP), DI 696 MOVQ length+8(FP), SI 697 MOVQ minout+16(FP), DX 698 MOVQ maxout+24(FP), CX 699 LEAQ LCDATA4<>(SB), BP 700 701 QUAD $0xffffffffffffb849; WORD $0x7fff // mov r8, 9223372036854775807 702 WORD $0xf685 // test esi, esi 703 JLE LBB6_1 704 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 705 WORD $0xfe83; BYTE $0x03 // cmp esi, 3 706 JA LBB6_6 707 LONG $0x01708d49 // lea rsi, [r8 + 1] 708 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 709 JMP LBB6_4 710 711 LBB6_1: 712 LONG $0x01708d49 // lea rsi, [r8 + 1] 713 JMP LBB6_13 714 715 LBB6_6: 716 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 717 LONG $0xfce38341 // and r11d, -4 718 LONG $0xfc438d49 // lea rax, [r11 - 4] 719 WORD $0x8949; BYTE $0xc0 // mov r8, rax 720 LONG $0x02e8c149 // shr r8, 2 721 LONG $0x01c08349 // add r8, 1 722 WORD $0x8548; BYTE $0xc0 // test rax, rax 723 JE LBB6_7 724 WORD $0x894d; BYTE $0xc2 // mov r10, r8 725 LONG $0xfee28349 // and r10, -2 726 WORD $0xf749; BYTE $0xda // neg r10 727 LONG $0x6f0f4466; WORD $0x004d // movdqa xmm9, oword 0[rbp] /* [rip + .LCPI6_0] */ 728 LONG $0x6f0f4466; WORD $0x1045 // movdqa xmm8, oword 16[rbp] /* [rip + .LCPI6_1] */ 729 WORD $0xc031 // xor eax, eax 730 LONG $0x6f0f4166; BYTE $0xd0 // movdqa xmm2, xmm8 731 LONG $0x6f0f4166; BYTE $0xf1 // movdqa xmm6, xmm9 732 733 LBB6_9: 734 LONG $0x3c6f0ff3; BYTE $0xc7 // movdqu xmm7, oword [rdi + 8*rax] 735 LONG $0xc76f0f66 // movdqa xmm0, xmm7 736 LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8 737 LONG $0xe76f0f66 // movdqa xmm4, xmm7 738 LONG $0x380f4166; WORD $0xe015 // blendvpd xmm4, xmm8, xmm0 739 LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu xmm1, oword [rdi + 8*rax + 16] 740 LONG $0xc16f0f66 // movdqa xmm0, xmm1 741 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 742 LONG $0xe96f0f66 // movdqa xmm5, xmm1 743 LONG $0x15380f66; BYTE $0xea // blendvpd xmm5, xmm2, xmm0 744 LONG $0x6f0f4166; BYTE $0xc1 // movdqa xmm0, xmm9 745 LONG $0x37380f66; BYTE $0xc7 // pcmpgtq xmm0, xmm7 746 LONG $0x380f4166; WORD $0xf915 // blendvpd xmm7, xmm9, xmm0 747 LONG $0xc66f0f66 // movdqa xmm0, xmm6 748 LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 749 LONG $0x15380f66; BYTE $0xce // blendvpd xmm1, xmm6, xmm0 750 LONG $0x5c6f0ff3; WORD $0x20c7 // movdqu xmm3, oword [rdi + 8*rax + 32] 751 LONG $0xc36f0f66 // movdqa xmm0, xmm3 752 LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 753 LONG $0x6f0f4466; BYTE $0xc3 // movdqa xmm8, xmm3 754 LONG $0x380f4466; WORD $0xc415 // blendvpd xmm8, xmm4, xmm0 755 LONG $0x646f0ff3; WORD $0x30c7 // movdqu xmm4, oword [rdi + 8*rax + 48] 756 LONG $0xc46f0f66 // movdqa xmm0, xmm4 757 LONG $0x37380f66; BYTE $0xc5 // pcmpgtq xmm0, xmm5 758 LONG $0xd46f0f66 // movdqa xmm2, xmm4 759 LONG $0x15380f66; BYTE $0xd5 // blendvpd xmm2, xmm5, xmm0 760 LONG $0xc7280f66 // movapd xmm0, xmm7 761 LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 762 LONG $0x15380f66; BYTE $0xdf // blendvpd xmm3, xmm7, xmm0 763 LONG $0xc1280f66 // movapd xmm0, xmm1 764 LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 765 LONG $0x15380f66; BYTE $0xe1 // blendvpd xmm4, xmm1, xmm0 766 LONG $0x08c08348 // add rax, 8 767 LONG $0x280f4466; BYTE $0xcb // movapd xmm9, xmm3 768 LONG $0xf4280f66 // movapd xmm6, xmm4 769 LONG $0x02c28349 // add r10, 2 770 JNE LBB6_9 771 LONG $0x01c0f641 // test r8b, 1 772 JE LBB6_12 773 774 LBB6_11: 775 LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu xmm1, oword [rdi + 8*rax + 16] 776 LONG $0xc4280f66 // movapd xmm0, xmm4 777 LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 778 LONG $0xe96f0f66 // movdqa xmm5, xmm1 779 LONG $0x15380f66; BYTE $0xec // blendvpd xmm5, xmm4, xmm0 780 LONG $0x246f0ff3; BYTE $0xc7 // movdqu xmm4, oword [rdi + 8*rax] 781 LONG $0xc3280f66 // movapd xmm0, xmm3 782 LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 783 LONG $0xf46f0f66 // movdqa xmm6, xmm4 784 LONG $0x15380f66; BYTE $0xf3 // blendvpd xmm6, xmm3, xmm0 785 LONG $0xc16f0f66 // movdqa xmm0, xmm1 786 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 787 LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0 788 LONG $0xc46f0f66 // movdqa xmm0, xmm4 789 LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8 790 LONG $0x380f4166; WORD $0xe015 // blendvpd xmm4, xmm8, xmm0 791 LONG $0x280f4466; BYTE $0xc4 // movapd xmm8, xmm4 792 LONG $0xd1280f66 // movapd xmm2, xmm1 793 LONG $0xde280f66 // movapd xmm3, xmm6 794 LONG $0xe5280f66 // movapd xmm4, xmm5 795 796 LBB6_12: 797 LONG $0xc3280f66 // movapd xmm0, xmm3 798 LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 799 LONG $0x15380f66; BYTE $0xe3 // blendvpd xmm4, xmm3, xmm0 800 LONG $0xcc700f66; BYTE $0x4e // pshufd xmm1, xmm4, 78 801 LONG $0xc46f0f66 // movdqa xmm0, xmm4 802 LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 803 LONG $0x15380f66; BYTE $0xcc // blendvpd xmm1, xmm4, xmm0 804 LONG $0x7e0f4866; BYTE $0xce // movq rsi, xmm1 805 LONG $0xc26f0f66 // movdqa xmm0, xmm2 806 LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8 807 LONG $0x380f4166; WORD $0xd015 // blendvpd xmm2, xmm8, xmm0 808 LONG $0xca700f66; BYTE $0x4e // pshufd xmm1, xmm2, 78 809 LONG $0xc16f0f66 // movdqa xmm0, xmm1 810 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 811 LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0 812 LONG $0x7e0f4966; BYTE $0xc8 // movq r8, xmm1 813 WORD $0x394d; BYTE $0xcb // cmp r11, r9 814 JE LBB6_13 815 816 LBB6_4: 817 WORD $0x8948; BYTE $0xf0 // mov rax, rsi 818 819 LBB6_5: 820 LONG $0xdf348b4a // mov rsi, qword [rdi + 8*r11] 821 WORD $0x3949; BYTE $0xf0 // cmp r8, rsi 822 LONG $0xc64f0f4c // cmovg r8, rsi 823 WORD $0x3948; BYTE $0xf0 // cmp rax, rsi 824 LONG $0xf04d0f48 // cmovge rsi, rax 825 LONG $0x01c38349 // add r11, 1 826 WORD $0x8948; BYTE $0xf0 // mov rax, rsi 827 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 828 JNE LBB6_5 829 830 LBB6_13: 831 WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi 832 WORD $0x894c; BYTE $0x02 // mov qword [rdx], r8 833 RET 834 835 LBB6_7: 836 LONG $0x5d280f66; BYTE $0x00 // movapd xmm3, oword 0[rbp] /* [rip + .LCPI6_0] */ 837 LONG $0x6f0f4466; WORD $0x1045 // movdqa xmm8, oword 16[rbp] /* [rip + .LCPI6_1] */ 838 WORD $0xc031 // xor eax, eax 839 LONG $0x6f0f4166; BYTE $0xd0 // movdqa xmm2, xmm8 840 LONG $0xe3280f66 // movapd xmm4, xmm3 841 LONG $0x01c0f641 // test r8b, 1 842 JNE LBB6_11 843 JMP LBB6_12 844 845 DATA LCDATA5<>+0x000(SB)/8, $0x8000000000000000 846 DATA LCDATA5<>+0x008(SB)/8, $0x8000000000000000 847 GLOBL LCDATA5<>(SB), 8, $16 848 849 TEXT ·_uint64_max_min_sse4(SB), $0-32 850 851 MOVQ values+0(FP), DI 852 MOVQ length+8(FP), SI 853 MOVQ minout+16(FP), DX 854 MOVQ maxout+24(FP), CX 855 LEAQ LCDATA5<>(SB), BP 856 857 WORD $0xf685 // test esi, esi 858 JLE LBB7_1 859 WORD $0x8941; BYTE $0xf1 // mov r9d, esi 860 WORD $0xfe83; BYTE $0x03 // cmp esi, 3 861 JA LBB7_6 862 LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1 863 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 864 WORD $0xc031 // xor eax, eax 865 JMP LBB7_4 866 867 LBB7_1: 868 LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1 869 WORD $0xc031 // xor eax, eax 870 JMP LBB7_13 871 872 LBB7_6: 873 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 874 LONG $0xfce38341 // and r11d, -4 875 LONG $0xfc438d49 // lea rax, [r11 - 4] 876 WORD $0x8949; BYTE $0xc0 // mov r8, rax 877 LONG $0x02e8c149 // shr r8, 2 878 LONG $0x01c08349 // add r8, 1 879 WORD $0x8548; BYTE $0xc0 // test rax, rax 880 JE LBB7_7 881 WORD $0x894d; BYTE $0xc2 // mov r10, r8 882 LONG $0xfee28349 // and r10, -2 883 WORD $0xf749; BYTE $0xda // neg r10 884 LONG $0xef0f4566; BYTE $0xc9 // pxor xmm9, xmm9 885 LONG $0x760f4566; BYTE $0xd2 // pcmpeqd xmm10, xmm10 886 WORD $0xc031 // xor eax, eax 887 LONG $0x6f0f4466; WORD $0x0045 // movdqa xmm8, oword 0[rbp] /* [rip + .LCPI7_0] */ 888 LONG $0x760f4566; BYTE $0xdb // pcmpeqd xmm11, xmm11 889 LONG $0xef0f4566; BYTE $0xe4 // pxor xmm12, xmm12 890 891 LBB7_9: 892 LONG $0x6f0f4166; BYTE $0xd2 // movdqa xmm2, xmm10 893 LONG $0xef0f4166; BYTE $0xd0 // pxor xmm2, xmm8 894 LONG $0x246f0ff3; BYTE $0xc7 // movdqu xmm4, oword [rdi + 8*rax] 895 LONG $0x6c6f0ff3; WORD $0x10c7 // movdqu xmm5, oword [rdi + 8*rax + 16] 896 LONG $0x6f0f44f3; WORD $0xc76c; BYTE $0x20 // movdqu xmm13, oword [rdi + 8*rax + 32] 897 LONG $0xc46f0f66 // movdqa xmm0, xmm4 898 LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 899 LONG $0x6f0f4166; BYTE $0xc9 // movdqa xmm1, xmm9 900 LONG $0xef0f4166; BYTE $0xc8 // pxor xmm1, xmm8 901 LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 902 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 903 LONG $0xdc6f0f66 // movdqa xmm3, xmm4 904 LONG $0x380f4166; WORD $0xda15 // blendvpd xmm3, xmm10, xmm0 905 LONG $0x746f0ff3; WORD $0x30c7 // movdqu xmm6, oword [rdi + 8*rax + 48] 906 LONG $0x6f0f4166; BYTE $0xfb // movdqa xmm7, xmm11 907 LONG $0xef0f4166; BYTE $0xf8 // pxor xmm7, xmm8 908 LONG $0xc56f0f66 // movdqa xmm0, xmm5 909 LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 910 LONG $0x6f0f4166; BYTE $0xd4 // movdqa xmm2, xmm12 911 LONG $0xef0f4166; BYTE $0xd0 // pxor xmm2, xmm8 912 LONG $0x37380f66; BYTE $0xd0 // pcmpgtq xmm2, xmm0 913 LONG $0x37380f66; BYTE $0xc7 // pcmpgtq xmm0, xmm7 914 LONG $0xfd6f0f66 // movdqa xmm7, xmm5 915 LONG $0x380f4166; WORD $0xfb15 // blendvpd xmm7, xmm11, xmm0 916 LONG $0xc16f0f66 // movdqa xmm0, xmm1 917 LONG $0x380f4166; WORD $0xe115 // blendvpd xmm4, xmm9, xmm0 918 LONG $0xc26f0f66 // movdqa xmm0, xmm2 919 LONG $0x380f4166; WORD $0xec15 // blendvpd xmm5, xmm12, xmm0 920 LONG $0xd3280f66 // movapd xmm2, xmm3 921 LONG $0x570f4166; BYTE $0xd0 // xorpd xmm2, xmm8 922 LONG $0x6f0f4166; BYTE $0xc5 // movdqa xmm0, xmm13 923 LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 924 LONG $0xcc280f66 // movapd xmm1, xmm4 925 LONG $0x570f4166; BYTE $0xc8 // xorpd xmm1, xmm8 926 LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 927 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 928 LONG $0x6f0f4566; BYTE $0xd5 // movdqa xmm10, xmm13 929 LONG $0x380f4466; WORD $0xd315 // blendvpd xmm10, xmm3, xmm0 930 LONG $0xdf280f66 // movapd xmm3, xmm7 931 LONG $0x570f4166; BYTE $0xd8 // xorpd xmm3, xmm8 932 LONG $0xc66f0f66 // movdqa xmm0, xmm6 933 LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 934 LONG $0xd5280f66 // movapd xmm2, xmm5 935 LONG $0x570f4166; BYTE $0xd0 // xorpd xmm2, xmm8 936 LONG $0x37380f66; BYTE $0xd0 // pcmpgtq xmm2, xmm0 937 LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 938 LONG $0x6f0f4466; BYTE $0xde // movdqa xmm11, xmm6 939 LONG $0x380f4466; WORD $0xdf15 // blendvpd xmm11, xmm7, xmm0 940 LONG $0xc16f0f66 // movdqa xmm0, xmm1 941 LONG $0x380f4466; WORD $0xec15 // blendvpd xmm13, xmm4, xmm0 942 LONG $0xc26f0f66 // movdqa xmm0, xmm2 943 LONG $0x15380f66; BYTE $0xf5 // blendvpd xmm6, xmm5, xmm0 944 LONG $0x08c08348 // add rax, 8 945 LONG $0x280f4566; BYTE $0xcd // movapd xmm9, xmm13 946 LONG $0x280f4466; BYTE $0xe6 // movapd xmm12, xmm6 947 LONG $0x02c28349 // add r10, 2 948 JNE LBB7_9 949 LONG $0x01c0f641 // test r8b, 1 950 JE LBB7_12 951 952 LBB7_11: 953 LONG $0x24100f66; BYTE $0xc7 // movupd xmm4, oword [rdi + 8*rax] 954 LONG $0x5c100f66; WORD $0x10c7 // movupd xmm3, oword [rdi + 8*rax + 16] 955 LONG $0x6d280f66; BYTE $0x00 // movapd xmm5, oword 0[rbp] /* [rip + .LCPI7_0] */ 956 LONG $0xc6280f66 // movapd xmm0, xmm6 957 LONG $0xc5570f66 // xorpd xmm0, xmm5 958 LONG $0xcb280f66 // movapd xmm1, xmm3 959 LONG $0xcd570f66 // xorpd xmm1, xmm5 960 LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 961 LONG $0xfb280f66 // movapd xmm7, xmm3 962 LONG $0x15380f66; BYTE $0xfe // blendvpd xmm7, xmm6, xmm0 963 LONG $0x280f4166; BYTE $0xc5 // movapd xmm0, xmm13 964 LONG $0xc5570f66 // xorpd xmm0, xmm5 965 LONG $0xd4280f66 // movapd xmm2, xmm4 966 LONG $0xd5570f66 // xorpd xmm2, xmm5 967 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 968 LONG $0xf4280f66 // movapd xmm6, xmm4 969 LONG $0x380f4166; WORD $0xf515 // blendvpd xmm6, xmm13, xmm0 970 LONG $0x280f4166; BYTE $0xc3 // movapd xmm0, xmm11 971 LONG $0xc5570f66 // xorpd xmm0, xmm5 972 LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 973 LONG $0xc16f0f66 // movdqa xmm0, xmm1 974 LONG $0x380f4166; WORD $0xdb15 // blendvpd xmm3, xmm11, xmm0 975 LONG $0x570f4166; BYTE $0xea // xorpd xmm5, xmm10 976 LONG $0x37380f66; BYTE $0xd5 // pcmpgtq xmm2, xmm5 977 LONG $0xc26f0f66 // movdqa xmm0, xmm2 978 LONG $0x380f4166; WORD $0xe215 // blendvpd xmm4, xmm10, xmm0 979 LONG $0x280f4466; BYTE $0xd4 // movapd xmm10, xmm4 980 LONG $0x280f4466; BYTE $0xdb // movapd xmm11, xmm3 981 LONG $0x280f4466; BYTE $0xee // movapd xmm13, xmm6 982 LONG $0xf7280f66 // movapd xmm6, xmm7 983 984 LBB7_12: 985 LONG $0x4d280f66; BYTE $0x00 // movapd xmm1, oword 0[rbp] /* [rip + .LCPI7_0] */ 986 LONG $0xd6280f66 // movapd xmm2, xmm6 987 LONG $0xd1570f66 // xorpd xmm2, xmm1 988 LONG $0x280f4166; BYTE $0xc5 // movapd xmm0, xmm13 989 LONG $0xc1570f66 // xorpd xmm0, xmm1 990 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 991 LONG $0x380f4166; WORD $0xf515 // blendvpd xmm6, xmm13, xmm0 992 LONG $0xd6700f66; BYTE $0x4e // pshufd xmm2, xmm6, 78 993 LONG $0xc6280f66 // movapd xmm0, xmm6 994 LONG $0xc1570f66 // xorpd xmm0, xmm1 995 LONG $0xda6f0f66 // movdqa xmm3, xmm2 996 LONG $0xd9ef0f66 // pxor xmm3, xmm1 997 LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 998 LONG $0x15380f66; BYTE $0xd6 // blendvpd xmm2, xmm6, xmm0 999 LONG $0x7e0f4866; BYTE $0xd0 // movq rax, xmm2 1000 LONG $0x6f0f4166; BYTE $0xd2 // movdqa xmm2, xmm10 1001 LONG $0xd1ef0f66 // pxor xmm2, xmm1 1002 LONG $0x6f0f4166; BYTE $0xc3 // movdqa xmm0, xmm11 1003 LONG $0xc1ef0f66 // pxor xmm0, xmm1 1004 LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 1005 LONG $0x380f4566; WORD $0xda15 // blendvpd xmm11, xmm10, xmm0 1006 LONG $0x700f4166; WORD $0x4ed3 // pshufd xmm2, xmm11, 78 1007 LONG $0x6f0f4166; BYTE $0xc3 // movdqa xmm0, xmm11 1008 LONG $0xc1ef0f66 // pxor xmm0, xmm1 1009 LONG $0xcaef0f66 // pxor xmm1, xmm2 1010 LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 1011 LONG $0xc16f0f66 // movdqa xmm0, xmm1 1012 LONG $0x380f4166; WORD $0xd315 // blendvpd xmm2, xmm11, xmm0 1013 LONG $0x7e0f4966; BYTE $0xd0 // movq r8, xmm2 1014 WORD $0x394d; BYTE $0xcb // cmp r11, r9 1015 JE LBB7_13 1016 1017 LBB7_4: 1018 WORD $0x8948; BYTE $0xc6 // mov rsi, rax 1019 1020 LBB7_5: 1021 LONG $0xdf048b4a // mov rax, qword [rdi + 8*r11] 1022 WORD $0x3949; BYTE $0xc0 // cmp r8, rax 1023 LONG $0xc0430f4c // cmovae r8, rax 1024 WORD $0x3948; BYTE $0xc6 // cmp rsi, rax 1025 LONG $0xc6470f48 // cmova rax, rsi 1026 LONG $0x01c38349 // add r11, 1 1027 WORD $0x8948; BYTE $0xc6 // mov rsi, rax 1028 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 1029 JNE LBB7_5 1030 1031 LBB7_13: 1032 WORD $0x8948; BYTE $0x01 // mov qword [rcx], rax 1033 WORD $0x894c; BYTE $0x02 // mov qword [rdx], r8 1034 RET 1035 1036 LBB7_7: 1037 LONG $0x570f4566; BYTE $0xed // xorpd xmm13, xmm13 1038 LONG $0x760f4566; BYTE $0xd2 // pcmpeqd xmm10, xmm10 1039 WORD $0xc031 // xor eax, eax 1040 LONG $0x760f4566; BYTE $0xdb // pcmpeqd xmm11, xmm11 1041 LONG $0xf6570f66 // xorpd xmm6, xmm6 1042 LONG $0x01c0f641 // test r8b, 1 1043 JNE LBB7_11 1044 JMP LBB7_12