github.com/apache/arrow/go/v14@v14.0.2/internal/utils/transpose_ints_sse4_amd64.s (about) 1 //+build !noasm !appengine 2 // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT 3 4 TEXT ·_transpose_uint8_uint8_sse4(SB), $0-32 5 6 MOVQ src+0(FP), DI 7 MOVQ dest+8(FP), SI 8 MOVQ length+16(FP), DX 9 MOVQ transposeMap+24(FP), CX 10 11 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 12 JL LBB0_1 13 14 LBB0_5: 15 WORD $0xd089 // mov eax, edx 16 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 17 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 18 WORD $0x1688 // mov byte [rsi], dl 19 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 20 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 21 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 22 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 23 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 24 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 25 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 26 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 27 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 28 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 29 LONG $0x04c78348 // add rdi, 4 30 LONG $0x04c68348 // add rsi, 4 31 WORD $0xf883; BYTE $0x07 // cmp eax, 7 32 JG LBB0_5 33 34 LBB0_1: 35 WORD $0xd285 // test edx, edx 36 JLE LBB0_4 37 WORD $0xc283; BYTE $0x01 // add edx, 1 38 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 39 40 LBB0_3: 41 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 42 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 43 LONG $0x06048842 // mov byte [rsi + r8], al 44 LONG $0x01c08349 // add r8, 1 45 WORD $0xc283; BYTE $0xff // add edx, -1 46 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 47 JG LBB0_3 48 49 LBB0_4: 50 RET 51 52 TEXT ·_transpose_int8_uint8_sse4(SB), $0-32 53 54 MOVQ src+0(FP), DI 55 MOVQ dest+8(FP), SI 56 MOVQ length+16(FP), DX 57 MOVQ transposeMap+24(FP), CX 58 59 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 60 JL LBB1_1 61 62 LBB1_5: 63 WORD $0xd089 // mov eax, edx 64 LONG $0x17be0f48 // movsx rdx, byte [rdi] 65 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 66 WORD $0x1688 // mov byte [rsi], dl 67 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 68 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 69 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 70 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 71 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 72 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 73 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 74 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 75 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 76 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 77 LONG $0x04c78348 // add rdi, 4 78 LONG $0x04c68348 // add rsi, 4 79 WORD $0xf883; BYTE $0x07 // cmp eax, 7 80 JG LBB1_5 81 82 LBB1_1: 83 WORD $0xd285 // test edx, edx 84 JLE LBB1_4 85 WORD $0xc283; BYTE $0x01 // add edx, 1 86 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 87 88 LBB1_3: 89 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 90 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 91 LONG $0x06048842 // mov byte [rsi + r8], al 92 LONG $0x01c08349 // add r8, 1 93 WORD $0xc283; BYTE $0xff // add edx, -1 94 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 95 JG LBB1_3 96 97 LBB1_4: 98 RET 99 100 TEXT ·_transpose_uint16_uint8_sse4(SB), $0-32 101 102 MOVQ src+0(FP), DI 103 MOVQ dest+8(FP), SI 104 MOVQ length+16(FP), DX 105 MOVQ transposeMap+24(FP), CX 106 107 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 108 JL LBB2_1 109 110 LBB2_5: 111 WORD $0xd089 // mov eax, edx 112 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 113 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 114 WORD $0x1688 // mov byte [rsi], dl 115 LONG $0x0257b70f // movzx edx, word [rdi + 2] 116 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 117 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 118 LONG $0x0457b70f // movzx edx, word [rdi + 4] 119 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 120 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 121 LONG $0x0657b70f // movzx edx, word [rdi + 6] 122 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 123 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 124 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 125 LONG $0x08c78348 // add rdi, 8 126 LONG $0x04c68348 // add rsi, 4 127 WORD $0xf883; BYTE $0x07 // cmp eax, 7 128 JG LBB2_5 129 130 LBB2_1: 131 WORD $0xd285 // test edx, edx 132 JLE LBB2_4 133 WORD $0xc283; BYTE $0x01 // add edx, 1 134 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 135 136 LBB2_3: 137 LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] 138 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 139 LONG $0x06048842 // mov byte [rsi + r8], al 140 LONG $0x01c08349 // add r8, 1 141 WORD $0xc283; BYTE $0xff // add edx, -1 142 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 143 JG LBB2_3 144 145 LBB2_4: 146 RET 147 148 TEXT ·_transpose_int16_uint8_sse4(SB), $0-32 149 150 MOVQ src+0(FP), DI 151 MOVQ dest+8(FP), SI 152 MOVQ length+16(FP), DX 153 MOVQ transposeMap+24(FP), CX 154 155 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 156 JL LBB3_1 157 158 LBB3_5: 159 WORD $0xd089 // mov eax, edx 160 LONG $0x17bf0f48 // movsx rdx, word [rdi] 161 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 162 WORD $0x1688 // mov byte [rsi], dl 163 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 164 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 165 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 166 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 167 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 168 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 169 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 170 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 171 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 172 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 173 LONG $0x08c78348 // add rdi, 8 174 LONG $0x04c68348 // add rsi, 4 175 WORD $0xf883; BYTE $0x07 // cmp eax, 7 176 JG LBB3_5 177 178 LBB3_1: 179 WORD $0xd285 // test edx, edx 180 JLE LBB3_4 181 WORD $0xc283; BYTE $0x01 // add edx, 1 182 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 183 184 LBB3_3: 185 LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] 186 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 187 LONG $0x06048842 // mov byte [rsi + r8], al 188 LONG $0x01c08349 // add r8, 1 189 WORD $0xc283; BYTE $0xff // add edx, -1 190 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 191 JG LBB3_3 192 193 LBB3_4: 194 RET 195 196 TEXT ·_transpose_uint32_uint8_sse4(SB), $0-32 197 198 MOVQ src+0(FP), DI 199 MOVQ dest+8(FP), SI 200 MOVQ length+16(FP), DX 201 MOVQ transposeMap+24(FP), CX 202 203 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 204 JL LBB4_1 205 206 LBB4_5: 207 WORD $0xd089 // mov eax, edx 208 WORD $0x178b // mov edx, dword [rdi] 209 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 210 WORD $0x1688 // mov byte [rsi], dl 211 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 212 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 213 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 214 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 215 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 216 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 217 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 218 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 219 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 220 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 221 LONG $0x10c78348 // add rdi, 16 222 LONG $0x04c68348 // add rsi, 4 223 WORD $0xf883; BYTE $0x07 // cmp eax, 7 224 JG LBB4_5 225 226 LBB4_1: 227 WORD $0xd285 // test edx, edx 228 JLE LBB4_4 229 WORD $0xc283; BYTE $0x01 // add edx, 1 230 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 231 232 LBB4_3: 233 LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] 234 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 235 LONG $0x06048842 // mov byte [rsi + r8], al 236 LONG $0x01c08349 // add r8, 1 237 WORD $0xc283; BYTE $0xff // add edx, -1 238 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 239 JG LBB4_3 240 241 LBB4_4: 242 RET 243 244 TEXT ·_transpose_int32_uint8_sse4(SB), $0-32 245 246 MOVQ src+0(FP), DI 247 MOVQ dest+8(FP), SI 248 MOVQ length+16(FP), DX 249 MOVQ transposeMap+24(FP), CX 250 251 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 252 JL LBB5_1 253 254 LBB5_5: 255 WORD $0xd089 // mov eax, edx 256 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 257 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 258 WORD $0x1688 // mov byte [rsi], dl 259 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 260 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 261 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 262 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 263 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 264 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 265 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 266 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 267 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 268 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 269 LONG $0x10c78348 // add rdi, 16 270 LONG $0x04c68348 // add rsi, 4 271 WORD $0xf883; BYTE $0x07 // cmp eax, 7 272 JG LBB5_5 273 274 LBB5_1: 275 WORD $0xd285 // test edx, edx 276 JLE LBB5_4 277 WORD $0xc283; BYTE $0x01 // add edx, 1 278 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 279 280 LBB5_3: 281 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] 282 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 283 LONG $0x06048842 // mov byte [rsi + r8], al 284 LONG $0x01c08349 // add r8, 1 285 WORD $0xc283; BYTE $0xff // add edx, -1 286 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 287 JG LBB5_3 288 289 LBB5_4: 290 RET 291 292 TEXT ·_transpose_uint64_uint8_sse4(SB), $0-32 293 294 MOVQ src+0(FP), DI 295 MOVQ dest+8(FP), SI 296 MOVQ length+16(FP), DX 297 MOVQ transposeMap+24(FP), CX 298 299 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 300 JL LBB6_1 301 302 LBB6_5: 303 WORD $0xd089 // mov eax, edx 304 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 305 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 306 WORD $0x1688 // mov byte [rsi], dl 307 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 308 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 309 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 310 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 311 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 312 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 313 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 314 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 315 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 316 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 317 LONG $0x20c78348 // add rdi, 32 318 LONG $0x04c68348 // add rsi, 4 319 WORD $0xf883; BYTE $0x07 // cmp eax, 7 320 JG LBB6_5 321 322 LBB6_1: 323 WORD $0xd285 // test edx, edx 324 JLE LBB6_4 325 WORD $0xc283; BYTE $0x01 // add edx, 1 326 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 327 328 LBB6_3: 329 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 330 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 331 LONG $0x06048842 // mov byte [rsi + r8], al 332 LONG $0x01c08349 // add r8, 1 333 WORD $0xc283; BYTE $0xff // add edx, -1 334 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 335 JG LBB6_3 336 337 LBB6_4: 338 RET 339 340 TEXT ·_transpose_int64_uint8_sse4(SB), $0-32 341 342 MOVQ src+0(FP), DI 343 MOVQ dest+8(FP), SI 344 MOVQ length+16(FP), DX 345 MOVQ transposeMap+24(FP), CX 346 347 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 348 JL LBB7_1 349 350 LBB7_5: 351 WORD $0xd089 // mov eax, edx 352 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 353 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 354 WORD $0x1688 // mov byte [rsi], dl 355 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 356 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 357 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 358 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 359 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 360 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 361 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 362 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 363 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 364 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 365 LONG $0x20c78348 // add rdi, 32 366 LONG $0x04c68348 // add rsi, 4 367 WORD $0xf883; BYTE $0x07 // cmp eax, 7 368 JG LBB7_5 369 370 LBB7_1: 371 WORD $0xd285 // test edx, edx 372 JLE LBB7_4 373 WORD $0xc283; BYTE $0x01 // add edx, 1 374 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 375 376 LBB7_3: 377 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 378 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 379 LONG $0x06048842 // mov byte [rsi + r8], al 380 LONG $0x01c08349 // add r8, 1 381 WORD $0xc283; BYTE $0xff // add edx, -1 382 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 383 JG LBB7_3 384 385 LBB7_4: 386 RET 387 388 TEXT ·_transpose_uint8_int8_sse4(SB), $0-32 389 390 MOVQ src+0(FP), DI 391 MOVQ dest+8(FP), SI 392 MOVQ length+16(FP), DX 393 MOVQ transposeMap+24(FP), CX 394 395 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 396 JL LBB8_1 397 398 LBB8_5: 399 WORD $0xd089 // mov eax, edx 400 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 401 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 402 WORD $0x1688 // mov byte [rsi], dl 403 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 404 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 405 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 406 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 407 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 408 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 409 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 410 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 411 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 412 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 413 LONG $0x04c78348 // add rdi, 4 414 LONG $0x04c68348 // add rsi, 4 415 WORD $0xf883; BYTE $0x07 // cmp eax, 7 416 JG LBB8_5 417 418 LBB8_1: 419 WORD $0xd285 // test edx, edx 420 JLE LBB8_4 421 WORD $0xc283; BYTE $0x01 // add edx, 1 422 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 423 424 LBB8_3: 425 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 426 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 427 LONG $0x06048842 // mov byte [rsi + r8], al 428 LONG $0x01c08349 // add r8, 1 429 WORD $0xc283; BYTE $0xff // add edx, -1 430 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 431 JG LBB8_3 432 433 LBB8_4: 434 RET 435 436 TEXT ·_transpose_int8_int8_sse4(SB), $0-32 437 438 MOVQ src+0(FP), DI 439 MOVQ dest+8(FP), SI 440 MOVQ length+16(FP), DX 441 MOVQ transposeMap+24(FP), CX 442 443 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 444 JL LBB9_1 445 446 LBB9_5: 447 WORD $0xd089 // mov eax, edx 448 LONG $0x17be0f48 // movsx rdx, byte [rdi] 449 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 450 WORD $0x1688 // mov byte [rsi], dl 451 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 452 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 453 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 454 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 455 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 456 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 457 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 458 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 459 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 460 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 461 LONG $0x04c78348 // add rdi, 4 462 LONG $0x04c68348 // add rsi, 4 463 WORD $0xf883; BYTE $0x07 // cmp eax, 7 464 JG LBB9_5 465 466 LBB9_1: 467 WORD $0xd285 // test edx, edx 468 JLE LBB9_4 469 WORD $0xc283; BYTE $0x01 // add edx, 1 470 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 471 472 LBB9_3: 473 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 474 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 475 LONG $0x06048842 // mov byte [rsi + r8], al 476 LONG $0x01c08349 // add r8, 1 477 WORD $0xc283; BYTE $0xff // add edx, -1 478 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 479 JG LBB9_3 480 481 LBB9_4: 482 RET 483 484 TEXT ·_transpose_uint16_int8_sse4(SB), $0-32 485 486 MOVQ src+0(FP), DI 487 MOVQ dest+8(FP), SI 488 MOVQ length+16(FP), DX 489 MOVQ transposeMap+24(FP), CX 490 491 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 492 JL LBB10_1 493 494 LBB10_5: 495 WORD $0xd089 // mov eax, edx 496 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 497 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 498 WORD $0x1688 // mov byte [rsi], dl 499 LONG $0x0257b70f // movzx edx, word [rdi + 2] 500 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 501 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 502 LONG $0x0457b70f // movzx edx, word [rdi + 4] 503 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 504 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 505 LONG $0x0657b70f // movzx edx, word [rdi + 6] 506 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 507 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 508 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 509 LONG $0x08c78348 // add rdi, 8 510 LONG $0x04c68348 // add rsi, 4 511 WORD $0xf883; BYTE $0x07 // cmp eax, 7 512 JG LBB10_5 513 514 LBB10_1: 515 WORD $0xd285 // test edx, edx 516 JLE LBB10_4 517 WORD $0xc283; BYTE $0x01 // add edx, 1 518 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 519 520 LBB10_3: 521 LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] 522 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 523 LONG $0x06048842 // mov byte [rsi + r8], al 524 LONG $0x01c08349 // add r8, 1 525 WORD $0xc283; BYTE $0xff // add edx, -1 526 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 527 JG LBB10_3 528 529 LBB10_4: 530 RET 531 532 TEXT ·_transpose_int16_int8_sse4(SB), $0-32 533 534 MOVQ src+0(FP), DI 535 MOVQ dest+8(FP), SI 536 MOVQ length+16(FP), DX 537 MOVQ transposeMap+24(FP), CX 538 539 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 540 JL LBB11_1 541 542 LBB11_5: 543 WORD $0xd089 // mov eax, edx 544 LONG $0x17bf0f48 // movsx rdx, word [rdi] 545 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 546 WORD $0x1688 // mov byte [rsi], dl 547 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 548 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 549 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 550 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 551 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 552 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 553 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 554 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 555 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 556 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 557 LONG $0x08c78348 // add rdi, 8 558 LONG $0x04c68348 // add rsi, 4 559 WORD $0xf883; BYTE $0x07 // cmp eax, 7 560 JG LBB11_5 561 562 LBB11_1: 563 WORD $0xd285 // test edx, edx 564 JLE LBB11_4 565 WORD $0xc283; BYTE $0x01 // add edx, 1 566 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 567 568 LBB11_3: 569 LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] 570 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 571 LONG $0x06048842 // mov byte [rsi + r8], al 572 LONG $0x01c08349 // add r8, 1 573 WORD $0xc283; BYTE $0xff // add edx, -1 574 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 575 JG LBB11_3 576 577 LBB11_4: 578 RET 579 580 TEXT ·_transpose_uint32_int8_sse4(SB), $0-32 581 582 MOVQ src+0(FP), DI 583 MOVQ dest+8(FP), SI 584 MOVQ length+16(FP), DX 585 MOVQ transposeMap+24(FP), CX 586 587 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 588 JL LBB12_1 589 590 LBB12_5: 591 WORD $0xd089 // mov eax, edx 592 WORD $0x178b // mov edx, dword [rdi] 593 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 594 WORD $0x1688 // mov byte [rsi], dl 595 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 596 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 597 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 598 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 599 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 600 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 601 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 602 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 603 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 604 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 605 LONG $0x10c78348 // add rdi, 16 606 LONG $0x04c68348 // add rsi, 4 607 WORD $0xf883; BYTE $0x07 // cmp eax, 7 608 JG LBB12_5 609 610 LBB12_1: 611 WORD $0xd285 // test edx, edx 612 JLE LBB12_4 613 WORD $0xc283; BYTE $0x01 // add edx, 1 614 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 615 616 LBB12_3: 617 LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] 618 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 619 LONG $0x06048842 // mov byte [rsi + r8], al 620 LONG $0x01c08349 // add r8, 1 621 WORD $0xc283; BYTE $0xff // add edx, -1 622 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 623 JG LBB12_3 624 625 LBB12_4: 626 RET 627 628 TEXT ·_transpose_int32_int8_sse4(SB), $0-32 629 630 MOVQ src+0(FP), DI 631 MOVQ dest+8(FP), SI 632 MOVQ length+16(FP), DX 633 MOVQ transposeMap+24(FP), CX 634 635 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 636 JL LBB13_1 637 638 LBB13_5: 639 WORD $0xd089 // mov eax, edx 640 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 641 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 642 WORD $0x1688 // mov byte [rsi], dl 643 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 644 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 645 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 646 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 647 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 648 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 649 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 650 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 651 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 652 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 653 LONG $0x10c78348 // add rdi, 16 654 LONG $0x04c68348 // add rsi, 4 655 WORD $0xf883; BYTE $0x07 // cmp eax, 7 656 JG LBB13_5 657 658 LBB13_1: 659 WORD $0xd285 // test edx, edx 660 JLE LBB13_4 661 WORD $0xc283; BYTE $0x01 // add edx, 1 662 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 663 664 LBB13_3: 665 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] 666 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 667 LONG $0x06048842 // mov byte [rsi + r8], al 668 LONG $0x01c08349 // add r8, 1 669 WORD $0xc283; BYTE $0xff // add edx, -1 670 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 671 JG LBB13_3 672 673 LBB13_4: 674 RET 675 676 TEXT ·_transpose_uint64_int8_sse4(SB), $0-32 677 678 MOVQ src+0(FP), DI 679 MOVQ dest+8(FP), SI 680 MOVQ length+16(FP), DX 681 MOVQ transposeMap+24(FP), CX 682 683 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 684 JL LBB14_1 685 686 LBB14_5: 687 WORD $0xd089 // mov eax, edx 688 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 689 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 690 WORD $0x1688 // mov byte [rsi], dl 691 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 692 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 693 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 694 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 695 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 696 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 697 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 698 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 699 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 700 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 701 LONG $0x20c78348 // add rdi, 32 702 LONG $0x04c68348 // add rsi, 4 703 WORD $0xf883; BYTE $0x07 // cmp eax, 7 704 JG LBB14_5 705 706 LBB14_1: 707 WORD $0xd285 // test edx, edx 708 JLE LBB14_4 709 WORD $0xc283; BYTE $0x01 // add edx, 1 710 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 711 712 LBB14_3: 713 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 714 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 715 LONG $0x06048842 // mov byte [rsi + r8], al 716 LONG $0x01c08349 // add r8, 1 717 WORD $0xc283; BYTE $0xff // add edx, -1 718 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 719 JG LBB14_3 720 721 LBB14_4: 722 RET 723 724 TEXT ·_transpose_int64_int8_sse4(SB), $0-32 725 726 MOVQ src+0(FP), DI 727 MOVQ dest+8(FP), SI 728 MOVQ length+16(FP), DX 729 MOVQ transposeMap+24(FP), CX 730 731 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 732 JL LBB15_1 733 734 LBB15_5: 735 WORD $0xd089 // mov eax, edx 736 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 737 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 738 WORD $0x1688 // mov byte [rsi], dl 739 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 740 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 741 WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl 742 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 743 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 744 WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl 745 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 746 LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] 747 WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl 748 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 749 LONG $0x20c78348 // add rdi, 32 750 LONG $0x04c68348 // add rsi, 4 751 WORD $0xf883; BYTE $0x07 // cmp eax, 7 752 JG LBB15_5 753 754 LBB15_1: 755 WORD $0xd285 // test edx, edx 756 JLE LBB15_4 757 WORD $0xc283; BYTE $0x01 // add edx, 1 758 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 759 760 LBB15_3: 761 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 762 LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] 763 LONG $0x06048842 // mov byte [rsi + r8], al 764 LONG $0x01c08349 // add r8, 1 765 WORD $0xc283; BYTE $0xff // add edx, -1 766 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 767 JG LBB15_3 768 769 LBB15_4: 770 RET 771 772 TEXT ·_transpose_uint8_uint16_sse4(SB), $0-32 773 774 MOVQ src+0(FP), DI 775 MOVQ dest+8(FP), SI 776 MOVQ length+16(FP), DX 777 MOVQ transposeMap+24(FP), CX 778 779 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 780 JL LBB16_1 781 782 LBB16_5: 783 WORD $0xd089 // mov eax, edx 784 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 785 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 786 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 787 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 788 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 789 LONG $0x02568966 // mov word [rsi + 2], dx 790 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 791 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 792 LONG $0x04568966 // mov word [rsi + 4], dx 793 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 794 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 795 LONG $0x06568966 // mov word [rsi + 6], dx 796 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 797 LONG $0x04c78348 // add rdi, 4 798 LONG $0x08c68348 // add rsi, 8 799 WORD $0xf883; BYTE $0x07 // cmp eax, 7 800 JG LBB16_5 801 802 LBB16_1: 803 WORD $0xd285 // test edx, edx 804 JLE LBB16_4 805 WORD $0xc283; BYTE $0x01 // add edx, 1 806 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 807 808 LBB16_3: 809 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 810 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 811 LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax 812 LONG $0x01c08349 // add r8, 1 813 WORD $0xc283; BYTE $0xff // add edx, -1 814 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 815 JG LBB16_3 816 817 LBB16_4: 818 RET 819 820 TEXT ·_transpose_int8_uint16_sse4(SB), $0-32 821 822 MOVQ src+0(FP), DI 823 MOVQ dest+8(FP), SI 824 MOVQ length+16(FP), DX 825 MOVQ transposeMap+24(FP), CX 826 827 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 828 JL LBB17_1 829 830 LBB17_5: 831 WORD $0xd089 // mov eax, edx 832 LONG $0x17be0f48 // movsx rdx, byte [rdi] 833 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 834 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 835 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 836 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 837 LONG $0x02568966 // mov word [rsi + 2], dx 838 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 839 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 840 LONG $0x04568966 // mov word [rsi + 4], dx 841 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 842 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 843 LONG $0x06568966 // mov word [rsi + 6], dx 844 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 845 LONG $0x04c78348 // add rdi, 4 846 LONG $0x08c68348 // add rsi, 8 847 WORD $0xf883; BYTE $0x07 // cmp eax, 7 848 JG LBB17_5 849 850 LBB17_1: 851 WORD $0xd285 // test edx, edx 852 JLE LBB17_4 853 WORD $0xc283; BYTE $0x01 // add edx, 1 854 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 855 856 LBB17_3: 857 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 858 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 859 LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax 860 LONG $0x01c08349 // add r8, 1 861 WORD $0xc283; BYTE $0xff // add edx, -1 862 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 863 JG LBB17_3 864 865 LBB17_4: 866 RET 867 868 TEXT ·_transpose_uint16_uint16_sse4(SB), $0-32 869 870 MOVQ src+0(FP), DI 871 MOVQ dest+8(FP), SI 872 MOVQ length+16(FP), DX 873 MOVQ transposeMap+24(FP), CX 874 875 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 876 JL LBB18_1 877 878 LBB18_5: 879 WORD $0xd089 // mov eax, edx 880 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 881 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 882 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 883 LONG $0x0257b70f // movzx edx, word [rdi + 2] 884 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 885 LONG $0x02568966 // mov word [rsi + 2], dx 886 LONG $0x0457b70f // movzx edx, word [rdi + 4] 887 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 888 LONG $0x04568966 // mov word [rsi + 4], dx 889 LONG $0x0657b70f // movzx edx, word [rdi + 6] 890 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 891 LONG $0x06568966 // mov word [rsi + 6], dx 892 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 893 LONG $0x08c78348 // add rdi, 8 894 LONG $0x08c68348 // add rsi, 8 895 WORD $0xf883; BYTE $0x07 // cmp eax, 7 896 JG LBB18_5 897 898 LBB18_1: 899 WORD $0xd285 // test edx, edx 900 JLE LBB18_4 901 WORD $0xc283; BYTE $0x01 // add edx, 1 902 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 903 904 LBB18_3: 905 LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] 906 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 907 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 908 LONG $0x02c08349 // add r8, 2 909 WORD $0xc283; BYTE $0xff // add edx, -1 910 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 911 JG LBB18_3 912 913 LBB18_4: 914 RET 915 916 TEXT ·_transpose_int16_uint16_sse4(SB), $0-32 917 918 MOVQ src+0(FP), DI 919 MOVQ dest+8(FP), SI 920 MOVQ length+16(FP), DX 921 MOVQ transposeMap+24(FP), CX 922 923 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 924 JL LBB19_1 925 926 LBB19_5: 927 WORD $0xd089 // mov eax, edx 928 LONG $0x17bf0f48 // movsx rdx, word [rdi] 929 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 930 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 931 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 932 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 933 LONG $0x02568966 // mov word [rsi + 2], dx 934 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 935 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 936 LONG $0x04568966 // mov word [rsi + 4], dx 937 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 938 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 939 LONG $0x06568966 // mov word [rsi + 6], dx 940 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 941 LONG $0x08c78348 // add rdi, 8 942 LONG $0x08c68348 // add rsi, 8 943 WORD $0xf883; BYTE $0x07 // cmp eax, 7 944 JG LBB19_5 945 946 LBB19_1: 947 WORD $0xd285 // test edx, edx 948 JLE LBB19_4 949 WORD $0xc283; BYTE $0x01 // add edx, 1 950 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 951 952 LBB19_3: 953 LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] 954 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 955 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 956 LONG $0x02c08349 // add r8, 2 957 WORD $0xc283; BYTE $0xff // add edx, -1 958 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 959 JG LBB19_3 960 961 LBB19_4: 962 RET 963 964 TEXT ·_transpose_uint32_uint16_sse4(SB), $0-32 965 966 MOVQ src+0(FP), DI 967 MOVQ dest+8(FP), SI 968 MOVQ length+16(FP), DX 969 MOVQ transposeMap+24(FP), CX 970 971 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 972 JL LBB20_1 973 974 LBB20_5: 975 WORD $0xd089 // mov eax, edx 976 WORD $0x178b // mov edx, dword [rdi] 977 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 978 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 979 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 980 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 981 LONG $0x02568966 // mov word [rsi + 2], dx 982 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 983 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 984 LONG $0x04568966 // mov word [rsi + 4], dx 985 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 986 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 987 LONG $0x06568966 // mov word [rsi + 6], dx 988 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 989 LONG $0x10c78348 // add rdi, 16 990 LONG $0x08c68348 // add rsi, 8 991 WORD $0xf883; BYTE $0x07 // cmp eax, 7 992 JG LBB20_5 993 994 LBB20_1: 995 WORD $0xd285 // test edx, edx 996 JLE LBB20_4 997 WORD $0xc283; BYTE $0x01 // add edx, 1 998 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 999 1000 LBB20_3: 1001 LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] 1002 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1003 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1004 LONG $0x02c08349 // add r8, 2 1005 WORD $0xc283; BYTE $0xff // add edx, -1 1006 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1007 JG LBB20_3 1008 1009 LBB20_4: 1010 RET 1011 1012 TEXT ·_transpose_int32_uint16_sse4(SB), $0-32 1013 1014 MOVQ src+0(FP), DI 1015 MOVQ dest+8(FP), SI 1016 MOVQ length+16(FP), DX 1017 MOVQ transposeMap+24(FP), CX 1018 1019 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1020 JL LBB21_1 1021 1022 LBB21_5: 1023 WORD $0xd089 // mov eax, edx 1024 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 1025 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1026 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1027 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 1028 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1029 LONG $0x02568966 // mov word [rsi + 2], dx 1030 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 1031 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1032 LONG $0x04568966 // mov word [rsi + 4], dx 1033 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 1034 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1035 LONG $0x06568966 // mov word [rsi + 6], dx 1036 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1037 LONG $0x10c78348 // add rdi, 16 1038 LONG $0x08c68348 // add rsi, 8 1039 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1040 JG LBB21_5 1041 1042 LBB21_1: 1043 WORD $0xd285 // test edx, edx 1044 JLE LBB21_4 1045 WORD $0xc283; BYTE $0x01 // add edx, 1 1046 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1047 1048 LBB21_3: 1049 LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] 1050 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1051 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1052 LONG $0x02c08349 // add r8, 2 1053 WORD $0xc283; BYTE $0xff // add edx, -1 1054 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1055 JG LBB21_3 1056 1057 LBB21_4: 1058 RET 1059 1060 TEXT ·_transpose_uint64_uint16_sse4(SB), $0-32 1061 1062 MOVQ src+0(FP), DI 1063 MOVQ dest+8(FP), SI 1064 MOVQ length+16(FP), DX 1065 MOVQ transposeMap+24(FP), CX 1066 1067 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1068 JL LBB22_1 1069 1070 LBB22_5: 1071 WORD $0xd089 // mov eax, edx 1072 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 1073 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1074 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1075 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 1076 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1077 LONG $0x02568966 // mov word [rsi + 2], dx 1078 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 1079 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1080 LONG $0x04568966 // mov word [rsi + 4], dx 1081 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 1082 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1083 LONG $0x06568966 // mov word [rsi + 6], dx 1084 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1085 LONG $0x20c78348 // add rdi, 32 1086 LONG $0x08c68348 // add rsi, 8 1087 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1088 JG LBB22_5 1089 1090 LBB22_1: 1091 WORD $0xd285 // test edx, edx 1092 JLE LBB22_4 1093 WORD $0xc283; BYTE $0x01 // add edx, 1 1094 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1095 1096 LBB22_3: 1097 LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] 1098 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1099 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1100 LONG $0x02c08349 // add r8, 2 1101 WORD $0xc283; BYTE $0xff // add edx, -1 1102 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1103 JG LBB22_3 1104 1105 LBB22_4: 1106 RET 1107 1108 TEXT ·_transpose_int64_uint16_sse4(SB), $0-32 1109 1110 MOVQ src+0(FP), DI 1111 MOVQ dest+8(FP), SI 1112 MOVQ length+16(FP), DX 1113 MOVQ transposeMap+24(FP), CX 1114 1115 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1116 JL LBB23_1 1117 1118 LBB23_5: 1119 WORD $0xd089 // mov eax, edx 1120 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 1121 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1122 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1123 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 1124 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1125 LONG $0x02568966 // mov word [rsi + 2], dx 1126 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 1127 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1128 LONG $0x04568966 // mov word [rsi + 4], dx 1129 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 1130 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1131 LONG $0x06568966 // mov word [rsi + 6], dx 1132 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1133 LONG $0x20c78348 // add rdi, 32 1134 LONG $0x08c68348 // add rsi, 8 1135 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1136 JG LBB23_5 1137 1138 LBB23_1: 1139 WORD $0xd285 // test edx, edx 1140 JLE LBB23_4 1141 WORD $0xc283; BYTE $0x01 // add edx, 1 1142 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1143 1144 LBB23_3: 1145 LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] 1146 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1147 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1148 LONG $0x02c08349 // add r8, 2 1149 WORD $0xc283; BYTE $0xff // add edx, -1 1150 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1151 JG LBB23_3 1152 1153 LBB23_4: 1154 RET 1155 1156 TEXT ·_transpose_uint8_int16_sse4(SB), $0-32 1157 1158 MOVQ src+0(FP), DI 1159 MOVQ dest+8(FP), SI 1160 MOVQ length+16(FP), DX 1161 MOVQ transposeMap+24(FP), CX 1162 1163 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1164 JL LBB24_1 1165 1166 LBB24_5: 1167 WORD $0xd089 // mov eax, edx 1168 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 1169 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1170 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1171 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 1172 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1173 LONG $0x02568966 // mov word [rsi + 2], dx 1174 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 1175 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1176 LONG $0x04568966 // mov word [rsi + 4], dx 1177 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 1178 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1179 LONG $0x06568966 // mov word [rsi + 6], dx 1180 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1181 LONG $0x04c78348 // add rdi, 4 1182 LONG $0x08c68348 // add rsi, 8 1183 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1184 JG LBB24_5 1185 1186 LBB24_1: 1187 WORD $0xd285 // test edx, edx 1188 JLE LBB24_4 1189 WORD $0xc283; BYTE $0x01 // add edx, 1 1190 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1191 1192 LBB24_3: 1193 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 1194 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1195 LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax 1196 LONG $0x01c08349 // add r8, 1 1197 WORD $0xc283; BYTE $0xff // add edx, -1 1198 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1199 JG LBB24_3 1200 1201 LBB24_4: 1202 RET 1203 1204 TEXT ·_transpose_int8_int16_sse4(SB), $0-32 1205 1206 MOVQ src+0(FP), DI 1207 MOVQ dest+8(FP), SI 1208 MOVQ length+16(FP), DX 1209 MOVQ transposeMap+24(FP), CX 1210 1211 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1212 JL LBB25_1 1213 1214 LBB25_5: 1215 WORD $0xd089 // mov eax, edx 1216 LONG $0x17be0f48 // movsx rdx, byte [rdi] 1217 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1218 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1219 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 1220 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1221 LONG $0x02568966 // mov word [rsi + 2], dx 1222 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 1223 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1224 LONG $0x04568966 // mov word [rsi + 4], dx 1225 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 1226 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1227 LONG $0x06568966 // mov word [rsi + 6], dx 1228 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1229 LONG $0x04c78348 // add rdi, 4 1230 LONG $0x08c68348 // add rsi, 8 1231 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1232 JG LBB25_5 1233 1234 LBB25_1: 1235 WORD $0xd285 // test edx, edx 1236 JLE LBB25_4 1237 WORD $0xc283; BYTE $0x01 // add edx, 1 1238 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1239 1240 LBB25_3: 1241 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 1242 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1243 LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax 1244 LONG $0x01c08349 // add r8, 1 1245 WORD $0xc283; BYTE $0xff // add edx, -1 1246 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1247 JG LBB25_3 1248 1249 LBB25_4: 1250 RET 1251 1252 TEXT ·_transpose_uint16_int16_sse4(SB), $0-32 1253 1254 MOVQ src+0(FP), DI 1255 MOVQ dest+8(FP), SI 1256 MOVQ length+16(FP), DX 1257 MOVQ transposeMap+24(FP), CX 1258 1259 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1260 JL LBB26_1 1261 1262 LBB26_5: 1263 WORD $0xd089 // mov eax, edx 1264 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 1265 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1266 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1267 LONG $0x0257b70f // movzx edx, word [rdi + 2] 1268 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1269 LONG $0x02568966 // mov word [rsi + 2], dx 1270 LONG $0x0457b70f // movzx edx, word [rdi + 4] 1271 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1272 LONG $0x04568966 // mov word [rsi + 4], dx 1273 LONG $0x0657b70f // movzx edx, word [rdi + 6] 1274 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1275 LONG $0x06568966 // mov word [rsi + 6], dx 1276 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1277 LONG $0x08c78348 // add rdi, 8 1278 LONG $0x08c68348 // add rsi, 8 1279 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1280 JG LBB26_5 1281 1282 LBB26_1: 1283 WORD $0xd285 // test edx, edx 1284 JLE LBB26_4 1285 WORD $0xc283; BYTE $0x01 // add edx, 1 1286 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1287 1288 LBB26_3: 1289 LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] 1290 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1291 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1292 LONG $0x02c08349 // add r8, 2 1293 WORD $0xc283; BYTE $0xff // add edx, -1 1294 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1295 JG LBB26_3 1296 1297 LBB26_4: 1298 RET 1299 1300 TEXT ·_transpose_int16_int16_sse4(SB), $0-32 1301 1302 MOVQ src+0(FP), DI 1303 MOVQ dest+8(FP), SI 1304 MOVQ length+16(FP), DX 1305 MOVQ transposeMap+24(FP), CX 1306 1307 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1308 JL LBB27_1 1309 1310 LBB27_5: 1311 WORD $0xd089 // mov eax, edx 1312 LONG $0x17bf0f48 // movsx rdx, word [rdi] 1313 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1314 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1315 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 1316 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1317 LONG $0x02568966 // mov word [rsi + 2], dx 1318 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 1319 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1320 LONG $0x04568966 // mov word [rsi + 4], dx 1321 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 1322 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1323 LONG $0x06568966 // mov word [rsi + 6], dx 1324 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1325 LONG $0x08c78348 // add rdi, 8 1326 LONG $0x08c68348 // add rsi, 8 1327 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1328 JG LBB27_5 1329 1330 LBB27_1: 1331 WORD $0xd285 // test edx, edx 1332 JLE LBB27_4 1333 WORD $0xc283; BYTE $0x01 // add edx, 1 1334 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1335 1336 LBB27_3: 1337 LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] 1338 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1339 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1340 LONG $0x02c08349 // add r8, 2 1341 WORD $0xc283; BYTE $0xff // add edx, -1 1342 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1343 JG LBB27_3 1344 1345 LBB27_4: 1346 RET 1347 1348 TEXT ·_transpose_uint32_int16_sse4(SB), $0-32 1349 1350 MOVQ src+0(FP), DI 1351 MOVQ dest+8(FP), SI 1352 MOVQ length+16(FP), DX 1353 MOVQ transposeMap+24(FP), CX 1354 1355 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1356 JL LBB28_1 1357 1358 LBB28_5: 1359 WORD $0xd089 // mov eax, edx 1360 WORD $0x178b // mov edx, dword [rdi] 1361 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1362 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1363 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 1364 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1365 LONG $0x02568966 // mov word [rsi + 2], dx 1366 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 1367 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1368 LONG $0x04568966 // mov word [rsi + 4], dx 1369 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 1370 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1371 LONG $0x06568966 // mov word [rsi + 6], dx 1372 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1373 LONG $0x10c78348 // add rdi, 16 1374 LONG $0x08c68348 // add rsi, 8 1375 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1376 JG LBB28_5 1377 1378 LBB28_1: 1379 WORD $0xd285 // test edx, edx 1380 JLE LBB28_4 1381 WORD $0xc283; BYTE $0x01 // add edx, 1 1382 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1383 1384 LBB28_3: 1385 LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] 1386 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1387 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1388 LONG $0x02c08349 // add r8, 2 1389 WORD $0xc283; BYTE $0xff // add edx, -1 1390 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1391 JG LBB28_3 1392 1393 LBB28_4: 1394 RET 1395 1396 TEXT ·_transpose_int32_int16_sse4(SB), $0-32 1397 1398 MOVQ src+0(FP), DI 1399 MOVQ dest+8(FP), SI 1400 MOVQ length+16(FP), DX 1401 MOVQ transposeMap+24(FP), CX 1402 1403 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1404 JL LBB29_1 1405 1406 LBB29_5: 1407 WORD $0xd089 // mov eax, edx 1408 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 1409 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1410 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1411 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 1412 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1413 LONG $0x02568966 // mov word [rsi + 2], dx 1414 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 1415 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1416 LONG $0x04568966 // mov word [rsi + 4], dx 1417 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 1418 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1419 LONG $0x06568966 // mov word [rsi + 6], dx 1420 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1421 LONG $0x10c78348 // add rdi, 16 1422 LONG $0x08c68348 // add rsi, 8 1423 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1424 JG LBB29_5 1425 1426 LBB29_1: 1427 WORD $0xd285 // test edx, edx 1428 JLE LBB29_4 1429 WORD $0xc283; BYTE $0x01 // add edx, 1 1430 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1431 1432 LBB29_3: 1433 LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] 1434 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1435 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1436 LONG $0x02c08349 // add r8, 2 1437 WORD $0xc283; BYTE $0xff // add edx, -1 1438 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1439 JG LBB29_3 1440 1441 LBB29_4: 1442 RET 1443 1444 TEXT ·_transpose_uint64_int16_sse4(SB), $0-32 1445 1446 MOVQ src+0(FP), DI 1447 MOVQ dest+8(FP), SI 1448 MOVQ length+16(FP), DX 1449 MOVQ transposeMap+24(FP), CX 1450 1451 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1452 JL LBB30_1 1453 1454 LBB30_5: 1455 WORD $0xd089 // mov eax, edx 1456 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 1457 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1458 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1459 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 1460 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1461 LONG $0x02568966 // mov word [rsi + 2], dx 1462 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 1463 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1464 LONG $0x04568966 // mov word [rsi + 4], dx 1465 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 1466 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1467 LONG $0x06568966 // mov word [rsi + 6], dx 1468 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1469 LONG $0x20c78348 // add rdi, 32 1470 LONG $0x08c68348 // add rsi, 8 1471 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1472 JG LBB30_5 1473 1474 LBB30_1: 1475 WORD $0xd285 // test edx, edx 1476 JLE LBB30_4 1477 WORD $0xc283; BYTE $0x01 // add edx, 1 1478 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1479 1480 LBB30_3: 1481 LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] 1482 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1483 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1484 LONG $0x02c08349 // add r8, 2 1485 WORD $0xc283; BYTE $0xff // add edx, -1 1486 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1487 JG LBB30_3 1488 1489 LBB30_4: 1490 RET 1491 1492 TEXT ·_transpose_int64_int16_sse4(SB), $0-32 1493 1494 MOVQ src+0(FP), DI 1495 MOVQ dest+8(FP), SI 1496 MOVQ length+16(FP), DX 1497 MOVQ transposeMap+24(FP), CX 1498 1499 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1500 JL LBB31_1 1501 1502 LBB31_5: 1503 WORD $0xd089 // mov eax, edx 1504 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 1505 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1506 WORD $0x8966; BYTE $0x16 // mov word [rsi], dx 1507 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 1508 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1509 LONG $0x02568966 // mov word [rsi + 2], dx 1510 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 1511 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1512 LONG $0x04568966 // mov word [rsi + 4], dx 1513 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 1514 LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] 1515 LONG $0x06568966 // mov word [rsi + 6], dx 1516 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1517 LONG $0x20c78348 // add rdi, 32 1518 LONG $0x08c68348 // add rsi, 8 1519 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1520 JG LBB31_5 1521 1522 LBB31_1: 1523 WORD $0xd285 // test edx, edx 1524 JLE LBB31_4 1525 WORD $0xc283; BYTE $0x01 // add edx, 1 1526 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1527 1528 LBB31_3: 1529 LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] 1530 LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] 1531 LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax 1532 LONG $0x02c08349 // add r8, 2 1533 WORD $0xc283; BYTE $0xff // add edx, -1 1534 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1535 JG LBB31_3 1536 1537 LBB31_4: 1538 RET 1539 1540 TEXT ·_transpose_uint8_uint32_sse4(SB), $0-32 1541 1542 MOVQ src+0(FP), DI 1543 MOVQ dest+8(FP), SI 1544 MOVQ length+16(FP), DX 1545 MOVQ transposeMap+24(FP), CX 1546 1547 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1548 JL LBB32_1 1549 1550 LBB32_5: 1551 WORD $0xd089 // mov eax, edx 1552 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 1553 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1554 WORD $0x1689 // mov dword [rsi], edx 1555 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 1556 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1557 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1558 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 1559 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1560 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1561 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 1562 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1563 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1564 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1565 LONG $0x04c78348 // add rdi, 4 1566 LONG $0x10c68348 // add rsi, 16 1567 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1568 JG LBB32_5 1569 1570 LBB32_1: 1571 WORD $0xd285 // test edx, edx 1572 JLE LBB32_4 1573 WORD $0xc283; BYTE $0x01 // add edx, 1 1574 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1575 1576 LBB32_3: 1577 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 1578 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1579 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 1580 LONG $0x01c08349 // add r8, 1 1581 WORD $0xc283; BYTE $0xff // add edx, -1 1582 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1583 JG LBB32_3 1584 1585 LBB32_4: 1586 RET 1587 1588 TEXT ·_transpose_int8_uint32_sse4(SB), $0-32 1589 1590 MOVQ src+0(FP), DI 1591 MOVQ dest+8(FP), SI 1592 MOVQ length+16(FP), DX 1593 MOVQ transposeMap+24(FP), CX 1594 1595 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1596 JL LBB33_1 1597 1598 LBB33_5: 1599 WORD $0xd089 // mov eax, edx 1600 LONG $0x17be0f48 // movsx rdx, byte [rdi] 1601 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1602 WORD $0x1689 // mov dword [rsi], edx 1603 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 1604 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1605 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1606 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 1607 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1608 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1609 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 1610 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1611 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1612 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1613 LONG $0x04c78348 // add rdi, 4 1614 LONG $0x10c68348 // add rsi, 16 1615 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1616 JG LBB33_5 1617 1618 LBB33_1: 1619 WORD $0xd285 // test edx, edx 1620 JLE LBB33_4 1621 WORD $0xc283; BYTE $0x01 // add edx, 1 1622 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1623 1624 LBB33_3: 1625 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 1626 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1627 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 1628 LONG $0x01c08349 // add r8, 1 1629 WORD $0xc283; BYTE $0xff // add edx, -1 1630 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1631 JG LBB33_3 1632 1633 LBB33_4: 1634 RET 1635 1636 TEXT ·_transpose_uint16_uint32_sse4(SB), $0-32 1637 1638 MOVQ src+0(FP), DI 1639 MOVQ dest+8(FP), SI 1640 MOVQ length+16(FP), DX 1641 MOVQ transposeMap+24(FP), CX 1642 1643 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1644 JL LBB34_1 1645 1646 LBB34_5: 1647 WORD $0xd089 // mov eax, edx 1648 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 1649 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1650 WORD $0x1689 // mov dword [rsi], edx 1651 LONG $0x0257b70f // movzx edx, word [rdi + 2] 1652 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1653 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1654 LONG $0x0457b70f // movzx edx, word [rdi + 4] 1655 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1656 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1657 LONG $0x0657b70f // movzx edx, word [rdi + 6] 1658 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1659 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1660 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1661 LONG $0x08c78348 // add rdi, 8 1662 LONG $0x10c68348 // add rsi, 16 1663 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1664 JG LBB34_5 1665 1666 LBB34_1: 1667 WORD $0xd285 // test edx, edx 1668 JLE LBB34_4 1669 WORD $0xc283; BYTE $0x01 // add edx, 1 1670 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1671 1672 LBB34_3: 1673 LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] 1674 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1675 LONG $0x46048942 // mov dword [rsi + 2*r8], eax 1676 LONG $0x02c08349 // add r8, 2 1677 WORD $0xc283; BYTE $0xff // add edx, -1 1678 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1679 JG LBB34_3 1680 1681 LBB34_4: 1682 RET 1683 1684 TEXT ·_transpose_int16_uint32_sse4(SB), $0-32 1685 1686 MOVQ src+0(FP), DI 1687 MOVQ dest+8(FP), SI 1688 MOVQ length+16(FP), DX 1689 MOVQ transposeMap+24(FP), CX 1690 1691 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1692 JL LBB35_1 1693 1694 LBB35_5: 1695 WORD $0xd089 // mov eax, edx 1696 LONG $0x17bf0f48 // movsx rdx, word [rdi] 1697 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1698 WORD $0x1689 // mov dword [rsi], edx 1699 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 1700 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1701 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1702 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 1703 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1704 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1705 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 1706 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1707 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1708 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1709 LONG $0x08c78348 // add rdi, 8 1710 LONG $0x10c68348 // add rsi, 16 1711 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1712 JG LBB35_5 1713 1714 LBB35_1: 1715 WORD $0xd285 // test edx, edx 1716 JLE LBB35_4 1717 WORD $0xc283; BYTE $0x01 // add edx, 1 1718 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1719 1720 LBB35_3: 1721 LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] 1722 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1723 LONG $0x46048942 // mov dword [rsi + 2*r8], eax 1724 LONG $0x02c08349 // add r8, 2 1725 WORD $0xc283; BYTE $0xff // add edx, -1 1726 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1727 JG LBB35_3 1728 1729 LBB35_4: 1730 RET 1731 1732 TEXT ·_transpose_uint32_uint32_sse4(SB), $0-32 1733 1734 MOVQ src+0(FP), DI 1735 MOVQ dest+8(FP), SI 1736 MOVQ length+16(FP), DX 1737 MOVQ transposeMap+24(FP), CX 1738 1739 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1740 JL LBB36_1 1741 1742 LBB36_5: 1743 WORD $0xd089 // mov eax, edx 1744 WORD $0x178b // mov edx, dword [rdi] 1745 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1746 WORD $0x1689 // mov dword [rsi], edx 1747 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 1748 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1749 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1750 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 1751 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1752 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1753 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 1754 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1755 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1756 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1757 LONG $0x10c78348 // add rdi, 16 1758 LONG $0x10c68348 // add rsi, 16 1759 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1760 JG LBB36_5 1761 1762 LBB36_1: 1763 WORD $0xd285 // test edx, edx 1764 JLE LBB36_4 1765 WORD $0xc283; BYTE $0x01 // add edx, 1 1766 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1767 1768 LBB36_3: 1769 LONG $0x07048b42 // mov eax, dword [rdi + r8] 1770 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1771 LONG $0x06048942 // mov dword [rsi + r8], eax 1772 LONG $0x04c08349 // add r8, 4 1773 WORD $0xc283; BYTE $0xff // add edx, -1 1774 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1775 JG LBB36_3 1776 1777 LBB36_4: 1778 RET 1779 1780 TEXT ·_transpose_int32_uint32_sse4(SB), $0-32 1781 1782 MOVQ src+0(FP), DI 1783 MOVQ dest+8(FP), SI 1784 MOVQ length+16(FP), DX 1785 MOVQ transposeMap+24(FP), CX 1786 1787 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1788 JL LBB37_1 1789 1790 LBB37_5: 1791 WORD $0xd089 // mov eax, edx 1792 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 1793 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1794 WORD $0x1689 // mov dword [rsi], edx 1795 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 1796 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1797 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1798 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 1799 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1800 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1801 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 1802 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1803 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1804 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1805 LONG $0x10c78348 // add rdi, 16 1806 LONG $0x10c68348 // add rsi, 16 1807 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1808 JG LBB37_5 1809 1810 LBB37_1: 1811 WORD $0xd285 // test edx, edx 1812 JLE LBB37_4 1813 WORD $0xc283; BYTE $0x01 // add edx, 1 1814 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1815 1816 LBB37_3: 1817 LONG $0x0704634a // movsxd rax, dword [rdi + r8] 1818 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1819 LONG $0x06048942 // mov dword [rsi + r8], eax 1820 LONG $0x04c08349 // add r8, 4 1821 WORD $0xc283; BYTE $0xff // add edx, -1 1822 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1823 JG LBB37_3 1824 1825 LBB37_4: 1826 RET 1827 1828 TEXT ·_transpose_uint64_uint32_sse4(SB), $0-32 1829 1830 MOVQ src+0(FP), DI 1831 MOVQ dest+8(FP), SI 1832 MOVQ length+16(FP), DX 1833 MOVQ transposeMap+24(FP), CX 1834 1835 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1836 JL LBB38_1 1837 1838 LBB38_5: 1839 WORD $0xd089 // mov eax, edx 1840 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 1841 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1842 WORD $0x1689 // mov dword [rsi], edx 1843 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 1844 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1845 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1846 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 1847 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1848 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1849 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 1850 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1851 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1852 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1853 LONG $0x20c78348 // add rdi, 32 1854 LONG $0x10c68348 // add rsi, 16 1855 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1856 JG LBB38_5 1857 1858 LBB38_1: 1859 WORD $0xd285 // test edx, edx 1860 JLE LBB38_4 1861 WORD $0xc283; BYTE $0x01 // add edx, 1 1862 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1863 1864 LBB38_3: 1865 LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] 1866 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1867 LONG $0x06048942 // mov dword [rsi + r8], eax 1868 LONG $0x04c08349 // add r8, 4 1869 WORD $0xc283; BYTE $0xff // add edx, -1 1870 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1871 JG LBB38_3 1872 1873 LBB38_4: 1874 RET 1875 1876 TEXT ·_transpose_int64_uint32_sse4(SB), $0-32 1877 1878 MOVQ src+0(FP), DI 1879 MOVQ dest+8(FP), SI 1880 MOVQ length+16(FP), DX 1881 MOVQ transposeMap+24(FP), CX 1882 1883 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1884 JL LBB39_1 1885 1886 LBB39_5: 1887 WORD $0xd089 // mov eax, edx 1888 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 1889 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1890 WORD $0x1689 // mov dword [rsi], edx 1891 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 1892 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1893 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1894 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 1895 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1896 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1897 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 1898 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1899 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1900 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1901 LONG $0x20c78348 // add rdi, 32 1902 LONG $0x10c68348 // add rsi, 16 1903 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1904 JG LBB39_5 1905 1906 LBB39_1: 1907 WORD $0xd285 // test edx, edx 1908 JLE LBB39_4 1909 WORD $0xc283; BYTE $0x01 // add edx, 1 1910 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1911 1912 LBB39_3: 1913 LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] 1914 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1915 LONG $0x06048942 // mov dword [rsi + r8], eax 1916 LONG $0x04c08349 // add r8, 4 1917 WORD $0xc283; BYTE $0xff // add edx, -1 1918 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1919 JG LBB39_3 1920 1921 LBB39_4: 1922 RET 1923 1924 TEXT ·_transpose_uint8_int32_sse4(SB), $0-32 1925 1926 MOVQ src+0(FP), DI 1927 MOVQ dest+8(FP), SI 1928 MOVQ length+16(FP), DX 1929 MOVQ transposeMap+24(FP), CX 1930 1931 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1932 JL LBB40_1 1933 1934 LBB40_5: 1935 WORD $0xd089 // mov eax, edx 1936 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 1937 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1938 WORD $0x1689 // mov dword [rsi], edx 1939 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 1940 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1941 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1942 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 1943 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1944 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1945 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 1946 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1947 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1948 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1949 LONG $0x04c78348 // add rdi, 4 1950 LONG $0x10c68348 // add rsi, 16 1951 WORD $0xf883; BYTE $0x07 // cmp eax, 7 1952 JG LBB40_5 1953 1954 LBB40_1: 1955 WORD $0xd285 // test edx, edx 1956 JLE LBB40_4 1957 WORD $0xc283; BYTE $0x01 // add edx, 1 1958 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 1959 1960 LBB40_3: 1961 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 1962 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 1963 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 1964 LONG $0x01c08349 // add r8, 1 1965 WORD $0xc283; BYTE $0xff // add edx, -1 1966 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 1967 JG LBB40_3 1968 1969 LBB40_4: 1970 RET 1971 1972 TEXT ·_transpose_int8_int32_sse4(SB), $0-32 1973 1974 MOVQ src+0(FP), DI 1975 MOVQ dest+8(FP), SI 1976 MOVQ length+16(FP), DX 1977 MOVQ transposeMap+24(FP), CX 1978 1979 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 1980 JL LBB41_1 1981 1982 LBB41_5: 1983 WORD $0xd089 // mov eax, edx 1984 LONG $0x17be0f48 // movsx rdx, byte [rdi] 1985 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1986 WORD $0x1689 // mov dword [rsi], edx 1987 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 1988 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1989 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 1990 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 1991 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1992 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 1993 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 1994 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 1995 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 1996 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 1997 LONG $0x04c78348 // add rdi, 4 1998 LONG $0x10c68348 // add rsi, 16 1999 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2000 JG LBB41_5 2001 2002 LBB41_1: 2003 WORD $0xd285 // test edx, edx 2004 JLE LBB41_4 2005 WORD $0xc283; BYTE $0x01 // add edx, 1 2006 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2007 2008 LBB41_3: 2009 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 2010 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2011 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 2012 LONG $0x01c08349 // add r8, 1 2013 WORD $0xc283; BYTE $0xff // add edx, -1 2014 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2015 JG LBB41_3 2016 2017 LBB41_4: 2018 RET 2019 2020 TEXT ·_transpose_uint16_int32_sse4(SB), $0-32 2021 2022 MOVQ src+0(FP), DI 2023 MOVQ dest+8(FP), SI 2024 MOVQ length+16(FP), DX 2025 MOVQ transposeMap+24(FP), CX 2026 2027 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2028 JL LBB42_1 2029 2030 LBB42_5: 2031 WORD $0xd089 // mov eax, edx 2032 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 2033 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2034 WORD $0x1689 // mov dword [rsi], edx 2035 LONG $0x0257b70f // movzx edx, word [rdi + 2] 2036 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2037 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 2038 LONG $0x0457b70f // movzx edx, word [rdi + 4] 2039 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2040 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 2041 LONG $0x0657b70f // movzx edx, word [rdi + 6] 2042 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2043 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 2044 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2045 LONG $0x08c78348 // add rdi, 8 2046 LONG $0x10c68348 // add rsi, 16 2047 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2048 JG LBB42_5 2049 2050 LBB42_1: 2051 WORD $0xd285 // test edx, edx 2052 JLE LBB42_4 2053 WORD $0xc283; BYTE $0x01 // add edx, 1 2054 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2055 2056 LBB42_3: 2057 LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] 2058 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2059 LONG $0x46048942 // mov dword [rsi + 2*r8], eax 2060 LONG $0x02c08349 // add r8, 2 2061 WORD $0xc283; BYTE $0xff // add edx, -1 2062 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2063 JG LBB42_3 2064 2065 LBB42_4: 2066 RET 2067 2068 TEXT ·_transpose_int16_int32_sse4(SB), $0-32 2069 2070 MOVQ src+0(FP), DI 2071 MOVQ dest+8(FP), SI 2072 MOVQ length+16(FP), DX 2073 MOVQ transposeMap+24(FP), CX 2074 2075 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2076 JL LBB43_1 2077 2078 LBB43_5: 2079 WORD $0xd089 // mov eax, edx 2080 LONG $0x17bf0f48 // movsx rdx, word [rdi] 2081 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2082 WORD $0x1689 // mov dword [rsi], edx 2083 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 2084 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2085 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 2086 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 2087 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2088 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 2089 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 2090 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2091 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 2092 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2093 LONG $0x08c78348 // add rdi, 8 2094 LONG $0x10c68348 // add rsi, 16 2095 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2096 JG LBB43_5 2097 2098 LBB43_1: 2099 WORD $0xd285 // test edx, edx 2100 JLE LBB43_4 2101 WORD $0xc283; BYTE $0x01 // add edx, 1 2102 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2103 2104 LBB43_3: 2105 LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] 2106 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2107 LONG $0x46048942 // mov dword [rsi + 2*r8], eax 2108 LONG $0x02c08349 // add r8, 2 2109 WORD $0xc283; BYTE $0xff // add edx, -1 2110 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2111 JG LBB43_3 2112 2113 LBB43_4: 2114 RET 2115 2116 TEXT ·_transpose_uint32_int32_sse4(SB), $0-32 2117 2118 MOVQ src+0(FP), DI 2119 MOVQ dest+8(FP), SI 2120 MOVQ length+16(FP), DX 2121 MOVQ transposeMap+24(FP), CX 2122 2123 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2124 JL LBB44_1 2125 2126 LBB44_5: 2127 WORD $0xd089 // mov eax, edx 2128 WORD $0x178b // mov edx, dword [rdi] 2129 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2130 WORD $0x1689 // mov dword [rsi], edx 2131 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 2132 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2133 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 2134 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 2135 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2136 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 2137 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 2138 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2139 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 2140 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2141 LONG $0x10c78348 // add rdi, 16 2142 LONG $0x10c68348 // add rsi, 16 2143 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2144 JG LBB44_5 2145 2146 LBB44_1: 2147 WORD $0xd285 // test edx, edx 2148 JLE LBB44_4 2149 WORD $0xc283; BYTE $0x01 // add edx, 1 2150 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2151 2152 LBB44_3: 2153 LONG $0x07048b42 // mov eax, dword [rdi + r8] 2154 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2155 LONG $0x06048942 // mov dword [rsi + r8], eax 2156 LONG $0x04c08349 // add r8, 4 2157 WORD $0xc283; BYTE $0xff // add edx, -1 2158 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2159 JG LBB44_3 2160 2161 LBB44_4: 2162 RET 2163 2164 TEXT ·_transpose_int32_int32_sse4(SB), $0-32 2165 2166 MOVQ src+0(FP), DI 2167 MOVQ dest+8(FP), SI 2168 MOVQ length+16(FP), DX 2169 MOVQ transposeMap+24(FP), CX 2170 2171 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2172 JL LBB45_1 2173 2174 LBB45_5: 2175 WORD $0xd089 // mov eax, edx 2176 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 2177 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2178 WORD $0x1689 // mov dword [rsi], edx 2179 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 2180 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2181 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 2182 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 2183 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2184 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 2185 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 2186 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2187 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 2188 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2189 LONG $0x10c78348 // add rdi, 16 2190 LONG $0x10c68348 // add rsi, 16 2191 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2192 JG LBB45_5 2193 2194 LBB45_1: 2195 WORD $0xd285 // test edx, edx 2196 JLE LBB45_4 2197 WORD $0xc283; BYTE $0x01 // add edx, 1 2198 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2199 2200 LBB45_3: 2201 LONG $0x0704634a // movsxd rax, dword [rdi + r8] 2202 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2203 LONG $0x06048942 // mov dword [rsi + r8], eax 2204 LONG $0x04c08349 // add r8, 4 2205 WORD $0xc283; BYTE $0xff // add edx, -1 2206 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2207 JG LBB45_3 2208 2209 LBB45_4: 2210 RET 2211 2212 TEXT ·_transpose_uint64_int32_sse4(SB), $0-32 2213 2214 MOVQ src+0(FP), DI 2215 MOVQ dest+8(FP), SI 2216 MOVQ length+16(FP), DX 2217 MOVQ transposeMap+24(FP), CX 2218 2219 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2220 JL LBB46_1 2221 2222 LBB46_5: 2223 WORD $0xd089 // mov eax, edx 2224 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 2225 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2226 WORD $0x1689 // mov dword [rsi], edx 2227 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 2228 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2229 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 2230 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 2231 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2232 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 2233 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 2234 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2235 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 2236 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2237 LONG $0x20c78348 // add rdi, 32 2238 LONG $0x10c68348 // add rsi, 16 2239 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2240 JG LBB46_5 2241 2242 LBB46_1: 2243 WORD $0xd285 // test edx, edx 2244 JLE LBB46_4 2245 WORD $0xc283; BYTE $0x01 // add edx, 1 2246 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2247 2248 LBB46_3: 2249 LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] 2250 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2251 LONG $0x06048942 // mov dword [rsi + r8], eax 2252 LONG $0x04c08349 // add r8, 4 2253 WORD $0xc283; BYTE $0xff // add edx, -1 2254 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2255 JG LBB46_3 2256 2257 LBB46_4: 2258 RET 2259 2260 TEXT ·_transpose_int64_int32_sse4(SB), $0-32 2261 2262 MOVQ src+0(FP), DI 2263 MOVQ dest+8(FP), SI 2264 MOVQ length+16(FP), DX 2265 MOVQ transposeMap+24(FP), CX 2266 2267 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2268 JL LBB47_1 2269 2270 LBB47_5: 2271 WORD $0xd089 // mov eax, edx 2272 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 2273 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2274 WORD $0x1689 // mov dword [rsi], edx 2275 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 2276 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2277 WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx 2278 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 2279 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2280 WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx 2281 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 2282 WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] 2283 WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx 2284 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2285 LONG $0x20c78348 // add rdi, 32 2286 LONG $0x10c68348 // add rsi, 16 2287 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2288 JG LBB47_5 2289 2290 LBB47_1: 2291 WORD $0xd285 // test edx, edx 2292 JLE LBB47_4 2293 WORD $0xc283; BYTE $0x01 // add edx, 1 2294 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2295 2296 LBB47_3: 2297 LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] 2298 WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] 2299 LONG $0x06048942 // mov dword [rsi + r8], eax 2300 LONG $0x04c08349 // add r8, 4 2301 WORD $0xc283; BYTE $0xff // add edx, -1 2302 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2303 JG LBB47_3 2304 2305 LBB47_4: 2306 RET 2307 2308 TEXT ·_transpose_uint8_uint64_sse4(SB), $0-32 2309 2310 MOVQ src+0(FP), DI 2311 MOVQ dest+8(FP), SI 2312 MOVQ length+16(FP), DX 2313 MOVQ transposeMap+24(FP), CX 2314 2315 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2316 JL LBB48_1 2317 2318 LBB48_5: 2319 WORD $0xd089 // mov eax, edx 2320 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 2321 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2322 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2323 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 2324 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2325 LONG $0x08568948 // mov qword [rsi + 8], rdx 2326 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 2327 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2328 LONG $0x10568948 // mov qword [rsi + 16], rdx 2329 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 2330 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2331 LONG $0x18568948 // mov qword [rsi + 24], rdx 2332 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2333 LONG $0x04c78348 // add rdi, 4 2334 LONG $0x20c68348 // add rsi, 32 2335 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2336 JG LBB48_5 2337 2338 LBB48_1: 2339 WORD $0xd285 // test edx, edx 2340 JLE LBB48_4 2341 WORD $0xc283; BYTE $0x01 // add edx, 1 2342 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2343 2344 LBB48_3: 2345 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 2346 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2347 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 2348 LONG $0x01c08349 // add r8, 1 2349 WORD $0xc283; BYTE $0xff // add edx, -1 2350 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2351 JG LBB48_3 2352 2353 LBB48_4: 2354 RET 2355 2356 TEXT ·_transpose_int8_uint64_sse4(SB), $0-32 2357 2358 MOVQ src+0(FP), DI 2359 MOVQ dest+8(FP), SI 2360 MOVQ length+16(FP), DX 2361 MOVQ transposeMap+24(FP), CX 2362 2363 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2364 JL LBB49_1 2365 2366 LBB49_5: 2367 WORD $0xd089 // mov eax, edx 2368 LONG $0x17be0f48 // movsx rdx, byte [rdi] 2369 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2370 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2371 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 2372 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2373 LONG $0x08568948 // mov qword [rsi + 8], rdx 2374 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 2375 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2376 LONG $0x10568948 // mov qword [rsi + 16], rdx 2377 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 2378 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2379 LONG $0x18568948 // mov qword [rsi + 24], rdx 2380 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2381 LONG $0x04c78348 // add rdi, 4 2382 LONG $0x20c68348 // add rsi, 32 2383 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2384 JG LBB49_5 2385 2386 LBB49_1: 2387 WORD $0xd285 // test edx, edx 2388 JLE LBB49_4 2389 WORD $0xc283; BYTE $0x01 // add edx, 1 2390 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2391 2392 LBB49_3: 2393 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 2394 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2395 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 2396 LONG $0x01c08349 // add r8, 1 2397 WORD $0xc283; BYTE $0xff // add edx, -1 2398 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2399 JG LBB49_3 2400 2401 LBB49_4: 2402 RET 2403 2404 TEXT ·_transpose_uint16_uint64_sse4(SB), $0-32 2405 2406 MOVQ src+0(FP), DI 2407 MOVQ dest+8(FP), SI 2408 MOVQ length+16(FP), DX 2409 MOVQ transposeMap+24(FP), CX 2410 2411 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2412 JL LBB50_1 2413 2414 LBB50_5: 2415 WORD $0xd089 // mov eax, edx 2416 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 2417 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2418 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2419 LONG $0x0257b70f // movzx edx, word [rdi + 2] 2420 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2421 LONG $0x08568948 // mov qword [rsi + 8], rdx 2422 LONG $0x0457b70f // movzx edx, word [rdi + 4] 2423 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2424 LONG $0x10568948 // mov qword [rsi + 16], rdx 2425 LONG $0x0657b70f // movzx edx, word [rdi + 6] 2426 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2427 LONG $0x18568948 // mov qword [rsi + 24], rdx 2428 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2429 LONG $0x08c78348 // add rdi, 8 2430 LONG $0x20c68348 // add rsi, 32 2431 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2432 JG LBB50_5 2433 2434 LBB50_1: 2435 WORD $0xd285 // test edx, edx 2436 JLE LBB50_4 2437 WORD $0xc283; BYTE $0x01 // add edx, 1 2438 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2439 2440 LBB50_3: 2441 LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] 2442 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2443 LONG $0x8604894a // mov qword [rsi + 4*r8], rax 2444 LONG $0x02c08349 // add r8, 2 2445 WORD $0xc283; BYTE $0xff // add edx, -1 2446 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2447 JG LBB50_3 2448 2449 LBB50_4: 2450 RET 2451 2452 TEXT ·_transpose_int16_uint64_sse4(SB), $0-32 2453 2454 MOVQ src+0(FP), DI 2455 MOVQ dest+8(FP), SI 2456 MOVQ length+16(FP), DX 2457 MOVQ transposeMap+24(FP), CX 2458 2459 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2460 JL LBB51_1 2461 2462 LBB51_5: 2463 WORD $0xd089 // mov eax, edx 2464 LONG $0x17bf0f48 // movsx rdx, word [rdi] 2465 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2466 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2467 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 2468 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2469 LONG $0x08568948 // mov qword [rsi + 8], rdx 2470 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 2471 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2472 LONG $0x10568948 // mov qword [rsi + 16], rdx 2473 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 2474 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2475 LONG $0x18568948 // mov qword [rsi + 24], rdx 2476 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2477 LONG $0x08c78348 // add rdi, 8 2478 LONG $0x20c68348 // add rsi, 32 2479 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2480 JG LBB51_5 2481 2482 LBB51_1: 2483 WORD $0xd285 // test edx, edx 2484 JLE LBB51_4 2485 WORD $0xc283; BYTE $0x01 // add edx, 1 2486 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2487 2488 LBB51_3: 2489 LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] 2490 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2491 LONG $0x8604894a // mov qword [rsi + 4*r8], rax 2492 LONG $0x02c08349 // add r8, 2 2493 WORD $0xc283; BYTE $0xff // add edx, -1 2494 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2495 JG LBB51_3 2496 2497 LBB51_4: 2498 RET 2499 2500 TEXT ·_transpose_uint32_uint64_sse4(SB), $0-32 2501 2502 MOVQ src+0(FP), DI 2503 MOVQ dest+8(FP), SI 2504 MOVQ length+16(FP), DX 2505 MOVQ transposeMap+24(FP), CX 2506 2507 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2508 JL LBB52_1 2509 2510 LBB52_5: 2511 WORD $0xd089 // mov eax, edx 2512 WORD $0x178b // mov edx, dword [rdi] 2513 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2514 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2515 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 2516 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2517 LONG $0x08568948 // mov qword [rsi + 8], rdx 2518 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 2519 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2520 LONG $0x10568948 // mov qword [rsi + 16], rdx 2521 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 2522 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2523 LONG $0x18568948 // mov qword [rsi + 24], rdx 2524 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2525 LONG $0x10c78348 // add rdi, 16 2526 LONG $0x20c68348 // add rsi, 32 2527 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2528 JG LBB52_5 2529 2530 LBB52_1: 2531 WORD $0xd285 // test edx, edx 2532 JLE LBB52_4 2533 WORD $0xc283; BYTE $0x01 // add edx, 1 2534 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2535 2536 LBB52_3: 2537 LONG $0x07048b42 // mov eax, dword [rdi + r8] 2538 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2539 LONG $0x4604894a // mov qword [rsi + 2*r8], rax 2540 LONG $0x04c08349 // add r8, 4 2541 WORD $0xc283; BYTE $0xff // add edx, -1 2542 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2543 JG LBB52_3 2544 2545 LBB52_4: 2546 RET 2547 2548 TEXT ·_transpose_int32_uint64_sse4(SB), $0-32 2549 2550 MOVQ src+0(FP), DI 2551 MOVQ dest+8(FP), SI 2552 MOVQ length+16(FP), DX 2553 MOVQ transposeMap+24(FP), CX 2554 2555 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2556 JL LBB53_1 2557 2558 LBB53_5: 2559 WORD $0xd089 // mov eax, edx 2560 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 2561 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2562 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2563 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 2564 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2565 LONG $0x08568948 // mov qword [rsi + 8], rdx 2566 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 2567 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2568 LONG $0x10568948 // mov qword [rsi + 16], rdx 2569 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 2570 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2571 LONG $0x18568948 // mov qword [rsi + 24], rdx 2572 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2573 LONG $0x10c78348 // add rdi, 16 2574 LONG $0x20c68348 // add rsi, 32 2575 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2576 JG LBB53_5 2577 2578 LBB53_1: 2579 WORD $0xd285 // test edx, edx 2580 JLE LBB53_4 2581 WORD $0xc283; BYTE $0x01 // add edx, 1 2582 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2583 2584 LBB53_3: 2585 LONG $0x0704634a // movsxd rax, dword [rdi + r8] 2586 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2587 LONG $0x4604894a // mov qword [rsi + 2*r8], rax 2588 LONG $0x04c08349 // add r8, 4 2589 WORD $0xc283; BYTE $0xff // add edx, -1 2590 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2591 JG LBB53_3 2592 2593 LBB53_4: 2594 RET 2595 2596 TEXT ·_transpose_uint64_uint64_sse4(SB), $0-32 2597 2598 MOVQ src+0(FP), DI 2599 MOVQ dest+8(FP), SI 2600 MOVQ length+16(FP), DX 2601 MOVQ transposeMap+24(FP), CX 2602 2603 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2604 JL LBB54_1 2605 2606 LBB54_5: 2607 WORD $0xd089 // mov eax, edx 2608 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 2609 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2610 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2611 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 2612 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2613 LONG $0x08568948 // mov qword [rsi + 8], rdx 2614 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 2615 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2616 LONG $0x10568948 // mov qword [rsi + 16], rdx 2617 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 2618 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2619 LONG $0x18568948 // mov qword [rsi + 24], rdx 2620 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2621 LONG $0x20c78348 // add rdi, 32 2622 LONG $0x20c68348 // add rsi, 32 2623 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2624 JG LBB54_5 2625 2626 LBB54_1: 2627 WORD $0xd285 // test edx, edx 2628 JLE LBB54_4 2629 WORD $0xc283; BYTE $0x01 // add edx, 1 2630 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2631 2632 LBB54_3: 2633 LONG $0x07048b4a // mov rax, qword [rdi + r8] 2634 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2635 LONG $0x0604894a // mov qword [rsi + r8], rax 2636 LONG $0x08c08349 // add r8, 8 2637 WORD $0xc283; BYTE $0xff // add edx, -1 2638 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2639 JG LBB54_3 2640 2641 LBB54_4: 2642 RET 2643 2644 TEXT ·_transpose_int64_uint64_sse4(SB), $0-32 2645 2646 MOVQ src+0(FP), DI 2647 MOVQ dest+8(FP), SI 2648 MOVQ length+16(FP), DX 2649 MOVQ transposeMap+24(FP), CX 2650 2651 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2652 JL LBB55_1 2653 2654 LBB55_5: 2655 WORD $0xd089 // mov eax, edx 2656 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 2657 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2658 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2659 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 2660 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2661 LONG $0x08568948 // mov qword [rsi + 8], rdx 2662 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 2663 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2664 LONG $0x10568948 // mov qword [rsi + 16], rdx 2665 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 2666 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2667 LONG $0x18568948 // mov qword [rsi + 24], rdx 2668 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2669 LONG $0x20c78348 // add rdi, 32 2670 LONG $0x20c68348 // add rsi, 32 2671 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2672 JG LBB55_5 2673 2674 LBB55_1: 2675 WORD $0xd285 // test edx, edx 2676 JLE LBB55_4 2677 WORD $0xc283; BYTE $0x01 // add edx, 1 2678 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2679 2680 LBB55_3: 2681 LONG $0x07048b4a // mov rax, qword [rdi + r8] 2682 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2683 LONG $0x0604894a // mov qword [rsi + r8], rax 2684 LONG $0x08c08349 // add r8, 8 2685 WORD $0xc283; BYTE $0xff // add edx, -1 2686 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2687 JG LBB55_3 2688 2689 LBB55_4: 2690 RET 2691 2692 TEXT ·_transpose_uint8_int64_sse4(SB), $0-32 2693 2694 MOVQ src+0(FP), DI 2695 MOVQ dest+8(FP), SI 2696 MOVQ length+16(FP), DX 2697 MOVQ transposeMap+24(FP), CX 2698 2699 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2700 JL LBB56_1 2701 2702 LBB56_5: 2703 WORD $0xd089 // mov eax, edx 2704 WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] 2705 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2706 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2707 LONG $0x0157b60f // movzx edx, byte [rdi + 1] 2708 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2709 LONG $0x08568948 // mov qword [rsi + 8], rdx 2710 LONG $0x0257b60f // movzx edx, byte [rdi + 2] 2711 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2712 LONG $0x10568948 // mov qword [rsi + 16], rdx 2713 LONG $0x0357b60f // movzx edx, byte [rdi + 3] 2714 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2715 LONG $0x18568948 // mov qword [rsi + 24], rdx 2716 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2717 LONG $0x04c78348 // add rdi, 4 2718 LONG $0x20c68348 // add rsi, 32 2719 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2720 JG LBB56_5 2721 2722 LBB56_1: 2723 WORD $0xd285 // test edx, edx 2724 JLE LBB56_4 2725 WORD $0xc283; BYTE $0x01 // add edx, 1 2726 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2727 2728 LBB56_3: 2729 LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] 2730 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2731 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 2732 LONG $0x01c08349 // add r8, 1 2733 WORD $0xc283; BYTE $0xff // add edx, -1 2734 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2735 JG LBB56_3 2736 2737 LBB56_4: 2738 RET 2739 2740 TEXT ·_transpose_int8_int64_sse4(SB), $0-32 2741 2742 MOVQ src+0(FP), DI 2743 MOVQ dest+8(FP), SI 2744 MOVQ length+16(FP), DX 2745 MOVQ transposeMap+24(FP), CX 2746 2747 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2748 JL LBB57_1 2749 2750 LBB57_5: 2751 WORD $0xd089 // mov eax, edx 2752 LONG $0x17be0f48 // movsx rdx, byte [rdi] 2753 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2754 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2755 LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] 2756 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2757 LONG $0x08568948 // mov qword [rsi + 8], rdx 2758 LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] 2759 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2760 LONG $0x10568948 // mov qword [rsi + 16], rdx 2761 LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] 2762 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2763 LONG $0x18568948 // mov qword [rsi + 24], rdx 2764 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2765 LONG $0x04c78348 // add rdi, 4 2766 LONG $0x20c68348 // add rsi, 32 2767 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2768 JG LBB57_5 2769 2770 LBB57_1: 2771 WORD $0xd285 // test edx, edx 2772 JLE LBB57_4 2773 WORD $0xc283; BYTE $0x01 // add edx, 1 2774 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2775 2776 LBB57_3: 2777 LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] 2778 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2779 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 2780 LONG $0x01c08349 // add r8, 1 2781 WORD $0xc283; BYTE $0xff // add edx, -1 2782 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2783 JG LBB57_3 2784 2785 LBB57_4: 2786 RET 2787 2788 TEXT ·_transpose_uint16_int64_sse4(SB), $0-32 2789 2790 MOVQ src+0(FP), DI 2791 MOVQ dest+8(FP), SI 2792 MOVQ length+16(FP), DX 2793 MOVQ transposeMap+24(FP), CX 2794 2795 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2796 JL LBB58_1 2797 2798 LBB58_5: 2799 WORD $0xd089 // mov eax, edx 2800 WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] 2801 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2802 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2803 LONG $0x0257b70f // movzx edx, word [rdi + 2] 2804 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2805 LONG $0x08568948 // mov qword [rsi + 8], rdx 2806 LONG $0x0457b70f // movzx edx, word [rdi + 4] 2807 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2808 LONG $0x10568948 // mov qword [rsi + 16], rdx 2809 LONG $0x0657b70f // movzx edx, word [rdi + 6] 2810 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2811 LONG $0x18568948 // mov qword [rsi + 24], rdx 2812 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2813 LONG $0x08c78348 // add rdi, 8 2814 LONG $0x20c68348 // add rsi, 32 2815 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2816 JG LBB58_5 2817 2818 LBB58_1: 2819 WORD $0xd285 // test edx, edx 2820 JLE LBB58_4 2821 WORD $0xc283; BYTE $0x01 // add edx, 1 2822 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2823 2824 LBB58_3: 2825 LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] 2826 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2827 LONG $0x8604894a // mov qword [rsi + 4*r8], rax 2828 LONG $0x02c08349 // add r8, 2 2829 WORD $0xc283; BYTE $0xff // add edx, -1 2830 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2831 JG LBB58_3 2832 2833 LBB58_4: 2834 RET 2835 2836 TEXT ·_transpose_int16_int64_sse4(SB), $0-32 2837 2838 MOVQ src+0(FP), DI 2839 MOVQ dest+8(FP), SI 2840 MOVQ length+16(FP), DX 2841 MOVQ transposeMap+24(FP), CX 2842 2843 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2844 JL LBB59_1 2845 2846 LBB59_5: 2847 WORD $0xd089 // mov eax, edx 2848 LONG $0x17bf0f48 // movsx rdx, word [rdi] 2849 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2850 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2851 LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] 2852 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2853 LONG $0x08568948 // mov qword [rsi + 8], rdx 2854 LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] 2855 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2856 LONG $0x10568948 // mov qword [rsi + 16], rdx 2857 LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] 2858 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2859 LONG $0x18568948 // mov qword [rsi + 24], rdx 2860 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2861 LONG $0x08c78348 // add rdi, 8 2862 LONG $0x20c68348 // add rsi, 32 2863 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2864 JG LBB59_5 2865 2866 LBB59_1: 2867 WORD $0xd285 // test edx, edx 2868 JLE LBB59_4 2869 WORD $0xc283; BYTE $0x01 // add edx, 1 2870 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2871 2872 LBB59_3: 2873 LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] 2874 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2875 LONG $0x8604894a // mov qword [rsi + 4*r8], rax 2876 LONG $0x02c08349 // add r8, 2 2877 WORD $0xc283; BYTE $0xff // add edx, -1 2878 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2879 JG LBB59_3 2880 2881 LBB59_4: 2882 RET 2883 2884 TEXT ·_transpose_uint32_int64_sse4(SB), $0-32 2885 2886 MOVQ src+0(FP), DI 2887 MOVQ dest+8(FP), SI 2888 MOVQ length+16(FP), DX 2889 MOVQ transposeMap+24(FP), CX 2890 2891 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2892 JL LBB60_1 2893 2894 LBB60_5: 2895 WORD $0xd089 // mov eax, edx 2896 WORD $0x178b // mov edx, dword [rdi] 2897 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2898 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2899 WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] 2900 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2901 LONG $0x08568948 // mov qword [rsi + 8], rdx 2902 WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] 2903 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2904 LONG $0x10568948 // mov qword [rsi + 16], rdx 2905 WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] 2906 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2907 LONG $0x18568948 // mov qword [rsi + 24], rdx 2908 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2909 LONG $0x10c78348 // add rdi, 16 2910 LONG $0x20c68348 // add rsi, 32 2911 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2912 JG LBB60_5 2913 2914 LBB60_1: 2915 WORD $0xd285 // test edx, edx 2916 JLE LBB60_4 2917 WORD $0xc283; BYTE $0x01 // add edx, 1 2918 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2919 2920 LBB60_3: 2921 LONG $0x07048b42 // mov eax, dword [rdi + r8] 2922 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2923 LONG $0x4604894a // mov qword [rsi + 2*r8], rax 2924 LONG $0x04c08349 // add r8, 4 2925 WORD $0xc283; BYTE $0xff // add edx, -1 2926 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2927 JG LBB60_3 2928 2929 LBB60_4: 2930 RET 2931 2932 TEXT ·_transpose_int32_int64_sse4(SB), $0-32 2933 2934 MOVQ src+0(FP), DI 2935 MOVQ dest+8(FP), SI 2936 MOVQ length+16(FP), DX 2937 MOVQ transposeMap+24(FP), CX 2938 2939 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2940 JL LBB61_1 2941 2942 LBB61_5: 2943 WORD $0xd089 // mov eax, edx 2944 WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] 2945 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2946 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2947 LONG $0x04576348 // movsxd rdx, dword [rdi + 4] 2948 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2949 LONG $0x08568948 // mov qword [rsi + 8], rdx 2950 LONG $0x08576348 // movsxd rdx, dword [rdi + 8] 2951 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2952 LONG $0x10568948 // mov qword [rsi + 16], rdx 2953 LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] 2954 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2955 LONG $0x18568948 // mov qword [rsi + 24], rdx 2956 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 2957 LONG $0x10c78348 // add rdi, 16 2958 LONG $0x20c68348 // add rsi, 32 2959 WORD $0xf883; BYTE $0x07 // cmp eax, 7 2960 JG LBB61_5 2961 2962 LBB61_1: 2963 WORD $0xd285 // test edx, edx 2964 JLE LBB61_4 2965 WORD $0xc283; BYTE $0x01 // add edx, 1 2966 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 2967 2968 LBB61_3: 2969 LONG $0x0704634a // movsxd rax, dword [rdi + r8] 2970 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 2971 LONG $0x4604894a // mov qword [rsi + 2*r8], rax 2972 LONG $0x04c08349 // add r8, 4 2973 WORD $0xc283; BYTE $0xff // add edx, -1 2974 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 2975 JG LBB61_3 2976 2977 LBB61_4: 2978 RET 2979 2980 TEXT ·_transpose_uint64_int64_sse4(SB), $0-32 2981 2982 MOVQ src+0(FP), DI 2983 MOVQ dest+8(FP), SI 2984 MOVQ length+16(FP), DX 2985 MOVQ transposeMap+24(FP), CX 2986 2987 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 2988 JL LBB62_1 2989 2990 LBB62_5: 2991 WORD $0xd089 // mov eax, edx 2992 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 2993 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2994 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 2995 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 2996 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 2997 LONG $0x08568948 // mov qword [rsi + 8], rdx 2998 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 2999 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 3000 LONG $0x10568948 // mov qword [rsi + 16], rdx 3001 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 3002 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 3003 LONG $0x18568948 // mov qword [rsi + 24], rdx 3004 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 3005 LONG $0x20c78348 // add rdi, 32 3006 LONG $0x20c68348 // add rsi, 32 3007 WORD $0xf883; BYTE $0x07 // cmp eax, 7 3008 JG LBB62_5 3009 3010 LBB62_1: 3011 WORD $0xd285 // test edx, edx 3012 JLE LBB62_4 3013 WORD $0xc283; BYTE $0x01 // add edx, 1 3014 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 3015 3016 LBB62_3: 3017 LONG $0x07048b4a // mov rax, qword [rdi + r8] 3018 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 3019 LONG $0x0604894a // mov qword [rsi + r8], rax 3020 LONG $0x08c08349 // add r8, 8 3021 WORD $0xc283; BYTE $0xff // add edx, -1 3022 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 3023 JG LBB62_3 3024 3025 LBB62_4: 3026 RET 3027 3028 TEXT ·_transpose_int64_int64_sse4(SB), $0-32 3029 3030 MOVQ src+0(FP), DI 3031 MOVQ dest+8(FP), SI 3032 MOVQ length+16(FP), DX 3033 MOVQ transposeMap+24(FP), CX 3034 3035 WORD $0xfa83; BYTE $0x04 // cmp edx, 4 3036 JL LBB63_1 3037 3038 LBB63_5: 3039 WORD $0xd089 // mov eax, edx 3040 WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] 3041 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 3042 WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx 3043 LONG $0x08578b48 // mov rdx, qword [rdi + 8] 3044 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 3045 LONG $0x08568948 // mov qword [rsi + 8], rdx 3046 LONG $0x10578b48 // mov rdx, qword [rdi + 16] 3047 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 3048 LONG $0x10568948 // mov qword [rsi + 16], rdx 3049 LONG $0x18578b48 // mov rdx, qword [rdi + 24] 3050 LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] 3051 LONG $0x18568948 // mov qword [rsi + 24], rdx 3052 WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] 3053 LONG $0x20c78348 // add rdi, 32 3054 LONG $0x20c68348 // add rsi, 32 3055 WORD $0xf883; BYTE $0x07 // cmp eax, 7 3056 JG LBB63_5 3057 3058 LBB63_1: 3059 WORD $0xd285 // test edx, edx 3060 JLE LBB63_4 3061 WORD $0xc283; BYTE $0x01 // add edx, 1 3062 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 3063 3064 LBB63_3: 3065 LONG $0x07048b4a // mov rax, qword [rdi + r8] 3066 LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] 3067 LONG $0x0604894a // mov qword [rsi + r8], rax 3068 LONG $0x08c08349 // add r8, 8 3069 WORD $0xc283; BYTE $0xff // add edx, -1 3070 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 3071 JG LBB63_3 3072 3073 LBB63_4: 3074 RET