github.com/apache/arrow/go/v14@v14.0.2/internal/utils/_lib/transpose_ints_sse4_amd64.s (about) 1 .text 2 .intel_syntax noprefix 3 .file "transpose_ints.c" 4 .globl transpose_uint8_uint8_sse4 # -- Begin function transpose_uint8_uint8_sse4 5 .p2align 4, 0x90 6 .type transpose_uint8_uint8_sse4,@function 7 transpose_uint8_uint8_sse4: # @transpose_uint8_uint8_sse4 8 # %bb.0: 9 push rbp 10 mov rbp, rsp 11 and rsp, -8 12 cmp edx, 4 13 jl .LBB0_1 14 .p2align 4, 0x90 15 .LBB0_5: # =>This Inner Loop Header: Depth=1 16 mov eax, edx 17 movzx edx, byte ptr [rdi] 18 movzx edx, byte ptr [rcx + 4*rdx] 19 mov byte ptr [rsi], dl 20 movzx edx, byte ptr [rdi + 1] 21 movzx edx, byte ptr [rcx + 4*rdx] 22 mov byte ptr [rsi + 1], dl 23 movzx edx, byte ptr [rdi + 2] 24 movzx edx, byte ptr [rcx + 4*rdx] 25 mov byte ptr [rsi + 2], dl 26 movzx edx, byte ptr [rdi + 3] 27 movzx edx, byte ptr [rcx + 4*rdx] 28 mov byte ptr [rsi + 3], dl 29 lea edx, [rax - 4] 30 add rdi, 4 31 add rsi, 4 32 cmp eax, 7 33 jg .LBB0_5 34 .LBB0_1: 35 test edx, edx 36 jle .LBB0_4 37 # %bb.2: 38 add edx, 1 39 xor r8d, r8d 40 .p2align 4, 0x90 41 .LBB0_3: # =>This Inner Loop Header: Depth=1 42 movzx eax, byte ptr [rdi + r8] 43 movzx eax, byte ptr [rcx + 4*rax] 44 mov byte ptr [rsi + r8], al 45 add r8, 1 46 add edx, -1 47 cmp edx, 1 48 jg .LBB0_3 49 .LBB0_4: 50 mov rsp, rbp 51 pop rbp 52 ret 53 .Lfunc_end0: 54 .size transpose_uint8_uint8_sse4, .Lfunc_end0-transpose_uint8_uint8_sse4 55 # -- End function 56 .globl transpose_int8_uint8_sse4 # -- Begin function transpose_int8_uint8_sse4 57 .p2align 4, 0x90 58 .type transpose_int8_uint8_sse4,@function 59 transpose_int8_uint8_sse4: # @transpose_int8_uint8_sse4 60 # %bb.0: 61 push rbp 62 mov rbp, rsp 63 and rsp, -8 64 cmp edx, 4 65 jl .LBB1_1 66 .p2align 4, 0x90 67 .LBB1_5: # =>This Inner Loop Header: Depth=1 68 mov eax, edx 69 movsx rdx, byte ptr [rdi] 70 movzx edx, byte ptr [rcx + 4*rdx] 71 mov byte ptr [rsi], dl 72 movsx rdx, byte ptr [rdi + 1] 73 movzx edx, byte ptr [rcx + 4*rdx] 74 mov byte ptr [rsi + 1], dl 75 movsx rdx, byte ptr [rdi + 2] 76 movzx edx, byte ptr [rcx + 4*rdx] 77 mov byte ptr [rsi + 2], dl 78 movsx rdx, byte ptr [rdi + 3] 79 movzx edx, byte ptr [rcx + 4*rdx] 80 mov byte ptr [rsi + 3], dl 81 lea edx, [rax - 4] 82 add rdi, 4 83 add rsi, 4 84 cmp eax, 7 85 jg .LBB1_5 86 .LBB1_1: 87 test edx, edx 88 jle .LBB1_4 89 # %bb.2: 90 add edx, 1 91 xor r8d, r8d 92 .p2align 4, 0x90 93 .LBB1_3: # =>This Inner Loop Header: Depth=1 94 movsx rax, byte ptr [rdi + r8] 95 movzx eax, byte ptr [rcx + 4*rax] 96 mov byte ptr [rsi + r8], al 97 add r8, 1 98 add edx, -1 99 cmp edx, 1 100 jg .LBB1_3 101 .LBB1_4: 102 mov rsp, rbp 103 pop rbp 104 ret 105 .Lfunc_end1: 106 .size transpose_int8_uint8_sse4, .Lfunc_end1-transpose_int8_uint8_sse4 107 # -- End function 108 .globl transpose_uint16_uint8_sse4 # -- Begin function transpose_uint16_uint8_sse4 109 .p2align 4, 0x90 110 .type transpose_uint16_uint8_sse4,@function 111 transpose_uint16_uint8_sse4: # @transpose_uint16_uint8_sse4 112 # %bb.0: 113 push rbp 114 mov rbp, rsp 115 and rsp, -8 116 cmp edx, 4 117 jl .LBB2_1 118 .p2align 4, 0x90 119 .LBB2_5: # =>This Inner Loop Header: Depth=1 120 mov eax, edx 121 movzx edx, word ptr [rdi] 122 movzx edx, byte ptr [rcx + 4*rdx] 123 mov byte ptr [rsi], dl 124 movzx edx, word ptr [rdi + 2] 125 movzx edx, byte ptr [rcx + 4*rdx] 126 mov byte ptr [rsi + 1], dl 127 movzx edx, word ptr [rdi + 4] 128 movzx edx, byte ptr [rcx + 4*rdx] 129 mov byte ptr [rsi + 2], dl 130 movzx edx, word ptr [rdi + 6] 131 movzx edx, byte ptr [rcx + 4*rdx] 132 mov byte ptr [rsi + 3], dl 133 lea edx, [rax - 4] 134 add rdi, 8 135 add rsi, 4 136 cmp eax, 7 137 jg .LBB2_5 138 .LBB2_1: 139 test edx, edx 140 jle .LBB2_4 141 # %bb.2: 142 add edx, 1 143 xor r8d, r8d 144 .p2align 4, 0x90 145 .LBB2_3: # =>This Inner Loop Header: Depth=1 146 movzx eax, word ptr [rdi + 2*r8] 147 movzx eax, byte ptr [rcx + 4*rax] 148 mov byte ptr [rsi + r8], al 149 add r8, 1 150 add edx, -1 151 cmp edx, 1 152 jg .LBB2_3 153 .LBB2_4: 154 mov rsp, rbp 155 pop rbp 156 ret 157 .Lfunc_end2: 158 .size transpose_uint16_uint8_sse4, .Lfunc_end2-transpose_uint16_uint8_sse4 159 # -- End function 160 .globl transpose_int16_uint8_sse4 # -- Begin function transpose_int16_uint8_sse4 161 .p2align 4, 0x90 162 .type transpose_int16_uint8_sse4,@function 163 transpose_int16_uint8_sse4: # @transpose_int16_uint8_sse4 164 # %bb.0: 165 push rbp 166 mov rbp, rsp 167 and rsp, -8 168 cmp edx, 4 169 jl .LBB3_1 170 .p2align 4, 0x90 171 .LBB3_5: # =>This Inner Loop Header: Depth=1 172 mov eax, edx 173 movsx rdx, word ptr [rdi] 174 movzx edx, byte ptr [rcx + 4*rdx] 175 mov byte ptr [rsi], dl 176 movsx rdx, word ptr [rdi + 2] 177 movzx edx, byte ptr [rcx + 4*rdx] 178 mov byte ptr [rsi + 1], dl 179 movsx rdx, word ptr [rdi + 4] 180 movzx edx, byte ptr [rcx + 4*rdx] 181 mov byte ptr [rsi + 2], dl 182 movsx rdx, word ptr [rdi + 6] 183 movzx edx, byte ptr [rcx + 4*rdx] 184 mov byte ptr [rsi + 3], dl 185 lea edx, [rax - 4] 186 add rdi, 8 187 add rsi, 4 188 cmp eax, 7 189 jg .LBB3_5 190 .LBB3_1: 191 test edx, edx 192 jle .LBB3_4 193 # %bb.2: 194 add edx, 1 195 xor r8d, r8d 196 .p2align 4, 0x90 197 .LBB3_3: # =>This Inner Loop Header: Depth=1 198 movsx rax, word ptr [rdi + 2*r8] 199 movzx eax, byte ptr [rcx + 4*rax] 200 mov byte ptr [rsi + r8], al 201 add r8, 1 202 add edx, -1 203 cmp edx, 1 204 jg .LBB3_3 205 .LBB3_4: 206 mov rsp, rbp 207 pop rbp 208 ret 209 .Lfunc_end3: 210 .size transpose_int16_uint8_sse4, .Lfunc_end3-transpose_int16_uint8_sse4 211 # -- End function 212 .globl transpose_uint32_uint8_sse4 # -- Begin function transpose_uint32_uint8_sse4 213 .p2align 4, 0x90 214 .type transpose_uint32_uint8_sse4,@function 215 transpose_uint32_uint8_sse4: # @transpose_uint32_uint8_sse4 216 # %bb.0: 217 push rbp 218 mov rbp, rsp 219 and rsp, -8 220 cmp edx, 4 221 jl .LBB4_1 222 .p2align 4, 0x90 223 .LBB4_5: # =>This Inner Loop Header: Depth=1 224 mov eax, edx 225 mov edx, dword ptr [rdi] 226 movzx edx, byte ptr [rcx + 4*rdx] 227 mov byte ptr [rsi], dl 228 mov edx, dword ptr [rdi + 4] 229 movzx edx, byte ptr [rcx + 4*rdx] 230 mov byte ptr [rsi + 1], dl 231 mov edx, dword ptr [rdi + 8] 232 movzx edx, byte ptr [rcx + 4*rdx] 233 mov byte ptr [rsi + 2], dl 234 mov edx, dword ptr [rdi + 12] 235 movzx edx, byte ptr [rcx + 4*rdx] 236 mov byte ptr [rsi + 3], dl 237 lea edx, [rax - 4] 238 add rdi, 16 239 add rsi, 4 240 cmp eax, 7 241 jg .LBB4_5 242 .LBB4_1: 243 test edx, edx 244 jle .LBB4_4 245 # %bb.2: 246 add edx, 1 247 xor r8d, r8d 248 .p2align 4, 0x90 249 .LBB4_3: # =>This Inner Loop Header: Depth=1 250 mov eax, dword ptr [rdi + 4*r8] 251 movzx eax, byte ptr [rcx + 4*rax] 252 mov byte ptr [rsi + r8], al 253 add r8, 1 254 add edx, -1 255 cmp edx, 1 256 jg .LBB4_3 257 .LBB4_4: 258 mov rsp, rbp 259 pop rbp 260 ret 261 .Lfunc_end4: 262 .size transpose_uint32_uint8_sse4, .Lfunc_end4-transpose_uint32_uint8_sse4 263 # -- End function 264 .globl transpose_int32_uint8_sse4 # -- Begin function transpose_int32_uint8_sse4 265 .p2align 4, 0x90 266 .type transpose_int32_uint8_sse4,@function 267 transpose_int32_uint8_sse4: # @transpose_int32_uint8_sse4 268 # %bb.0: 269 push rbp 270 mov rbp, rsp 271 and rsp, -8 272 cmp edx, 4 273 jl .LBB5_1 274 .p2align 4, 0x90 275 .LBB5_5: # =>This Inner Loop Header: Depth=1 276 mov eax, edx 277 movsxd rdx, dword ptr [rdi] 278 movzx edx, byte ptr [rcx + 4*rdx] 279 mov byte ptr [rsi], dl 280 movsxd rdx, dword ptr [rdi + 4] 281 movzx edx, byte ptr [rcx + 4*rdx] 282 mov byte ptr [rsi + 1], dl 283 movsxd rdx, dword ptr [rdi + 8] 284 movzx edx, byte ptr [rcx + 4*rdx] 285 mov byte ptr [rsi + 2], dl 286 movsxd rdx, dword ptr [rdi + 12] 287 movzx edx, byte ptr [rcx + 4*rdx] 288 mov byte ptr [rsi + 3], dl 289 lea edx, [rax - 4] 290 add rdi, 16 291 add rsi, 4 292 cmp eax, 7 293 jg .LBB5_5 294 .LBB5_1: 295 test edx, edx 296 jle .LBB5_4 297 # %bb.2: 298 add edx, 1 299 xor r8d, r8d 300 .p2align 4, 0x90 301 .LBB5_3: # =>This Inner Loop Header: Depth=1 302 movsxd rax, dword ptr [rdi + 4*r8] 303 movzx eax, byte ptr [rcx + 4*rax] 304 mov byte ptr [rsi + r8], al 305 add r8, 1 306 add edx, -1 307 cmp edx, 1 308 jg .LBB5_3 309 .LBB5_4: 310 mov rsp, rbp 311 pop rbp 312 ret 313 .Lfunc_end5: 314 .size transpose_int32_uint8_sse4, .Lfunc_end5-transpose_int32_uint8_sse4 315 # -- End function 316 .globl transpose_uint64_uint8_sse4 # -- Begin function transpose_uint64_uint8_sse4 317 .p2align 4, 0x90 318 .type transpose_uint64_uint8_sse4,@function 319 transpose_uint64_uint8_sse4: # @transpose_uint64_uint8_sse4 320 # %bb.0: 321 push rbp 322 mov rbp, rsp 323 and rsp, -8 324 cmp edx, 4 325 jl .LBB6_1 326 .p2align 4, 0x90 327 .LBB6_5: # =>This Inner Loop Header: Depth=1 328 mov eax, edx 329 mov rdx, qword ptr [rdi] 330 movzx edx, byte ptr [rcx + 4*rdx] 331 mov byte ptr [rsi], dl 332 mov rdx, qword ptr [rdi + 8] 333 movzx edx, byte ptr [rcx + 4*rdx] 334 mov byte ptr [rsi + 1], dl 335 mov rdx, qword ptr [rdi + 16] 336 movzx edx, byte ptr [rcx + 4*rdx] 337 mov byte ptr [rsi + 2], dl 338 mov rdx, qword ptr [rdi + 24] 339 movzx edx, byte ptr [rcx + 4*rdx] 340 mov byte ptr [rsi + 3], dl 341 lea edx, [rax - 4] 342 add rdi, 32 343 add rsi, 4 344 cmp eax, 7 345 jg .LBB6_5 346 .LBB6_1: 347 test edx, edx 348 jle .LBB6_4 349 # %bb.2: 350 add edx, 1 351 xor r8d, r8d 352 .p2align 4, 0x90 353 .LBB6_3: # =>This Inner Loop Header: Depth=1 354 mov rax, qword ptr [rdi + 8*r8] 355 movzx eax, byte ptr [rcx + 4*rax] 356 mov byte ptr [rsi + r8], al 357 add r8, 1 358 add edx, -1 359 cmp edx, 1 360 jg .LBB6_3 361 .LBB6_4: 362 mov rsp, rbp 363 pop rbp 364 ret 365 .Lfunc_end6: 366 .size transpose_uint64_uint8_sse4, .Lfunc_end6-transpose_uint64_uint8_sse4 367 # -- End function 368 .globl transpose_int64_uint8_sse4 # -- Begin function transpose_int64_uint8_sse4 369 .p2align 4, 0x90 370 .type transpose_int64_uint8_sse4,@function 371 transpose_int64_uint8_sse4: # @transpose_int64_uint8_sse4 372 # %bb.0: 373 push rbp 374 mov rbp, rsp 375 and rsp, -8 376 cmp edx, 4 377 jl .LBB7_1 378 .p2align 4, 0x90 379 .LBB7_5: # =>This Inner Loop Header: Depth=1 380 mov eax, edx 381 mov rdx, qword ptr [rdi] 382 movzx edx, byte ptr [rcx + 4*rdx] 383 mov byte ptr [rsi], dl 384 mov rdx, qword ptr [rdi + 8] 385 movzx edx, byte ptr [rcx + 4*rdx] 386 mov byte ptr [rsi + 1], dl 387 mov rdx, qword ptr [rdi + 16] 388 movzx edx, byte ptr [rcx + 4*rdx] 389 mov byte ptr [rsi + 2], dl 390 mov rdx, qword ptr [rdi + 24] 391 movzx edx, byte ptr [rcx + 4*rdx] 392 mov byte ptr [rsi + 3], dl 393 lea edx, [rax - 4] 394 add rdi, 32 395 add rsi, 4 396 cmp eax, 7 397 jg .LBB7_5 398 .LBB7_1: 399 test edx, edx 400 jle .LBB7_4 401 # %bb.2: 402 add edx, 1 403 xor r8d, r8d 404 .p2align 4, 0x90 405 .LBB7_3: # =>This Inner Loop Header: Depth=1 406 mov rax, qword ptr [rdi + 8*r8] 407 movzx eax, byte ptr [rcx + 4*rax] 408 mov byte ptr [rsi + r8], al 409 add r8, 1 410 add edx, -1 411 cmp edx, 1 412 jg .LBB7_3 413 .LBB7_4: 414 mov rsp, rbp 415 pop rbp 416 ret 417 .Lfunc_end7: 418 .size transpose_int64_uint8_sse4, .Lfunc_end7-transpose_int64_uint8_sse4 419 # -- End function 420 .globl transpose_uint8_int8_sse4 # -- Begin function transpose_uint8_int8_sse4 421 .p2align 4, 0x90 422 .type transpose_uint8_int8_sse4,@function 423 transpose_uint8_int8_sse4: # @transpose_uint8_int8_sse4 424 # %bb.0: 425 push rbp 426 mov rbp, rsp 427 and rsp, -8 428 cmp edx, 4 429 jl .LBB8_1 430 .p2align 4, 0x90 431 .LBB8_5: # =>This Inner Loop Header: Depth=1 432 mov eax, edx 433 movzx edx, byte ptr [rdi] 434 movzx edx, byte ptr [rcx + 4*rdx] 435 mov byte ptr [rsi], dl 436 movzx edx, byte ptr [rdi + 1] 437 movzx edx, byte ptr [rcx + 4*rdx] 438 mov byte ptr [rsi + 1], dl 439 movzx edx, byte ptr [rdi + 2] 440 movzx edx, byte ptr [rcx + 4*rdx] 441 mov byte ptr [rsi + 2], dl 442 movzx edx, byte ptr [rdi + 3] 443 movzx edx, byte ptr [rcx + 4*rdx] 444 mov byte ptr [rsi + 3], dl 445 lea edx, [rax - 4] 446 add rdi, 4 447 add rsi, 4 448 cmp eax, 7 449 jg .LBB8_5 450 .LBB8_1: 451 test edx, edx 452 jle .LBB8_4 453 # %bb.2: 454 add edx, 1 455 xor r8d, r8d 456 .p2align 4, 0x90 457 .LBB8_3: # =>This Inner Loop Header: Depth=1 458 movzx eax, byte ptr [rdi + r8] 459 movzx eax, byte ptr [rcx + 4*rax] 460 mov byte ptr [rsi + r8], al 461 add r8, 1 462 add edx, -1 463 cmp edx, 1 464 jg .LBB8_3 465 .LBB8_4: 466 mov rsp, rbp 467 pop rbp 468 ret 469 .Lfunc_end8: 470 .size transpose_uint8_int8_sse4, .Lfunc_end8-transpose_uint8_int8_sse4 471 # -- End function 472 .globl transpose_int8_int8_sse4 # -- Begin function transpose_int8_int8_sse4 473 .p2align 4, 0x90 474 .type transpose_int8_int8_sse4,@function 475 transpose_int8_int8_sse4: # @transpose_int8_int8_sse4 476 # %bb.0: 477 push rbp 478 mov rbp, rsp 479 and rsp, -8 480 cmp edx, 4 481 jl .LBB9_1 482 .p2align 4, 0x90 483 .LBB9_5: # =>This Inner Loop Header: Depth=1 484 mov eax, edx 485 movsx rdx, byte ptr [rdi] 486 movzx edx, byte ptr [rcx + 4*rdx] 487 mov byte ptr [rsi], dl 488 movsx rdx, byte ptr [rdi + 1] 489 movzx edx, byte ptr [rcx + 4*rdx] 490 mov byte ptr [rsi + 1], dl 491 movsx rdx, byte ptr [rdi + 2] 492 movzx edx, byte ptr [rcx + 4*rdx] 493 mov byte ptr [rsi + 2], dl 494 movsx rdx, byte ptr [rdi + 3] 495 movzx edx, byte ptr [rcx + 4*rdx] 496 mov byte ptr [rsi + 3], dl 497 lea edx, [rax - 4] 498 add rdi, 4 499 add rsi, 4 500 cmp eax, 7 501 jg .LBB9_5 502 .LBB9_1: 503 test edx, edx 504 jle .LBB9_4 505 # %bb.2: 506 add edx, 1 507 xor r8d, r8d 508 .p2align 4, 0x90 509 .LBB9_3: # =>This Inner Loop Header: Depth=1 510 movsx rax, byte ptr [rdi + r8] 511 movzx eax, byte ptr [rcx + 4*rax] 512 mov byte ptr [rsi + r8], al 513 add r8, 1 514 add edx, -1 515 cmp edx, 1 516 jg .LBB9_3 517 .LBB9_4: 518 mov rsp, rbp 519 pop rbp 520 ret 521 .Lfunc_end9: 522 .size transpose_int8_int8_sse4, .Lfunc_end9-transpose_int8_int8_sse4 523 # -- End function 524 .globl transpose_uint16_int8_sse4 # -- Begin function transpose_uint16_int8_sse4 525 .p2align 4, 0x90 526 .type transpose_uint16_int8_sse4,@function 527 transpose_uint16_int8_sse4: # @transpose_uint16_int8_sse4 528 # %bb.0: 529 push rbp 530 mov rbp, rsp 531 and rsp, -8 532 cmp edx, 4 533 jl .LBB10_1 534 .p2align 4, 0x90 535 .LBB10_5: # =>This Inner Loop Header: Depth=1 536 mov eax, edx 537 movzx edx, word ptr [rdi] 538 movzx edx, byte ptr [rcx + 4*rdx] 539 mov byte ptr [rsi], dl 540 movzx edx, word ptr [rdi + 2] 541 movzx edx, byte ptr [rcx + 4*rdx] 542 mov byte ptr [rsi + 1], dl 543 movzx edx, word ptr [rdi + 4] 544 movzx edx, byte ptr [rcx + 4*rdx] 545 mov byte ptr [rsi + 2], dl 546 movzx edx, word ptr [rdi + 6] 547 movzx edx, byte ptr [rcx + 4*rdx] 548 mov byte ptr [rsi + 3], dl 549 lea edx, [rax - 4] 550 add rdi, 8 551 add rsi, 4 552 cmp eax, 7 553 jg .LBB10_5 554 .LBB10_1: 555 test edx, edx 556 jle .LBB10_4 557 # %bb.2: 558 add edx, 1 559 xor r8d, r8d 560 .p2align 4, 0x90 561 .LBB10_3: # =>This Inner Loop Header: Depth=1 562 movzx eax, word ptr [rdi + 2*r8] 563 movzx eax, byte ptr [rcx + 4*rax] 564 mov byte ptr [rsi + r8], al 565 add r8, 1 566 add edx, -1 567 cmp edx, 1 568 jg .LBB10_3 569 .LBB10_4: 570 mov rsp, rbp 571 pop rbp 572 ret 573 .Lfunc_end10: 574 .size transpose_uint16_int8_sse4, .Lfunc_end10-transpose_uint16_int8_sse4 575 # -- End function 576 .globl transpose_int16_int8_sse4 # -- Begin function transpose_int16_int8_sse4 577 .p2align 4, 0x90 578 .type transpose_int16_int8_sse4,@function 579 transpose_int16_int8_sse4: # @transpose_int16_int8_sse4 580 # %bb.0: 581 push rbp 582 mov rbp, rsp 583 and rsp, -8 584 cmp edx, 4 585 jl .LBB11_1 586 .p2align 4, 0x90 587 .LBB11_5: # =>This Inner Loop Header: Depth=1 588 mov eax, edx 589 movsx rdx, word ptr [rdi] 590 movzx edx, byte ptr [rcx + 4*rdx] 591 mov byte ptr [rsi], dl 592 movsx rdx, word ptr [rdi + 2] 593 movzx edx, byte ptr [rcx + 4*rdx] 594 mov byte ptr [rsi + 1], dl 595 movsx rdx, word ptr [rdi + 4] 596 movzx edx, byte ptr [rcx + 4*rdx] 597 mov byte ptr [rsi + 2], dl 598 movsx rdx, word ptr [rdi + 6] 599 movzx edx, byte ptr [rcx + 4*rdx] 600 mov byte ptr [rsi + 3], dl 601 lea edx, [rax - 4] 602 add rdi, 8 603 add rsi, 4 604 cmp eax, 7 605 jg .LBB11_5 606 .LBB11_1: 607 test edx, edx 608 jle .LBB11_4 609 # %bb.2: 610 add edx, 1 611 xor r8d, r8d 612 .p2align 4, 0x90 613 .LBB11_3: # =>This Inner Loop Header: Depth=1 614 movsx rax, word ptr [rdi + 2*r8] 615 movzx eax, byte ptr [rcx + 4*rax] 616 mov byte ptr [rsi + r8], al 617 add r8, 1 618 add edx, -1 619 cmp edx, 1 620 jg .LBB11_3 621 .LBB11_4: 622 mov rsp, rbp 623 pop rbp 624 ret 625 .Lfunc_end11: 626 .size transpose_int16_int8_sse4, .Lfunc_end11-transpose_int16_int8_sse4 627 # -- End function 628 .globl transpose_uint32_int8_sse4 # -- Begin function transpose_uint32_int8_sse4 629 .p2align 4, 0x90 630 .type transpose_uint32_int8_sse4,@function 631 transpose_uint32_int8_sse4: # @transpose_uint32_int8_sse4 632 # %bb.0: 633 push rbp 634 mov rbp, rsp 635 and rsp, -8 636 cmp edx, 4 637 jl .LBB12_1 638 .p2align 4, 0x90 639 .LBB12_5: # =>This Inner Loop Header: Depth=1 640 mov eax, edx 641 mov edx, dword ptr [rdi] 642 movzx edx, byte ptr [rcx + 4*rdx] 643 mov byte ptr [rsi], dl 644 mov edx, dword ptr [rdi + 4] 645 movzx edx, byte ptr [rcx + 4*rdx] 646 mov byte ptr [rsi + 1], dl 647 mov edx, dword ptr [rdi + 8] 648 movzx edx, byte ptr [rcx + 4*rdx] 649 mov byte ptr [rsi + 2], dl 650 mov edx, dword ptr [rdi + 12] 651 movzx edx, byte ptr [rcx + 4*rdx] 652 mov byte ptr [rsi + 3], dl 653 lea edx, [rax - 4] 654 add rdi, 16 655 add rsi, 4 656 cmp eax, 7 657 jg .LBB12_5 658 .LBB12_1: 659 test edx, edx 660 jle .LBB12_4 661 # %bb.2: 662 add edx, 1 663 xor r8d, r8d 664 .p2align 4, 0x90 665 .LBB12_3: # =>This Inner Loop Header: Depth=1 666 mov eax, dword ptr [rdi + 4*r8] 667 movzx eax, byte ptr [rcx + 4*rax] 668 mov byte ptr [rsi + r8], al 669 add r8, 1 670 add edx, -1 671 cmp edx, 1 672 jg .LBB12_3 673 .LBB12_4: 674 mov rsp, rbp 675 pop rbp 676 ret 677 .Lfunc_end12: 678 .size transpose_uint32_int8_sse4, .Lfunc_end12-transpose_uint32_int8_sse4 679 # -- End function 680 .globl transpose_int32_int8_sse4 # -- Begin function transpose_int32_int8_sse4 681 .p2align 4, 0x90 682 .type transpose_int32_int8_sse4,@function 683 transpose_int32_int8_sse4: # @transpose_int32_int8_sse4 684 # %bb.0: 685 push rbp 686 mov rbp, rsp 687 and rsp, -8 688 cmp edx, 4 689 jl .LBB13_1 690 .p2align 4, 0x90 691 .LBB13_5: # =>This Inner Loop Header: Depth=1 692 mov eax, edx 693 movsxd rdx, dword ptr [rdi] 694 movzx edx, byte ptr [rcx + 4*rdx] 695 mov byte ptr [rsi], dl 696 movsxd rdx, dword ptr [rdi + 4] 697 movzx edx, byte ptr [rcx + 4*rdx] 698 mov byte ptr [rsi + 1], dl 699 movsxd rdx, dword ptr [rdi + 8] 700 movzx edx, byte ptr [rcx + 4*rdx] 701 mov byte ptr [rsi + 2], dl 702 movsxd rdx, dword ptr [rdi + 12] 703 movzx edx, byte ptr [rcx + 4*rdx] 704 mov byte ptr [rsi + 3], dl 705 lea edx, [rax - 4] 706 add rdi, 16 707 add rsi, 4 708 cmp eax, 7 709 jg .LBB13_5 710 .LBB13_1: 711 test edx, edx 712 jle .LBB13_4 713 # %bb.2: 714 add edx, 1 715 xor r8d, r8d 716 .p2align 4, 0x90 717 .LBB13_3: # =>This Inner Loop Header: Depth=1 718 movsxd rax, dword ptr [rdi + 4*r8] 719 movzx eax, byte ptr [rcx + 4*rax] 720 mov byte ptr [rsi + r8], al 721 add r8, 1 722 add edx, -1 723 cmp edx, 1 724 jg .LBB13_3 725 .LBB13_4: 726 mov rsp, rbp 727 pop rbp 728 ret 729 .Lfunc_end13: 730 .size transpose_int32_int8_sse4, .Lfunc_end13-transpose_int32_int8_sse4 731 # -- End function 732 .globl transpose_uint64_int8_sse4 # -- Begin function transpose_uint64_int8_sse4 733 .p2align 4, 0x90 734 .type transpose_uint64_int8_sse4,@function 735 transpose_uint64_int8_sse4: # @transpose_uint64_int8_sse4 736 # %bb.0: 737 push rbp 738 mov rbp, rsp 739 and rsp, -8 740 cmp edx, 4 741 jl .LBB14_1 742 .p2align 4, 0x90 743 .LBB14_5: # =>This Inner Loop Header: Depth=1 744 mov eax, edx 745 mov rdx, qword ptr [rdi] 746 movzx edx, byte ptr [rcx + 4*rdx] 747 mov byte ptr [rsi], dl 748 mov rdx, qword ptr [rdi + 8] 749 movzx edx, byte ptr [rcx + 4*rdx] 750 mov byte ptr [rsi + 1], dl 751 mov rdx, qword ptr [rdi + 16] 752 movzx edx, byte ptr [rcx + 4*rdx] 753 mov byte ptr [rsi + 2], dl 754 mov rdx, qword ptr [rdi + 24] 755 movzx edx, byte ptr [rcx + 4*rdx] 756 mov byte ptr [rsi + 3], dl 757 lea edx, [rax - 4] 758 add rdi, 32 759 add rsi, 4 760 cmp eax, 7 761 jg .LBB14_5 762 .LBB14_1: 763 test edx, edx 764 jle .LBB14_4 765 # %bb.2: 766 add edx, 1 767 xor r8d, r8d 768 .p2align 4, 0x90 769 .LBB14_3: # =>This Inner Loop Header: Depth=1 770 mov rax, qword ptr [rdi + 8*r8] 771 movzx eax, byte ptr [rcx + 4*rax] 772 mov byte ptr [rsi + r8], al 773 add r8, 1 774 add edx, -1 775 cmp edx, 1 776 jg .LBB14_3 777 .LBB14_4: 778 mov rsp, rbp 779 pop rbp 780 ret 781 .Lfunc_end14: 782 .size transpose_uint64_int8_sse4, .Lfunc_end14-transpose_uint64_int8_sse4 783 # -- End function 784 .globl transpose_int64_int8_sse4 # -- Begin function transpose_int64_int8_sse4 785 .p2align 4, 0x90 786 .type transpose_int64_int8_sse4,@function 787 transpose_int64_int8_sse4: # @transpose_int64_int8_sse4 788 # %bb.0: 789 push rbp 790 mov rbp, rsp 791 and rsp, -8 792 cmp edx, 4 793 jl .LBB15_1 794 .p2align 4, 0x90 795 .LBB15_5: # =>This Inner Loop Header: Depth=1 796 mov eax, edx 797 mov rdx, qword ptr [rdi] 798 movzx edx, byte ptr [rcx + 4*rdx] 799 mov byte ptr [rsi], dl 800 mov rdx, qword ptr [rdi + 8] 801 movzx edx, byte ptr [rcx + 4*rdx] 802 mov byte ptr [rsi + 1], dl 803 mov rdx, qword ptr [rdi + 16] 804 movzx edx, byte ptr [rcx + 4*rdx] 805 mov byte ptr [rsi + 2], dl 806 mov rdx, qword ptr [rdi + 24] 807 movzx edx, byte ptr [rcx + 4*rdx] 808 mov byte ptr [rsi + 3], dl 809 lea edx, [rax - 4] 810 add rdi, 32 811 add rsi, 4 812 cmp eax, 7 813 jg .LBB15_5 814 .LBB15_1: 815 test edx, edx 816 jle .LBB15_4 817 # %bb.2: 818 add edx, 1 819 xor r8d, r8d 820 .p2align 4, 0x90 821 .LBB15_3: # =>This Inner Loop Header: Depth=1 822 mov rax, qword ptr [rdi + 8*r8] 823 movzx eax, byte ptr [rcx + 4*rax] 824 mov byte ptr [rsi + r8], al 825 add r8, 1 826 add edx, -1 827 cmp edx, 1 828 jg .LBB15_3 829 .LBB15_4: 830 mov rsp, rbp 831 pop rbp 832 ret 833 .Lfunc_end15: 834 .size transpose_int64_int8_sse4, .Lfunc_end15-transpose_int64_int8_sse4 835 # -- End function 836 .globl transpose_uint8_uint16_sse4 # -- Begin function transpose_uint8_uint16_sse4 837 .p2align 4, 0x90 838 .type transpose_uint8_uint16_sse4,@function 839 transpose_uint8_uint16_sse4: # @transpose_uint8_uint16_sse4 840 # %bb.0: 841 push rbp 842 mov rbp, rsp 843 and rsp, -8 844 cmp edx, 4 845 jl .LBB16_1 846 .p2align 4, 0x90 847 .LBB16_5: # =>This Inner Loop Header: Depth=1 848 mov eax, edx 849 movzx edx, byte ptr [rdi] 850 movzx edx, word ptr [rcx + 4*rdx] 851 mov word ptr [rsi], dx 852 movzx edx, byte ptr [rdi + 1] 853 movzx edx, word ptr [rcx + 4*rdx] 854 mov word ptr [rsi + 2], dx 855 movzx edx, byte ptr [rdi + 2] 856 movzx edx, word ptr [rcx + 4*rdx] 857 mov word ptr [rsi + 4], dx 858 movzx edx, byte ptr [rdi + 3] 859 movzx edx, word ptr [rcx + 4*rdx] 860 mov word ptr [rsi + 6], dx 861 lea edx, [rax - 4] 862 add rdi, 4 863 add rsi, 8 864 cmp eax, 7 865 jg .LBB16_5 866 .LBB16_1: 867 test edx, edx 868 jle .LBB16_4 869 # %bb.2: 870 add edx, 1 871 xor r8d, r8d 872 .p2align 4, 0x90 873 .LBB16_3: # =>This Inner Loop Header: Depth=1 874 movzx eax, byte ptr [rdi + r8] 875 movzx eax, word ptr [rcx + 4*rax] 876 mov word ptr [rsi + 2*r8], ax 877 add r8, 1 878 add edx, -1 879 cmp edx, 1 880 jg .LBB16_3 881 .LBB16_4: 882 mov rsp, rbp 883 pop rbp 884 ret 885 .Lfunc_end16: 886 .size transpose_uint8_uint16_sse4, .Lfunc_end16-transpose_uint8_uint16_sse4 887 # -- End function 888 .globl transpose_int8_uint16_sse4 # -- Begin function transpose_int8_uint16_sse4 889 .p2align 4, 0x90 890 .type transpose_int8_uint16_sse4,@function 891 transpose_int8_uint16_sse4: # @transpose_int8_uint16_sse4 892 # %bb.0: 893 push rbp 894 mov rbp, rsp 895 and rsp, -8 896 cmp edx, 4 897 jl .LBB17_1 898 .p2align 4, 0x90 899 .LBB17_5: # =>This Inner Loop Header: Depth=1 900 mov eax, edx 901 movsx rdx, byte ptr [rdi] 902 movzx edx, word ptr [rcx + 4*rdx] 903 mov word ptr [rsi], dx 904 movsx rdx, byte ptr [rdi + 1] 905 movzx edx, word ptr [rcx + 4*rdx] 906 mov word ptr [rsi + 2], dx 907 movsx rdx, byte ptr [rdi + 2] 908 movzx edx, word ptr [rcx + 4*rdx] 909 mov word ptr [rsi + 4], dx 910 movsx rdx, byte ptr [rdi + 3] 911 movzx edx, word ptr [rcx + 4*rdx] 912 mov word ptr [rsi + 6], dx 913 lea edx, [rax - 4] 914 add rdi, 4 915 add rsi, 8 916 cmp eax, 7 917 jg .LBB17_5 918 .LBB17_1: 919 test edx, edx 920 jle .LBB17_4 921 # %bb.2: 922 add edx, 1 923 xor r8d, r8d 924 .p2align 4, 0x90 925 .LBB17_3: # =>This Inner Loop Header: Depth=1 926 movsx rax, byte ptr [rdi + r8] 927 movzx eax, word ptr [rcx + 4*rax] 928 mov word ptr [rsi + 2*r8], ax 929 add r8, 1 930 add edx, -1 931 cmp edx, 1 932 jg .LBB17_3 933 .LBB17_4: 934 mov rsp, rbp 935 pop rbp 936 ret 937 .Lfunc_end17: 938 .size transpose_int8_uint16_sse4, .Lfunc_end17-transpose_int8_uint16_sse4 939 # -- End function 940 .globl transpose_uint16_uint16_sse4 # -- Begin function transpose_uint16_uint16_sse4 941 .p2align 4, 0x90 942 .type transpose_uint16_uint16_sse4,@function 943 transpose_uint16_uint16_sse4: # @transpose_uint16_uint16_sse4 944 # %bb.0: 945 push rbp 946 mov rbp, rsp 947 and rsp, -8 948 cmp edx, 4 949 jl .LBB18_1 950 .p2align 4, 0x90 951 .LBB18_5: # =>This Inner Loop Header: Depth=1 952 mov eax, edx 953 movzx edx, word ptr [rdi] 954 movzx edx, word ptr [rcx + 4*rdx] 955 mov word ptr [rsi], dx 956 movzx edx, word ptr [rdi + 2] 957 movzx edx, word ptr [rcx + 4*rdx] 958 mov word ptr [rsi + 2], dx 959 movzx edx, word ptr [rdi + 4] 960 movzx edx, word ptr [rcx + 4*rdx] 961 mov word ptr [rsi + 4], dx 962 movzx edx, word ptr [rdi + 6] 963 movzx edx, word ptr [rcx + 4*rdx] 964 mov word ptr [rsi + 6], dx 965 lea edx, [rax - 4] 966 add rdi, 8 967 add rsi, 8 968 cmp eax, 7 969 jg .LBB18_5 970 .LBB18_1: 971 test edx, edx 972 jle .LBB18_4 973 # %bb.2: 974 add edx, 1 975 xor r8d, r8d 976 .p2align 4, 0x90 977 .LBB18_3: # =>This Inner Loop Header: Depth=1 978 movzx eax, word ptr [rdi + r8] 979 movzx eax, word ptr [rcx + 4*rax] 980 mov word ptr [rsi + r8], ax 981 add r8, 2 982 add edx, -1 983 cmp edx, 1 984 jg .LBB18_3 985 .LBB18_4: 986 mov rsp, rbp 987 pop rbp 988 ret 989 .Lfunc_end18: 990 .size transpose_uint16_uint16_sse4, .Lfunc_end18-transpose_uint16_uint16_sse4 991 # -- End function 992 .globl transpose_int16_uint16_sse4 # -- Begin function transpose_int16_uint16_sse4 993 .p2align 4, 0x90 994 .type transpose_int16_uint16_sse4,@function 995 transpose_int16_uint16_sse4: # @transpose_int16_uint16_sse4 996 # %bb.0: 997 push rbp 998 mov rbp, rsp 999 and rsp, -8 1000 cmp edx, 4 1001 jl .LBB19_1 1002 .p2align 4, 0x90 1003 .LBB19_5: # =>This Inner Loop Header: Depth=1 1004 mov eax, edx 1005 movsx rdx, word ptr [rdi] 1006 movzx edx, word ptr [rcx + 4*rdx] 1007 mov word ptr [rsi], dx 1008 movsx rdx, word ptr [rdi + 2] 1009 movzx edx, word ptr [rcx + 4*rdx] 1010 mov word ptr [rsi + 2], dx 1011 movsx rdx, word ptr [rdi + 4] 1012 movzx edx, word ptr [rcx + 4*rdx] 1013 mov word ptr [rsi + 4], dx 1014 movsx rdx, word ptr [rdi + 6] 1015 movzx edx, word ptr [rcx + 4*rdx] 1016 mov word ptr [rsi + 6], dx 1017 lea edx, [rax - 4] 1018 add rdi, 8 1019 add rsi, 8 1020 cmp eax, 7 1021 jg .LBB19_5 1022 .LBB19_1: 1023 test edx, edx 1024 jle .LBB19_4 1025 # %bb.2: 1026 add edx, 1 1027 xor r8d, r8d 1028 .p2align 4, 0x90 1029 .LBB19_3: # =>This Inner Loop Header: Depth=1 1030 movsx rax, word ptr [rdi + r8] 1031 movzx eax, word ptr [rcx + 4*rax] 1032 mov word ptr [rsi + r8], ax 1033 add r8, 2 1034 add edx, -1 1035 cmp edx, 1 1036 jg .LBB19_3 1037 .LBB19_4: 1038 mov rsp, rbp 1039 pop rbp 1040 ret 1041 .Lfunc_end19: 1042 .size transpose_int16_uint16_sse4, .Lfunc_end19-transpose_int16_uint16_sse4 1043 # -- End function 1044 .globl transpose_uint32_uint16_sse4 # -- Begin function transpose_uint32_uint16_sse4 1045 .p2align 4, 0x90 1046 .type transpose_uint32_uint16_sse4,@function 1047 transpose_uint32_uint16_sse4: # @transpose_uint32_uint16_sse4 1048 # %bb.0: 1049 push rbp 1050 mov rbp, rsp 1051 and rsp, -8 1052 cmp edx, 4 1053 jl .LBB20_1 1054 .p2align 4, 0x90 1055 .LBB20_5: # =>This Inner Loop Header: Depth=1 1056 mov eax, edx 1057 mov edx, dword ptr [rdi] 1058 movzx edx, word ptr [rcx + 4*rdx] 1059 mov word ptr [rsi], dx 1060 mov edx, dword ptr [rdi + 4] 1061 movzx edx, word ptr [rcx + 4*rdx] 1062 mov word ptr [rsi + 2], dx 1063 mov edx, dword ptr [rdi + 8] 1064 movzx edx, word ptr [rcx + 4*rdx] 1065 mov word ptr [rsi + 4], dx 1066 mov edx, dword ptr [rdi + 12] 1067 movzx edx, word ptr [rcx + 4*rdx] 1068 mov word ptr [rsi + 6], dx 1069 lea edx, [rax - 4] 1070 add rdi, 16 1071 add rsi, 8 1072 cmp eax, 7 1073 jg .LBB20_5 1074 .LBB20_1: 1075 test edx, edx 1076 jle .LBB20_4 1077 # %bb.2: 1078 add edx, 1 1079 xor r8d, r8d 1080 .p2align 4, 0x90 1081 .LBB20_3: # =>This Inner Loop Header: Depth=1 1082 mov eax, dword ptr [rdi + 2*r8] 1083 movzx eax, word ptr [rcx + 4*rax] 1084 mov word ptr [rsi + r8], ax 1085 add r8, 2 1086 add edx, -1 1087 cmp edx, 1 1088 jg .LBB20_3 1089 .LBB20_4: 1090 mov rsp, rbp 1091 pop rbp 1092 ret 1093 .Lfunc_end20: 1094 .size transpose_uint32_uint16_sse4, .Lfunc_end20-transpose_uint32_uint16_sse4 1095 # -- End function 1096 .globl transpose_int32_uint16_sse4 # -- Begin function transpose_int32_uint16_sse4 1097 .p2align 4, 0x90 1098 .type transpose_int32_uint16_sse4,@function 1099 transpose_int32_uint16_sse4: # @transpose_int32_uint16_sse4 1100 # %bb.0: 1101 push rbp 1102 mov rbp, rsp 1103 and rsp, -8 1104 cmp edx, 4 1105 jl .LBB21_1 1106 .p2align 4, 0x90 1107 .LBB21_5: # =>This Inner Loop Header: Depth=1 1108 mov eax, edx 1109 movsxd rdx, dword ptr [rdi] 1110 movzx edx, word ptr [rcx + 4*rdx] 1111 mov word ptr [rsi], dx 1112 movsxd rdx, dword ptr [rdi + 4] 1113 movzx edx, word ptr [rcx + 4*rdx] 1114 mov word ptr [rsi + 2], dx 1115 movsxd rdx, dword ptr [rdi + 8] 1116 movzx edx, word ptr [rcx + 4*rdx] 1117 mov word ptr [rsi + 4], dx 1118 movsxd rdx, dword ptr [rdi + 12] 1119 movzx edx, word ptr [rcx + 4*rdx] 1120 mov word ptr [rsi + 6], dx 1121 lea edx, [rax - 4] 1122 add rdi, 16 1123 add rsi, 8 1124 cmp eax, 7 1125 jg .LBB21_5 1126 .LBB21_1: 1127 test edx, edx 1128 jle .LBB21_4 1129 # %bb.2: 1130 add edx, 1 1131 xor r8d, r8d 1132 .p2align 4, 0x90 1133 .LBB21_3: # =>This Inner Loop Header: Depth=1 1134 movsxd rax, dword ptr [rdi + 2*r8] 1135 movzx eax, word ptr [rcx + 4*rax] 1136 mov word ptr [rsi + r8], ax 1137 add r8, 2 1138 add edx, -1 1139 cmp edx, 1 1140 jg .LBB21_3 1141 .LBB21_4: 1142 mov rsp, rbp 1143 pop rbp 1144 ret 1145 .Lfunc_end21: 1146 .size transpose_int32_uint16_sse4, .Lfunc_end21-transpose_int32_uint16_sse4 1147 # -- End function 1148 .globl transpose_uint64_uint16_sse4 # -- Begin function transpose_uint64_uint16_sse4 1149 .p2align 4, 0x90 1150 .type transpose_uint64_uint16_sse4,@function 1151 transpose_uint64_uint16_sse4: # @transpose_uint64_uint16_sse4 1152 # %bb.0: 1153 push rbp 1154 mov rbp, rsp 1155 and rsp, -8 1156 cmp edx, 4 1157 jl .LBB22_1 1158 .p2align 4, 0x90 1159 .LBB22_5: # =>This Inner Loop Header: Depth=1 1160 mov eax, edx 1161 mov rdx, qword ptr [rdi] 1162 movzx edx, word ptr [rcx + 4*rdx] 1163 mov word ptr [rsi], dx 1164 mov rdx, qword ptr [rdi + 8] 1165 movzx edx, word ptr [rcx + 4*rdx] 1166 mov word ptr [rsi + 2], dx 1167 mov rdx, qword ptr [rdi + 16] 1168 movzx edx, word ptr [rcx + 4*rdx] 1169 mov word ptr [rsi + 4], dx 1170 mov rdx, qword ptr [rdi + 24] 1171 movzx edx, word ptr [rcx + 4*rdx] 1172 mov word ptr [rsi + 6], dx 1173 lea edx, [rax - 4] 1174 add rdi, 32 1175 add rsi, 8 1176 cmp eax, 7 1177 jg .LBB22_5 1178 .LBB22_1: 1179 test edx, edx 1180 jle .LBB22_4 1181 # %bb.2: 1182 add edx, 1 1183 xor r8d, r8d 1184 .p2align 4, 0x90 1185 .LBB22_3: # =>This Inner Loop Header: Depth=1 1186 mov rax, qword ptr [rdi + 4*r8] 1187 movzx eax, word ptr [rcx + 4*rax] 1188 mov word ptr [rsi + r8], ax 1189 add r8, 2 1190 add edx, -1 1191 cmp edx, 1 1192 jg .LBB22_3 1193 .LBB22_4: 1194 mov rsp, rbp 1195 pop rbp 1196 ret 1197 .Lfunc_end22: 1198 .size transpose_uint64_uint16_sse4, .Lfunc_end22-transpose_uint64_uint16_sse4 1199 # -- End function 1200 .globl transpose_int64_uint16_sse4 # -- Begin function transpose_int64_uint16_sse4 1201 .p2align 4, 0x90 1202 .type transpose_int64_uint16_sse4,@function 1203 transpose_int64_uint16_sse4: # @transpose_int64_uint16_sse4 1204 # %bb.0: 1205 push rbp 1206 mov rbp, rsp 1207 and rsp, -8 1208 cmp edx, 4 1209 jl .LBB23_1 1210 .p2align 4, 0x90 1211 .LBB23_5: # =>This Inner Loop Header: Depth=1 1212 mov eax, edx 1213 mov rdx, qword ptr [rdi] 1214 movzx edx, word ptr [rcx + 4*rdx] 1215 mov word ptr [rsi], dx 1216 mov rdx, qword ptr [rdi + 8] 1217 movzx edx, word ptr [rcx + 4*rdx] 1218 mov word ptr [rsi + 2], dx 1219 mov rdx, qword ptr [rdi + 16] 1220 movzx edx, word ptr [rcx + 4*rdx] 1221 mov word ptr [rsi + 4], dx 1222 mov rdx, qword ptr [rdi + 24] 1223 movzx edx, word ptr [rcx + 4*rdx] 1224 mov word ptr [rsi + 6], dx 1225 lea edx, [rax - 4] 1226 add rdi, 32 1227 add rsi, 8 1228 cmp eax, 7 1229 jg .LBB23_5 1230 .LBB23_1: 1231 test edx, edx 1232 jle .LBB23_4 1233 # %bb.2: 1234 add edx, 1 1235 xor r8d, r8d 1236 .p2align 4, 0x90 1237 .LBB23_3: # =>This Inner Loop Header: Depth=1 1238 mov rax, qword ptr [rdi + 4*r8] 1239 movzx eax, word ptr [rcx + 4*rax] 1240 mov word ptr [rsi + r8], ax 1241 add r8, 2 1242 add edx, -1 1243 cmp edx, 1 1244 jg .LBB23_3 1245 .LBB23_4: 1246 mov rsp, rbp 1247 pop rbp 1248 ret 1249 .Lfunc_end23: 1250 .size transpose_int64_uint16_sse4, .Lfunc_end23-transpose_int64_uint16_sse4 1251 # -- End function 1252 .globl transpose_uint8_int16_sse4 # -- Begin function transpose_uint8_int16_sse4 1253 .p2align 4, 0x90 1254 .type transpose_uint8_int16_sse4,@function 1255 transpose_uint8_int16_sse4: # @transpose_uint8_int16_sse4 1256 # %bb.0: 1257 push rbp 1258 mov rbp, rsp 1259 and rsp, -8 1260 cmp edx, 4 1261 jl .LBB24_1 1262 .p2align 4, 0x90 1263 .LBB24_5: # =>This Inner Loop Header: Depth=1 1264 mov eax, edx 1265 movzx edx, byte ptr [rdi] 1266 movzx edx, word ptr [rcx + 4*rdx] 1267 mov word ptr [rsi], dx 1268 movzx edx, byte ptr [rdi + 1] 1269 movzx edx, word ptr [rcx + 4*rdx] 1270 mov word ptr [rsi + 2], dx 1271 movzx edx, byte ptr [rdi + 2] 1272 movzx edx, word ptr [rcx + 4*rdx] 1273 mov word ptr [rsi + 4], dx 1274 movzx edx, byte ptr [rdi + 3] 1275 movzx edx, word ptr [rcx + 4*rdx] 1276 mov word ptr [rsi + 6], dx 1277 lea edx, [rax - 4] 1278 add rdi, 4 1279 add rsi, 8 1280 cmp eax, 7 1281 jg .LBB24_5 1282 .LBB24_1: 1283 test edx, edx 1284 jle .LBB24_4 1285 # %bb.2: 1286 add edx, 1 1287 xor r8d, r8d 1288 .p2align 4, 0x90 1289 .LBB24_3: # =>This Inner Loop Header: Depth=1 1290 movzx eax, byte ptr [rdi + r8] 1291 movzx eax, word ptr [rcx + 4*rax] 1292 mov word ptr [rsi + 2*r8], ax 1293 add r8, 1 1294 add edx, -1 1295 cmp edx, 1 1296 jg .LBB24_3 1297 .LBB24_4: 1298 mov rsp, rbp 1299 pop rbp 1300 ret 1301 .Lfunc_end24: 1302 .size transpose_uint8_int16_sse4, .Lfunc_end24-transpose_uint8_int16_sse4 1303 # -- End function 1304 .globl transpose_int8_int16_sse4 # -- Begin function transpose_int8_int16_sse4 1305 .p2align 4, 0x90 1306 .type transpose_int8_int16_sse4,@function 1307 transpose_int8_int16_sse4: # @transpose_int8_int16_sse4 1308 # %bb.0: 1309 push rbp 1310 mov rbp, rsp 1311 and rsp, -8 1312 cmp edx, 4 1313 jl .LBB25_1 1314 .p2align 4, 0x90 1315 .LBB25_5: # =>This Inner Loop Header: Depth=1 1316 mov eax, edx 1317 movsx rdx, byte ptr [rdi] 1318 movzx edx, word ptr [rcx + 4*rdx] 1319 mov word ptr [rsi], dx 1320 movsx rdx, byte ptr [rdi + 1] 1321 movzx edx, word ptr [rcx + 4*rdx] 1322 mov word ptr [rsi + 2], dx 1323 movsx rdx, byte ptr [rdi + 2] 1324 movzx edx, word ptr [rcx + 4*rdx] 1325 mov word ptr [rsi + 4], dx 1326 movsx rdx, byte ptr [rdi + 3] 1327 movzx edx, word ptr [rcx + 4*rdx] 1328 mov word ptr [rsi + 6], dx 1329 lea edx, [rax - 4] 1330 add rdi, 4 1331 add rsi, 8 1332 cmp eax, 7 1333 jg .LBB25_5 1334 .LBB25_1: 1335 test edx, edx 1336 jle .LBB25_4 1337 # %bb.2: 1338 add edx, 1 1339 xor r8d, r8d 1340 .p2align 4, 0x90 1341 .LBB25_3: # =>This Inner Loop Header: Depth=1 1342 movsx rax, byte ptr [rdi + r8] 1343 movzx eax, word ptr [rcx + 4*rax] 1344 mov word ptr [rsi + 2*r8], ax 1345 add r8, 1 1346 add edx, -1 1347 cmp edx, 1 1348 jg .LBB25_3 1349 .LBB25_4: 1350 mov rsp, rbp 1351 pop rbp 1352 ret 1353 .Lfunc_end25: 1354 .size transpose_int8_int16_sse4, .Lfunc_end25-transpose_int8_int16_sse4 1355 # -- End function 1356 .globl transpose_uint16_int16_sse4 # -- Begin function transpose_uint16_int16_sse4 1357 .p2align 4, 0x90 1358 .type transpose_uint16_int16_sse4,@function 1359 transpose_uint16_int16_sse4: # @transpose_uint16_int16_sse4 1360 # %bb.0: 1361 push rbp 1362 mov rbp, rsp 1363 and rsp, -8 1364 cmp edx, 4 1365 jl .LBB26_1 1366 .p2align 4, 0x90 1367 .LBB26_5: # =>This Inner Loop Header: Depth=1 1368 mov eax, edx 1369 movzx edx, word ptr [rdi] 1370 movzx edx, word ptr [rcx + 4*rdx] 1371 mov word ptr [rsi], dx 1372 movzx edx, word ptr [rdi + 2] 1373 movzx edx, word ptr [rcx + 4*rdx] 1374 mov word ptr [rsi + 2], dx 1375 movzx edx, word ptr [rdi + 4] 1376 movzx edx, word ptr [rcx + 4*rdx] 1377 mov word ptr [rsi + 4], dx 1378 movzx edx, word ptr [rdi + 6] 1379 movzx edx, word ptr [rcx + 4*rdx] 1380 mov word ptr [rsi + 6], dx 1381 lea edx, [rax - 4] 1382 add rdi, 8 1383 add rsi, 8 1384 cmp eax, 7 1385 jg .LBB26_5 1386 .LBB26_1: 1387 test edx, edx 1388 jle .LBB26_4 1389 # %bb.2: 1390 add edx, 1 1391 xor r8d, r8d 1392 .p2align 4, 0x90 1393 .LBB26_3: # =>This Inner Loop Header: Depth=1 1394 movzx eax, word ptr [rdi + r8] 1395 movzx eax, word ptr [rcx + 4*rax] 1396 mov word ptr [rsi + r8], ax 1397 add r8, 2 1398 add edx, -1 1399 cmp edx, 1 1400 jg .LBB26_3 1401 .LBB26_4: 1402 mov rsp, rbp 1403 pop rbp 1404 ret 1405 .Lfunc_end26: 1406 .size transpose_uint16_int16_sse4, .Lfunc_end26-transpose_uint16_int16_sse4 1407 # -- End function 1408 .globl transpose_int16_int16_sse4 # -- Begin function transpose_int16_int16_sse4 1409 .p2align 4, 0x90 1410 .type transpose_int16_int16_sse4,@function 1411 transpose_int16_int16_sse4: # @transpose_int16_int16_sse4 1412 # %bb.0: 1413 push rbp 1414 mov rbp, rsp 1415 and rsp, -8 1416 cmp edx, 4 1417 jl .LBB27_1 1418 .p2align 4, 0x90 1419 .LBB27_5: # =>This Inner Loop Header: Depth=1 1420 mov eax, edx 1421 movsx rdx, word ptr [rdi] 1422 movzx edx, word ptr [rcx + 4*rdx] 1423 mov word ptr [rsi], dx 1424 movsx rdx, word ptr [rdi + 2] 1425 movzx edx, word ptr [rcx + 4*rdx] 1426 mov word ptr [rsi + 2], dx 1427 movsx rdx, word ptr [rdi + 4] 1428 movzx edx, word ptr [rcx + 4*rdx] 1429 mov word ptr [rsi + 4], dx 1430 movsx rdx, word ptr [rdi + 6] 1431 movzx edx, word ptr [rcx + 4*rdx] 1432 mov word ptr [rsi + 6], dx 1433 lea edx, [rax - 4] 1434 add rdi, 8 1435 add rsi, 8 1436 cmp eax, 7 1437 jg .LBB27_5 1438 .LBB27_1: 1439 test edx, edx 1440 jle .LBB27_4 1441 # %bb.2: 1442 add edx, 1 1443 xor r8d, r8d 1444 .p2align 4, 0x90 1445 .LBB27_3: # =>This Inner Loop Header: Depth=1 1446 movsx rax, word ptr [rdi + r8] 1447 movzx eax, word ptr [rcx + 4*rax] 1448 mov word ptr [rsi + r8], ax 1449 add r8, 2 1450 add edx, -1 1451 cmp edx, 1 1452 jg .LBB27_3 1453 .LBB27_4: 1454 mov rsp, rbp 1455 pop rbp 1456 ret 1457 .Lfunc_end27: 1458 .size transpose_int16_int16_sse4, .Lfunc_end27-transpose_int16_int16_sse4 1459 # -- End function 1460 .globl transpose_uint32_int16_sse4 # -- Begin function transpose_uint32_int16_sse4 1461 .p2align 4, 0x90 1462 .type transpose_uint32_int16_sse4,@function 1463 transpose_uint32_int16_sse4: # @transpose_uint32_int16_sse4 1464 # %bb.0: 1465 push rbp 1466 mov rbp, rsp 1467 and rsp, -8 1468 cmp edx, 4 1469 jl .LBB28_1 1470 .p2align 4, 0x90 1471 .LBB28_5: # =>This Inner Loop Header: Depth=1 1472 mov eax, edx 1473 mov edx, dword ptr [rdi] 1474 movzx edx, word ptr [rcx + 4*rdx] 1475 mov word ptr [rsi], dx 1476 mov edx, dword ptr [rdi + 4] 1477 movzx edx, word ptr [rcx + 4*rdx] 1478 mov word ptr [rsi + 2], dx 1479 mov edx, dword ptr [rdi + 8] 1480 movzx edx, word ptr [rcx + 4*rdx] 1481 mov word ptr [rsi + 4], dx 1482 mov edx, dword ptr [rdi + 12] 1483 movzx edx, word ptr [rcx + 4*rdx] 1484 mov word ptr [rsi + 6], dx 1485 lea edx, [rax - 4] 1486 add rdi, 16 1487 add rsi, 8 1488 cmp eax, 7 1489 jg .LBB28_5 1490 .LBB28_1: 1491 test edx, edx 1492 jle .LBB28_4 1493 # %bb.2: 1494 add edx, 1 1495 xor r8d, r8d 1496 .p2align 4, 0x90 1497 .LBB28_3: # =>This Inner Loop Header: Depth=1 1498 mov eax, dword ptr [rdi + 2*r8] 1499 movzx eax, word ptr [rcx + 4*rax] 1500 mov word ptr [rsi + r8], ax 1501 add r8, 2 1502 add edx, -1 1503 cmp edx, 1 1504 jg .LBB28_3 1505 .LBB28_4: 1506 mov rsp, rbp 1507 pop rbp 1508 ret 1509 .Lfunc_end28: 1510 .size transpose_uint32_int16_sse4, .Lfunc_end28-transpose_uint32_int16_sse4 1511 # -- End function 1512 .globl transpose_int32_int16_sse4 # -- Begin function transpose_int32_int16_sse4 1513 .p2align 4, 0x90 1514 .type transpose_int32_int16_sse4,@function 1515 transpose_int32_int16_sse4: # @transpose_int32_int16_sse4 1516 # %bb.0: 1517 push rbp 1518 mov rbp, rsp 1519 and rsp, -8 1520 cmp edx, 4 1521 jl .LBB29_1 1522 .p2align 4, 0x90 1523 .LBB29_5: # =>This Inner Loop Header: Depth=1 1524 mov eax, edx 1525 movsxd rdx, dword ptr [rdi] 1526 movzx edx, word ptr [rcx + 4*rdx] 1527 mov word ptr [rsi], dx 1528 movsxd rdx, dword ptr [rdi + 4] 1529 movzx edx, word ptr [rcx + 4*rdx] 1530 mov word ptr [rsi + 2], dx 1531 movsxd rdx, dword ptr [rdi + 8] 1532 movzx edx, word ptr [rcx + 4*rdx] 1533 mov word ptr [rsi + 4], dx 1534 movsxd rdx, dword ptr [rdi + 12] 1535 movzx edx, word ptr [rcx + 4*rdx] 1536 mov word ptr [rsi + 6], dx 1537 lea edx, [rax - 4] 1538 add rdi, 16 1539 add rsi, 8 1540 cmp eax, 7 1541 jg .LBB29_5 1542 .LBB29_1: 1543 test edx, edx 1544 jle .LBB29_4 1545 # %bb.2: 1546 add edx, 1 1547 xor r8d, r8d 1548 .p2align 4, 0x90 1549 .LBB29_3: # =>This Inner Loop Header: Depth=1 1550 movsxd rax, dword ptr [rdi + 2*r8] 1551 movzx eax, word ptr [rcx + 4*rax] 1552 mov word ptr [rsi + r8], ax 1553 add r8, 2 1554 add edx, -1 1555 cmp edx, 1 1556 jg .LBB29_3 1557 .LBB29_4: 1558 mov rsp, rbp 1559 pop rbp 1560 ret 1561 .Lfunc_end29: 1562 .size transpose_int32_int16_sse4, .Lfunc_end29-transpose_int32_int16_sse4 1563 # -- End function 1564 .globl transpose_uint64_int16_sse4 # -- Begin function transpose_uint64_int16_sse4 1565 .p2align 4, 0x90 1566 .type transpose_uint64_int16_sse4,@function 1567 transpose_uint64_int16_sse4: # @transpose_uint64_int16_sse4 1568 # %bb.0: 1569 push rbp 1570 mov rbp, rsp 1571 and rsp, -8 1572 cmp edx, 4 1573 jl .LBB30_1 1574 .p2align 4, 0x90 1575 .LBB30_5: # =>This Inner Loop Header: Depth=1 1576 mov eax, edx 1577 mov rdx, qword ptr [rdi] 1578 movzx edx, word ptr [rcx + 4*rdx] 1579 mov word ptr [rsi], dx 1580 mov rdx, qword ptr [rdi + 8] 1581 movzx edx, word ptr [rcx + 4*rdx] 1582 mov word ptr [rsi + 2], dx 1583 mov rdx, qword ptr [rdi + 16] 1584 movzx edx, word ptr [rcx + 4*rdx] 1585 mov word ptr [rsi + 4], dx 1586 mov rdx, qword ptr [rdi + 24] 1587 movzx edx, word ptr [rcx + 4*rdx] 1588 mov word ptr [rsi + 6], dx 1589 lea edx, [rax - 4] 1590 add rdi, 32 1591 add rsi, 8 1592 cmp eax, 7 1593 jg .LBB30_5 1594 .LBB30_1: 1595 test edx, edx 1596 jle .LBB30_4 1597 # %bb.2: 1598 add edx, 1 1599 xor r8d, r8d 1600 .p2align 4, 0x90 1601 .LBB30_3: # =>This Inner Loop Header: Depth=1 1602 mov rax, qword ptr [rdi + 4*r8] 1603 movzx eax, word ptr [rcx + 4*rax] 1604 mov word ptr [rsi + r8], ax 1605 add r8, 2 1606 add edx, -1 1607 cmp edx, 1 1608 jg .LBB30_3 1609 .LBB30_4: 1610 mov rsp, rbp 1611 pop rbp 1612 ret 1613 .Lfunc_end30: 1614 .size transpose_uint64_int16_sse4, .Lfunc_end30-transpose_uint64_int16_sse4 1615 # -- End function 1616 .globl transpose_int64_int16_sse4 # -- Begin function transpose_int64_int16_sse4 1617 .p2align 4, 0x90 1618 .type transpose_int64_int16_sse4,@function 1619 transpose_int64_int16_sse4: # @transpose_int64_int16_sse4 1620 # %bb.0: 1621 push rbp 1622 mov rbp, rsp 1623 and rsp, -8 1624 cmp edx, 4 1625 jl .LBB31_1 1626 .p2align 4, 0x90 1627 .LBB31_5: # =>This Inner Loop Header: Depth=1 1628 mov eax, edx 1629 mov rdx, qword ptr [rdi] 1630 movzx edx, word ptr [rcx + 4*rdx] 1631 mov word ptr [rsi], dx 1632 mov rdx, qword ptr [rdi + 8] 1633 movzx edx, word ptr [rcx + 4*rdx] 1634 mov word ptr [rsi + 2], dx 1635 mov rdx, qword ptr [rdi + 16] 1636 movzx edx, word ptr [rcx + 4*rdx] 1637 mov word ptr [rsi + 4], dx 1638 mov rdx, qword ptr [rdi + 24] 1639 movzx edx, word ptr [rcx + 4*rdx] 1640 mov word ptr [rsi + 6], dx 1641 lea edx, [rax - 4] 1642 add rdi, 32 1643 add rsi, 8 1644 cmp eax, 7 1645 jg .LBB31_5 1646 .LBB31_1: 1647 test edx, edx 1648 jle .LBB31_4 1649 # %bb.2: 1650 add edx, 1 1651 xor r8d, r8d 1652 .p2align 4, 0x90 1653 .LBB31_3: # =>This Inner Loop Header: Depth=1 1654 mov rax, qword ptr [rdi + 4*r8] 1655 movzx eax, word ptr [rcx + 4*rax] 1656 mov word ptr [rsi + r8], ax 1657 add r8, 2 1658 add edx, -1 1659 cmp edx, 1 1660 jg .LBB31_3 1661 .LBB31_4: 1662 mov rsp, rbp 1663 pop rbp 1664 ret 1665 .Lfunc_end31: 1666 .size transpose_int64_int16_sse4, .Lfunc_end31-transpose_int64_int16_sse4 1667 # -- End function 1668 .globl transpose_uint8_uint32_sse4 # -- Begin function transpose_uint8_uint32_sse4 1669 .p2align 4, 0x90 1670 .type transpose_uint8_uint32_sse4,@function 1671 transpose_uint8_uint32_sse4: # @transpose_uint8_uint32_sse4 1672 # %bb.0: 1673 push rbp 1674 mov rbp, rsp 1675 and rsp, -8 1676 cmp edx, 4 1677 jl .LBB32_1 1678 .p2align 4, 0x90 1679 .LBB32_5: # =>This Inner Loop Header: Depth=1 1680 mov eax, edx 1681 movzx edx, byte ptr [rdi] 1682 mov edx, dword ptr [rcx + 4*rdx] 1683 mov dword ptr [rsi], edx 1684 movzx edx, byte ptr [rdi + 1] 1685 mov edx, dword ptr [rcx + 4*rdx] 1686 mov dword ptr [rsi + 4], edx 1687 movzx edx, byte ptr [rdi + 2] 1688 mov edx, dword ptr [rcx + 4*rdx] 1689 mov dword ptr [rsi + 8], edx 1690 movzx edx, byte ptr [rdi + 3] 1691 mov edx, dword ptr [rcx + 4*rdx] 1692 mov dword ptr [rsi + 12], edx 1693 lea edx, [rax - 4] 1694 add rdi, 4 1695 add rsi, 16 1696 cmp eax, 7 1697 jg .LBB32_5 1698 .LBB32_1: 1699 test edx, edx 1700 jle .LBB32_4 1701 # %bb.2: 1702 add edx, 1 1703 xor r8d, r8d 1704 .p2align 4, 0x90 1705 .LBB32_3: # =>This Inner Loop Header: Depth=1 1706 movzx eax, byte ptr [rdi + r8] 1707 mov eax, dword ptr [rcx + 4*rax] 1708 mov dword ptr [rsi + 4*r8], eax 1709 add r8, 1 1710 add edx, -1 1711 cmp edx, 1 1712 jg .LBB32_3 1713 .LBB32_4: 1714 mov rsp, rbp 1715 pop rbp 1716 ret 1717 .Lfunc_end32: 1718 .size transpose_uint8_uint32_sse4, .Lfunc_end32-transpose_uint8_uint32_sse4 1719 # -- End function 1720 .globl transpose_int8_uint32_sse4 # -- Begin function transpose_int8_uint32_sse4 1721 .p2align 4, 0x90 1722 .type transpose_int8_uint32_sse4,@function 1723 transpose_int8_uint32_sse4: # @transpose_int8_uint32_sse4 1724 # %bb.0: 1725 push rbp 1726 mov rbp, rsp 1727 and rsp, -8 1728 cmp edx, 4 1729 jl .LBB33_1 1730 .p2align 4, 0x90 1731 .LBB33_5: # =>This Inner Loop Header: Depth=1 1732 mov eax, edx 1733 movsx rdx, byte ptr [rdi] 1734 mov edx, dword ptr [rcx + 4*rdx] 1735 mov dword ptr [rsi], edx 1736 movsx rdx, byte ptr [rdi + 1] 1737 mov edx, dword ptr [rcx + 4*rdx] 1738 mov dword ptr [rsi + 4], edx 1739 movsx rdx, byte ptr [rdi + 2] 1740 mov edx, dword ptr [rcx + 4*rdx] 1741 mov dword ptr [rsi + 8], edx 1742 movsx rdx, byte ptr [rdi + 3] 1743 mov edx, dword ptr [rcx + 4*rdx] 1744 mov dword ptr [rsi + 12], edx 1745 lea edx, [rax - 4] 1746 add rdi, 4 1747 add rsi, 16 1748 cmp eax, 7 1749 jg .LBB33_5 1750 .LBB33_1: 1751 test edx, edx 1752 jle .LBB33_4 1753 # %bb.2: 1754 add edx, 1 1755 xor r8d, r8d 1756 .p2align 4, 0x90 1757 .LBB33_3: # =>This Inner Loop Header: Depth=1 1758 movsx rax, byte ptr [rdi + r8] 1759 mov eax, dword ptr [rcx + 4*rax] 1760 mov dword ptr [rsi + 4*r8], eax 1761 add r8, 1 1762 add edx, -1 1763 cmp edx, 1 1764 jg .LBB33_3 1765 .LBB33_4: 1766 mov rsp, rbp 1767 pop rbp 1768 ret 1769 .Lfunc_end33: 1770 .size transpose_int8_uint32_sse4, .Lfunc_end33-transpose_int8_uint32_sse4 1771 # -- End function 1772 .globl transpose_uint16_uint32_sse4 # -- Begin function transpose_uint16_uint32_sse4 1773 .p2align 4, 0x90 1774 .type transpose_uint16_uint32_sse4,@function 1775 transpose_uint16_uint32_sse4: # @transpose_uint16_uint32_sse4 1776 # %bb.0: 1777 push rbp 1778 mov rbp, rsp 1779 and rsp, -8 1780 cmp edx, 4 1781 jl .LBB34_1 1782 .p2align 4, 0x90 1783 .LBB34_5: # =>This Inner Loop Header: Depth=1 1784 mov eax, edx 1785 movzx edx, word ptr [rdi] 1786 mov edx, dword ptr [rcx + 4*rdx] 1787 mov dword ptr [rsi], edx 1788 movzx edx, word ptr [rdi + 2] 1789 mov edx, dword ptr [rcx + 4*rdx] 1790 mov dword ptr [rsi + 4], edx 1791 movzx edx, word ptr [rdi + 4] 1792 mov edx, dword ptr [rcx + 4*rdx] 1793 mov dword ptr [rsi + 8], edx 1794 movzx edx, word ptr [rdi + 6] 1795 mov edx, dword ptr [rcx + 4*rdx] 1796 mov dword ptr [rsi + 12], edx 1797 lea edx, [rax - 4] 1798 add rdi, 8 1799 add rsi, 16 1800 cmp eax, 7 1801 jg .LBB34_5 1802 .LBB34_1: 1803 test edx, edx 1804 jle .LBB34_4 1805 # %bb.2: 1806 add edx, 1 1807 xor r8d, r8d 1808 .p2align 4, 0x90 1809 .LBB34_3: # =>This Inner Loop Header: Depth=1 1810 movzx eax, word ptr [rdi + r8] 1811 mov eax, dword ptr [rcx + 4*rax] 1812 mov dword ptr [rsi + 2*r8], eax 1813 add r8, 2 1814 add edx, -1 1815 cmp edx, 1 1816 jg .LBB34_3 1817 .LBB34_4: 1818 mov rsp, rbp 1819 pop rbp 1820 ret 1821 .Lfunc_end34: 1822 .size transpose_uint16_uint32_sse4, .Lfunc_end34-transpose_uint16_uint32_sse4 1823 # -- End function 1824 .globl transpose_int16_uint32_sse4 # -- Begin function transpose_int16_uint32_sse4 1825 .p2align 4, 0x90 1826 .type transpose_int16_uint32_sse4,@function 1827 transpose_int16_uint32_sse4: # @transpose_int16_uint32_sse4 1828 # %bb.0: 1829 push rbp 1830 mov rbp, rsp 1831 and rsp, -8 1832 cmp edx, 4 1833 jl .LBB35_1 1834 .p2align 4, 0x90 1835 .LBB35_5: # =>This Inner Loop Header: Depth=1 1836 mov eax, edx 1837 movsx rdx, word ptr [rdi] 1838 mov edx, dword ptr [rcx + 4*rdx] 1839 mov dword ptr [rsi], edx 1840 movsx rdx, word ptr [rdi + 2] 1841 mov edx, dword ptr [rcx + 4*rdx] 1842 mov dword ptr [rsi + 4], edx 1843 movsx rdx, word ptr [rdi + 4] 1844 mov edx, dword ptr [rcx + 4*rdx] 1845 mov dword ptr [rsi + 8], edx 1846 movsx rdx, word ptr [rdi + 6] 1847 mov edx, dword ptr [rcx + 4*rdx] 1848 mov dword ptr [rsi + 12], edx 1849 lea edx, [rax - 4] 1850 add rdi, 8 1851 add rsi, 16 1852 cmp eax, 7 1853 jg .LBB35_5 1854 .LBB35_1: 1855 test edx, edx 1856 jle .LBB35_4 1857 # %bb.2: 1858 add edx, 1 1859 xor r8d, r8d 1860 .p2align 4, 0x90 1861 .LBB35_3: # =>This Inner Loop Header: Depth=1 1862 movsx rax, word ptr [rdi + r8] 1863 mov eax, dword ptr [rcx + 4*rax] 1864 mov dword ptr [rsi + 2*r8], eax 1865 add r8, 2 1866 add edx, -1 1867 cmp edx, 1 1868 jg .LBB35_3 1869 .LBB35_4: 1870 mov rsp, rbp 1871 pop rbp 1872 ret 1873 .Lfunc_end35: 1874 .size transpose_int16_uint32_sse4, .Lfunc_end35-transpose_int16_uint32_sse4 1875 # -- End function 1876 .globl transpose_uint32_uint32_sse4 # -- Begin function transpose_uint32_uint32_sse4 1877 .p2align 4, 0x90 1878 .type transpose_uint32_uint32_sse4,@function 1879 transpose_uint32_uint32_sse4: # @transpose_uint32_uint32_sse4 1880 # %bb.0: 1881 push rbp 1882 mov rbp, rsp 1883 and rsp, -8 1884 cmp edx, 4 1885 jl .LBB36_1 1886 .p2align 4, 0x90 1887 .LBB36_5: # =>This Inner Loop Header: Depth=1 1888 mov eax, edx 1889 mov edx, dword ptr [rdi] 1890 mov edx, dword ptr [rcx + 4*rdx] 1891 mov dword ptr [rsi], edx 1892 mov edx, dword ptr [rdi + 4] 1893 mov edx, dword ptr [rcx + 4*rdx] 1894 mov dword ptr [rsi + 4], edx 1895 mov edx, dword ptr [rdi + 8] 1896 mov edx, dword ptr [rcx + 4*rdx] 1897 mov dword ptr [rsi + 8], edx 1898 mov edx, dword ptr [rdi + 12] 1899 mov edx, dword ptr [rcx + 4*rdx] 1900 mov dword ptr [rsi + 12], edx 1901 lea edx, [rax - 4] 1902 add rdi, 16 1903 add rsi, 16 1904 cmp eax, 7 1905 jg .LBB36_5 1906 .LBB36_1: 1907 test edx, edx 1908 jle .LBB36_4 1909 # %bb.2: 1910 add edx, 1 1911 xor r8d, r8d 1912 .p2align 4, 0x90 1913 .LBB36_3: # =>This Inner Loop Header: Depth=1 1914 mov eax, dword ptr [rdi + r8] 1915 mov eax, dword ptr [rcx + 4*rax] 1916 mov dword ptr [rsi + r8], eax 1917 add r8, 4 1918 add edx, -1 1919 cmp edx, 1 1920 jg .LBB36_3 1921 .LBB36_4: 1922 mov rsp, rbp 1923 pop rbp 1924 ret 1925 .Lfunc_end36: 1926 .size transpose_uint32_uint32_sse4, .Lfunc_end36-transpose_uint32_uint32_sse4 1927 # -- End function 1928 .globl transpose_int32_uint32_sse4 # -- Begin function transpose_int32_uint32_sse4 1929 .p2align 4, 0x90 1930 .type transpose_int32_uint32_sse4,@function 1931 transpose_int32_uint32_sse4: # @transpose_int32_uint32_sse4 1932 # %bb.0: 1933 push rbp 1934 mov rbp, rsp 1935 and rsp, -8 1936 cmp edx, 4 1937 jl .LBB37_1 1938 .p2align 4, 0x90 1939 .LBB37_5: # =>This Inner Loop Header: Depth=1 1940 mov eax, edx 1941 movsxd rdx, dword ptr [rdi] 1942 mov edx, dword ptr [rcx + 4*rdx] 1943 mov dword ptr [rsi], edx 1944 movsxd rdx, dword ptr [rdi + 4] 1945 mov edx, dword ptr [rcx + 4*rdx] 1946 mov dword ptr [rsi + 4], edx 1947 movsxd rdx, dword ptr [rdi + 8] 1948 mov edx, dword ptr [rcx + 4*rdx] 1949 mov dword ptr [rsi + 8], edx 1950 movsxd rdx, dword ptr [rdi + 12] 1951 mov edx, dword ptr [rcx + 4*rdx] 1952 mov dword ptr [rsi + 12], edx 1953 lea edx, [rax - 4] 1954 add rdi, 16 1955 add rsi, 16 1956 cmp eax, 7 1957 jg .LBB37_5 1958 .LBB37_1: 1959 test edx, edx 1960 jle .LBB37_4 1961 # %bb.2: 1962 add edx, 1 1963 xor r8d, r8d 1964 .p2align 4, 0x90 1965 .LBB37_3: # =>This Inner Loop Header: Depth=1 1966 movsxd rax, dword ptr [rdi + r8] 1967 mov eax, dword ptr [rcx + 4*rax] 1968 mov dword ptr [rsi + r8], eax 1969 add r8, 4 1970 add edx, -1 1971 cmp edx, 1 1972 jg .LBB37_3 1973 .LBB37_4: 1974 mov rsp, rbp 1975 pop rbp 1976 ret 1977 .Lfunc_end37: 1978 .size transpose_int32_uint32_sse4, .Lfunc_end37-transpose_int32_uint32_sse4 1979 # -- End function 1980 .globl transpose_uint64_uint32_sse4 # -- Begin function transpose_uint64_uint32_sse4 1981 .p2align 4, 0x90 1982 .type transpose_uint64_uint32_sse4,@function 1983 transpose_uint64_uint32_sse4: # @transpose_uint64_uint32_sse4 1984 # %bb.0: 1985 push rbp 1986 mov rbp, rsp 1987 and rsp, -8 1988 cmp edx, 4 1989 jl .LBB38_1 1990 .p2align 4, 0x90 1991 .LBB38_5: # =>This Inner Loop Header: Depth=1 1992 mov eax, edx 1993 mov rdx, qword ptr [rdi] 1994 mov edx, dword ptr [rcx + 4*rdx] 1995 mov dword ptr [rsi], edx 1996 mov rdx, qword ptr [rdi + 8] 1997 mov edx, dword ptr [rcx + 4*rdx] 1998 mov dword ptr [rsi + 4], edx 1999 mov rdx, qword ptr [rdi + 16] 2000 mov edx, dword ptr [rcx + 4*rdx] 2001 mov dword ptr [rsi + 8], edx 2002 mov rdx, qword ptr [rdi + 24] 2003 mov edx, dword ptr [rcx + 4*rdx] 2004 mov dword ptr [rsi + 12], edx 2005 lea edx, [rax - 4] 2006 add rdi, 32 2007 add rsi, 16 2008 cmp eax, 7 2009 jg .LBB38_5 2010 .LBB38_1: 2011 test edx, edx 2012 jle .LBB38_4 2013 # %bb.2: 2014 add edx, 1 2015 xor r8d, r8d 2016 .p2align 4, 0x90 2017 .LBB38_3: # =>This Inner Loop Header: Depth=1 2018 mov rax, qword ptr [rdi + 2*r8] 2019 mov eax, dword ptr [rcx + 4*rax] 2020 mov dword ptr [rsi + r8], eax 2021 add r8, 4 2022 add edx, -1 2023 cmp edx, 1 2024 jg .LBB38_3 2025 .LBB38_4: 2026 mov rsp, rbp 2027 pop rbp 2028 ret 2029 .Lfunc_end38: 2030 .size transpose_uint64_uint32_sse4, .Lfunc_end38-transpose_uint64_uint32_sse4 2031 # -- End function 2032 .globl transpose_int64_uint32_sse4 # -- Begin function transpose_int64_uint32_sse4 2033 .p2align 4, 0x90 2034 .type transpose_int64_uint32_sse4,@function 2035 transpose_int64_uint32_sse4: # @transpose_int64_uint32_sse4 2036 # %bb.0: 2037 push rbp 2038 mov rbp, rsp 2039 and rsp, -8 2040 cmp edx, 4 2041 jl .LBB39_1 2042 .p2align 4, 0x90 2043 .LBB39_5: # =>This Inner Loop Header: Depth=1 2044 mov eax, edx 2045 mov rdx, qword ptr [rdi] 2046 mov edx, dword ptr [rcx + 4*rdx] 2047 mov dword ptr [rsi], edx 2048 mov rdx, qword ptr [rdi + 8] 2049 mov edx, dword ptr [rcx + 4*rdx] 2050 mov dword ptr [rsi + 4], edx 2051 mov rdx, qword ptr [rdi + 16] 2052 mov edx, dword ptr [rcx + 4*rdx] 2053 mov dword ptr [rsi + 8], edx 2054 mov rdx, qword ptr [rdi + 24] 2055 mov edx, dword ptr [rcx + 4*rdx] 2056 mov dword ptr [rsi + 12], edx 2057 lea edx, [rax - 4] 2058 add rdi, 32 2059 add rsi, 16 2060 cmp eax, 7 2061 jg .LBB39_5 2062 .LBB39_1: 2063 test edx, edx 2064 jle .LBB39_4 2065 # %bb.2: 2066 add edx, 1 2067 xor r8d, r8d 2068 .p2align 4, 0x90 2069 .LBB39_3: # =>This Inner Loop Header: Depth=1 2070 mov rax, qword ptr [rdi + 2*r8] 2071 mov eax, dword ptr [rcx + 4*rax] 2072 mov dword ptr [rsi + r8], eax 2073 add r8, 4 2074 add edx, -1 2075 cmp edx, 1 2076 jg .LBB39_3 2077 .LBB39_4: 2078 mov rsp, rbp 2079 pop rbp 2080 ret 2081 .Lfunc_end39: 2082 .size transpose_int64_uint32_sse4, .Lfunc_end39-transpose_int64_uint32_sse4 2083 # -- End function 2084 .globl transpose_uint8_int32_sse4 # -- Begin function transpose_uint8_int32_sse4 2085 .p2align 4, 0x90 2086 .type transpose_uint8_int32_sse4,@function 2087 transpose_uint8_int32_sse4: # @transpose_uint8_int32_sse4 2088 # %bb.0: 2089 push rbp 2090 mov rbp, rsp 2091 and rsp, -8 2092 cmp edx, 4 2093 jl .LBB40_1 2094 .p2align 4, 0x90 2095 .LBB40_5: # =>This Inner Loop Header: Depth=1 2096 mov eax, edx 2097 movzx edx, byte ptr [rdi] 2098 mov edx, dword ptr [rcx + 4*rdx] 2099 mov dword ptr [rsi], edx 2100 movzx edx, byte ptr [rdi + 1] 2101 mov edx, dword ptr [rcx + 4*rdx] 2102 mov dword ptr [rsi + 4], edx 2103 movzx edx, byte ptr [rdi + 2] 2104 mov edx, dword ptr [rcx + 4*rdx] 2105 mov dword ptr [rsi + 8], edx 2106 movzx edx, byte ptr [rdi + 3] 2107 mov edx, dword ptr [rcx + 4*rdx] 2108 mov dword ptr [rsi + 12], edx 2109 lea edx, [rax - 4] 2110 add rdi, 4 2111 add rsi, 16 2112 cmp eax, 7 2113 jg .LBB40_5 2114 .LBB40_1: 2115 test edx, edx 2116 jle .LBB40_4 2117 # %bb.2: 2118 add edx, 1 2119 xor r8d, r8d 2120 .p2align 4, 0x90 2121 .LBB40_3: # =>This Inner Loop Header: Depth=1 2122 movzx eax, byte ptr [rdi + r8] 2123 mov eax, dword ptr [rcx + 4*rax] 2124 mov dword ptr [rsi + 4*r8], eax 2125 add r8, 1 2126 add edx, -1 2127 cmp edx, 1 2128 jg .LBB40_3 2129 .LBB40_4: 2130 mov rsp, rbp 2131 pop rbp 2132 ret 2133 .Lfunc_end40: 2134 .size transpose_uint8_int32_sse4, .Lfunc_end40-transpose_uint8_int32_sse4 2135 # -- End function 2136 .globl transpose_int8_int32_sse4 # -- Begin function transpose_int8_int32_sse4 2137 .p2align 4, 0x90 2138 .type transpose_int8_int32_sse4,@function 2139 transpose_int8_int32_sse4: # @transpose_int8_int32_sse4 2140 # %bb.0: 2141 push rbp 2142 mov rbp, rsp 2143 and rsp, -8 2144 cmp edx, 4 2145 jl .LBB41_1 2146 .p2align 4, 0x90 2147 .LBB41_5: # =>This Inner Loop Header: Depth=1 2148 mov eax, edx 2149 movsx rdx, byte ptr [rdi] 2150 mov edx, dword ptr [rcx + 4*rdx] 2151 mov dword ptr [rsi], edx 2152 movsx rdx, byte ptr [rdi + 1] 2153 mov edx, dword ptr [rcx + 4*rdx] 2154 mov dword ptr [rsi + 4], edx 2155 movsx rdx, byte ptr [rdi + 2] 2156 mov edx, dword ptr [rcx + 4*rdx] 2157 mov dword ptr [rsi + 8], edx 2158 movsx rdx, byte ptr [rdi + 3] 2159 mov edx, dword ptr [rcx + 4*rdx] 2160 mov dword ptr [rsi + 12], edx 2161 lea edx, [rax - 4] 2162 add rdi, 4 2163 add rsi, 16 2164 cmp eax, 7 2165 jg .LBB41_5 2166 .LBB41_1: 2167 test edx, edx 2168 jle .LBB41_4 2169 # %bb.2: 2170 add edx, 1 2171 xor r8d, r8d 2172 .p2align 4, 0x90 2173 .LBB41_3: # =>This Inner Loop Header: Depth=1 2174 movsx rax, byte ptr [rdi + r8] 2175 mov eax, dword ptr [rcx + 4*rax] 2176 mov dword ptr [rsi + 4*r8], eax 2177 add r8, 1 2178 add edx, -1 2179 cmp edx, 1 2180 jg .LBB41_3 2181 .LBB41_4: 2182 mov rsp, rbp 2183 pop rbp 2184 ret 2185 .Lfunc_end41: 2186 .size transpose_int8_int32_sse4, .Lfunc_end41-transpose_int8_int32_sse4 2187 # -- End function 2188 .globl transpose_uint16_int32_sse4 # -- Begin function transpose_uint16_int32_sse4 2189 .p2align 4, 0x90 2190 .type transpose_uint16_int32_sse4,@function 2191 transpose_uint16_int32_sse4: # @transpose_uint16_int32_sse4 2192 # %bb.0: 2193 push rbp 2194 mov rbp, rsp 2195 and rsp, -8 2196 cmp edx, 4 2197 jl .LBB42_1 2198 .p2align 4, 0x90 2199 .LBB42_5: # =>This Inner Loop Header: Depth=1 2200 mov eax, edx 2201 movzx edx, word ptr [rdi] 2202 mov edx, dword ptr [rcx + 4*rdx] 2203 mov dword ptr [rsi], edx 2204 movzx edx, word ptr [rdi + 2] 2205 mov edx, dword ptr [rcx + 4*rdx] 2206 mov dword ptr [rsi + 4], edx 2207 movzx edx, word ptr [rdi + 4] 2208 mov edx, dword ptr [rcx + 4*rdx] 2209 mov dword ptr [rsi + 8], edx 2210 movzx edx, word ptr [rdi + 6] 2211 mov edx, dword ptr [rcx + 4*rdx] 2212 mov dword ptr [rsi + 12], edx 2213 lea edx, [rax - 4] 2214 add rdi, 8 2215 add rsi, 16 2216 cmp eax, 7 2217 jg .LBB42_5 2218 .LBB42_1: 2219 test edx, edx 2220 jle .LBB42_4 2221 # %bb.2: 2222 add edx, 1 2223 xor r8d, r8d 2224 .p2align 4, 0x90 2225 .LBB42_3: # =>This Inner Loop Header: Depth=1 2226 movzx eax, word ptr [rdi + r8] 2227 mov eax, dword ptr [rcx + 4*rax] 2228 mov dword ptr [rsi + 2*r8], eax 2229 add r8, 2 2230 add edx, -1 2231 cmp edx, 1 2232 jg .LBB42_3 2233 .LBB42_4: 2234 mov rsp, rbp 2235 pop rbp 2236 ret 2237 .Lfunc_end42: 2238 .size transpose_uint16_int32_sse4, .Lfunc_end42-transpose_uint16_int32_sse4 2239 # -- End function 2240 .globl transpose_int16_int32_sse4 # -- Begin function transpose_int16_int32_sse4 2241 .p2align 4, 0x90 2242 .type transpose_int16_int32_sse4,@function 2243 transpose_int16_int32_sse4: # @transpose_int16_int32_sse4 2244 # %bb.0: 2245 push rbp 2246 mov rbp, rsp 2247 and rsp, -8 2248 cmp edx, 4 2249 jl .LBB43_1 2250 .p2align 4, 0x90 2251 .LBB43_5: # =>This Inner Loop Header: Depth=1 2252 mov eax, edx 2253 movsx rdx, word ptr [rdi] 2254 mov edx, dword ptr [rcx + 4*rdx] 2255 mov dword ptr [rsi], edx 2256 movsx rdx, word ptr [rdi + 2] 2257 mov edx, dword ptr [rcx + 4*rdx] 2258 mov dword ptr [rsi + 4], edx 2259 movsx rdx, word ptr [rdi + 4] 2260 mov edx, dword ptr [rcx + 4*rdx] 2261 mov dword ptr [rsi + 8], edx 2262 movsx rdx, word ptr [rdi + 6] 2263 mov edx, dword ptr [rcx + 4*rdx] 2264 mov dword ptr [rsi + 12], edx 2265 lea edx, [rax - 4] 2266 add rdi, 8 2267 add rsi, 16 2268 cmp eax, 7 2269 jg .LBB43_5 2270 .LBB43_1: 2271 test edx, edx 2272 jle .LBB43_4 2273 # %bb.2: 2274 add edx, 1 2275 xor r8d, r8d 2276 .p2align 4, 0x90 2277 .LBB43_3: # =>This Inner Loop Header: Depth=1 2278 movsx rax, word ptr [rdi + r8] 2279 mov eax, dword ptr [rcx + 4*rax] 2280 mov dword ptr [rsi + 2*r8], eax 2281 add r8, 2 2282 add edx, -1 2283 cmp edx, 1 2284 jg .LBB43_3 2285 .LBB43_4: 2286 mov rsp, rbp 2287 pop rbp 2288 ret 2289 .Lfunc_end43: 2290 .size transpose_int16_int32_sse4, .Lfunc_end43-transpose_int16_int32_sse4 2291 # -- End function 2292 .globl transpose_uint32_int32_sse4 # -- Begin function transpose_uint32_int32_sse4 2293 .p2align 4, 0x90 2294 .type transpose_uint32_int32_sse4,@function 2295 transpose_uint32_int32_sse4: # @transpose_uint32_int32_sse4 2296 # %bb.0: 2297 push rbp 2298 mov rbp, rsp 2299 and rsp, -8 2300 cmp edx, 4 2301 jl .LBB44_1 2302 .p2align 4, 0x90 2303 .LBB44_5: # =>This Inner Loop Header: Depth=1 2304 mov eax, edx 2305 mov edx, dword ptr [rdi] 2306 mov edx, dword ptr [rcx + 4*rdx] 2307 mov dword ptr [rsi], edx 2308 mov edx, dword ptr [rdi + 4] 2309 mov edx, dword ptr [rcx + 4*rdx] 2310 mov dword ptr [rsi + 4], edx 2311 mov edx, dword ptr [rdi + 8] 2312 mov edx, dword ptr [rcx + 4*rdx] 2313 mov dword ptr [rsi + 8], edx 2314 mov edx, dword ptr [rdi + 12] 2315 mov edx, dword ptr [rcx + 4*rdx] 2316 mov dword ptr [rsi + 12], edx 2317 lea edx, [rax - 4] 2318 add rdi, 16 2319 add rsi, 16 2320 cmp eax, 7 2321 jg .LBB44_5 2322 .LBB44_1: 2323 test edx, edx 2324 jle .LBB44_4 2325 # %bb.2: 2326 add edx, 1 2327 xor r8d, r8d 2328 .p2align 4, 0x90 2329 .LBB44_3: # =>This Inner Loop Header: Depth=1 2330 mov eax, dword ptr [rdi + r8] 2331 mov eax, dword ptr [rcx + 4*rax] 2332 mov dword ptr [rsi + r8], eax 2333 add r8, 4 2334 add edx, -1 2335 cmp edx, 1 2336 jg .LBB44_3 2337 .LBB44_4: 2338 mov rsp, rbp 2339 pop rbp 2340 ret 2341 .Lfunc_end44: 2342 .size transpose_uint32_int32_sse4, .Lfunc_end44-transpose_uint32_int32_sse4 2343 # -- End function 2344 .globl transpose_int32_int32_sse4 # -- Begin function transpose_int32_int32_sse4 2345 .p2align 4, 0x90 2346 .type transpose_int32_int32_sse4,@function 2347 transpose_int32_int32_sse4: # @transpose_int32_int32_sse4 2348 # %bb.0: 2349 push rbp 2350 mov rbp, rsp 2351 and rsp, -8 2352 cmp edx, 4 2353 jl .LBB45_1 2354 .p2align 4, 0x90 2355 .LBB45_5: # =>This Inner Loop Header: Depth=1 2356 mov eax, edx 2357 movsxd rdx, dword ptr [rdi] 2358 mov edx, dword ptr [rcx + 4*rdx] 2359 mov dword ptr [rsi], edx 2360 movsxd rdx, dword ptr [rdi + 4] 2361 mov edx, dword ptr [rcx + 4*rdx] 2362 mov dword ptr [rsi + 4], edx 2363 movsxd rdx, dword ptr [rdi + 8] 2364 mov edx, dword ptr [rcx + 4*rdx] 2365 mov dword ptr [rsi + 8], edx 2366 movsxd rdx, dword ptr [rdi + 12] 2367 mov edx, dword ptr [rcx + 4*rdx] 2368 mov dword ptr [rsi + 12], edx 2369 lea edx, [rax - 4] 2370 add rdi, 16 2371 add rsi, 16 2372 cmp eax, 7 2373 jg .LBB45_5 2374 .LBB45_1: 2375 test edx, edx 2376 jle .LBB45_4 2377 # %bb.2: 2378 add edx, 1 2379 xor r8d, r8d 2380 .p2align 4, 0x90 2381 .LBB45_3: # =>This Inner Loop Header: Depth=1 2382 movsxd rax, dword ptr [rdi + r8] 2383 mov eax, dword ptr [rcx + 4*rax] 2384 mov dword ptr [rsi + r8], eax 2385 add r8, 4 2386 add edx, -1 2387 cmp edx, 1 2388 jg .LBB45_3 2389 .LBB45_4: 2390 mov rsp, rbp 2391 pop rbp 2392 ret 2393 .Lfunc_end45: 2394 .size transpose_int32_int32_sse4, .Lfunc_end45-transpose_int32_int32_sse4 2395 # -- End function 2396 .globl transpose_uint64_int32_sse4 # -- Begin function transpose_uint64_int32_sse4 2397 .p2align 4, 0x90 2398 .type transpose_uint64_int32_sse4,@function 2399 transpose_uint64_int32_sse4: # @transpose_uint64_int32_sse4 2400 # %bb.0: 2401 push rbp 2402 mov rbp, rsp 2403 and rsp, -8 2404 cmp edx, 4 2405 jl .LBB46_1 2406 .p2align 4, 0x90 2407 .LBB46_5: # =>This Inner Loop Header: Depth=1 2408 mov eax, edx 2409 mov rdx, qword ptr [rdi] 2410 mov edx, dword ptr [rcx + 4*rdx] 2411 mov dword ptr [rsi], edx 2412 mov rdx, qword ptr [rdi + 8] 2413 mov edx, dword ptr [rcx + 4*rdx] 2414 mov dword ptr [rsi + 4], edx 2415 mov rdx, qword ptr [rdi + 16] 2416 mov edx, dword ptr [rcx + 4*rdx] 2417 mov dword ptr [rsi + 8], edx 2418 mov rdx, qword ptr [rdi + 24] 2419 mov edx, dword ptr [rcx + 4*rdx] 2420 mov dword ptr [rsi + 12], edx 2421 lea edx, [rax - 4] 2422 add rdi, 32 2423 add rsi, 16 2424 cmp eax, 7 2425 jg .LBB46_5 2426 .LBB46_1: 2427 test edx, edx 2428 jle .LBB46_4 2429 # %bb.2: 2430 add edx, 1 2431 xor r8d, r8d 2432 .p2align 4, 0x90 2433 .LBB46_3: # =>This Inner Loop Header: Depth=1 2434 mov rax, qword ptr [rdi + 2*r8] 2435 mov eax, dword ptr [rcx + 4*rax] 2436 mov dword ptr [rsi + r8], eax 2437 add r8, 4 2438 add edx, -1 2439 cmp edx, 1 2440 jg .LBB46_3 2441 .LBB46_4: 2442 mov rsp, rbp 2443 pop rbp 2444 ret 2445 .Lfunc_end46: 2446 .size transpose_uint64_int32_sse4, .Lfunc_end46-transpose_uint64_int32_sse4 2447 # -- End function 2448 .globl transpose_int64_int32_sse4 # -- Begin function transpose_int64_int32_sse4 2449 .p2align 4, 0x90 2450 .type transpose_int64_int32_sse4,@function 2451 transpose_int64_int32_sse4: # @transpose_int64_int32_sse4 2452 # %bb.0: 2453 push rbp 2454 mov rbp, rsp 2455 and rsp, -8 2456 cmp edx, 4 2457 jl .LBB47_1 2458 .p2align 4, 0x90 2459 .LBB47_5: # =>This Inner Loop Header: Depth=1 2460 mov eax, edx 2461 mov rdx, qword ptr [rdi] 2462 mov edx, dword ptr [rcx + 4*rdx] 2463 mov dword ptr [rsi], edx 2464 mov rdx, qword ptr [rdi + 8] 2465 mov edx, dword ptr [rcx + 4*rdx] 2466 mov dword ptr [rsi + 4], edx 2467 mov rdx, qword ptr [rdi + 16] 2468 mov edx, dword ptr [rcx + 4*rdx] 2469 mov dword ptr [rsi + 8], edx 2470 mov rdx, qword ptr [rdi + 24] 2471 mov edx, dword ptr [rcx + 4*rdx] 2472 mov dword ptr [rsi + 12], edx 2473 lea edx, [rax - 4] 2474 add rdi, 32 2475 add rsi, 16 2476 cmp eax, 7 2477 jg .LBB47_5 2478 .LBB47_1: 2479 test edx, edx 2480 jle .LBB47_4 2481 # %bb.2: 2482 add edx, 1 2483 xor r8d, r8d 2484 .p2align 4, 0x90 2485 .LBB47_3: # =>This Inner Loop Header: Depth=1 2486 mov rax, qword ptr [rdi + 2*r8] 2487 mov eax, dword ptr [rcx + 4*rax] 2488 mov dword ptr [rsi + r8], eax 2489 add r8, 4 2490 add edx, -1 2491 cmp edx, 1 2492 jg .LBB47_3 2493 .LBB47_4: 2494 mov rsp, rbp 2495 pop rbp 2496 ret 2497 .Lfunc_end47: 2498 .size transpose_int64_int32_sse4, .Lfunc_end47-transpose_int64_int32_sse4 2499 # -- End function 2500 .globl transpose_uint8_uint64_sse4 # -- Begin function transpose_uint8_uint64_sse4 2501 .p2align 4, 0x90 2502 .type transpose_uint8_uint64_sse4,@function 2503 transpose_uint8_uint64_sse4: # @transpose_uint8_uint64_sse4 2504 # %bb.0: 2505 push rbp 2506 mov rbp, rsp 2507 and rsp, -8 2508 cmp edx, 4 2509 jl .LBB48_1 2510 .p2align 4, 0x90 2511 .LBB48_5: # =>This Inner Loop Header: Depth=1 2512 mov eax, edx 2513 movzx edx, byte ptr [rdi] 2514 movsxd rdx, dword ptr [rcx + 4*rdx] 2515 mov qword ptr [rsi], rdx 2516 movzx edx, byte ptr [rdi + 1] 2517 movsxd rdx, dword ptr [rcx + 4*rdx] 2518 mov qword ptr [rsi + 8], rdx 2519 movzx edx, byte ptr [rdi + 2] 2520 movsxd rdx, dword ptr [rcx + 4*rdx] 2521 mov qword ptr [rsi + 16], rdx 2522 movzx edx, byte ptr [rdi + 3] 2523 movsxd rdx, dword ptr [rcx + 4*rdx] 2524 mov qword ptr [rsi + 24], rdx 2525 lea edx, [rax - 4] 2526 add rdi, 4 2527 add rsi, 32 2528 cmp eax, 7 2529 jg .LBB48_5 2530 .LBB48_1: 2531 test edx, edx 2532 jle .LBB48_4 2533 # %bb.2: 2534 add edx, 1 2535 xor r8d, r8d 2536 .p2align 4, 0x90 2537 .LBB48_3: # =>This Inner Loop Header: Depth=1 2538 movzx eax, byte ptr [rdi + r8] 2539 movsxd rax, dword ptr [rcx + 4*rax] 2540 mov qword ptr [rsi + 8*r8], rax 2541 add r8, 1 2542 add edx, -1 2543 cmp edx, 1 2544 jg .LBB48_3 2545 .LBB48_4: 2546 mov rsp, rbp 2547 pop rbp 2548 ret 2549 .Lfunc_end48: 2550 .size transpose_uint8_uint64_sse4, .Lfunc_end48-transpose_uint8_uint64_sse4 2551 # -- End function 2552 .globl transpose_int8_uint64_sse4 # -- Begin function transpose_int8_uint64_sse4 2553 .p2align 4, 0x90 2554 .type transpose_int8_uint64_sse4,@function 2555 transpose_int8_uint64_sse4: # @transpose_int8_uint64_sse4 2556 # %bb.0: 2557 push rbp 2558 mov rbp, rsp 2559 and rsp, -8 2560 cmp edx, 4 2561 jl .LBB49_1 2562 .p2align 4, 0x90 2563 .LBB49_5: # =>This Inner Loop Header: Depth=1 2564 mov eax, edx 2565 movsx rdx, byte ptr [rdi] 2566 movsxd rdx, dword ptr [rcx + 4*rdx] 2567 mov qword ptr [rsi], rdx 2568 movsx rdx, byte ptr [rdi + 1] 2569 movsxd rdx, dword ptr [rcx + 4*rdx] 2570 mov qword ptr [rsi + 8], rdx 2571 movsx rdx, byte ptr [rdi + 2] 2572 movsxd rdx, dword ptr [rcx + 4*rdx] 2573 mov qword ptr [rsi + 16], rdx 2574 movsx rdx, byte ptr [rdi + 3] 2575 movsxd rdx, dword ptr [rcx + 4*rdx] 2576 mov qword ptr [rsi + 24], rdx 2577 lea edx, [rax - 4] 2578 add rdi, 4 2579 add rsi, 32 2580 cmp eax, 7 2581 jg .LBB49_5 2582 .LBB49_1: 2583 test edx, edx 2584 jle .LBB49_4 2585 # %bb.2: 2586 add edx, 1 2587 xor r8d, r8d 2588 .p2align 4, 0x90 2589 .LBB49_3: # =>This Inner Loop Header: Depth=1 2590 movsx rax, byte ptr [rdi + r8] 2591 movsxd rax, dword ptr [rcx + 4*rax] 2592 mov qword ptr [rsi + 8*r8], rax 2593 add r8, 1 2594 add edx, -1 2595 cmp edx, 1 2596 jg .LBB49_3 2597 .LBB49_4: 2598 mov rsp, rbp 2599 pop rbp 2600 ret 2601 .Lfunc_end49: 2602 .size transpose_int8_uint64_sse4, .Lfunc_end49-transpose_int8_uint64_sse4 2603 # -- End function 2604 .globl transpose_uint16_uint64_sse4 # -- Begin function transpose_uint16_uint64_sse4 2605 .p2align 4, 0x90 2606 .type transpose_uint16_uint64_sse4,@function 2607 transpose_uint16_uint64_sse4: # @transpose_uint16_uint64_sse4 2608 # %bb.0: 2609 push rbp 2610 mov rbp, rsp 2611 and rsp, -8 2612 cmp edx, 4 2613 jl .LBB50_1 2614 .p2align 4, 0x90 2615 .LBB50_5: # =>This Inner Loop Header: Depth=1 2616 mov eax, edx 2617 movzx edx, word ptr [rdi] 2618 movsxd rdx, dword ptr [rcx + 4*rdx] 2619 mov qword ptr [rsi], rdx 2620 movzx edx, word ptr [rdi + 2] 2621 movsxd rdx, dword ptr [rcx + 4*rdx] 2622 mov qword ptr [rsi + 8], rdx 2623 movzx edx, word ptr [rdi + 4] 2624 movsxd rdx, dword ptr [rcx + 4*rdx] 2625 mov qword ptr [rsi + 16], rdx 2626 movzx edx, word ptr [rdi + 6] 2627 movsxd rdx, dword ptr [rcx + 4*rdx] 2628 mov qword ptr [rsi + 24], rdx 2629 lea edx, [rax - 4] 2630 add rdi, 8 2631 add rsi, 32 2632 cmp eax, 7 2633 jg .LBB50_5 2634 .LBB50_1: 2635 test edx, edx 2636 jle .LBB50_4 2637 # %bb.2: 2638 add edx, 1 2639 xor r8d, r8d 2640 .p2align 4, 0x90 2641 .LBB50_3: # =>This Inner Loop Header: Depth=1 2642 movzx eax, word ptr [rdi + r8] 2643 movsxd rax, dword ptr [rcx + 4*rax] 2644 mov qword ptr [rsi + 4*r8], rax 2645 add r8, 2 2646 add edx, -1 2647 cmp edx, 1 2648 jg .LBB50_3 2649 .LBB50_4: 2650 mov rsp, rbp 2651 pop rbp 2652 ret 2653 .Lfunc_end50: 2654 .size transpose_uint16_uint64_sse4, .Lfunc_end50-transpose_uint16_uint64_sse4 2655 # -- End function 2656 .globl transpose_int16_uint64_sse4 # -- Begin function transpose_int16_uint64_sse4 2657 .p2align 4, 0x90 2658 .type transpose_int16_uint64_sse4,@function 2659 transpose_int16_uint64_sse4: # @transpose_int16_uint64_sse4 2660 # %bb.0: 2661 push rbp 2662 mov rbp, rsp 2663 and rsp, -8 2664 cmp edx, 4 2665 jl .LBB51_1 2666 .p2align 4, 0x90 2667 .LBB51_5: # =>This Inner Loop Header: Depth=1 2668 mov eax, edx 2669 movsx rdx, word ptr [rdi] 2670 movsxd rdx, dword ptr [rcx + 4*rdx] 2671 mov qword ptr [rsi], rdx 2672 movsx rdx, word ptr [rdi + 2] 2673 movsxd rdx, dword ptr [rcx + 4*rdx] 2674 mov qword ptr [rsi + 8], rdx 2675 movsx rdx, word ptr [rdi + 4] 2676 movsxd rdx, dword ptr [rcx + 4*rdx] 2677 mov qword ptr [rsi + 16], rdx 2678 movsx rdx, word ptr [rdi + 6] 2679 movsxd rdx, dword ptr [rcx + 4*rdx] 2680 mov qword ptr [rsi + 24], rdx 2681 lea edx, [rax - 4] 2682 add rdi, 8 2683 add rsi, 32 2684 cmp eax, 7 2685 jg .LBB51_5 2686 .LBB51_1: 2687 test edx, edx 2688 jle .LBB51_4 2689 # %bb.2: 2690 add edx, 1 2691 xor r8d, r8d 2692 .p2align 4, 0x90 2693 .LBB51_3: # =>This Inner Loop Header: Depth=1 2694 movsx rax, word ptr [rdi + r8] 2695 movsxd rax, dword ptr [rcx + 4*rax] 2696 mov qword ptr [rsi + 4*r8], rax 2697 add r8, 2 2698 add edx, -1 2699 cmp edx, 1 2700 jg .LBB51_3 2701 .LBB51_4: 2702 mov rsp, rbp 2703 pop rbp 2704 ret 2705 .Lfunc_end51: 2706 .size transpose_int16_uint64_sse4, .Lfunc_end51-transpose_int16_uint64_sse4 2707 # -- End function 2708 .globl transpose_uint32_uint64_sse4 # -- Begin function transpose_uint32_uint64_sse4 2709 .p2align 4, 0x90 2710 .type transpose_uint32_uint64_sse4,@function 2711 transpose_uint32_uint64_sse4: # @transpose_uint32_uint64_sse4 2712 # %bb.0: 2713 push rbp 2714 mov rbp, rsp 2715 and rsp, -8 2716 cmp edx, 4 2717 jl .LBB52_1 2718 .p2align 4, 0x90 2719 .LBB52_5: # =>This Inner Loop Header: Depth=1 2720 mov eax, edx 2721 mov edx, dword ptr [rdi] 2722 movsxd rdx, dword ptr [rcx + 4*rdx] 2723 mov qword ptr [rsi], rdx 2724 mov edx, dword ptr [rdi + 4] 2725 movsxd rdx, dword ptr [rcx + 4*rdx] 2726 mov qword ptr [rsi + 8], rdx 2727 mov edx, dword ptr [rdi + 8] 2728 movsxd rdx, dword ptr [rcx + 4*rdx] 2729 mov qword ptr [rsi + 16], rdx 2730 mov edx, dword ptr [rdi + 12] 2731 movsxd rdx, dword ptr [rcx + 4*rdx] 2732 mov qword ptr [rsi + 24], rdx 2733 lea edx, [rax - 4] 2734 add rdi, 16 2735 add rsi, 32 2736 cmp eax, 7 2737 jg .LBB52_5 2738 .LBB52_1: 2739 test edx, edx 2740 jle .LBB52_4 2741 # %bb.2: 2742 add edx, 1 2743 xor r8d, r8d 2744 .p2align 4, 0x90 2745 .LBB52_3: # =>This Inner Loop Header: Depth=1 2746 mov eax, dword ptr [rdi + r8] 2747 movsxd rax, dword ptr [rcx + 4*rax] 2748 mov qword ptr [rsi + 2*r8], rax 2749 add r8, 4 2750 add edx, -1 2751 cmp edx, 1 2752 jg .LBB52_3 2753 .LBB52_4: 2754 mov rsp, rbp 2755 pop rbp 2756 ret 2757 .Lfunc_end52: 2758 .size transpose_uint32_uint64_sse4, .Lfunc_end52-transpose_uint32_uint64_sse4 2759 # -- End function 2760 .globl transpose_int32_uint64_sse4 # -- Begin function transpose_int32_uint64_sse4 2761 .p2align 4, 0x90 2762 .type transpose_int32_uint64_sse4,@function 2763 transpose_int32_uint64_sse4: # @transpose_int32_uint64_sse4 2764 # %bb.0: 2765 push rbp 2766 mov rbp, rsp 2767 and rsp, -8 2768 cmp edx, 4 2769 jl .LBB53_1 2770 .p2align 4, 0x90 2771 .LBB53_5: # =>This Inner Loop Header: Depth=1 2772 mov eax, edx 2773 movsxd rdx, dword ptr [rdi] 2774 movsxd rdx, dword ptr [rcx + 4*rdx] 2775 mov qword ptr [rsi], rdx 2776 movsxd rdx, dword ptr [rdi + 4] 2777 movsxd rdx, dword ptr [rcx + 4*rdx] 2778 mov qword ptr [rsi + 8], rdx 2779 movsxd rdx, dword ptr [rdi + 8] 2780 movsxd rdx, dword ptr [rcx + 4*rdx] 2781 mov qword ptr [rsi + 16], rdx 2782 movsxd rdx, dword ptr [rdi + 12] 2783 movsxd rdx, dword ptr [rcx + 4*rdx] 2784 mov qword ptr [rsi + 24], rdx 2785 lea edx, [rax - 4] 2786 add rdi, 16 2787 add rsi, 32 2788 cmp eax, 7 2789 jg .LBB53_5 2790 .LBB53_1: 2791 test edx, edx 2792 jle .LBB53_4 2793 # %bb.2: 2794 add edx, 1 2795 xor r8d, r8d 2796 .p2align 4, 0x90 2797 .LBB53_3: # =>This Inner Loop Header: Depth=1 2798 movsxd rax, dword ptr [rdi + r8] 2799 movsxd rax, dword ptr [rcx + 4*rax] 2800 mov qword ptr [rsi + 2*r8], rax 2801 add r8, 4 2802 add edx, -1 2803 cmp edx, 1 2804 jg .LBB53_3 2805 .LBB53_4: 2806 mov rsp, rbp 2807 pop rbp 2808 ret 2809 .Lfunc_end53: 2810 .size transpose_int32_uint64_sse4, .Lfunc_end53-transpose_int32_uint64_sse4 2811 # -- End function 2812 .globl transpose_uint64_uint64_sse4 # -- Begin function transpose_uint64_uint64_sse4 2813 .p2align 4, 0x90 2814 .type transpose_uint64_uint64_sse4,@function 2815 transpose_uint64_uint64_sse4: # @transpose_uint64_uint64_sse4 2816 # %bb.0: 2817 push rbp 2818 mov rbp, rsp 2819 and rsp, -8 2820 cmp edx, 4 2821 jl .LBB54_1 2822 .p2align 4, 0x90 2823 .LBB54_5: # =>This Inner Loop Header: Depth=1 2824 mov eax, edx 2825 mov rdx, qword ptr [rdi] 2826 movsxd rdx, dword ptr [rcx + 4*rdx] 2827 mov qword ptr [rsi], rdx 2828 mov rdx, qword ptr [rdi + 8] 2829 movsxd rdx, dword ptr [rcx + 4*rdx] 2830 mov qword ptr [rsi + 8], rdx 2831 mov rdx, qword ptr [rdi + 16] 2832 movsxd rdx, dword ptr [rcx + 4*rdx] 2833 mov qword ptr [rsi + 16], rdx 2834 mov rdx, qword ptr [rdi + 24] 2835 movsxd rdx, dword ptr [rcx + 4*rdx] 2836 mov qword ptr [rsi + 24], rdx 2837 lea edx, [rax - 4] 2838 add rdi, 32 2839 add rsi, 32 2840 cmp eax, 7 2841 jg .LBB54_5 2842 .LBB54_1: 2843 test edx, edx 2844 jle .LBB54_4 2845 # %bb.2: 2846 add edx, 1 2847 xor r8d, r8d 2848 .p2align 4, 0x90 2849 .LBB54_3: # =>This Inner Loop Header: Depth=1 2850 mov rax, qword ptr [rdi + r8] 2851 movsxd rax, dword ptr [rcx + 4*rax] 2852 mov qword ptr [rsi + r8], rax 2853 add r8, 8 2854 add edx, -1 2855 cmp edx, 1 2856 jg .LBB54_3 2857 .LBB54_4: 2858 mov rsp, rbp 2859 pop rbp 2860 ret 2861 .Lfunc_end54: 2862 .size transpose_uint64_uint64_sse4, .Lfunc_end54-transpose_uint64_uint64_sse4 2863 # -- End function 2864 .globl transpose_int64_uint64_sse4 # -- Begin function transpose_int64_uint64_sse4 2865 .p2align 4, 0x90 2866 .type transpose_int64_uint64_sse4,@function 2867 transpose_int64_uint64_sse4: # @transpose_int64_uint64_sse4 2868 # %bb.0: 2869 push rbp 2870 mov rbp, rsp 2871 and rsp, -8 2872 cmp edx, 4 2873 jl .LBB55_1 2874 .p2align 4, 0x90 2875 .LBB55_5: # =>This Inner Loop Header: Depth=1 2876 mov eax, edx 2877 mov rdx, qword ptr [rdi] 2878 movsxd rdx, dword ptr [rcx + 4*rdx] 2879 mov qword ptr [rsi], rdx 2880 mov rdx, qword ptr [rdi + 8] 2881 movsxd rdx, dword ptr [rcx + 4*rdx] 2882 mov qword ptr [rsi + 8], rdx 2883 mov rdx, qword ptr [rdi + 16] 2884 movsxd rdx, dword ptr [rcx + 4*rdx] 2885 mov qword ptr [rsi + 16], rdx 2886 mov rdx, qword ptr [rdi + 24] 2887 movsxd rdx, dword ptr [rcx + 4*rdx] 2888 mov qword ptr [rsi + 24], rdx 2889 lea edx, [rax - 4] 2890 add rdi, 32 2891 add rsi, 32 2892 cmp eax, 7 2893 jg .LBB55_5 2894 .LBB55_1: 2895 test edx, edx 2896 jle .LBB55_4 2897 # %bb.2: 2898 add edx, 1 2899 xor r8d, r8d 2900 .p2align 4, 0x90 2901 .LBB55_3: # =>This Inner Loop Header: Depth=1 2902 mov rax, qword ptr [rdi + r8] 2903 movsxd rax, dword ptr [rcx + 4*rax] 2904 mov qword ptr [rsi + r8], rax 2905 add r8, 8 2906 add edx, -1 2907 cmp edx, 1 2908 jg .LBB55_3 2909 .LBB55_4: 2910 mov rsp, rbp 2911 pop rbp 2912 ret 2913 .Lfunc_end55: 2914 .size transpose_int64_uint64_sse4, .Lfunc_end55-transpose_int64_uint64_sse4 2915 # -- End function 2916 .globl transpose_uint8_int64_sse4 # -- Begin function transpose_uint8_int64_sse4 2917 .p2align 4, 0x90 2918 .type transpose_uint8_int64_sse4,@function 2919 transpose_uint8_int64_sse4: # @transpose_uint8_int64_sse4 2920 # %bb.0: 2921 push rbp 2922 mov rbp, rsp 2923 and rsp, -8 2924 cmp edx, 4 2925 jl .LBB56_1 2926 .p2align 4, 0x90 2927 .LBB56_5: # =>This Inner Loop Header: Depth=1 2928 mov eax, edx 2929 movzx edx, byte ptr [rdi] 2930 movsxd rdx, dword ptr [rcx + 4*rdx] 2931 mov qword ptr [rsi], rdx 2932 movzx edx, byte ptr [rdi + 1] 2933 movsxd rdx, dword ptr [rcx + 4*rdx] 2934 mov qword ptr [rsi + 8], rdx 2935 movzx edx, byte ptr [rdi + 2] 2936 movsxd rdx, dword ptr [rcx + 4*rdx] 2937 mov qword ptr [rsi + 16], rdx 2938 movzx edx, byte ptr [rdi + 3] 2939 movsxd rdx, dword ptr [rcx + 4*rdx] 2940 mov qword ptr [rsi + 24], rdx 2941 lea edx, [rax - 4] 2942 add rdi, 4 2943 add rsi, 32 2944 cmp eax, 7 2945 jg .LBB56_5 2946 .LBB56_1: 2947 test edx, edx 2948 jle .LBB56_4 2949 # %bb.2: 2950 add edx, 1 2951 xor r8d, r8d 2952 .p2align 4, 0x90 2953 .LBB56_3: # =>This Inner Loop Header: Depth=1 2954 movzx eax, byte ptr [rdi + r8] 2955 movsxd rax, dword ptr [rcx + 4*rax] 2956 mov qword ptr [rsi + 8*r8], rax 2957 add r8, 1 2958 add edx, -1 2959 cmp edx, 1 2960 jg .LBB56_3 2961 .LBB56_4: 2962 mov rsp, rbp 2963 pop rbp 2964 ret 2965 .Lfunc_end56: 2966 .size transpose_uint8_int64_sse4, .Lfunc_end56-transpose_uint8_int64_sse4 2967 # -- End function 2968 .globl transpose_int8_int64_sse4 # -- Begin function transpose_int8_int64_sse4 2969 .p2align 4, 0x90 2970 .type transpose_int8_int64_sse4,@function 2971 transpose_int8_int64_sse4: # @transpose_int8_int64_sse4 2972 # %bb.0: 2973 push rbp 2974 mov rbp, rsp 2975 and rsp, -8 2976 cmp edx, 4 2977 jl .LBB57_1 2978 .p2align 4, 0x90 2979 .LBB57_5: # =>This Inner Loop Header: Depth=1 2980 mov eax, edx 2981 movsx rdx, byte ptr [rdi] 2982 movsxd rdx, dword ptr [rcx + 4*rdx] 2983 mov qword ptr [rsi], rdx 2984 movsx rdx, byte ptr [rdi + 1] 2985 movsxd rdx, dword ptr [rcx + 4*rdx] 2986 mov qword ptr [rsi + 8], rdx 2987 movsx rdx, byte ptr [rdi + 2] 2988 movsxd rdx, dword ptr [rcx + 4*rdx] 2989 mov qword ptr [rsi + 16], rdx 2990 movsx rdx, byte ptr [rdi + 3] 2991 movsxd rdx, dword ptr [rcx + 4*rdx] 2992 mov qword ptr [rsi + 24], rdx 2993 lea edx, [rax - 4] 2994 add rdi, 4 2995 add rsi, 32 2996 cmp eax, 7 2997 jg .LBB57_5 2998 .LBB57_1: 2999 test edx, edx 3000 jle .LBB57_4 3001 # %bb.2: 3002 add edx, 1 3003 xor r8d, r8d 3004 .p2align 4, 0x90 3005 .LBB57_3: # =>This Inner Loop Header: Depth=1 3006 movsx rax, byte ptr [rdi + r8] 3007 movsxd rax, dword ptr [rcx + 4*rax] 3008 mov qword ptr [rsi + 8*r8], rax 3009 add r8, 1 3010 add edx, -1 3011 cmp edx, 1 3012 jg .LBB57_3 3013 .LBB57_4: 3014 mov rsp, rbp 3015 pop rbp 3016 ret 3017 .Lfunc_end57: 3018 .size transpose_int8_int64_sse4, .Lfunc_end57-transpose_int8_int64_sse4 3019 # -- End function 3020 .globl transpose_uint16_int64_sse4 # -- Begin function transpose_uint16_int64_sse4 3021 .p2align 4, 0x90 3022 .type transpose_uint16_int64_sse4,@function 3023 transpose_uint16_int64_sse4: # @transpose_uint16_int64_sse4 3024 # %bb.0: 3025 push rbp 3026 mov rbp, rsp 3027 and rsp, -8 3028 cmp edx, 4 3029 jl .LBB58_1 3030 .p2align 4, 0x90 3031 .LBB58_5: # =>This Inner Loop Header: Depth=1 3032 mov eax, edx 3033 movzx edx, word ptr [rdi] 3034 movsxd rdx, dword ptr [rcx + 4*rdx] 3035 mov qword ptr [rsi], rdx 3036 movzx edx, word ptr [rdi + 2] 3037 movsxd rdx, dword ptr [rcx + 4*rdx] 3038 mov qword ptr [rsi + 8], rdx 3039 movzx edx, word ptr [rdi + 4] 3040 movsxd rdx, dword ptr [rcx + 4*rdx] 3041 mov qword ptr [rsi + 16], rdx 3042 movzx edx, word ptr [rdi + 6] 3043 movsxd rdx, dword ptr [rcx + 4*rdx] 3044 mov qword ptr [rsi + 24], rdx 3045 lea edx, [rax - 4] 3046 add rdi, 8 3047 add rsi, 32 3048 cmp eax, 7 3049 jg .LBB58_5 3050 .LBB58_1: 3051 test edx, edx 3052 jle .LBB58_4 3053 # %bb.2: 3054 add edx, 1 3055 xor r8d, r8d 3056 .p2align 4, 0x90 3057 .LBB58_3: # =>This Inner Loop Header: Depth=1 3058 movzx eax, word ptr [rdi + r8] 3059 movsxd rax, dword ptr [rcx + 4*rax] 3060 mov qword ptr [rsi + 4*r8], rax 3061 add r8, 2 3062 add edx, -1 3063 cmp edx, 1 3064 jg .LBB58_3 3065 .LBB58_4: 3066 mov rsp, rbp 3067 pop rbp 3068 ret 3069 .Lfunc_end58: 3070 .size transpose_uint16_int64_sse4, .Lfunc_end58-transpose_uint16_int64_sse4 3071 # -- End function 3072 .globl transpose_int16_int64_sse4 # -- Begin function transpose_int16_int64_sse4 3073 .p2align 4, 0x90 3074 .type transpose_int16_int64_sse4,@function 3075 transpose_int16_int64_sse4: # @transpose_int16_int64_sse4 3076 # %bb.0: 3077 push rbp 3078 mov rbp, rsp 3079 and rsp, -8 3080 cmp edx, 4 3081 jl .LBB59_1 3082 .p2align 4, 0x90 3083 .LBB59_5: # =>This Inner Loop Header: Depth=1 3084 mov eax, edx 3085 movsx rdx, word ptr [rdi] 3086 movsxd rdx, dword ptr [rcx + 4*rdx] 3087 mov qword ptr [rsi], rdx 3088 movsx rdx, word ptr [rdi + 2] 3089 movsxd rdx, dword ptr [rcx + 4*rdx] 3090 mov qword ptr [rsi + 8], rdx 3091 movsx rdx, word ptr [rdi + 4] 3092 movsxd rdx, dword ptr [rcx + 4*rdx] 3093 mov qword ptr [rsi + 16], rdx 3094 movsx rdx, word ptr [rdi + 6] 3095 movsxd rdx, dword ptr [rcx + 4*rdx] 3096 mov qword ptr [rsi + 24], rdx 3097 lea edx, [rax - 4] 3098 add rdi, 8 3099 add rsi, 32 3100 cmp eax, 7 3101 jg .LBB59_5 3102 .LBB59_1: 3103 test edx, edx 3104 jle .LBB59_4 3105 # %bb.2: 3106 add edx, 1 3107 xor r8d, r8d 3108 .p2align 4, 0x90 3109 .LBB59_3: # =>This Inner Loop Header: Depth=1 3110 movsx rax, word ptr [rdi + r8] 3111 movsxd rax, dword ptr [rcx + 4*rax] 3112 mov qword ptr [rsi + 4*r8], rax 3113 add r8, 2 3114 add edx, -1 3115 cmp edx, 1 3116 jg .LBB59_3 3117 .LBB59_4: 3118 mov rsp, rbp 3119 pop rbp 3120 ret 3121 .Lfunc_end59: 3122 .size transpose_int16_int64_sse4, .Lfunc_end59-transpose_int16_int64_sse4 3123 # -- End function 3124 .globl transpose_uint32_int64_sse4 # -- Begin function transpose_uint32_int64_sse4 3125 .p2align 4, 0x90 3126 .type transpose_uint32_int64_sse4,@function 3127 transpose_uint32_int64_sse4: # @transpose_uint32_int64_sse4 3128 # %bb.0: 3129 push rbp 3130 mov rbp, rsp 3131 and rsp, -8 3132 cmp edx, 4 3133 jl .LBB60_1 3134 .p2align 4, 0x90 3135 .LBB60_5: # =>This Inner Loop Header: Depth=1 3136 mov eax, edx 3137 mov edx, dword ptr [rdi] 3138 movsxd rdx, dword ptr [rcx + 4*rdx] 3139 mov qword ptr [rsi], rdx 3140 mov edx, dword ptr [rdi + 4] 3141 movsxd rdx, dword ptr [rcx + 4*rdx] 3142 mov qword ptr [rsi + 8], rdx 3143 mov edx, dword ptr [rdi + 8] 3144 movsxd rdx, dword ptr [rcx + 4*rdx] 3145 mov qword ptr [rsi + 16], rdx 3146 mov edx, dword ptr [rdi + 12] 3147 movsxd rdx, dword ptr [rcx + 4*rdx] 3148 mov qword ptr [rsi + 24], rdx 3149 lea edx, [rax - 4] 3150 add rdi, 16 3151 add rsi, 32 3152 cmp eax, 7 3153 jg .LBB60_5 3154 .LBB60_1: 3155 test edx, edx 3156 jle .LBB60_4 3157 # %bb.2: 3158 add edx, 1 3159 xor r8d, r8d 3160 .p2align 4, 0x90 3161 .LBB60_3: # =>This Inner Loop Header: Depth=1 3162 mov eax, dword ptr [rdi + r8] 3163 movsxd rax, dword ptr [rcx + 4*rax] 3164 mov qword ptr [rsi + 2*r8], rax 3165 add r8, 4 3166 add edx, -1 3167 cmp edx, 1 3168 jg .LBB60_3 3169 .LBB60_4: 3170 mov rsp, rbp 3171 pop rbp 3172 ret 3173 .Lfunc_end60: 3174 .size transpose_uint32_int64_sse4, .Lfunc_end60-transpose_uint32_int64_sse4 3175 # -- End function 3176 .globl transpose_int32_int64_sse4 # -- Begin function transpose_int32_int64_sse4 3177 .p2align 4, 0x90 3178 .type transpose_int32_int64_sse4,@function 3179 transpose_int32_int64_sse4: # @transpose_int32_int64_sse4 3180 # %bb.0: 3181 push rbp 3182 mov rbp, rsp 3183 and rsp, -8 3184 cmp edx, 4 3185 jl .LBB61_1 3186 .p2align 4, 0x90 3187 .LBB61_5: # =>This Inner Loop Header: Depth=1 3188 mov eax, edx 3189 movsxd rdx, dword ptr [rdi] 3190 movsxd rdx, dword ptr [rcx + 4*rdx] 3191 mov qword ptr [rsi], rdx 3192 movsxd rdx, dword ptr [rdi + 4] 3193 movsxd rdx, dword ptr [rcx + 4*rdx] 3194 mov qword ptr [rsi + 8], rdx 3195 movsxd rdx, dword ptr [rdi + 8] 3196 movsxd rdx, dword ptr [rcx + 4*rdx] 3197 mov qword ptr [rsi + 16], rdx 3198 movsxd rdx, dword ptr [rdi + 12] 3199 movsxd rdx, dword ptr [rcx + 4*rdx] 3200 mov qword ptr [rsi + 24], rdx 3201 lea edx, [rax - 4] 3202 add rdi, 16 3203 add rsi, 32 3204 cmp eax, 7 3205 jg .LBB61_5 3206 .LBB61_1: 3207 test edx, edx 3208 jle .LBB61_4 3209 # %bb.2: 3210 add edx, 1 3211 xor r8d, r8d 3212 .p2align 4, 0x90 3213 .LBB61_3: # =>This Inner Loop Header: Depth=1 3214 movsxd rax, dword ptr [rdi + r8] 3215 movsxd rax, dword ptr [rcx + 4*rax] 3216 mov qword ptr [rsi + 2*r8], rax 3217 add r8, 4 3218 add edx, -1 3219 cmp edx, 1 3220 jg .LBB61_3 3221 .LBB61_4: 3222 mov rsp, rbp 3223 pop rbp 3224 ret 3225 .Lfunc_end61: 3226 .size transpose_int32_int64_sse4, .Lfunc_end61-transpose_int32_int64_sse4 3227 # -- End function 3228 .globl transpose_uint64_int64_sse4 # -- Begin function transpose_uint64_int64_sse4 3229 .p2align 4, 0x90 3230 .type transpose_uint64_int64_sse4,@function 3231 transpose_uint64_int64_sse4: # @transpose_uint64_int64_sse4 3232 # %bb.0: 3233 push rbp 3234 mov rbp, rsp 3235 and rsp, -8 3236 cmp edx, 4 3237 jl .LBB62_1 3238 .p2align 4, 0x90 3239 .LBB62_5: # =>This Inner Loop Header: Depth=1 3240 mov eax, edx 3241 mov rdx, qword ptr [rdi] 3242 movsxd rdx, dword ptr [rcx + 4*rdx] 3243 mov qword ptr [rsi], rdx 3244 mov rdx, qword ptr [rdi + 8] 3245 movsxd rdx, dword ptr [rcx + 4*rdx] 3246 mov qword ptr [rsi + 8], rdx 3247 mov rdx, qword ptr [rdi + 16] 3248 movsxd rdx, dword ptr [rcx + 4*rdx] 3249 mov qword ptr [rsi + 16], rdx 3250 mov rdx, qword ptr [rdi + 24] 3251 movsxd rdx, dword ptr [rcx + 4*rdx] 3252 mov qword ptr [rsi + 24], rdx 3253 lea edx, [rax - 4] 3254 add rdi, 32 3255 add rsi, 32 3256 cmp eax, 7 3257 jg .LBB62_5 3258 .LBB62_1: 3259 test edx, edx 3260 jle .LBB62_4 3261 # %bb.2: 3262 add edx, 1 3263 xor r8d, r8d 3264 .p2align 4, 0x90 3265 .LBB62_3: # =>This Inner Loop Header: Depth=1 3266 mov rax, qword ptr [rdi + r8] 3267 movsxd rax, dword ptr [rcx + 4*rax] 3268 mov qword ptr [rsi + r8], rax 3269 add r8, 8 3270 add edx, -1 3271 cmp edx, 1 3272 jg .LBB62_3 3273 .LBB62_4: 3274 mov rsp, rbp 3275 pop rbp 3276 ret 3277 .Lfunc_end62: 3278 .size transpose_uint64_int64_sse4, .Lfunc_end62-transpose_uint64_int64_sse4 3279 # -- End function 3280 .globl transpose_int64_int64_sse4 # -- Begin function transpose_int64_int64_sse4 3281 .p2align 4, 0x90 3282 .type transpose_int64_int64_sse4,@function 3283 transpose_int64_int64_sse4: # @transpose_int64_int64_sse4 3284 # %bb.0: 3285 push rbp 3286 mov rbp, rsp 3287 and rsp, -8 3288 cmp edx, 4 3289 jl .LBB63_1 3290 .p2align 4, 0x90 3291 .LBB63_5: # =>This Inner Loop Header: Depth=1 3292 mov eax, edx 3293 mov rdx, qword ptr [rdi] 3294 movsxd rdx, dword ptr [rcx + 4*rdx] 3295 mov qword ptr [rsi], rdx 3296 mov rdx, qword ptr [rdi + 8] 3297 movsxd rdx, dword ptr [rcx + 4*rdx] 3298 mov qword ptr [rsi + 8], rdx 3299 mov rdx, qword ptr [rdi + 16] 3300 movsxd rdx, dword ptr [rcx + 4*rdx] 3301 mov qword ptr [rsi + 16], rdx 3302 mov rdx, qword ptr [rdi + 24] 3303 movsxd rdx, dword ptr [rcx + 4*rdx] 3304 mov qword ptr [rsi + 24], rdx 3305 lea edx, [rax - 4] 3306 add rdi, 32 3307 add rsi, 32 3308 cmp eax, 7 3309 jg .LBB63_5 3310 .LBB63_1: 3311 test edx, edx 3312 jle .LBB63_4 3313 # %bb.2: 3314 add edx, 1 3315 xor r8d, r8d 3316 .p2align 4, 0x90 3317 .LBB63_3: # =>This Inner Loop Header: Depth=1 3318 mov rax, qword ptr [rdi + r8] 3319 movsxd rax, dword ptr [rcx + 4*rax] 3320 mov qword ptr [rsi + r8], rax 3321 add r8, 8 3322 add edx, -1 3323 cmp edx, 1 3324 jg .LBB63_3 3325 .LBB63_4: 3326 mov rsp, rbp 3327 pop rbp 3328 ret 3329 .Lfunc_end63: 3330 .size transpose_int64_int64_sse4, .Lfunc_end63-transpose_int64_int64_sse4 3331 # -- End function 3332 .ident "Ubuntu clang version 11.0.0-2~ubuntu20.04.1" 3333 .section ".note.GNU-stack","",@progbits 3334 .addrsig