github.com/apache/arrow/go/v14@v14.0.2/internal/utils/transpose_ints_sse4_amd64.s (about)

     1  //+build !noasm !appengine
     2  // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
     3  
     4  TEXT ·_transpose_uint8_uint8_sse4(SB), $0-32
     5  
     6  	MOVQ src+0(FP), DI
     7  	MOVQ dest+8(FP), SI
     8  	MOVQ length+16(FP), DX
     9  	MOVQ transposeMap+24(FP), CX
    10  
    11  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
    12  	JL   LBB0_1
    13  
    14  LBB0_5:
    15  	WORD $0xd089             // mov    eax, edx
    16  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
    17  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
    18  	WORD $0x1688             // mov    byte [rsi], dl
    19  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
    20  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
    21  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
    22  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
    23  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
    24  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
    25  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
    26  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
    27  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
    28  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
    29  	LONG $0x04c78348         // add    rdi, 4
    30  	LONG $0x04c68348         // add    rsi, 4
    31  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
    32  	JG   LBB0_5
    33  
    34  LBB0_1:
    35  	WORD $0xd285             // test    edx, edx
    36  	JLE  LBB0_4
    37  	WORD $0xc283; BYTE $0x01 // add    edx, 1
    38  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
    39  
    40  LBB0_3:
    41  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
    42  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
    43  	LONG $0x06048842             // mov    byte [rsi + r8], al
    44  	LONG $0x01c08349             // add    r8, 1
    45  	WORD $0xc283; BYTE $0xff     // add    edx, -1
    46  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
    47  	JG   LBB0_3
    48  
    49  LBB0_4:
    50  	RET
    51  
    52  TEXT ·_transpose_int8_uint8_sse4(SB), $0-32
    53  
    54  	MOVQ src+0(FP), DI
    55  	MOVQ dest+8(FP), SI
    56  	MOVQ length+16(FP), DX
    57  	MOVQ transposeMap+24(FP), CX
    58  
    59  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
    60  	JL   LBB1_1
    61  
    62  LBB1_5:
    63  	WORD $0xd089                 // mov    eax, edx
    64  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
    65  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
    66  	WORD $0x1688                 // mov    byte [rsi], dl
    67  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
    68  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
    69  	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
    70  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
    71  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
    72  	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
    73  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
    74  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
    75  	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
    76  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
    77  	LONG $0x04c78348             // add    rdi, 4
    78  	LONG $0x04c68348             // add    rsi, 4
    79  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
    80  	JG   LBB1_5
    81  
    82  LBB1_1:
    83  	WORD $0xd285             // test    edx, edx
    84  	JLE  LBB1_4
    85  	WORD $0xc283; BYTE $0x01 // add    edx, 1
    86  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
    87  
    88  LBB1_3:
    89  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
    90  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
    91  	LONG $0x06048842             // mov    byte [rsi + r8], al
    92  	LONG $0x01c08349             // add    r8, 1
    93  	WORD $0xc283; BYTE $0xff     // add    edx, -1
    94  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
    95  	JG   LBB1_3
    96  
    97  LBB1_4:
    98  	RET
    99  
   100  TEXT ·_transpose_uint16_uint8_sse4(SB), $0-32
   101  
   102  	MOVQ src+0(FP), DI
   103  	MOVQ dest+8(FP), SI
   104  	MOVQ length+16(FP), DX
   105  	MOVQ transposeMap+24(FP), CX
   106  
   107  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   108  	JL   LBB2_1
   109  
   110  LBB2_5:
   111  	WORD $0xd089             // mov    eax, edx
   112  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
   113  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   114  	WORD $0x1688             // mov    byte [rsi], dl
   115  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
   116  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   117  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   118  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
   119  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   120  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   121  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
   122  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   123  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   124  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   125  	LONG $0x08c78348         // add    rdi, 8
   126  	LONG $0x04c68348         // add    rsi, 4
   127  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   128  	JG   LBB2_5
   129  
   130  LBB2_1:
   131  	WORD $0xd285             // test    edx, edx
   132  	JLE  LBB2_4
   133  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   134  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   135  
   136  LBB2_3:
   137  	LONG $0x04b70f42; BYTE $0x47 // movzx    eax, word [rdi + 2*r8]
   138  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
   139  	LONG $0x06048842             // mov    byte [rsi + r8], al
   140  	LONG $0x01c08349             // add    r8, 1
   141  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   142  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   143  	JG   LBB2_3
   144  
   145  LBB2_4:
   146  	RET
   147  
   148  TEXT ·_transpose_int16_uint8_sse4(SB), $0-32
   149  
   150  	MOVQ src+0(FP), DI
   151  	MOVQ dest+8(FP), SI
   152  	MOVQ length+16(FP), DX
   153  	MOVQ transposeMap+24(FP), CX
   154  
   155  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   156  	JL   LBB3_1
   157  
   158  LBB3_5:
   159  	WORD $0xd089                 // mov    eax, edx
   160  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
   161  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   162  	WORD $0x1688                 // mov    byte [rsi], dl
   163  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
   164  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   165  	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
   166  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
   167  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   168  	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
   169  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
   170  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   171  	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
   172  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
   173  	LONG $0x08c78348             // add    rdi, 8
   174  	LONG $0x04c68348             // add    rsi, 4
   175  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
   176  	JG   LBB3_5
   177  
   178  LBB3_1:
   179  	WORD $0xd285             // test    edx, edx
   180  	JLE  LBB3_4
   181  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   182  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   183  
   184  LBB3_3:
   185  	LONG $0x04bf0f4a; BYTE $0x47 // movsx    rax, word [rdi + 2*r8]
   186  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
   187  	LONG $0x06048842             // mov    byte [rsi + r8], al
   188  	LONG $0x01c08349             // add    r8, 1
   189  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   190  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   191  	JG   LBB3_3
   192  
   193  LBB3_4:
   194  	RET
   195  
   196  TEXT ·_transpose_uint32_uint8_sse4(SB), $0-32
   197  
   198  	MOVQ src+0(FP), DI
   199  	MOVQ dest+8(FP), SI
   200  	MOVQ length+16(FP), DX
   201  	MOVQ transposeMap+24(FP), CX
   202  
   203  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   204  	JL   LBB4_1
   205  
   206  LBB4_5:
   207  	WORD $0xd089             // mov    eax, edx
   208  	WORD $0x178b             // mov    edx, dword [rdi]
   209  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   210  	WORD $0x1688             // mov    byte [rsi], dl
   211  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
   212  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   213  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   214  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
   215  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   216  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   217  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
   218  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   219  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   220  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   221  	LONG $0x10c78348         // add    rdi, 16
   222  	LONG $0x04c68348         // add    rsi, 4
   223  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   224  	JG   LBB4_5
   225  
   226  LBB4_1:
   227  	WORD $0xd285             // test    edx, edx
   228  	JLE  LBB4_4
   229  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   230  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   231  
   232  LBB4_3:
   233  	LONG $0x87048b42         // mov    eax, dword [rdi + 4*r8]
   234  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   235  	LONG $0x06048842         // mov    byte [rsi + r8], al
   236  	LONG $0x01c08349         // add    r8, 1
   237  	WORD $0xc283; BYTE $0xff // add    edx, -1
   238  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   239  	JG   LBB4_3
   240  
   241  LBB4_4:
   242  	RET
   243  
   244  TEXT ·_transpose_int32_uint8_sse4(SB), $0-32
   245  
   246  	MOVQ src+0(FP), DI
   247  	MOVQ dest+8(FP), SI
   248  	MOVQ length+16(FP), DX
   249  	MOVQ transposeMap+24(FP), CX
   250  
   251  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   252  	JL   LBB5_1
   253  
   254  LBB5_5:
   255  	WORD $0xd089             // mov    eax, edx
   256  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
   257  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   258  	WORD $0x1688             // mov    byte [rsi], dl
   259  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
   260  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   261  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   262  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
   263  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   264  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   265  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
   266  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   267  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   268  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   269  	LONG $0x10c78348         // add    rdi, 16
   270  	LONG $0x04c68348         // add    rsi, 4
   271  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   272  	JG   LBB5_5
   273  
   274  LBB5_1:
   275  	WORD $0xd285             // test    edx, edx
   276  	JLE  LBB5_4
   277  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   278  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   279  
   280  LBB5_3:
   281  	LONG $0x8704634a         // movsxd    rax, dword [rdi + 4*r8]
   282  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   283  	LONG $0x06048842         // mov    byte [rsi + r8], al
   284  	LONG $0x01c08349         // add    r8, 1
   285  	WORD $0xc283; BYTE $0xff // add    edx, -1
   286  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   287  	JG   LBB5_3
   288  
   289  LBB5_4:
   290  	RET
   291  
   292  TEXT ·_transpose_uint64_uint8_sse4(SB), $0-32
   293  
   294  	MOVQ src+0(FP), DI
   295  	MOVQ dest+8(FP), SI
   296  	MOVQ length+16(FP), DX
   297  	MOVQ transposeMap+24(FP), CX
   298  
   299  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   300  	JL   LBB6_1
   301  
   302  LBB6_5:
   303  	WORD $0xd089             // mov    eax, edx
   304  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
   305  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   306  	WORD $0x1688             // mov    byte [rsi], dl
   307  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
   308  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   309  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   310  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
   311  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   312  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   313  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
   314  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   315  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   316  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   317  	LONG $0x20c78348         // add    rdi, 32
   318  	LONG $0x04c68348         // add    rsi, 4
   319  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   320  	JG   LBB6_5
   321  
   322  LBB6_1:
   323  	WORD $0xd285             // test    edx, edx
   324  	JLE  LBB6_4
   325  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   326  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   327  
   328  LBB6_3:
   329  	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
   330  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   331  	LONG $0x06048842         // mov    byte [rsi + r8], al
   332  	LONG $0x01c08349         // add    r8, 1
   333  	WORD $0xc283; BYTE $0xff // add    edx, -1
   334  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   335  	JG   LBB6_3
   336  
   337  LBB6_4:
   338  	RET
   339  
   340  TEXT ·_transpose_int64_uint8_sse4(SB), $0-32
   341  
   342  	MOVQ src+0(FP), DI
   343  	MOVQ dest+8(FP), SI
   344  	MOVQ length+16(FP), DX
   345  	MOVQ transposeMap+24(FP), CX
   346  
   347  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   348  	JL   LBB7_1
   349  
   350  LBB7_5:
   351  	WORD $0xd089             // mov    eax, edx
   352  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
   353  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   354  	WORD $0x1688             // mov    byte [rsi], dl
   355  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
   356  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   357  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   358  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
   359  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   360  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   361  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
   362  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   363  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   364  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   365  	LONG $0x20c78348         // add    rdi, 32
   366  	LONG $0x04c68348         // add    rsi, 4
   367  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   368  	JG   LBB7_5
   369  
   370  LBB7_1:
   371  	WORD $0xd285             // test    edx, edx
   372  	JLE  LBB7_4
   373  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   374  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   375  
   376  LBB7_3:
   377  	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
   378  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   379  	LONG $0x06048842         // mov    byte [rsi + r8], al
   380  	LONG $0x01c08349         // add    r8, 1
   381  	WORD $0xc283; BYTE $0xff // add    edx, -1
   382  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   383  	JG   LBB7_3
   384  
   385  LBB7_4:
   386  	RET
   387  
   388  TEXT ·_transpose_uint8_int8_sse4(SB), $0-32
   389  
   390  	MOVQ src+0(FP), DI
   391  	MOVQ dest+8(FP), SI
   392  	MOVQ length+16(FP), DX
   393  	MOVQ transposeMap+24(FP), CX
   394  
   395  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   396  	JL   LBB8_1
   397  
   398  LBB8_5:
   399  	WORD $0xd089             // mov    eax, edx
   400  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
   401  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   402  	WORD $0x1688             // mov    byte [rsi], dl
   403  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
   404  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   405  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   406  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
   407  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   408  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   409  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
   410  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   411  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   412  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   413  	LONG $0x04c78348         // add    rdi, 4
   414  	LONG $0x04c68348         // add    rsi, 4
   415  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   416  	JG   LBB8_5
   417  
   418  LBB8_1:
   419  	WORD $0xd285             // test    edx, edx
   420  	JLE  LBB8_4
   421  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   422  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   423  
   424  LBB8_3:
   425  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
   426  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
   427  	LONG $0x06048842             // mov    byte [rsi + r8], al
   428  	LONG $0x01c08349             // add    r8, 1
   429  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   430  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   431  	JG   LBB8_3
   432  
   433  LBB8_4:
   434  	RET
   435  
   436  TEXT ·_transpose_int8_int8_sse4(SB), $0-32
   437  
   438  	MOVQ src+0(FP), DI
   439  	MOVQ dest+8(FP), SI
   440  	MOVQ length+16(FP), DX
   441  	MOVQ transposeMap+24(FP), CX
   442  
   443  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   444  	JL   LBB9_1
   445  
   446  LBB9_5:
   447  	WORD $0xd089                 // mov    eax, edx
   448  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
   449  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   450  	WORD $0x1688                 // mov    byte [rsi], dl
   451  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
   452  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   453  	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
   454  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
   455  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   456  	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
   457  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
   458  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   459  	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
   460  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
   461  	LONG $0x04c78348             // add    rdi, 4
   462  	LONG $0x04c68348             // add    rsi, 4
   463  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
   464  	JG   LBB9_5
   465  
   466  LBB9_1:
   467  	WORD $0xd285             // test    edx, edx
   468  	JLE  LBB9_4
   469  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   470  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   471  
   472  LBB9_3:
   473  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
   474  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
   475  	LONG $0x06048842             // mov    byte [rsi + r8], al
   476  	LONG $0x01c08349             // add    r8, 1
   477  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   478  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   479  	JG   LBB9_3
   480  
   481  LBB9_4:
   482  	RET
   483  
   484  TEXT ·_transpose_uint16_int8_sse4(SB), $0-32
   485  
   486  	MOVQ src+0(FP), DI
   487  	MOVQ dest+8(FP), SI
   488  	MOVQ length+16(FP), DX
   489  	MOVQ transposeMap+24(FP), CX
   490  
   491  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   492  	JL   LBB10_1
   493  
   494  LBB10_5:
   495  	WORD $0xd089             // mov    eax, edx
   496  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
   497  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   498  	WORD $0x1688             // mov    byte [rsi], dl
   499  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
   500  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   501  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   502  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
   503  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   504  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   505  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
   506  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   507  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   508  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   509  	LONG $0x08c78348         // add    rdi, 8
   510  	LONG $0x04c68348         // add    rsi, 4
   511  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   512  	JG   LBB10_5
   513  
   514  LBB10_1:
   515  	WORD $0xd285             // test    edx, edx
   516  	JLE  LBB10_4
   517  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   518  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   519  
   520  LBB10_3:
   521  	LONG $0x04b70f42; BYTE $0x47 // movzx    eax, word [rdi + 2*r8]
   522  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
   523  	LONG $0x06048842             // mov    byte [rsi + r8], al
   524  	LONG $0x01c08349             // add    r8, 1
   525  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   526  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   527  	JG   LBB10_3
   528  
   529  LBB10_4:
   530  	RET
   531  
   532  TEXT ·_transpose_int16_int8_sse4(SB), $0-32
   533  
   534  	MOVQ src+0(FP), DI
   535  	MOVQ dest+8(FP), SI
   536  	MOVQ length+16(FP), DX
   537  	MOVQ transposeMap+24(FP), CX
   538  
   539  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   540  	JL   LBB11_1
   541  
   542  LBB11_5:
   543  	WORD $0xd089                 // mov    eax, edx
   544  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
   545  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   546  	WORD $0x1688                 // mov    byte [rsi], dl
   547  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
   548  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   549  	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
   550  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
   551  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   552  	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
   553  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
   554  	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
   555  	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
   556  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
   557  	LONG $0x08c78348             // add    rdi, 8
   558  	LONG $0x04c68348             // add    rsi, 4
   559  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
   560  	JG   LBB11_5
   561  
   562  LBB11_1:
   563  	WORD $0xd285             // test    edx, edx
   564  	JLE  LBB11_4
   565  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   566  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   567  
   568  LBB11_3:
   569  	LONG $0x04bf0f4a; BYTE $0x47 // movsx    rax, word [rdi + 2*r8]
   570  	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
   571  	LONG $0x06048842             // mov    byte [rsi + r8], al
   572  	LONG $0x01c08349             // add    r8, 1
   573  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   574  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   575  	JG   LBB11_3
   576  
   577  LBB11_4:
   578  	RET
   579  
   580  TEXT ·_transpose_uint32_int8_sse4(SB), $0-32
   581  
   582  	MOVQ src+0(FP), DI
   583  	MOVQ dest+8(FP), SI
   584  	MOVQ length+16(FP), DX
   585  	MOVQ transposeMap+24(FP), CX
   586  
   587  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   588  	JL   LBB12_1
   589  
   590  LBB12_5:
   591  	WORD $0xd089             // mov    eax, edx
   592  	WORD $0x178b             // mov    edx, dword [rdi]
   593  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   594  	WORD $0x1688             // mov    byte [rsi], dl
   595  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
   596  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   597  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   598  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
   599  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   600  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   601  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
   602  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   603  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   604  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   605  	LONG $0x10c78348         // add    rdi, 16
   606  	LONG $0x04c68348         // add    rsi, 4
   607  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   608  	JG   LBB12_5
   609  
   610  LBB12_1:
   611  	WORD $0xd285             // test    edx, edx
   612  	JLE  LBB12_4
   613  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   614  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   615  
   616  LBB12_3:
   617  	LONG $0x87048b42         // mov    eax, dword [rdi + 4*r8]
   618  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   619  	LONG $0x06048842         // mov    byte [rsi + r8], al
   620  	LONG $0x01c08349         // add    r8, 1
   621  	WORD $0xc283; BYTE $0xff // add    edx, -1
   622  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   623  	JG   LBB12_3
   624  
   625  LBB12_4:
   626  	RET
   627  
   628  TEXT ·_transpose_int32_int8_sse4(SB), $0-32
   629  
   630  	MOVQ src+0(FP), DI
   631  	MOVQ dest+8(FP), SI
   632  	MOVQ length+16(FP), DX
   633  	MOVQ transposeMap+24(FP), CX
   634  
   635  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   636  	JL   LBB13_1
   637  
   638  LBB13_5:
   639  	WORD $0xd089             // mov    eax, edx
   640  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
   641  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   642  	WORD $0x1688             // mov    byte [rsi], dl
   643  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
   644  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   645  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   646  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
   647  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   648  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   649  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
   650  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   651  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   652  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   653  	LONG $0x10c78348         // add    rdi, 16
   654  	LONG $0x04c68348         // add    rsi, 4
   655  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   656  	JG   LBB13_5
   657  
   658  LBB13_1:
   659  	WORD $0xd285             // test    edx, edx
   660  	JLE  LBB13_4
   661  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   662  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   663  
   664  LBB13_3:
   665  	LONG $0x8704634a         // movsxd    rax, dword [rdi + 4*r8]
   666  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   667  	LONG $0x06048842         // mov    byte [rsi + r8], al
   668  	LONG $0x01c08349         // add    r8, 1
   669  	WORD $0xc283; BYTE $0xff // add    edx, -1
   670  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   671  	JG   LBB13_3
   672  
   673  LBB13_4:
   674  	RET
   675  
   676  TEXT ·_transpose_uint64_int8_sse4(SB), $0-32
   677  
   678  	MOVQ src+0(FP), DI
   679  	MOVQ dest+8(FP), SI
   680  	MOVQ length+16(FP), DX
   681  	MOVQ transposeMap+24(FP), CX
   682  
   683  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   684  	JL   LBB14_1
   685  
   686  LBB14_5:
   687  	WORD $0xd089             // mov    eax, edx
   688  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
   689  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   690  	WORD $0x1688             // mov    byte [rsi], dl
   691  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
   692  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   693  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   694  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
   695  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   696  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   697  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
   698  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   699  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   700  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   701  	LONG $0x20c78348         // add    rdi, 32
   702  	LONG $0x04c68348         // add    rsi, 4
   703  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   704  	JG   LBB14_5
   705  
   706  LBB14_1:
   707  	WORD $0xd285             // test    edx, edx
   708  	JLE  LBB14_4
   709  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   710  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   711  
   712  LBB14_3:
   713  	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
   714  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   715  	LONG $0x06048842         // mov    byte [rsi + r8], al
   716  	LONG $0x01c08349         // add    r8, 1
   717  	WORD $0xc283; BYTE $0xff // add    edx, -1
   718  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   719  	JG   LBB14_3
   720  
   721  LBB14_4:
   722  	RET
   723  
   724  TEXT ·_transpose_int64_int8_sse4(SB), $0-32
   725  
   726  	MOVQ src+0(FP), DI
   727  	MOVQ dest+8(FP), SI
   728  	MOVQ length+16(FP), DX
   729  	MOVQ transposeMap+24(FP), CX
   730  
   731  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   732  	JL   LBB15_1
   733  
   734  LBB15_5:
   735  	WORD $0xd089             // mov    eax, edx
   736  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
   737  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   738  	WORD $0x1688             // mov    byte [rsi], dl
   739  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
   740  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   741  	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
   742  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
   743  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   744  	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
   745  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
   746  	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
   747  	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
   748  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   749  	LONG $0x20c78348         // add    rdi, 32
   750  	LONG $0x04c68348         // add    rsi, 4
   751  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   752  	JG   LBB15_5
   753  
   754  LBB15_1:
   755  	WORD $0xd285             // test    edx, edx
   756  	JLE  LBB15_4
   757  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   758  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   759  
   760  LBB15_3:
   761  	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
   762  	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
   763  	LONG $0x06048842         // mov    byte [rsi + r8], al
   764  	LONG $0x01c08349         // add    r8, 1
   765  	WORD $0xc283; BYTE $0xff // add    edx, -1
   766  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
   767  	JG   LBB15_3
   768  
   769  LBB15_4:
   770  	RET
   771  
   772  TEXT ·_transpose_uint8_uint16_sse4(SB), $0-32
   773  
   774  	MOVQ src+0(FP), DI
   775  	MOVQ dest+8(FP), SI
   776  	MOVQ length+16(FP), DX
   777  	MOVQ transposeMap+24(FP), CX
   778  
   779  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   780  	JL   LBB16_1
   781  
   782  LBB16_5:
   783  	WORD $0xd089             // mov    eax, edx
   784  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
   785  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   786  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
   787  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
   788  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   789  	LONG $0x02568966         // mov    word [rsi + 2], dx
   790  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
   791  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   792  	LONG $0x04568966         // mov    word [rsi + 4], dx
   793  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
   794  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   795  	LONG $0x06568966         // mov    word [rsi + 6], dx
   796  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   797  	LONG $0x04c78348         // add    rdi, 4
   798  	LONG $0x08c68348         // add    rsi, 8
   799  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   800  	JG   LBB16_5
   801  
   802  LBB16_1:
   803  	WORD $0xd285             // test    edx, edx
   804  	JLE  LBB16_4
   805  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   806  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   807  
   808  LBB16_3:
   809  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
   810  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
   811  	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
   812  	LONG $0x01c08349             // add    r8, 1
   813  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   814  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   815  	JG   LBB16_3
   816  
   817  LBB16_4:
   818  	RET
   819  
   820  TEXT ·_transpose_int8_uint16_sse4(SB), $0-32
   821  
   822  	MOVQ src+0(FP), DI
   823  	MOVQ dest+8(FP), SI
   824  	MOVQ length+16(FP), DX
   825  	MOVQ transposeMap+24(FP), CX
   826  
   827  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   828  	JL   LBB17_1
   829  
   830  LBB17_5:
   831  	WORD $0xd089                 // mov    eax, edx
   832  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
   833  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   834  	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
   835  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
   836  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   837  	LONG $0x02568966             // mov    word [rsi + 2], dx
   838  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
   839  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   840  	LONG $0x04568966             // mov    word [rsi + 4], dx
   841  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
   842  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   843  	LONG $0x06568966             // mov    word [rsi + 6], dx
   844  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
   845  	LONG $0x04c78348             // add    rdi, 4
   846  	LONG $0x08c68348             // add    rsi, 8
   847  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
   848  	JG   LBB17_5
   849  
   850  LBB17_1:
   851  	WORD $0xd285             // test    edx, edx
   852  	JLE  LBB17_4
   853  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   854  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   855  
   856  LBB17_3:
   857  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
   858  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
   859  	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
   860  	LONG $0x01c08349             // add    r8, 1
   861  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   862  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   863  	JG   LBB17_3
   864  
   865  LBB17_4:
   866  	RET
   867  
   868  TEXT ·_transpose_uint16_uint16_sse4(SB), $0-32
   869  
   870  	MOVQ src+0(FP), DI
   871  	MOVQ dest+8(FP), SI
   872  	MOVQ length+16(FP), DX
   873  	MOVQ transposeMap+24(FP), CX
   874  
   875  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   876  	JL   LBB18_1
   877  
   878  LBB18_5:
   879  	WORD $0xd089             // mov    eax, edx
   880  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
   881  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   882  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
   883  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
   884  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   885  	LONG $0x02568966         // mov    word [rsi + 2], dx
   886  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
   887  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   888  	LONG $0x04568966         // mov    word [rsi + 4], dx
   889  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
   890  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   891  	LONG $0x06568966         // mov    word [rsi + 6], dx
   892  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   893  	LONG $0x08c78348         // add    rdi, 8
   894  	LONG $0x08c68348         // add    rsi, 8
   895  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   896  	JG   LBB18_5
   897  
   898  LBB18_1:
   899  	WORD $0xd285             // test    edx, edx
   900  	JLE  LBB18_4
   901  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   902  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   903  
   904  LBB18_3:
   905  	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
   906  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
   907  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
   908  	LONG $0x02c08349             // add    r8, 2
   909  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   910  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   911  	JG   LBB18_3
   912  
   913  LBB18_4:
   914  	RET
   915  
   916  TEXT ·_transpose_int16_uint16_sse4(SB), $0-32
   917  
   918  	MOVQ src+0(FP), DI
   919  	MOVQ dest+8(FP), SI
   920  	MOVQ length+16(FP), DX
   921  	MOVQ transposeMap+24(FP), CX
   922  
   923  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   924  	JL   LBB19_1
   925  
   926  LBB19_5:
   927  	WORD $0xd089                 // mov    eax, edx
   928  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
   929  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   930  	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
   931  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
   932  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   933  	LONG $0x02568966             // mov    word [rsi + 2], dx
   934  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
   935  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   936  	LONG $0x04568966             // mov    word [rsi + 4], dx
   937  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
   938  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
   939  	LONG $0x06568966             // mov    word [rsi + 6], dx
   940  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
   941  	LONG $0x08c78348             // add    rdi, 8
   942  	LONG $0x08c68348             // add    rsi, 8
   943  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
   944  	JG   LBB19_5
   945  
   946  LBB19_1:
   947  	WORD $0xd285             // test    edx, edx
   948  	JLE  LBB19_4
   949  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   950  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   951  
   952  LBB19_3:
   953  	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
   954  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
   955  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
   956  	LONG $0x02c08349             // add    r8, 2
   957  	WORD $0xc283; BYTE $0xff     // add    edx, -1
   958  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
   959  	JG   LBB19_3
   960  
   961  LBB19_4:
   962  	RET
   963  
   964  TEXT ·_transpose_uint32_uint16_sse4(SB), $0-32
   965  
   966  	MOVQ src+0(FP), DI
   967  	MOVQ dest+8(FP), SI
   968  	MOVQ length+16(FP), DX
   969  	MOVQ transposeMap+24(FP), CX
   970  
   971  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
   972  	JL   LBB20_1
   973  
   974  LBB20_5:
   975  	WORD $0xd089             // mov    eax, edx
   976  	WORD $0x178b             // mov    edx, dword [rdi]
   977  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   978  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
   979  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
   980  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   981  	LONG $0x02568966         // mov    word [rsi + 2], dx
   982  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
   983  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   984  	LONG $0x04568966         // mov    word [rsi + 4], dx
   985  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
   986  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
   987  	LONG $0x06568966         // mov    word [rsi + 6], dx
   988  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
   989  	LONG $0x10c78348         // add    rdi, 16
   990  	LONG $0x08c68348         // add    rsi, 8
   991  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
   992  	JG   LBB20_5
   993  
   994  LBB20_1:
   995  	WORD $0xd285             // test    edx, edx
   996  	JLE  LBB20_4
   997  	WORD $0xc283; BYTE $0x01 // add    edx, 1
   998  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
   999  
  1000  LBB20_3:
  1001  	LONG $0x47048b42             // mov    eax, dword [rdi + 2*r8]
  1002  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1003  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1004  	LONG $0x02c08349             // add    r8, 2
  1005  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1006  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1007  	JG   LBB20_3
  1008  
  1009  LBB20_4:
  1010  	RET
  1011  
  1012  TEXT ·_transpose_int32_uint16_sse4(SB), $0-32
  1013  
  1014  	MOVQ src+0(FP), DI
  1015  	MOVQ dest+8(FP), SI
  1016  	MOVQ length+16(FP), DX
  1017  	MOVQ transposeMap+24(FP), CX
  1018  
  1019  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1020  	JL   LBB21_1
  1021  
  1022  LBB21_5:
  1023  	WORD $0xd089             // mov    eax, edx
  1024  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
  1025  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1026  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1027  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
  1028  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1029  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1030  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
  1031  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1032  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1033  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
  1034  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1035  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1036  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1037  	LONG $0x10c78348         // add    rdi, 16
  1038  	LONG $0x08c68348         // add    rsi, 8
  1039  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1040  	JG   LBB21_5
  1041  
  1042  LBB21_1:
  1043  	WORD $0xd285             // test    edx, edx
  1044  	JLE  LBB21_4
  1045  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1046  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1047  
  1048  LBB21_3:
  1049  	LONG $0x4704634a             // movsxd    rax, dword [rdi + 2*r8]
  1050  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1051  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1052  	LONG $0x02c08349             // add    r8, 2
  1053  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1054  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1055  	JG   LBB21_3
  1056  
  1057  LBB21_4:
  1058  	RET
  1059  
  1060  TEXT ·_transpose_uint64_uint16_sse4(SB), $0-32
  1061  
  1062  	MOVQ src+0(FP), DI
  1063  	MOVQ dest+8(FP), SI
  1064  	MOVQ length+16(FP), DX
  1065  	MOVQ transposeMap+24(FP), CX
  1066  
  1067  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1068  	JL   LBB22_1
  1069  
  1070  LBB22_5:
  1071  	WORD $0xd089             // mov    eax, edx
  1072  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  1073  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1074  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1075  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  1076  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1077  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1078  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  1079  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1080  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1081  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  1082  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1083  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1084  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1085  	LONG $0x20c78348         // add    rdi, 32
  1086  	LONG $0x08c68348         // add    rsi, 8
  1087  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1088  	JG   LBB22_5
  1089  
  1090  LBB22_1:
  1091  	WORD $0xd285             // test    edx, edx
  1092  	JLE  LBB22_4
  1093  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1094  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1095  
  1096  LBB22_3:
  1097  	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
  1098  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1099  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1100  	LONG $0x02c08349             // add    r8, 2
  1101  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1102  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1103  	JG   LBB22_3
  1104  
  1105  LBB22_4:
  1106  	RET
  1107  
  1108  TEXT ·_transpose_int64_uint16_sse4(SB), $0-32
  1109  
  1110  	MOVQ src+0(FP), DI
  1111  	MOVQ dest+8(FP), SI
  1112  	MOVQ length+16(FP), DX
  1113  	MOVQ transposeMap+24(FP), CX
  1114  
  1115  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1116  	JL   LBB23_1
  1117  
  1118  LBB23_5:
  1119  	WORD $0xd089             // mov    eax, edx
  1120  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  1121  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1122  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1123  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  1124  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1125  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1126  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  1127  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1128  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1129  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  1130  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1131  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1132  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1133  	LONG $0x20c78348         // add    rdi, 32
  1134  	LONG $0x08c68348         // add    rsi, 8
  1135  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1136  	JG   LBB23_5
  1137  
  1138  LBB23_1:
  1139  	WORD $0xd285             // test    edx, edx
  1140  	JLE  LBB23_4
  1141  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1142  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1143  
  1144  LBB23_3:
  1145  	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
  1146  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1147  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1148  	LONG $0x02c08349             // add    r8, 2
  1149  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1150  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1151  	JG   LBB23_3
  1152  
  1153  LBB23_4:
  1154  	RET
  1155  
  1156  TEXT ·_transpose_uint8_int16_sse4(SB), $0-32
  1157  
  1158  	MOVQ src+0(FP), DI
  1159  	MOVQ dest+8(FP), SI
  1160  	MOVQ length+16(FP), DX
  1161  	MOVQ transposeMap+24(FP), CX
  1162  
  1163  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1164  	JL   LBB24_1
  1165  
  1166  LBB24_5:
  1167  	WORD $0xd089             // mov    eax, edx
  1168  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
  1169  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1170  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1171  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
  1172  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1173  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1174  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
  1175  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1176  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1177  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
  1178  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1179  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1180  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1181  	LONG $0x04c78348         // add    rdi, 4
  1182  	LONG $0x08c68348         // add    rsi, 8
  1183  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1184  	JG   LBB24_5
  1185  
  1186  LBB24_1:
  1187  	WORD $0xd285             // test    edx, edx
  1188  	JLE  LBB24_4
  1189  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1190  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1191  
  1192  LBB24_3:
  1193  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
  1194  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1195  	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
  1196  	LONG $0x01c08349             // add    r8, 1
  1197  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1198  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1199  	JG   LBB24_3
  1200  
  1201  LBB24_4:
  1202  	RET
  1203  
  1204  TEXT ·_transpose_int8_int16_sse4(SB), $0-32
  1205  
  1206  	MOVQ src+0(FP), DI
  1207  	MOVQ dest+8(FP), SI
  1208  	MOVQ length+16(FP), DX
  1209  	MOVQ transposeMap+24(FP), CX
  1210  
  1211  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1212  	JL   LBB25_1
  1213  
  1214  LBB25_5:
  1215  	WORD $0xd089                 // mov    eax, edx
  1216  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
  1217  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1218  	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
  1219  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
  1220  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1221  	LONG $0x02568966             // mov    word [rsi + 2], dx
  1222  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
  1223  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1224  	LONG $0x04568966             // mov    word [rsi + 4], dx
  1225  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
  1226  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1227  	LONG $0x06568966             // mov    word [rsi + 6], dx
  1228  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  1229  	LONG $0x04c78348             // add    rdi, 4
  1230  	LONG $0x08c68348             // add    rsi, 8
  1231  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  1232  	JG   LBB25_5
  1233  
  1234  LBB25_1:
  1235  	WORD $0xd285             // test    edx, edx
  1236  	JLE  LBB25_4
  1237  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1238  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1239  
  1240  LBB25_3:
  1241  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
  1242  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1243  	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
  1244  	LONG $0x01c08349             // add    r8, 1
  1245  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1246  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1247  	JG   LBB25_3
  1248  
  1249  LBB25_4:
  1250  	RET
  1251  
  1252  TEXT ·_transpose_uint16_int16_sse4(SB), $0-32
  1253  
  1254  	MOVQ src+0(FP), DI
  1255  	MOVQ dest+8(FP), SI
  1256  	MOVQ length+16(FP), DX
  1257  	MOVQ transposeMap+24(FP), CX
  1258  
  1259  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1260  	JL   LBB26_1
  1261  
  1262  LBB26_5:
  1263  	WORD $0xd089             // mov    eax, edx
  1264  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
  1265  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1266  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1267  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
  1268  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1269  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1270  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
  1271  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1272  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1273  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
  1274  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1275  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1276  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1277  	LONG $0x08c78348         // add    rdi, 8
  1278  	LONG $0x08c68348         // add    rsi, 8
  1279  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1280  	JG   LBB26_5
  1281  
  1282  LBB26_1:
  1283  	WORD $0xd285             // test    edx, edx
  1284  	JLE  LBB26_4
  1285  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1286  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1287  
  1288  LBB26_3:
  1289  	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
  1290  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1291  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1292  	LONG $0x02c08349             // add    r8, 2
  1293  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1294  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1295  	JG   LBB26_3
  1296  
  1297  LBB26_4:
  1298  	RET
  1299  
  1300  TEXT ·_transpose_int16_int16_sse4(SB), $0-32
  1301  
  1302  	MOVQ src+0(FP), DI
  1303  	MOVQ dest+8(FP), SI
  1304  	MOVQ length+16(FP), DX
  1305  	MOVQ transposeMap+24(FP), CX
  1306  
  1307  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1308  	JL   LBB27_1
  1309  
  1310  LBB27_5:
  1311  	WORD $0xd089                 // mov    eax, edx
  1312  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
  1313  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1314  	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
  1315  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
  1316  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1317  	LONG $0x02568966             // mov    word [rsi + 2], dx
  1318  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
  1319  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1320  	LONG $0x04568966             // mov    word [rsi + 4], dx
  1321  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
  1322  	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
  1323  	LONG $0x06568966             // mov    word [rsi + 6], dx
  1324  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  1325  	LONG $0x08c78348             // add    rdi, 8
  1326  	LONG $0x08c68348             // add    rsi, 8
  1327  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  1328  	JG   LBB27_5
  1329  
  1330  LBB27_1:
  1331  	WORD $0xd285             // test    edx, edx
  1332  	JLE  LBB27_4
  1333  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1334  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1335  
  1336  LBB27_3:
  1337  	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
  1338  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1339  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1340  	LONG $0x02c08349             // add    r8, 2
  1341  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1342  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1343  	JG   LBB27_3
  1344  
  1345  LBB27_4:
  1346  	RET
  1347  
  1348  TEXT ·_transpose_uint32_int16_sse4(SB), $0-32
  1349  
  1350  	MOVQ src+0(FP), DI
  1351  	MOVQ dest+8(FP), SI
  1352  	MOVQ length+16(FP), DX
  1353  	MOVQ transposeMap+24(FP), CX
  1354  
  1355  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1356  	JL   LBB28_1
  1357  
  1358  LBB28_5:
  1359  	WORD $0xd089             // mov    eax, edx
  1360  	WORD $0x178b             // mov    edx, dword [rdi]
  1361  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1362  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1363  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
  1364  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1365  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1366  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
  1367  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1368  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1369  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
  1370  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1371  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1372  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1373  	LONG $0x10c78348         // add    rdi, 16
  1374  	LONG $0x08c68348         // add    rsi, 8
  1375  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1376  	JG   LBB28_5
  1377  
  1378  LBB28_1:
  1379  	WORD $0xd285             // test    edx, edx
  1380  	JLE  LBB28_4
  1381  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1382  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1383  
  1384  LBB28_3:
  1385  	LONG $0x47048b42             // mov    eax, dword [rdi + 2*r8]
  1386  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1387  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1388  	LONG $0x02c08349             // add    r8, 2
  1389  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1390  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1391  	JG   LBB28_3
  1392  
  1393  LBB28_4:
  1394  	RET
  1395  
  1396  TEXT ·_transpose_int32_int16_sse4(SB), $0-32
  1397  
  1398  	MOVQ src+0(FP), DI
  1399  	MOVQ dest+8(FP), SI
  1400  	MOVQ length+16(FP), DX
  1401  	MOVQ transposeMap+24(FP), CX
  1402  
  1403  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1404  	JL   LBB29_1
  1405  
  1406  LBB29_5:
  1407  	WORD $0xd089             // mov    eax, edx
  1408  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
  1409  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1410  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1411  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
  1412  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1413  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1414  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
  1415  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1416  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1417  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
  1418  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1419  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1420  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1421  	LONG $0x10c78348         // add    rdi, 16
  1422  	LONG $0x08c68348         // add    rsi, 8
  1423  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1424  	JG   LBB29_5
  1425  
  1426  LBB29_1:
  1427  	WORD $0xd285             // test    edx, edx
  1428  	JLE  LBB29_4
  1429  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1430  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1431  
  1432  LBB29_3:
  1433  	LONG $0x4704634a             // movsxd    rax, dword [rdi + 2*r8]
  1434  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1435  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1436  	LONG $0x02c08349             // add    r8, 2
  1437  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1438  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1439  	JG   LBB29_3
  1440  
  1441  LBB29_4:
  1442  	RET
  1443  
  1444  TEXT ·_transpose_uint64_int16_sse4(SB), $0-32
  1445  
  1446  	MOVQ src+0(FP), DI
  1447  	MOVQ dest+8(FP), SI
  1448  	MOVQ length+16(FP), DX
  1449  	MOVQ transposeMap+24(FP), CX
  1450  
  1451  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1452  	JL   LBB30_1
  1453  
  1454  LBB30_5:
  1455  	WORD $0xd089             // mov    eax, edx
  1456  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  1457  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1458  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1459  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  1460  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1461  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1462  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  1463  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1464  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1465  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  1466  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1467  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1468  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1469  	LONG $0x20c78348         // add    rdi, 32
  1470  	LONG $0x08c68348         // add    rsi, 8
  1471  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1472  	JG   LBB30_5
  1473  
  1474  LBB30_1:
  1475  	WORD $0xd285             // test    edx, edx
  1476  	JLE  LBB30_4
  1477  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1478  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1479  
  1480  LBB30_3:
  1481  	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
  1482  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1483  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1484  	LONG $0x02c08349             // add    r8, 2
  1485  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1486  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1487  	JG   LBB30_3
  1488  
  1489  LBB30_4:
  1490  	RET
  1491  
  1492  TEXT ·_transpose_int64_int16_sse4(SB), $0-32
  1493  
  1494  	MOVQ src+0(FP), DI
  1495  	MOVQ dest+8(FP), SI
  1496  	MOVQ length+16(FP), DX
  1497  	MOVQ transposeMap+24(FP), CX
  1498  
  1499  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1500  	JL   LBB31_1
  1501  
  1502  LBB31_5:
  1503  	WORD $0xd089             // mov    eax, edx
  1504  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  1505  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1506  	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
  1507  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  1508  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1509  	LONG $0x02568966         // mov    word [rsi + 2], dx
  1510  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  1511  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1512  	LONG $0x04568966         // mov    word [rsi + 4], dx
  1513  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  1514  	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
  1515  	LONG $0x06568966         // mov    word [rsi + 6], dx
  1516  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1517  	LONG $0x20c78348         // add    rdi, 32
  1518  	LONG $0x08c68348         // add    rsi, 8
  1519  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1520  	JG   LBB31_5
  1521  
  1522  LBB31_1:
  1523  	WORD $0xd285             // test    edx, edx
  1524  	JLE  LBB31_4
  1525  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1526  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1527  
  1528  LBB31_3:
  1529  	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
  1530  	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
  1531  	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
  1532  	LONG $0x02c08349             // add    r8, 2
  1533  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1534  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1535  	JG   LBB31_3
  1536  
  1537  LBB31_4:
  1538  	RET
  1539  
  1540  TEXT ·_transpose_uint8_uint32_sse4(SB), $0-32
  1541  
  1542  	MOVQ src+0(FP), DI
  1543  	MOVQ dest+8(FP), SI
  1544  	MOVQ length+16(FP), DX
  1545  	MOVQ transposeMap+24(FP), CX
  1546  
  1547  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1548  	JL   LBB32_1
  1549  
  1550  LBB32_5:
  1551  	WORD $0xd089             // mov    eax, edx
  1552  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
  1553  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1554  	WORD $0x1689             // mov    dword [rsi], edx
  1555  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
  1556  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1557  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1558  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
  1559  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1560  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1561  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
  1562  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1563  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1564  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1565  	LONG $0x04c78348         // add    rdi, 4
  1566  	LONG $0x10c68348         // add    rsi, 16
  1567  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1568  	JG   LBB32_5
  1569  
  1570  LBB32_1:
  1571  	WORD $0xd285             // test    edx, edx
  1572  	JLE  LBB32_4
  1573  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1574  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1575  
  1576  LBB32_3:
  1577  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
  1578  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  1579  	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
  1580  	LONG $0x01c08349             // add    r8, 1
  1581  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1582  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1583  	JG   LBB32_3
  1584  
  1585  LBB32_4:
  1586  	RET
  1587  
  1588  TEXT ·_transpose_int8_uint32_sse4(SB), $0-32
  1589  
  1590  	MOVQ src+0(FP), DI
  1591  	MOVQ dest+8(FP), SI
  1592  	MOVQ length+16(FP), DX
  1593  	MOVQ transposeMap+24(FP), CX
  1594  
  1595  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1596  	JL   LBB33_1
  1597  
  1598  LBB33_5:
  1599  	WORD $0xd089                 // mov    eax, edx
  1600  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
  1601  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1602  	WORD $0x1689                 // mov    dword [rsi], edx
  1603  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
  1604  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1605  	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
  1606  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
  1607  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1608  	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
  1609  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
  1610  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1611  	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
  1612  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  1613  	LONG $0x04c78348             // add    rdi, 4
  1614  	LONG $0x10c68348             // add    rsi, 16
  1615  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  1616  	JG   LBB33_5
  1617  
  1618  LBB33_1:
  1619  	WORD $0xd285             // test    edx, edx
  1620  	JLE  LBB33_4
  1621  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1622  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1623  
  1624  LBB33_3:
  1625  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
  1626  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  1627  	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
  1628  	LONG $0x01c08349             // add    r8, 1
  1629  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1630  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1631  	JG   LBB33_3
  1632  
  1633  LBB33_4:
  1634  	RET
  1635  
  1636  TEXT ·_transpose_uint16_uint32_sse4(SB), $0-32
  1637  
  1638  	MOVQ src+0(FP), DI
  1639  	MOVQ dest+8(FP), SI
  1640  	MOVQ length+16(FP), DX
  1641  	MOVQ transposeMap+24(FP), CX
  1642  
  1643  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1644  	JL   LBB34_1
  1645  
  1646  LBB34_5:
  1647  	WORD $0xd089             // mov    eax, edx
  1648  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
  1649  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1650  	WORD $0x1689             // mov    dword [rsi], edx
  1651  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
  1652  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1653  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1654  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
  1655  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1656  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1657  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
  1658  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1659  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1660  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1661  	LONG $0x08c78348         // add    rdi, 8
  1662  	LONG $0x10c68348         // add    rsi, 16
  1663  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1664  	JG   LBB34_5
  1665  
  1666  LBB34_1:
  1667  	WORD $0xd285             // test    edx, edx
  1668  	JLE  LBB34_4
  1669  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1670  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1671  
  1672  LBB34_3:
  1673  	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
  1674  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  1675  	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
  1676  	LONG $0x02c08349             // add    r8, 2
  1677  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1678  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1679  	JG   LBB34_3
  1680  
  1681  LBB34_4:
  1682  	RET
  1683  
  1684  TEXT ·_transpose_int16_uint32_sse4(SB), $0-32
  1685  
  1686  	MOVQ src+0(FP), DI
  1687  	MOVQ dest+8(FP), SI
  1688  	MOVQ length+16(FP), DX
  1689  	MOVQ transposeMap+24(FP), CX
  1690  
  1691  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1692  	JL   LBB35_1
  1693  
  1694  LBB35_5:
  1695  	WORD $0xd089                 // mov    eax, edx
  1696  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
  1697  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1698  	WORD $0x1689                 // mov    dword [rsi], edx
  1699  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
  1700  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1701  	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
  1702  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
  1703  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1704  	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
  1705  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
  1706  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1707  	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
  1708  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  1709  	LONG $0x08c78348             // add    rdi, 8
  1710  	LONG $0x10c68348             // add    rsi, 16
  1711  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  1712  	JG   LBB35_5
  1713  
  1714  LBB35_1:
  1715  	WORD $0xd285             // test    edx, edx
  1716  	JLE  LBB35_4
  1717  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1718  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1719  
  1720  LBB35_3:
  1721  	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
  1722  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  1723  	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
  1724  	LONG $0x02c08349             // add    r8, 2
  1725  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1726  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1727  	JG   LBB35_3
  1728  
  1729  LBB35_4:
  1730  	RET
  1731  
  1732  TEXT ·_transpose_uint32_uint32_sse4(SB), $0-32
  1733  
  1734  	MOVQ src+0(FP), DI
  1735  	MOVQ dest+8(FP), SI
  1736  	MOVQ length+16(FP), DX
  1737  	MOVQ transposeMap+24(FP), CX
  1738  
  1739  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1740  	JL   LBB36_1
  1741  
  1742  LBB36_5:
  1743  	WORD $0xd089             // mov    eax, edx
  1744  	WORD $0x178b             // mov    edx, dword [rdi]
  1745  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1746  	WORD $0x1689             // mov    dword [rsi], edx
  1747  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
  1748  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1749  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1750  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
  1751  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1752  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1753  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
  1754  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1755  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1756  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1757  	LONG $0x10c78348         // add    rdi, 16
  1758  	LONG $0x10c68348         // add    rsi, 16
  1759  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1760  	JG   LBB36_5
  1761  
  1762  LBB36_1:
  1763  	WORD $0xd285             // test    edx, edx
  1764  	JLE  LBB36_4
  1765  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1766  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1767  
  1768  LBB36_3:
  1769  	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
  1770  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  1771  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  1772  	LONG $0x04c08349         // add    r8, 4
  1773  	WORD $0xc283; BYTE $0xff // add    edx, -1
  1774  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  1775  	JG   LBB36_3
  1776  
  1777  LBB36_4:
  1778  	RET
  1779  
  1780  TEXT ·_transpose_int32_uint32_sse4(SB), $0-32
  1781  
  1782  	MOVQ src+0(FP), DI
  1783  	MOVQ dest+8(FP), SI
  1784  	MOVQ length+16(FP), DX
  1785  	MOVQ transposeMap+24(FP), CX
  1786  
  1787  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1788  	JL   LBB37_1
  1789  
  1790  LBB37_5:
  1791  	WORD $0xd089             // mov    eax, edx
  1792  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
  1793  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1794  	WORD $0x1689             // mov    dword [rsi], edx
  1795  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
  1796  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1797  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1798  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
  1799  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1800  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1801  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
  1802  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1803  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1804  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1805  	LONG $0x10c78348         // add    rdi, 16
  1806  	LONG $0x10c68348         // add    rsi, 16
  1807  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1808  	JG   LBB37_5
  1809  
  1810  LBB37_1:
  1811  	WORD $0xd285             // test    edx, edx
  1812  	JLE  LBB37_4
  1813  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1814  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1815  
  1816  LBB37_3:
  1817  	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
  1818  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  1819  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  1820  	LONG $0x04c08349         // add    r8, 4
  1821  	WORD $0xc283; BYTE $0xff // add    edx, -1
  1822  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  1823  	JG   LBB37_3
  1824  
  1825  LBB37_4:
  1826  	RET
  1827  
  1828  TEXT ·_transpose_uint64_uint32_sse4(SB), $0-32
  1829  
  1830  	MOVQ src+0(FP), DI
  1831  	MOVQ dest+8(FP), SI
  1832  	MOVQ length+16(FP), DX
  1833  	MOVQ transposeMap+24(FP), CX
  1834  
  1835  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1836  	JL   LBB38_1
  1837  
  1838  LBB38_5:
  1839  	WORD $0xd089             // mov    eax, edx
  1840  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  1841  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1842  	WORD $0x1689             // mov    dword [rsi], edx
  1843  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  1844  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1845  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1846  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  1847  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1848  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1849  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  1850  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1851  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1852  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1853  	LONG $0x20c78348         // add    rdi, 32
  1854  	LONG $0x10c68348         // add    rsi, 16
  1855  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1856  	JG   LBB38_5
  1857  
  1858  LBB38_1:
  1859  	WORD $0xd285             // test    edx, edx
  1860  	JLE  LBB38_4
  1861  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1862  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1863  
  1864  LBB38_3:
  1865  	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
  1866  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  1867  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  1868  	LONG $0x04c08349         // add    r8, 4
  1869  	WORD $0xc283; BYTE $0xff // add    edx, -1
  1870  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  1871  	JG   LBB38_3
  1872  
  1873  LBB38_4:
  1874  	RET
  1875  
  1876  TEXT ·_transpose_int64_uint32_sse4(SB), $0-32
  1877  
  1878  	MOVQ src+0(FP), DI
  1879  	MOVQ dest+8(FP), SI
  1880  	MOVQ length+16(FP), DX
  1881  	MOVQ transposeMap+24(FP), CX
  1882  
  1883  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1884  	JL   LBB39_1
  1885  
  1886  LBB39_5:
  1887  	WORD $0xd089             // mov    eax, edx
  1888  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  1889  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1890  	WORD $0x1689             // mov    dword [rsi], edx
  1891  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  1892  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1893  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1894  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  1895  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1896  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1897  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  1898  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1899  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1900  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1901  	LONG $0x20c78348         // add    rdi, 32
  1902  	LONG $0x10c68348         // add    rsi, 16
  1903  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1904  	JG   LBB39_5
  1905  
  1906  LBB39_1:
  1907  	WORD $0xd285             // test    edx, edx
  1908  	JLE  LBB39_4
  1909  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1910  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1911  
  1912  LBB39_3:
  1913  	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
  1914  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  1915  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  1916  	LONG $0x04c08349         // add    r8, 4
  1917  	WORD $0xc283; BYTE $0xff // add    edx, -1
  1918  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  1919  	JG   LBB39_3
  1920  
  1921  LBB39_4:
  1922  	RET
  1923  
  1924  TEXT ·_transpose_uint8_int32_sse4(SB), $0-32
  1925  
  1926  	MOVQ src+0(FP), DI
  1927  	MOVQ dest+8(FP), SI
  1928  	MOVQ length+16(FP), DX
  1929  	MOVQ transposeMap+24(FP), CX
  1930  
  1931  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1932  	JL   LBB40_1
  1933  
  1934  LBB40_5:
  1935  	WORD $0xd089             // mov    eax, edx
  1936  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
  1937  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1938  	WORD $0x1689             // mov    dword [rsi], edx
  1939  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
  1940  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1941  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  1942  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
  1943  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1944  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  1945  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
  1946  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  1947  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  1948  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  1949  	LONG $0x04c78348         // add    rdi, 4
  1950  	LONG $0x10c68348         // add    rsi, 16
  1951  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  1952  	JG   LBB40_5
  1953  
  1954  LBB40_1:
  1955  	WORD $0xd285             // test    edx, edx
  1956  	JLE  LBB40_4
  1957  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  1958  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  1959  
  1960  LBB40_3:
  1961  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
  1962  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  1963  	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
  1964  	LONG $0x01c08349             // add    r8, 1
  1965  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  1966  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  1967  	JG   LBB40_3
  1968  
  1969  LBB40_4:
  1970  	RET
  1971  
  1972  TEXT ·_transpose_int8_int32_sse4(SB), $0-32
  1973  
  1974  	MOVQ src+0(FP), DI
  1975  	MOVQ dest+8(FP), SI
  1976  	MOVQ length+16(FP), DX
  1977  	MOVQ transposeMap+24(FP), CX
  1978  
  1979  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  1980  	JL   LBB41_1
  1981  
  1982  LBB41_5:
  1983  	WORD $0xd089                 // mov    eax, edx
  1984  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
  1985  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1986  	WORD $0x1689                 // mov    dword [rsi], edx
  1987  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
  1988  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1989  	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
  1990  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
  1991  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1992  	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
  1993  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
  1994  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  1995  	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
  1996  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  1997  	LONG $0x04c78348             // add    rdi, 4
  1998  	LONG $0x10c68348             // add    rsi, 16
  1999  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  2000  	JG   LBB41_5
  2001  
  2002  LBB41_1:
  2003  	WORD $0xd285             // test    edx, edx
  2004  	JLE  LBB41_4
  2005  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2006  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2007  
  2008  LBB41_3:
  2009  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
  2010  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  2011  	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
  2012  	LONG $0x01c08349             // add    r8, 1
  2013  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2014  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2015  	JG   LBB41_3
  2016  
  2017  LBB41_4:
  2018  	RET
  2019  
  2020  TEXT ·_transpose_uint16_int32_sse4(SB), $0-32
  2021  
  2022  	MOVQ src+0(FP), DI
  2023  	MOVQ dest+8(FP), SI
  2024  	MOVQ length+16(FP), DX
  2025  	MOVQ transposeMap+24(FP), CX
  2026  
  2027  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2028  	JL   LBB42_1
  2029  
  2030  LBB42_5:
  2031  	WORD $0xd089             // mov    eax, edx
  2032  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
  2033  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2034  	WORD $0x1689             // mov    dword [rsi], edx
  2035  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
  2036  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2037  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  2038  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
  2039  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2040  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  2041  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
  2042  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2043  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  2044  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2045  	LONG $0x08c78348         // add    rdi, 8
  2046  	LONG $0x10c68348         // add    rsi, 16
  2047  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2048  	JG   LBB42_5
  2049  
  2050  LBB42_1:
  2051  	WORD $0xd285             // test    edx, edx
  2052  	JLE  LBB42_4
  2053  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2054  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2055  
  2056  LBB42_3:
  2057  	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
  2058  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  2059  	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
  2060  	LONG $0x02c08349             // add    r8, 2
  2061  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2062  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2063  	JG   LBB42_3
  2064  
  2065  LBB42_4:
  2066  	RET
  2067  
  2068  TEXT ·_transpose_int16_int32_sse4(SB), $0-32
  2069  
  2070  	MOVQ src+0(FP), DI
  2071  	MOVQ dest+8(FP), SI
  2072  	MOVQ length+16(FP), DX
  2073  	MOVQ transposeMap+24(FP), CX
  2074  
  2075  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2076  	JL   LBB43_1
  2077  
  2078  LBB43_5:
  2079  	WORD $0xd089                 // mov    eax, edx
  2080  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
  2081  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  2082  	WORD $0x1689                 // mov    dword [rsi], edx
  2083  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
  2084  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  2085  	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
  2086  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
  2087  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  2088  	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
  2089  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
  2090  	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
  2091  	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
  2092  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  2093  	LONG $0x08c78348             // add    rdi, 8
  2094  	LONG $0x10c68348             // add    rsi, 16
  2095  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  2096  	JG   LBB43_5
  2097  
  2098  LBB43_1:
  2099  	WORD $0xd285             // test    edx, edx
  2100  	JLE  LBB43_4
  2101  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2102  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2103  
  2104  LBB43_3:
  2105  	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
  2106  	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
  2107  	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
  2108  	LONG $0x02c08349             // add    r8, 2
  2109  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2110  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2111  	JG   LBB43_3
  2112  
  2113  LBB43_4:
  2114  	RET
  2115  
  2116  TEXT ·_transpose_uint32_int32_sse4(SB), $0-32
  2117  
  2118  	MOVQ src+0(FP), DI
  2119  	MOVQ dest+8(FP), SI
  2120  	MOVQ length+16(FP), DX
  2121  	MOVQ transposeMap+24(FP), CX
  2122  
  2123  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2124  	JL   LBB44_1
  2125  
  2126  LBB44_5:
  2127  	WORD $0xd089             // mov    eax, edx
  2128  	WORD $0x178b             // mov    edx, dword [rdi]
  2129  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2130  	WORD $0x1689             // mov    dword [rsi], edx
  2131  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
  2132  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2133  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  2134  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
  2135  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2136  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  2137  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
  2138  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2139  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  2140  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2141  	LONG $0x10c78348         // add    rdi, 16
  2142  	LONG $0x10c68348         // add    rsi, 16
  2143  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2144  	JG   LBB44_5
  2145  
  2146  LBB44_1:
  2147  	WORD $0xd285             // test    edx, edx
  2148  	JLE  LBB44_4
  2149  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2150  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2151  
  2152  LBB44_3:
  2153  	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
  2154  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  2155  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  2156  	LONG $0x04c08349         // add    r8, 4
  2157  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2158  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2159  	JG   LBB44_3
  2160  
  2161  LBB44_4:
  2162  	RET
  2163  
  2164  TEXT ·_transpose_int32_int32_sse4(SB), $0-32
  2165  
  2166  	MOVQ src+0(FP), DI
  2167  	MOVQ dest+8(FP), SI
  2168  	MOVQ length+16(FP), DX
  2169  	MOVQ transposeMap+24(FP), CX
  2170  
  2171  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2172  	JL   LBB45_1
  2173  
  2174  LBB45_5:
  2175  	WORD $0xd089             // mov    eax, edx
  2176  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
  2177  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2178  	WORD $0x1689             // mov    dword [rsi], edx
  2179  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
  2180  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2181  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  2182  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
  2183  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2184  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  2185  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
  2186  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2187  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  2188  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2189  	LONG $0x10c78348         // add    rdi, 16
  2190  	LONG $0x10c68348         // add    rsi, 16
  2191  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2192  	JG   LBB45_5
  2193  
  2194  LBB45_1:
  2195  	WORD $0xd285             // test    edx, edx
  2196  	JLE  LBB45_4
  2197  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2198  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2199  
  2200  LBB45_3:
  2201  	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
  2202  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  2203  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  2204  	LONG $0x04c08349         // add    r8, 4
  2205  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2206  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2207  	JG   LBB45_3
  2208  
  2209  LBB45_4:
  2210  	RET
  2211  
  2212  TEXT ·_transpose_uint64_int32_sse4(SB), $0-32
  2213  
  2214  	MOVQ src+0(FP), DI
  2215  	MOVQ dest+8(FP), SI
  2216  	MOVQ length+16(FP), DX
  2217  	MOVQ transposeMap+24(FP), CX
  2218  
  2219  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2220  	JL   LBB46_1
  2221  
  2222  LBB46_5:
  2223  	WORD $0xd089             // mov    eax, edx
  2224  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  2225  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2226  	WORD $0x1689             // mov    dword [rsi], edx
  2227  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  2228  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2229  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  2230  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  2231  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2232  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  2233  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  2234  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2235  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  2236  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2237  	LONG $0x20c78348         // add    rdi, 32
  2238  	LONG $0x10c68348         // add    rsi, 16
  2239  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2240  	JG   LBB46_5
  2241  
  2242  LBB46_1:
  2243  	WORD $0xd285             // test    edx, edx
  2244  	JLE  LBB46_4
  2245  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2246  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2247  
  2248  LBB46_3:
  2249  	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
  2250  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  2251  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  2252  	LONG $0x04c08349         // add    r8, 4
  2253  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2254  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2255  	JG   LBB46_3
  2256  
  2257  LBB46_4:
  2258  	RET
  2259  
  2260  TEXT ·_transpose_int64_int32_sse4(SB), $0-32
  2261  
  2262  	MOVQ src+0(FP), DI
  2263  	MOVQ dest+8(FP), SI
  2264  	MOVQ length+16(FP), DX
  2265  	MOVQ transposeMap+24(FP), CX
  2266  
  2267  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2268  	JL   LBB47_1
  2269  
  2270  LBB47_5:
  2271  	WORD $0xd089             // mov    eax, edx
  2272  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  2273  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2274  	WORD $0x1689             // mov    dword [rsi], edx
  2275  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  2276  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2277  	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
  2278  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  2279  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2280  	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
  2281  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  2282  	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
  2283  	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
  2284  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2285  	LONG $0x20c78348         // add    rdi, 32
  2286  	LONG $0x10c68348         // add    rsi, 16
  2287  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2288  	JG   LBB47_5
  2289  
  2290  LBB47_1:
  2291  	WORD $0xd285             // test    edx, edx
  2292  	JLE  LBB47_4
  2293  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2294  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2295  
  2296  LBB47_3:
  2297  	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
  2298  	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
  2299  	LONG $0x06048942         // mov    dword [rsi + r8], eax
  2300  	LONG $0x04c08349         // add    r8, 4
  2301  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2302  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2303  	JG   LBB47_3
  2304  
  2305  LBB47_4:
  2306  	RET
  2307  
  2308  TEXT ·_transpose_uint8_uint64_sse4(SB), $0-32
  2309  
  2310  	MOVQ src+0(FP), DI
  2311  	MOVQ dest+8(FP), SI
  2312  	MOVQ length+16(FP), DX
  2313  	MOVQ transposeMap+24(FP), CX
  2314  
  2315  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2316  	JL   LBB48_1
  2317  
  2318  LBB48_5:
  2319  	WORD $0xd089             // mov    eax, edx
  2320  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
  2321  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2322  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2323  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
  2324  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2325  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2326  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
  2327  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2328  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2329  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
  2330  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2331  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2332  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2333  	LONG $0x04c78348         // add    rdi, 4
  2334  	LONG $0x20c68348         // add    rsi, 32
  2335  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2336  	JG   LBB48_5
  2337  
  2338  LBB48_1:
  2339  	WORD $0xd285             // test    edx, edx
  2340  	JLE  LBB48_4
  2341  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2342  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2343  
  2344  LBB48_3:
  2345  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
  2346  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2347  	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
  2348  	LONG $0x01c08349             // add    r8, 1
  2349  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2350  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2351  	JG   LBB48_3
  2352  
  2353  LBB48_4:
  2354  	RET
  2355  
  2356  TEXT ·_transpose_int8_uint64_sse4(SB), $0-32
  2357  
  2358  	MOVQ src+0(FP), DI
  2359  	MOVQ dest+8(FP), SI
  2360  	MOVQ length+16(FP), DX
  2361  	MOVQ transposeMap+24(FP), CX
  2362  
  2363  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2364  	JL   LBB49_1
  2365  
  2366  LBB49_5:
  2367  	WORD $0xd089                 // mov    eax, edx
  2368  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
  2369  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2370  	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
  2371  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
  2372  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2373  	LONG $0x08568948             // mov    qword [rsi + 8], rdx
  2374  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
  2375  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2376  	LONG $0x10568948             // mov    qword [rsi + 16], rdx
  2377  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
  2378  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2379  	LONG $0x18568948             // mov    qword [rsi + 24], rdx
  2380  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  2381  	LONG $0x04c78348             // add    rdi, 4
  2382  	LONG $0x20c68348             // add    rsi, 32
  2383  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  2384  	JG   LBB49_5
  2385  
  2386  LBB49_1:
  2387  	WORD $0xd285             // test    edx, edx
  2388  	JLE  LBB49_4
  2389  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2390  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2391  
  2392  LBB49_3:
  2393  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
  2394  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2395  	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
  2396  	LONG $0x01c08349             // add    r8, 1
  2397  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2398  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2399  	JG   LBB49_3
  2400  
  2401  LBB49_4:
  2402  	RET
  2403  
  2404  TEXT ·_transpose_uint16_uint64_sse4(SB), $0-32
  2405  
  2406  	MOVQ src+0(FP), DI
  2407  	MOVQ dest+8(FP), SI
  2408  	MOVQ length+16(FP), DX
  2409  	MOVQ transposeMap+24(FP), CX
  2410  
  2411  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2412  	JL   LBB50_1
  2413  
  2414  LBB50_5:
  2415  	WORD $0xd089             // mov    eax, edx
  2416  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
  2417  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2418  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2419  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
  2420  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2421  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2422  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
  2423  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2424  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2425  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
  2426  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2427  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2428  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2429  	LONG $0x08c78348         // add    rdi, 8
  2430  	LONG $0x20c68348         // add    rsi, 32
  2431  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2432  	JG   LBB50_5
  2433  
  2434  LBB50_1:
  2435  	WORD $0xd285             // test    edx, edx
  2436  	JLE  LBB50_4
  2437  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2438  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2439  
  2440  LBB50_3:
  2441  	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
  2442  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2443  	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
  2444  	LONG $0x02c08349             // add    r8, 2
  2445  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2446  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2447  	JG   LBB50_3
  2448  
  2449  LBB50_4:
  2450  	RET
  2451  
  2452  TEXT ·_transpose_int16_uint64_sse4(SB), $0-32
  2453  
  2454  	MOVQ src+0(FP), DI
  2455  	MOVQ dest+8(FP), SI
  2456  	MOVQ length+16(FP), DX
  2457  	MOVQ transposeMap+24(FP), CX
  2458  
  2459  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2460  	JL   LBB51_1
  2461  
  2462  LBB51_5:
  2463  	WORD $0xd089                 // mov    eax, edx
  2464  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
  2465  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2466  	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
  2467  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
  2468  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2469  	LONG $0x08568948             // mov    qword [rsi + 8], rdx
  2470  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
  2471  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2472  	LONG $0x10568948             // mov    qword [rsi + 16], rdx
  2473  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
  2474  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2475  	LONG $0x18568948             // mov    qword [rsi + 24], rdx
  2476  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  2477  	LONG $0x08c78348             // add    rdi, 8
  2478  	LONG $0x20c68348             // add    rsi, 32
  2479  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  2480  	JG   LBB51_5
  2481  
  2482  LBB51_1:
  2483  	WORD $0xd285             // test    edx, edx
  2484  	JLE  LBB51_4
  2485  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2486  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2487  
  2488  LBB51_3:
  2489  	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
  2490  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2491  	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
  2492  	LONG $0x02c08349             // add    r8, 2
  2493  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2494  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2495  	JG   LBB51_3
  2496  
  2497  LBB51_4:
  2498  	RET
  2499  
  2500  TEXT ·_transpose_uint32_uint64_sse4(SB), $0-32
  2501  
  2502  	MOVQ src+0(FP), DI
  2503  	MOVQ dest+8(FP), SI
  2504  	MOVQ length+16(FP), DX
  2505  	MOVQ transposeMap+24(FP), CX
  2506  
  2507  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2508  	JL   LBB52_1
  2509  
  2510  LBB52_5:
  2511  	WORD $0xd089             // mov    eax, edx
  2512  	WORD $0x178b             // mov    edx, dword [rdi]
  2513  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2514  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2515  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
  2516  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2517  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2518  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
  2519  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2520  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2521  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
  2522  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2523  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2524  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2525  	LONG $0x10c78348         // add    rdi, 16
  2526  	LONG $0x20c68348         // add    rsi, 32
  2527  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2528  	JG   LBB52_5
  2529  
  2530  LBB52_1:
  2531  	WORD $0xd285             // test    edx, edx
  2532  	JLE  LBB52_4
  2533  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2534  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2535  
  2536  LBB52_3:
  2537  	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
  2538  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  2539  	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
  2540  	LONG $0x04c08349         // add    r8, 4
  2541  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2542  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2543  	JG   LBB52_3
  2544  
  2545  LBB52_4:
  2546  	RET
  2547  
  2548  TEXT ·_transpose_int32_uint64_sse4(SB), $0-32
  2549  
  2550  	MOVQ src+0(FP), DI
  2551  	MOVQ dest+8(FP), SI
  2552  	MOVQ length+16(FP), DX
  2553  	MOVQ transposeMap+24(FP), CX
  2554  
  2555  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2556  	JL   LBB53_1
  2557  
  2558  LBB53_5:
  2559  	WORD $0xd089             // mov    eax, edx
  2560  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
  2561  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2562  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2563  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
  2564  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2565  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2566  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
  2567  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2568  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2569  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
  2570  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2571  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2572  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2573  	LONG $0x10c78348         // add    rdi, 16
  2574  	LONG $0x20c68348         // add    rsi, 32
  2575  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2576  	JG   LBB53_5
  2577  
  2578  LBB53_1:
  2579  	WORD $0xd285             // test    edx, edx
  2580  	JLE  LBB53_4
  2581  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2582  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2583  
  2584  LBB53_3:
  2585  	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
  2586  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  2587  	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
  2588  	LONG $0x04c08349         // add    r8, 4
  2589  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2590  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2591  	JG   LBB53_3
  2592  
  2593  LBB53_4:
  2594  	RET
  2595  
  2596  TEXT ·_transpose_uint64_uint64_sse4(SB), $0-32
  2597  
  2598  	MOVQ src+0(FP), DI
  2599  	MOVQ dest+8(FP), SI
  2600  	MOVQ length+16(FP), DX
  2601  	MOVQ transposeMap+24(FP), CX
  2602  
  2603  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2604  	JL   LBB54_1
  2605  
  2606  LBB54_5:
  2607  	WORD $0xd089             // mov    eax, edx
  2608  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  2609  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2610  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2611  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  2612  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2613  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2614  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  2615  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2616  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2617  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  2618  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2619  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2620  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2621  	LONG $0x20c78348         // add    rdi, 32
  2622  	LONG $0x20c68348         // add    rsi, 32
  2623  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2624  	JG   LBB54_5
  2625  
  2626  LBB54_1:
  2627  	WORD $0xd285             // test    edx, edx
  2628  	JLE  LBB54_4
  2629  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2630  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2631  
  2632  LBB54_3:
  2633  	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
  2634  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  2635  	LONG $0x0604894a         // mov    qword [rsi + r8], rax
  2636  	LONG $0x08c08349         // add    r8, 8
  2637  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2638  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2639  	JG   LBB54_3
  2640  
  2641  LBB54_4:
  2642  	RET
  2643  
  2644  TEXT ·_transpose_int64_uint64_sse4(SB), $0-32
  2645  
  2646  	MOVQ src+0(FP), DI
  2647  	MOVQ dest+8(FP), SI
  2648  	MOVQ length+16(FP), DX
  2649  	MOVQ transposeMap+24(FP), CX
  2650  
  2651  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2652  	JL   LBB55_1
  2653  
  2654  LBB55_5:
  2655  	WORD $0xd089             // mov    eax, edx
  2656  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  2657  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2658  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2659  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  2660  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2661  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2662  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  2663  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2664  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2665  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  2666  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2667  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2668  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2669  	LONG $0x20c78348         // add    rdi, 32
  2670  	LONG $0x20c68348         // add    rsi, 32
  2671  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2672  	JG   LBB55_5
  2673  
  2674  LBB55_1:
  2675  	WORD $0xd285             // test    edx, edx
  2676  	JLE  LBB55_4
  2677  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2678  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2679  
  2680  LBB55_3:
  2681  	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
  2682  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  2683  	LONG $0x0604894a         // mov    qword [rsi + r8], rax
  2684  	LONG $0x08c08349         // add    r8, 8
  2685  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2686  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2687  	JG   LBB55_3
  2688  
  2689  LBB55_4:
  2690  	RET
  2691  
  2692  TEXT ·_transpose_uint8_int64_sse4(SB), $0-32
  2693  
  2694  	MOVQ src+0(FP), DI
  2695  	MOVQ dest+8(FP), SI
  2696  	MOVQ length+16(FP), DX
  2697  	MOVQ transposeMap+24(FP), CX
  2698  
  2699  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2700  	JL   LBB56_1
  2701  
  2702  LBB56_5:
  2703  	WORD $0xd089             // mov    eax, edx
  2704  	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
  2705  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2706  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2707  	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
  2708  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2709  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2710  	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
  2711  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2712  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2713  	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
  2714  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2715  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2716  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2717  	LONG $0x04c78348         // add    rdi, 4
  2718  	LONG $0x20c68348         // add    rsi, 32
  2719  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2720  	JG   LBB56_5
  2721  
  2722  LBB56_1:
  2723  	WORD $0xd285             // test    edx, edx
  2724  	JLE  LBB56_4
  2725  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2726  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2727  
  2728  LBB56_3:
  2729  	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
  2730  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2731  	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
  2732  	LONG $0x01c08349             // add    r8, 1
  2733  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2734  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2735  	JG   LBB56_3
  2736  
  2737  LBB56_4:
  2738  	RET
  2739  
  2740  TEXT ·_transpose_int8_int64_sse4(SB), $0-32
  2741  
  2742  	MOVQ src+0(FP), DI
  2743  	MOVQ dest+8(FP), SI
  2744  	MOVQ length+16(FP), DX
  2745  	MOVQ transposeMap+24(FP), CX
  2746  
  2747  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2748  	JL   LBB57_1
  2749  
  2750  LBB57_5:
  2751  	WORD $0xd089                 // mov    eax, edx
  2752  	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
  2753  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2754  	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
  2755  	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
  2756  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2757  	LONG $0x08568948             // mov    qword [rsi + 8], rdx
  2758  	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
  2759  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2760  	LONG $0x10568948             // mov    qword [rsi + 16], rdx
  2761  	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
  2762  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2763  	LONG $0x18568948             // mov    qword [rsi + 24], rdx
  2764  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  2765  	LONG $0x04c78348             // add    rdi, 4
  2766  	LONG $0x20c68348             // add    rsi, 32
  2767  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  2768  	JG   LBB57_5
  2769  
  2770  LBB57_1:
  2771  	WORD $0xd285             // test    edx, edx
  2772  	JLE  LBB57_4
  2773  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2774  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2775  
  2776  LBB57_3:
  2777  	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
  2778  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2779  	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
  2780  	LONG $0x01c08349             // add    r8, 1
  2781  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2782  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2783  	JG   LBB57_3
  2784  
  2785  LBB57_4:
  2786  	RET
  2787  
  2788  TEXT ·_transpose_uint16_int64_sse4(SB), $0-32
  2789  
  2790  	MOVQ src+0(FP), DI
  2791  	MOVQ dest+8(FP), SI
  2792  	MOVQ length+16(FP), DX
  2793  	MOVQ transposeMap+24(FP), CX
  2794  
  2795  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2796  	JL   LBB58_1
  2797  
  2798  LBB58_5:
  2799  	WORD $0xd089             // mov    eax, edx
  2800  	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
  2801  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2802  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2803  	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
  2804  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2805  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2806  	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
  2807  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2808  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2809  	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
  2810  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2811  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2812  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2813  	LONG $0x08c78348         // add    rdi, 8
  2814  	LONG $0x20c68348         // add    rsi, 32
  2815  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2816  	JG   LBB58_5
  2817  
  2818  LBB58_1:
  2819  	WORD $0xd285             // test    edx, edx
  2820  	JLE  LBB58_4
  2821  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2822  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2823  
  2824  LBB58_3:
  2825  	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
  2826  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2827  	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
  2828  	LONG $0x02c08349             // add    r8, 2
  2829  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2830  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2831  	JG   LBB58_3
  2832  
  2833  LBB58_4:
  2834  	RET
  2835  
  2836  TEXT ·_transpose_int16_int64_sse4(SB), $0-32
  2837  
  2838  	MOVQ src+0(FP), DI
  2839  	MOVQ dest+8(FP), SI
  2840  	MOVQ length+16(FP), DX
  2841  	MOVQ transposeMap+24(FP), CX
  2842  
  2843  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2844  	JL   LBB59_1
  2845  
  2846  LBB59_5:
  2847  	WORD $0xd089                 // mov    eax, edx
  2848  	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
  2849  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2850  	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
  2851  	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
  2852  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2853  	LONG $0x08568948             // mov    qword [rsi + 8], rdx
  2854  	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
  2855  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2856  	LONG $0x10568948             // mov    qword [rsi + 16], rdx
  2857  	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
  2858  	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
  2859  	LONG $0x18568948             // mov    qword [rsi + 24], rdx
  2860  	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
  2861  	LONG $0x08c78348             // add    rdi, 8
  2862  	LONG $0x20c68348             // add    rsi, 32
  2863  	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
  2864  	JG   LBB59_5
  2865  
  2866  LBB59_1:
  2867  	WORD $0xd285             // test    edx, edx
  2868  	JLE  LBB59_4
  2869  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2870  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2871  
  2872  LBB59_3:
  2873  	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
  2874  	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
  2875  	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
  2876  	LONG $0x02c08349             // add    r8, 2
  2877  	WORD $0xc283; BYTE $0xff     // add    edx, -1
  2878  	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
  2879  	JG   LBB59_3
  2880  
  2881  LBB59_4:
  2882  	RET
  2883  
  2884  TEXT ·_transpose_uint32_int64_sse4(SB), $0-32
  2885  
  2886  	MOVQ src+0(FP), DI
  2887  	MOVQ dest+8(FP), SI
  2888  	MOVQ length+16(FP), DX
  2889  	MOVQ transposeMap+24(FP), CX
  2890  
  2891  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2892  	JL   LBB60_1
  2893  
  2894  LBB60_5:
  2895  	WORD $0xd089             // mov    eax, edx
  2896  	WORD $0x178b             // mov    edx, dword [rdi]
  2897  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2898  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2899  	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
  2900  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2901  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2902  	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
  2903  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2904  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2905  	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
  2906  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2907  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2908  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2909  	LONG $0x10c78348         // add    rdi, 16
  2910  	LONG $0x20c68348         // add    rsi, 32
  2911  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2912  	JG   LBB60_5
  2913  
  2914  LBB60_1:
  2915  	WORD $0xd285             // test    edx, edx
  2916  	JLE  LBB60_4
  2917  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2918  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2919  
  2920  LBB60_3:
  2921  	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
  2922  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  2923  	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
  2924  	LONG $0x04c08349         // add    r8, 4
  2925  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2926  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2927  	JG   LBB60_3
  2928  
  2929  LBB60_4:
  2930  	RET
  2931  
  2932  TEXT ·_transpose_int32_int64_sse4(SB), $0-32
  2933  
  2934  	MOVQ src+0(FP), DI
  2935  	MOVQ dest+8(FP), SI
  2936  	MOVQ length+16(FP), DX
  2937  	MOVQ transposeMap+24(FP), CX
  2938  
  2939  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2940  	JL   LBB61_1
  2941  
  2942  LBB61_5:
  2943  	WORD $0xd089             // mov    eax, edx
  2944  	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
  2945  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2946  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2947  	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
  2948  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2949  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2950  	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
  2951  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2952  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  2953  	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
  2954  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2955  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  2956  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  2957  	LONG $0x10c78348         // add    rdi, 16
  2958  	LONG $0x20c68348         // add    rsi, 32
  2959  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  2960  	JG   LBB61_5
  2961  
  2962  LBB61_1:
  2963  	WORD $0xd285             // test    edx, edx
  2964  	JLE  LBB61_4
  2965  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  2966  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  2967  
  2968  LBB61_3:
  2969  	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
  2970  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  2971  	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
  2972  	LONG $0x04c08349         // add    r8, 4
  2973  	WORD $0xc283; BYTE $0xff // add    edx, -1
  2974  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  2975  	JG   LBB61_3
  2976  
  2977  LBB61_4:
  2978  	RET
  2979  
  2980  TEXT ·_transpose_uint64_int64_sse4(SB), $0-32
  2981  
  2982  	MOVQ src+0(FP), DI
  2983  	MOVQ dest+8(FP), SI
  2984  	MOVQ length+16(FP), DX
  2985  	MOVQ transposeMap+24(FP), CX
  2986  
  2987  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  2988  	JL   LBB62_1
  2989  
  2990  LBB62_5:
  2991  	WORD $0xd089             // mov    eax, edx
  2992  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  2993  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2994  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  2995  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  2996  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  2997  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  2998  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  2999  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  3000  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  3001  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  3002  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  3003  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  3004  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  3005  	LONG $0x20c78348         // add    rdi, 32
  3006  	LONG $0x20c68348         // add    rsi, 32
  3007  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  3008  	JG   LBB62_5
  3009  
  3010  LBB62_1:
  3011  	WORD $0xd285             // test    edx, edx
  3012  	JLE  LBB62_4
  3013  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  3014  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  3015  
  3016  LBB62_3:
  3017  	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
  3018  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  3019  	LONG $0x0604894a         // mov    qword [rsi + r8], rax
  3020  	LONG $0x08c08349         // add    r8, 8
  3021  	WORD $0xc283; BYTE $0xff // add    edx, -1
  3022  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  3023  	JG   LBB62_3
  3024  
  3025  LBB62_4:
  3026  	RET
  3027  
  3028  TEXT ·_transpose_int64_int64_sse4(SB), $0-32
  3029  
  3030  	MOVQ src+0(FP), DI
  3031  	MOVQ dest+8(FP), SI
  3032  	MOVQ length+16(FP), DX
  3033  	MOVQ transposeMap+24(FP), CX
  3034  
  3035  	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
  3036  	JL   LBB63_1
  3037  
  3038  LBB63_5:
  3039  	WORD $0xd089             // mov    eax, edx
  3040  	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
  3041  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  3042  	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
  3043  	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
  3044  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  3045  	LONG $0x08568948         // mov    qword [rsi + 8], rdx
  3046  	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
  3047  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  3048  	LONG $0x10568948         // mov    qword [rsi + 16], rdx
  3049  	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
  3050  	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
  3051  	LONG $0x18568948         // mov    qword [rsi + 24], rdx
  3052  	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
  3053  	LONG $0x20c78348         // add    rdi, 32
  3054  	LONG $0x20c68348         // add    rsi, 32
  3055  	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
  3056  	JG   LBB63_5
  3057  
  3058  LBB63_1:
  3059  	WORD $0xd285             // test    edx, edx
  3060  	JLE  LBB63_4
  3061  	WORD $0xc283; BYTE $0x01 // add    edx, 1
  3062  	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
  3063  
  3064  LBB63_3:
  3065  	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
  3066  	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
  3067  	LONG $0x0604894a         // mov    qword [rsi + r8], rax
  3068  	LONG $0x08c08349         // add    r8, 8
  3069  	WORD $0xc283; BYTE $0xff // add    edx, -1
  3070  	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
  3071  	JG   LBB63_3
  3072  
  3073  LBB63_4:
  3074  	RET