gitee.com/quant1x/num@v0.3.2/asm/floats_avx_amd64.s (about)

     1  //+build !noasm !appengine
     2  // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
     3  
     4  TEXT ·___mm256_mul_const_add_to(SB), $0-32
     5  
     6      MOVQ a+0(FP), DI
     7      MOVQ b+8(FP), SI
     8      MOVQ c+16(FP), DX
     9      MOVQ n+24(FP), CX
    10  
    11      LONG $0x07418d4c             // lea    r8, [rcx + 7]
    12      WORD $0x8548; BYTE $0xc9     // test    rcx, rcx
    13      LONG $0xc1490f4c             // cmovns    r8, rcx
    14      WORD $0x894c; BYTE $0xc0     // mov    rax, r8
    15      LONG $0x03f8c148             // sar    rax, 3
    16      LONG $0xf8e08349             // and    r8, -8
    17      WORD $0x294c; BYTE $0xc1     // sub    rcx, r8
    18      WORD $0xc085                 // test    eax, eax
    19  	JLE LBB0_6
    20      WORD $0xf883; BYTE $0x01     // cmp    eax, 1
    21  	JE LBB0_4
    22      WORD $0x8941; BYTE $0xc0     // mov    r8d, eax
    23      LONG $0xfee08341             // and    r8d, -2
    24  LBB0_3:
    25      LONG $0x0710fcc5             // vmovups    ymm0, yword [rdi]
    26      LONG $0x187de2c4; BYTE $0x0e // vbroadcastss    ymm1, dword [rsi]
    27      LONG $0xa87de2c4; BYTE $0x0a // vfmadd213ps    ymm1, ymm0, yword [rdx]
    28      LONG $0x0a11fcc5             // vmovups    yword [rdx], ymm1
    29      LONG $0x4710fcc5; BYTE $0x20 // vmovups    ymm0, yword [rdi + 32]
    30      LONG $0x187de2c4; BYTE $0x0e // vbroadcastss    ymm1, dword [rsi]
    31      LONG $0xa87de2c4; WORD $0x204a // vfmadd213ps    ymm1, ymm0, yword [rdx + 32]
    32      LONG $0x4a11fcc5; BYTE $0x20 // vmovups    yword [rdx + 32], ymm1
    33      LONG $0x40c78348             // add    rdi, 64
    34      LONG $0x40c28348             // add    rdx, 64
    35      LONG $0xfec08341             // add    r8d, -2
    36  	JNE LBB0_3
    37  LBB0_4:
    38      WORD $0x01a8                 // test    al, 1
    39  	JE LBB0_6
    40      LONG $0x0710fcc5             // vmovups    ymm0, yword [rdi]
    41      LONG $0x187de2c4; BYTE $0x0e // vbroadcastss    ymm1, dword [rsi]
    42      LONG $0xa87de2c4; BYTE $0x0a // vfmadd213ps    ymm1, ymm0, yword [rdx]
    43      LONG $0x0a11fcc5             // vmovups    yword [rdx], ymm1
    44      LONG $0x20c78348             // add    rdi, 32
    45      LONG $0x20c28348             // add    rdx, 32
    46  LBB0_6:
    47      WORD $0xc985                 // test    ecx, ecx
    48  	JLE LBB0_18
    49      WORD $0xc889                 // mov    eax, ecx
    50      LONG $0x20f88348             // cmp    rax, 32
    51  	JAE LBB0_9
    52      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
    53  	JMP LBB0_14
    54  LBB0_9:
    55      LONG $0x82048d4c             // lea    r8, [rdx + 4*rax]
    56      LONG $0x870c8d4c             // lea    r9, [rdi + 4*rax]
    57      LONG $0x04568d4c             // lea    r10, [rsi + 4]
    58      WORD $0x394c; BYTE $0xca     // cmp    rdx, r9
    59      LONG $0xd3920f41             // setb    r11b
    60      WORD $0x394c; BYTE $0xc7     // cmp    rdi, r8
    61      WORD $0x920f; BYTE $0xd3     // setb    bl
    62      WORD $0x394c; BYTE $0xd2     // cmp    rdx, r10
    63      LONG $0xd1920f41             // setb    r9b
    64      WORD $0x3949; BYTE $0xf0     // cmp    r8, rsi
    65      LONG $0xd2970f41             // seta    r10b
    66      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
    67      WORD $0x8441; BYTE $0xdb     // test    r11b, bl
    68  	JNE LBB0_14
    69      WORD $0x2045; BYTE $0xd1     // and    r9b, r10b
    70  	JNE LBB0_14
    71      WORD $0x8941; BYTE $0xc9     // mov    r9d, ecx
    72      LONG $0x1fe18341             // and    r9d, 31
    73      WORD $0x8949; BYTE $0xc0     // mov    r8, rax
    74      WORD $0x294d; BYTE $0xc8     // sub    r8, r9
    75      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
    76      WORD $0x3145; BYTE $0xd2     // xor    r10d, r10d
    77  LBB0_12:
    78      LONG $0x107ca1c4; WORD $0x970c // vmovups    ymm1, yword [rdi + 4*r10]
    79      LONG $0x107ca1c4; WORD $0x9754; BYTE $0x20 // vmovups    ymm2, yword [rdi + 4*r10 + 32]
    80      LONG $0x107ca1c4; WORD $0x975c; BYTE $0x40 // vmovups    ymm3, yword [rdi + 4*r10 + 64]
    81      LONG $0x107ca1c4; WORD $0x9764; BYTE $0x60 // vmovups    ymm4, yword [rdi + 4*r10 + 96]
    82      LONG $0xa87da2c4; WORD $0x920c // vfmadd213ps    ymm1, ymm0, yword [rdx + 4*r10]
    83      LONG $0xa87da2c4; WORD $0x9254; BYTE $0x20 // vfmadd213ps    ymm2, ymm0, yword [rdx + 4*r10 + 32]
    84      LONG $0xa87da2c4; WORD $0x925c; BYTE $0x40 // vfmadd213ps    ymm3, ymm0, yword [rdx + 4*r10 + 64]
    85      LONG $0xa87da2c4; WORD $0x9264; BYTE $0x60 // vfmadd213ps    ymm4, ymm0, yword [rdx + 4*r10 + 96]
    86      LONG $0x117ca1c4; WORD $0x920c // vmovups    yword [rdx + 4*r10], ymm1
    87      LONG $0x117ca1c4; WORD $0x9254; BYTE $0x20 // vmovups    yword [rdx + 4*r10 + 32], ymm2
    88      LONG $0x117ca1c4; WORD $0x925c; BYTE $0x40 // vmovups    yword [rdx + 4*r10 + 64], ymm3
    89      LONG $0x117ca1c4; WORD $0x9264; BYTE $0x60 // vmovups    yword [rdx + 4*r10 + 96], ymm4
    90      LONG $0x20c28349             // add    r10, 32
    91      WORD $0x394d; BYTE $0xd0     // cmp    r8, r10
    92  	JNE LBB0_12
    93      WORD $0x854d; BYTE $0xc9     // test    r9, r9
    94  	JE LBB0_18
    95  LBB0_14:
    96      WORD $0x2944; BYTE $0xc1     // sub    ecx, r8d
    97      LONG $0x01488d4d             // lea    r9, [r8 + 1]
    98      WORD $0xc1f6; BYTE $0x01     // test    cl, 1
    99  	JE LBB0_16
   100      LONG $0x107aa1c4; WORD $0x8704 // vmovss    xmm0, dword [rdi + 4*r8]
   101      LONG $0x0e10fac5             // vmovss    xmm1, dword [rsi]
   102      LONG $0xa979a2c4; WORD $0x820c // vfmadd213ss    xmm1, xmm0, dword [rdx + 4*r8]
   103      LONG $0x117aa1c4; WORD $0x820c // vmovss    dword [rdx + 4*r8], xmm1
   104      WORD $0x894d; BYTE $0xc8     // mov    r8, r9
   105  LBB0_16:
   106      WORD $0x394c; BYTE $0xc8     // cmp    rax, r9
   107  	JE LBB0_18
   108  LBB0_17:
   109      LONG $0x107aa1c4; WORD $0x8704 // vmovss    xmm0, dword [rdi + 4*r8]
   110      LONG $0x0e10fac5             // vmovss    xmm1, dword [rsi]
   111      LONG $0xa979a2c4; WORD $0x820c // vfmadd213ss    xmm1, xmm0, dword [rdx + 4*r8]
   112      LONG $0x117aa1c4; WORD $0x820c // vmovss    dword [rdx + 4*r8], xmm1
   113      LONG $0x107aa1c4; WORD $0x8744; BYTE $0x04 // vmovss    xmm0, dword [rdi + 4*r8 + 4]
   114      LONG $0x0e10fac5             // vmovss    xmm1, dword [rsi]
   115      LONG $0xa979a2c4; WORD $0x824c; BYTE $0x04 // vfmadd213ss    xmm1, xmm0, dword [rdx + 4*r8 + 4]
   116      LONG $0x117aa1c4; WORD $0x824c; BYTE $0x04 // vmovss    dword [rdx + 4*r8 + 4], xmm1
   117      LONG $0x02c08349             // add    r8, 2
   118      WORD $0x394c; BYTE $0xc0     // cmp    rax, r8
   119  	JNE LBB0_17
   120  LBB0_18:
   121      VZEROUPPER
   122      RET
   123  
   124  
   125  
   126  
   127  TEXT ·___mm256_mul_const_to(SB), $0-32
   128  
   129      MOVQ a+0(FP), DI
   130      MOVQ b+8(FP), SI
   131      MOVQ c+16(FP), DX
   132      MOVQ n+24(FP), CX
   133  
   134      LONG $0x07418d4c             // lea    r8, [rcx + 7]
   135      WORD $0x8548; BYTE $0xc9     // test    rcx, rcx
   136      LONG $0xc1490f4c             // cmovns    r8, rcx
   137      WORD $0x894c; BYTE $0xc0     // mov    rax, r8
   138      LONG $0x03f8c148             // sar    rax, 3
   139      LONG $0xf8e08349             // and    r8, -8
   140      WORD $0x294c; BYTE $0xc1     // sub    rcx, r8
   141      WORD $0xc085                 // test    eax, eax
   142  	JLE LBB1_6
   143      WORD $0xf883; BYTE $0x01     // cmp    eax, 1
   144  	JE LBB1_4
   145      WORD $0x8941; BYTE $0xc0     // mov    r8d, eax
   146      LONG $0xfee08341             // and    r8d, -2
   147  LBB1_3:
   148      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   149      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   150      LONG $0x0211fcc5             // vmovups    yword [rdx], ymm0
   151      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   152      LONG $0x4759fcc5; BYTE $0x20 // vmulps    ymm0, ymm0, yword [rdi + 32]
   153      LONG $0x4211fcc5; BYTE $0x20 // vmovups    yword [rdx + 32], ymm0
   154      LONG $0x40c78348             // add    rdi, 64
   155      LONG $0x40c28348             // add    rdx, 64
   156      LONG $0xfec08341             // add    r8d, -2
   157  	JNE LBB1_3
   158  LBB1_4:
   159      WORD $0x01a8                 // test    al, 1
   160  	JE LBB1_6
   161      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   162      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   163      LONG $0x0211fcc5             // vmovups    yword [rdx], ymm0
   164      LONG $0x20c78348             // add    rdi, 32
   165      LONG $0x20c28348             // add    rdx, 32
   166  LBB1_6:
   167      WORD $0xc985                 // test    ecx, ecx
   168  	JLE LBB1_18
   169      WORD $0xc889                 // mov    eax, ecx
   170      LONG $0x20f88348             // cmp    rax, 32
   171  	JAE LBB1_9
   172      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
   173  	JMP LBB1_14
   174  LBB1_9:
   175      LONG $0x82048d4c             // lea    r8, [rdx + 4*rax]
   176      LONG $0x870c8d4c             // lea    r9, [rdi + 4*rax]
   177      LONG $0x04568d4c             // lea    r10, [rsi + 4]
   178      WORD $0x394c; BYTE $0xca     // cmp    rdx, r9
   179      LONG $0xd3920f41             // setb    r11b
   180      WORD $0x394c; BYTE $0xc7     // cmp    rdi, r8
   181      WORD $0x920f; BYTE $0xd3     // setb    bl
   182      WORD $0x394c; BYTE $0xd2     // cmp    rdx, r10
   183      LONG $0xd1920f41             // setb    r9b
   184      WORD $0x3949; BYTE $0xf0     // cmp    r8, rsi
   185      LONG $0xd2970f41             // seta    r10b
   186      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
   187      WORD $0x8441; BYTE $0xdb     // test    r11b, bl
   188  	JNE LBB1_14
   189      WORD $0x2045; BYTE $0xd1     // and    r9b, r10b
   190  	JNE LBB1_14
   191      WORD $0x8941; BYTE $0xc9     // mov    r9d, ecx
   192      LONG $0x1fe18341             // and    r9d, 31
   193      WORD $0x8949; BYTE $0xc0     // mov    r8, rax
   194      WORD $0x294d; BYTE $0xc8     // sub    r8, r9
   195      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   196      WORD $0x3145; BYTE $0xd2     // xor    r10d, r10d
   197  LBB1_12:
   198      LONG $0x597ca1c4; WORD $0x970c // vmulps    ymm1, ymm0, yword [rdi + 4*r10]
   199      LONG $0x597ca1c4; WORD $0x9754; BYTE $0x20 // vmulps    ymm2, ymm0, yword [rdi + 4*r10 + 32]
   200      LONG $0x597ca1c4; WORD $0x975c; BYTE $0x40 // vmulps    ymm3, ymm0, yword [rdi + 4*r10 + 64]
   201      LONG $0x597ca1c4; WORD $0x9764; BYTE $0x60 // vmulps    ymm4, ymm0, yword [rdi + 4*r10 + 96]
   202      LONG $0x117ca1c4; WORD $0x920c // vmovups    yword [rdx + 4*r10], ymm1
   203      LONG $0x117ca1c4; WORD $0x9254; BYTE $0x20 // vmovups    yword [rdx + 4*r10 + 32], ymm2
   204      LONG $0x117ca1c4; WORD $0x925c; BYTE $0x40 // vmovups    yword [rdx + 4*r10 + 64], ymm3
   205      LONG $0x117ca1c4; WORD $0x9264; BYTE $0x60 // vmovups    yword [rdx + 4*r10 + 96], ymm4
   206      LONG $0x20c28349             // add    r10, 32
   207      WORD $0x394d; BYTE $0xd0     // cmp    r8, r10
   208  	JNE LBB1_12
   209      WORD $0x854d; BYTE $0xc9     // test    r9, r9
   210  	JE LBB1_18
   211  LBB1_14:
   212      WORD $0x2944; BYTE $0xc1     // sub    ecx, r8d
   213      WORD $0x894d; BYTE $0xc1     // mov    r9, r8
   214      WORD $0xf749; BYTE $0xd1     // not    r9
   215      WORD $0x0149; BYTE $0xc1     // add    r9, rax
   216      LONG $0x03e18348             // and    rcx, 3
   217  	JE LBB1_16
   218  LBB1_15:
   219      LONG $0x0610fac5             // vmovss    xmm0, dword [rsi]
   220      LONG $0x597aa1c4; WORD $0x8704 // vmulss    xmm0, xmm0, dword [rdi + 4*r8]
   221      LONG $0x117aa1c4; WORD $0x8204 // vmovss    dword [rdx + 4*r8], xmm0
   222      WORD $0xff49; BYTE $0xc0     // inc    r8
   223      WORD $0xff48; BYTE $0xc9     // dec    rcx
   224  	JNE LBB1_15
   225  LBB1_16:
   226      LONG $0x03f98349             // cmp    r9, 3
   227  	JB LBB1_18
   228  LBB1_17:
   229      LONG $0x0610fac5             // vmovss    xmm0, dword [rsi]
   230      LONG $0x597aa1c4; WORD $0x8704 // vmulss    xmm0, xmm0, dword [rdi + 4*r8]
   231      LONG $0x117aa1c4; WORD $0x8204 // vmovss    dword [rdx + 4*r8], xmm0
   232      LONG $0x0610fac5             // vmovss    xmm0, dword [rsi]
   233      LONG $0x597aa1c4; WORD $0x8744; BYTE $0x04 // vmulss    xmm0, xmm0, dword [rdi + 4*r8 + 4]
   234      LONG $0x117aa1c4; WORD $0x8244; BYTE $0x04 // vmovss    dword [rdx + 4*r8 + 4], xmm0
   235      LONG $0x0610fac5             // vmovss    xmm0, dword [rsi]
   236      LONG $0x597aa1c4; WORD $0x8744; BYTE $0x08 // vmulss    xmm0, xmm0, dword [rdi + 4*r8 + 8]
   237      LONG $0x117aa1c4; WORD $0x8244; BYTE $0x08 // vmovss    dword [rdx + 4*r8 + 8], xmm0
   238      LONG $0x0610fac5             // vmovss    xmm0, dword [rsi]
   239      LONG $0x597aa1c4; WORD $0x8744; BYTE $0x0c // vmulss    xmm0, xmm0, dword [rdi + 4*r8 + 12]
   240      LONG $0x117aa1c4; WORD $0x8244; BYTE $0x0c // vmovss    dword [rdx + 4*r8 + 12], xmm0
   241      LONG $0x04c08349             // add    r8, 4
   242      WORD $0x394c; BYTE $0xc0     // cmp    rax, r8
   243  	JNE LBB1_17
   244  LBB1_18:
   245      VZEROUPPER
   246      RET
   247  
   248  
   249  
   250  
   251  TEXT ·___mm256_mul_const(SB), $0-24
   252  
   253      MOVQ a+0(FP), DI
   254      MOVQ b+8(FP), SI
   255      MOVQ n+16(FP), DX
   256  
   257      LONG $0x074a8d48             // lea    rcx, [rdx + 7]
   258      WORD $0x8548; BYTE $0xd2     // test    rdx, rdx
   259      LONG $0xca490f48             // cmovns    rcx, rdx
   260      WORD $0x8948; BYTE $0xc8     // mov    rax, rcx
   261      LONG $0x03f8c148             // sar    rax, 3
   262      LONG $0xf8e18348             // and    rcx, -8
   263      WORD $0x2948; BYTE $0xca     // sub    rdx, rcx
   264      WORD $0xc085                 // test    eax, eax
   265  	JLE LBB2_6
   266      WORD $0xf883; BYTE $0x01     // cmp    eax, 1
   267  	JE LBB2_4
   268      WORD $0xc189                 // mov    ecx, eax
   269      WORD $0xe183; BYTE $0xfe     // and    ecx, -2
   270  LBB2_3:
   271      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   272      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   273      LONG $0x0711fcc5             // vmovups    yword [rdi], ymm0
   274      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   275      LONG $0x4759fcc5; BYTE $0x20 // vmulps    ymm0, ymm0, yword [rdi + 32]
   276      LONG $0x4711fcc5; BYTE $0x20 // vmovups    yword [rdi + 32], ymm0
   277      LONG $0x40c78348             // add    rdi, 64
   278      WORD $0xc183; BYTE $0xfe     // add    ecx, -2
   279  	JNE LBB2_3
   280  LBB2_4:
   281      WORD $0x01a8                 // test    al, 1
   282  	JE LBB2_6
   283      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   284      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   285      LONG $0x0711fcc5             // vmovups    yword [rdi], ymm0
   286      LONG $0x20c78348             // add    rdi, 32
   287  LBB2_6:
   288      WORD $0xd285                 // test    edx, edx
   289  	JLE LBB2_19
   290      WORD $0xd089                 // mov    eax, edx
   291      LONG $0x20f88348             // cmp    rax, 32
   292  	JB LBB2_8
   293      LONG $0x044e8d48             // lea    rcx, [rsi + 4]
   294      WORD $0x3948; BYTE $0xcf     // cmp    rdi, rcx
   295  	JAE LBB2_12
   296      LONG $0x870c8d48             // lea    rcx, [rdi + 4*rax]
   297      WORD $0x3948; BYTE $0xf1     // cmp    rcx, rsi
   298  	JBE LBB2_12
   299  LBB2_8:
   300      WORD $0xc931                 // xor    ecx, ecx
   301  LBB2_15:
   302      WORD $0xca29                 // sub    edx, ecx
   303      WORD $0x8949; BYTE $0xc8     // mov    r8, rcx
   304      WORD $0xf749; BYTE $0xd0     // not    r8
   305      WORD $0x0149; BYTE $0xc0     // add    r8, rax
   306      LONG $0x03e28348             // and    rdx, 3
   307  	JE LBB2_17
   308  LBB2_16:
   309      LONG $0x0410fac5; BYTE $0x8f // vmovss    xmm0, dword [rdi + 4*rcx]
   310      LONG $0x0659fac5             // vmulss    xmm0, xmm0, dword [rsi]
   311      LONG $0x0411fac5; BYTE $0x8f // vmovss    dword [rdi + 4*rcx], xmm0
   312      WORD $0xff48; BYTE $0xc1     // inc    rcx
   313      WORD $0xff48; BYTE $0xca     // dec    rdx
   314  	JNE LBB2_16
   315  LBB2_17:
   316      LONG $0x03f88349             // cmp    r8, 3
   317  	JB LBB2_19
   318  LBB2_18:
   319      LONG $0x0410fac5; BYTE $0x8f // vmovss    xmm0, dword [rdi + 4*rcx]
   320      LONG $0x0659fac5             // vmulss    xmm0, xmm0, dword [rsi]
   321      LONG $0x4c10fac5; WORD $0x048f // vmovss    xmm1, dword [rdi + 4*rcx + 4]
   322      LONG $0x0411fac5; BYTE $0x8f // vmovss    dword [rdi + 4*rcx], xmm0
   323      LONG $0x0659f2c5             // vmulss    xmm0, xmm1, dword [rsi]
   324      LONG $0x4411fac5; WORD $0x048f // vmovss    dword [rdi + 4*rcx + 4], xmm0
   325      LONG $0x4410fac5; WORD $0x088f // vmovss    xmm0, dword [rdi + 4*rcx + 8]
   326      LONG $0x0659fac5             // vmulss    xmm0, xmm0, dword [rsi]
   327      LONG $0x4411fac5; WORD $0x088f // vmovss    dword [rdi + 4*rcx + 8], xmm0
   328      LONG $0x4410fac5; WORD $0x0c8f // vmovss    xmm0, dword [rdi + 4*rcx + 12]
   329      LONG $0x0659fac5             // vmulss    xmm0, xmm0, dword [rsi]
   330      LONG $0x4411fac5; WORD $0x0c8f // vmovss    dword [rdi + 4*rcx + 12], xmm0
   331      LONG $0x04c18348             // add    rcx, 4
   332      WORD $0x3948; BYTE $0xc8     // cmp    rax, rcx
   333  	JNE LBB2_18
   334  	JMP LBB2_19
   335  LBB2_12:
   336      WORD $0x8941; BYTE $0xd0     // mov    r8d, edx
   337      LONG $0x1fe08341             // and    r8d, 31
   338      WORD $0x8948; BYTE $0xc1     // mov    rcx, rax
   339      WORD $0x294c; BYTE $0xc1     // sub    rcx, r8
   340      LONG $0x187de2c4; BYTE $0x06 // vbroadcastss    ymm0, dword [rsi]
   341      WORD $0x3145; BYTE $0xc9     // xor    r9d, r9d
   342  LBB2_13:
   343      LONG $0x597ca1c4; WORD $0x8f0c // vmulps    ymm1, ymm0, yword [rdi + 4*r9]
   344      LONG $0x597ca1c4; WORD $0x8f54; BYTE $0x20 // vmulps    ymm2, ymm0, yword [rdi + 4*r9 + 32]
   345      LONG $0x597ca1c4; WORD $0x8f5c; BYTE $0x40 // vmulps    ymm3, ymm0, yword [rdi + 4*r9 + 64]
   346      LONG $0x597ca1c4; WORD $0x8f64; BYTE $0x60 // vmulps    ymm4, ymm0, yword [rdi + 4*r9 + 96]
   347      LONG $0x117ca1c4; WORD $0x8f0c // vmovups    yword [rdi + 4*r9], ymm1
   348      LONG $0x117ca1c4; WORD $0x8f54; BYTE $0x20 // vmovups    yword [rdi + 4*r9 + 32], ymm2
   349      LONG $0x117ca1c4; WORD $0x8f5c; BYTE $0x40 // vmovups    yword [rdi + 4*r9 + 64], ymm3
   350      LONG $0x117ca1c4; WORD $0x8f64; BYTE $0x60 // vmovups    yword [rdi + 4*r9 + 96], ymm4
   351      LONG $0x20c18349             // add    r9, 32
   352      WORD $0x394c; BYTE $0xc9     // cmp    rcx, r9
   353  	JNE LBB2_13
   354      WORD $0x854d; BYTE $0xc0     // test    r8, r8
   355  	JNE LBB2_15
   356  LBB2_19:
   357      VZEROUPPER
   358      RET
   359  
   360  
   361  
   362  
   363  TEXT ·___mm256_mul_to(SB), $0-32
   364  
   365      MOVQ a+0(FP), DI
   366      MOVQ b+8(FP), SI
   367      MOVQ c+16(FP), DX
   368      MOVQ n+24(FP), CX
   369  
   370      LONG $0x07418d48             // lea    rax, [rcx + 7]
   371      WORD $0x8548; BYTE $0xc9     // test    rcx, rcx
   372      LONG $0xc1490f48             // cmovns    rax, rcx
   373      WORD $0x8949; BYTE $0xc0     // mov    r8, rax
   374      LONG $0x03f8c149             // sar    r8, 3
   375      LONG $0xf8e08348             // and    rax, -8
   376      WORD $0x2948; BYTE $0xc1     // sub    rcx, rax
   377      WORD $0x8545; BYTE $0xc0     // test    r8d, r8d
   378  	JLE LBB3_6
   379      WORD $0x8944; BYTE $0xc0     // mov    eax, r8d
   380      WORD $0xe083; BYTE $0x03     // and    eax, 3
   381      LONG $0x04f88341             // cmp    r8d, 4
   382  	JB LBB3_4
   383      LONG $0xfce08341             // and    r8d, -4
   384  LBB3_3:
   385      LONG $0x0610fcc5             // vmovups    ymm0, yword [rsi]
   386      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   387      LONG $0x0211fcc5             // vmovups    yword [rdx], ymm0
   388      LONG $0x4610fcc5; BYTE $0x20 // vmovups    ymm0, yword [rsi + 32]
   389      LONG $0x4759fcc5; BYTE $0x20 // vmulps    ymm0, ymm0, yword [rdi + 32]
   390      LONG $0x4211fcc5; BYTE $0x20 // vmovups    yword [rdx + 32], ymm0
   391      LONG $0x4610fcc5; BYTE $0x40 // vmovups    ymm0, yword [rsi + 64]
   392      LONG $0x4759fcc5; BYTE $0x40 // vmulps    ymm0, ymm0, yword [rdi + 64]
   393      LONG $0x4211fcc5; BYTE $0x40 // vmovups    yword [rdx + 64], ymm0
   394      LONG $0x4610fcc5; BYTE $0x60 // vmovups    ymm0, yword [rsi + 96]
   395      LONG $0x4759fcc5; BYTE $0x60 // vmulps    ymm0, ymm0, yword [rdi + 96]
   396      LONG $0x4211fcc5; BYTE $0x60 // vmovups    yword [rdx + 96], ymm0
   397      LONG $0x80ef8348             // sub    rdi, -128
   398      LONG $0x80ee8348             // sub    rsi, -128
   399      LONG $0x80ea8348             // sub    rdx, -128
   400      LONG $0xfcc08341             // add    r8d, -4
   401  	JNE LBB3_3
   402  LBB3_4:
   403      WORD $0xc085                 // test    eax, eax
   404  	JE LBB3_6
   405  LBB3_5:
   406      LONG $0x0610fcc5             // vmovups    ymm0, yword [rsi]
   407      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   408      LONG $0x0211fcc5             // vmovups    yword [rdx], ymm0
   409      LONG $0x20c78348             // add    rdi, 32
   410      LONG $0x20c68348             // add    rsi, 32
   411      LONG $0x20c28348             // add    rdx, 32
   412      WORD $0xc8ff                 // dec    eax
   413  	JNE LBB3_5
   414  LBB3_6:
   415      WORD $0xc985                 // test    ecx, ecx
   416  	JLE LBB3_18
   417      WORD $0xc889                 // mov    eax, ecx
   418      LONG $0x20f88348             // cmp    rax, 32
   419  	JAE LBB3_9
   420      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
   421  	JMP LBB3_14
   422  LBB3_9:
   423      WORD $0x8949; BYTE $0xd1     // mov    r9, rdx
   424      WORD $0x2949; BYTE $0xf9     // sub    r9, rdi
   425      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
   426      LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp    r9, 128
   427  	JB LBB3_14
   428      WORD $0x8949; BYTE $0xd1     // mov    r9, rdx
   429      WORD $0x2949; BYTE $0xf1     // sub    r9, rsi
   430      LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp    r9, 128
   431  	JB LBB3_14
   432      WORD $0x8941; BYTE $0xc9     // mov    r9d, ecx
   433      LONG $0x1fe18341             // and    r9d, 31
   434      WORD $0x8949; BYTE $0xc0     // mov    r8, rax
   435      WORD $0x294d; BYTE $0xc8     // sub    r8, r9
   436      WORD $0x3145; BYTE $0xd2     // xor    r10d, r10d
   437  LBB3_12:
   438      LONG $0x107ca1c4; WORD $0x9604 // vmovups    ymm0, yword [rsi + 4*r10]
   439      LONG $0x107ca1c4; WORD $0x964c; BYTE $0x20 // vmovups    ymm1, yword [rsi + 4*r10 + 32]
   440      LONG $0x107ca1c4; WORD $0x9654; BYTE $0x40 // vmovups    ymm2, yword [rsi + 4*r10 + 64]
   441      LONG $0x107ca1c4; WORD $0x965c; BYTE $0x60 // vmovups    ymm3, yword [rsi + 4*r10 + 96]
   442      LONG $0x597ca1c4; WORD $0x9704 // vmulps    ymm0, ymm0, yword [rdi + 4*r10]
   443      LONG $0x5974a1c4; WORD $0x974c; BYTE $0x20 // vmulps    ymm1, ymm1, yword [rdi + 4*r10 + 32]
   444      LONG $0x596ca1c4; WORD $0x9754; BYTE $0x40 // vmulps    ymm2, ymm2, yword [rdi + 4*r10 + 64]
   445      LONG $0x5964a1c4; WORD $0x975c; BYTE $0x60 // vmulps    ymm3, ymm3, yword [rdi + 4*r10 + 96]
   446      LONG $0x117ca1c4; WORD $0x9204 // vmovups    yword [rdx + 4*r10], ymm0
   447      LONG $0x117ca1c4; WORD $0x924c; BYTE $0x20 // vmovups    yword [rdx + 4*r10 + 32], ymm1
   448      LONG $0x117ca1c4; WORD $0x9254; BYTE $0x40 // vmovups    yword [rdx + 4*r10 + 64], ymm2
   449      LONG $0x117ca1c4; WORD $0x925c; BYTE $0x60 // vmovups    yword [rdx + 4*r10 + 96], ymm3
   450      LONG $0x20c28349             // add    r10, 32
   451      WORD $0x394d; BYTE $0xd0     // cmp    r8, r10
   452  	JNE LBB3_12
   453      WORD $0x854d; BYTE $0xc9     // test    r9, r9
   454  	JE LBB3_18
   455  LBB3_14:
   456      WORD $0x2944; BYTE $0xc1     // sub    ecx, r8d
   457      WORD $0x894d; BYTE $0xc1     // mov    r9, r8
   458      WORD $0xf749; BYTE $0xd1     // not    r9
   459      WORD $0x0149; BYTE $0xc1     // add    r9, rax
   460      LONG $0x03e18348             // and    rcx, 3
   461  	JE LBB3_16
   462  LBB3_15:
   463      LONG $0x107aa1c4; WORD $0x8604 // vmovss    xmm0, dword [rsi + 4*r8]
   464      LONG $0x597aa1c4; WORD $0x8704 // vmulss    xmm0, xmm0, dword [rdi + 4*r8]
   465      LONG $0x117aa1c4; WORD $0x8204 // vmovss    dword [rdx + 4*r8], xmm0
   466      WORD $0xff49; BYTE $0xc0     // inc    r8
   467      WORD $0xff48; BYTE $0xc9     // dec    rcx
   468  	JNE LBB3_15
   469  LBB3_16:
   470      LONG $0x03f98349             // cmp    r9, 3
   471  	JB LBB3_18
   472  LBB3_17:
   473      LONG $0x107aa1c4; WORD $0x8604 // vmovss    xmm0, dword [rsi + 4*r8]
   474      LONG $0x597aa1c4; WORD $0x8704 // vmulss    xmm0, xmm0, dword [rdi + 4*r8]
   475      LONG $0x117aa1c4; WORD $0x8204 // vmovss    dword [rdx + 4*r8], xmm0
   476      LONG $0x107aa1c4; WORD $0x8644; BYTE $0x04 // vmovss    xmm0, dword [rsi + 4*r8 + 4]
   477      LONG $0x597aa1c4; WORD $0x8744; BYTE $0x04 // vmulss    xmm0, xmm0, dword [rdi + 4*r8 + 4]
   478      LONG $0x117aa1c4; WORD $0x8244; BYTE $0x04 // vmovss    dword [rdx + 4*r8 + 4], xmm0
   479      LONG $0x107aa1c4; WORD $0x8644; BYTE $0x08 // vmovss    xmm0, dword [rsi + 4*r8 + 8]
   480      LONG $0x597aa1c4; WORD $0x8744; BYTE $0x08 // vmulss    xmm0, xmm0, dword [rdi + 4*r8 + 8]
   481      LONG $0x117aa1c4; WORD $0x8244; BYTE $0x08 // vmovss    dword [rdx + 4*r8 + 8], xmm0
   482      LONG $0x107aa1c4; WORD $0x8644; BYTE $0x0c // vmovss    xmm0, dword [rsi + 4*r8 + 12]
   483      LONG $0x597aa1c4; WORD $0x8744; BYTE $0x0c // vmulss    xmm0, xmm0, dword [rdi + 4*r8 + 12]
   484      LONG $0x117aa1c4; WORD $0x8244; BYTE $0x0c // vmovss    dword [rdx + 4*r8 + 12], xmm0
   485      LONG $0x04c08349             // add    r8, 4
   486      WORD $0x394c; BYTE $0xc0     // cmp    rax, r8
   487  	JNE LBB3_17
   488  LBB3_18:
   489      VZEROUPPER
   490      RET
   491  
   492  
   493  
   494  
   495  TEXT ·___mm256_dot(SB), $0-32
   496  
   497      MOVQ a+0(FP), DI
   498      MOVQ b+8(FP), SI
   499      MOVQ n+16(FP), DX
   500      MOVQ ret+24(FP), CX
   501  
   502      LONG $0x07428d48             // lea    rax, [rdx + 7]
   503      WORD $0x8548; BYTE $0xd2     // test    rdx, rdx
   504      LONG $0xc2490f48             // cmovns    rax, rdx
   505      WORD $0x8949; BYTE $0xc1     // mov    r9, rax
   506      LONG $0x03f9c149             // sar    r9, 3
   507      LONG $0xf8e08348             // and    rax, -8
   508      WORD $0x2948; BYTE $0xc2     // sub    rdx, rax
   509      WORD $0x8545; BYTE $0xc9     // test    r9d, r9d
   510  	JLE LBB4_1
   511      LONG $0x0610fcc5             // vmovups    ymm0, yword [rsi]
   512      LONG $0x0759fcc5             // vmulps    ymm0, ymm0, yword [rdi]
   513      LONG $0x20c78348             // add    rdi, 32
   514      LONG $0x20c68348             // add    rsi, 32
   515      LONG $0x01f98341             // cmp    r9d, 1
   516  	JE LBB4_8
   517      LONG $0xff418d45             // lea    r8d, [r9 - 1]
   518      LONG $0xfec18341             // add    r9d, -2
   519      WORD $0x8944; BYTE $0xc0     // mov    eax, r8d
   520      WORD $0xe083; BYTE $0x03     // and    eax, 3
   521      LONG $0x03f98341             // cmp    r9d, 3
   522  	JB LBB4_6
   523      LONG $0xfce08341             // and    r8d, -4
   524  LBB4_5:
   525      LONG $0x0e10fcc5             // vmovups    ymm1, yword [rsi]
   526      LONG $0x5610fcc5; BYTE $0x20 // vmovups    ymm2, yword [rsi + 32]
   527      LONG $0x5e10fcc5; BYTE $0x40 // vmovups    ymm3, yword [rsi + 64]
   528      LONG $0x987de2c4; BYTE $0x0f // vfmadd132ps    ymm1, ymm0, yword [rdi]
   529      LONG $0xb86de2c4; WORD $0x204f // vfmadd231ps    ymm1, ymm2, yword [rdi + 32]
   530      LONG $0xb865e2c4; WORD $0x404f // vfmadd231ps    ymm1, ymm3, yword [rdi + 64]
   531      LONG $0x5610fcc5; BYTE $0x60 // vmovups    ymm2, yword [rsi + 96]
   532      LONG $0xc128fcc5             // vmovaps    ymm0, ymm1
   533      LONG $0xb86de2c4; WORD $0x6047 // vfmadd231ps    ymm0, ymm2, yword [rdi + 96]
   534      LONG $0x80ef8348             // sub    rdi, -128
   535      LONG $0x80ee8348             // sub    rsi, -128
   536      LONG $0xfcc08341             // add    r8d, -4
   537  	JNE LBB4_5
   538  LBB4_6:
   539      WORD $0xc085                 // test    eax, eax
   540  	JE LBB4_8
   541  LBB4_7:
   542      LONG $0x0e10fcc5             // vmovups    ymm1, yword [rsi]
   543      LONG $0xb875e2c4; BYTE $0x07 // vfmadd231ps    ymm0, ymm1, yword [rdi]
   544      LONG $0x20c78348             // add    rdi, 32
   545      LONG $0x20c68348             // add    rsi, 32
   546      WORD $0xc8ff                 // dec    eax
   547  	JNE LBB4_7
   548  	JMP LBB4_8
   549  LBB4_1:
   550  LBB4_8:
   551      LONG $0x197de3c4; WORD $0x01c1 // vextractf128    xmm1, ymm0, 1
   552      LONG $0xc058f0c5             // vaddps    xmm0, xmm1, xmm0
   553      LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd    xmm1, xmm0, 1
   554      LONG $0xc058f0c5             // vaddps    xmm0, xmm1, xmm0
   555      LONG $0xc816fac5             // vmovshdup    xmm1, xmm0
   556      LONG $0xc158fac5             // vaddss    xmm0, xmm0, xmm1
   557      LONG $0x0111fac5             // vmovss    dword [rcx], xmm0
   558      WORD $0xd285                 // test    edx, edx
   559  	JLE LBB4_20
   560      WORD $0xd089                 // mov    eax, edx
   561      LONG $0x20f88348             // cmp    rax, 32
   562  	JAE LBB4_11
   563      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
   564  	JMP LBB4_16
   565  LBB4_11:
   566      LONG $0x04418d4c             // lea    r8, [rcx + 4]
   567      LONG $0x870c8d4c             // lea    r9, [rdi + 4*rax]
   568      LONG $0x86148d4c             // lea    r10, [rsi + 4*rax]
   569      WORD $0x3949; BYTE $0xc9     // cmp    r9, rcx
   570      LONG $0xd3970f41             // seta    r11b
   571      WORD $0x394c; BYTE $0xc7     // cmp    rdi, r8
   572      WORD $0x920f; BYTE $0xd3     // setb    bl
   573      WORD $0x3949; BYTE $0xca     // cmp    r10, rcx
   574      LONG $0xd1970f41             // seta    r9b
   575      WORD $0x394c; BYTE $0xc6     // cmp    rsi, r8
   576      LONG $0xd2920f41             // setb    r10b
   577      WORD $0x3145; BYTE $0xc0     // xor    r8d, r8d
   578      WORD $0x8441; BYTE $0xdb     // test    r11b, bl
   579  	JNE LBB4_16
   580      WORD $0x2045; BYTE $0xd1     // and    r9b, r10b
   581  	JNE LBB4_16
   582      WORD $0x8941; BYTE $0xd1     // mov    r9d, edx
   583      LONG $0x1fe18341             // and    r9d, 31
   584      WORD $0x8949; BYTE $0xc0     // mov    r8, rax
   585      WORD $0x294d; BYTE $0xc8     // sub    r8, r9
   586      LONG $0xc957f0c5             // vxorps    xmm1, xmm1, xmm1
   587      LONG $0x0c71e3c4; WORD $0x01c0 // vblendps    xmm0, xmm1, xmm0, 1
   588      LONG $0xc957f0c5             // vxorps    xmm1, xmm1, xmm1
   589      WORD $0x3145; BYTE $0xd2     // xor    r10d, r10d
   590      LONG $0xd257e8c5             // vxorps    xmm2, xmm2, xmm2
   591      LONG $0xdb57e0c5             // vxorps    xmm3, xmm3, xmm3
   592  LBB4_14:
   593      LONG $0x107ca1c4; WORD $0x9624 // vmovups    ymm4, yword [rsi + 4*r10]
   594      LONG $0x107ca1c4; WORD $0x966c; BYTE $0x20 // vmovups    ymm5, yword [rsi + 4*r10 + 32]
   595      LONG $0x107ca1c4; WORD $0x9674; BYTE $0x40 // vmovups    ymm6, yword [rsi + 4*r10 + 64]
   596      LONG $0x107ca1c4; WORD $0x967c; BYTE $0x60 // vmovups    ymm7, yword [rsi + 4*r10 + 96]
   597      LONG $0xb85da2c4; WORD $0x9704 // vfmadd231ps    ymm0, ymm4, yword [rdi + 4*r10]
   598      LONG $0xb855a2c4; WORD $0x974c; BYTE $0x20 // vfmadd231ps    ymm1, ymm5, yword [rdi + 4*r10 + 32]
   599      LONG $0xb84da2c4; WORD $0x9754; BYTE $0x40 // vfmadd231ps    ymm2, ymm6, yword [rdi + 4*r10 + 64]
   600      LONG $0xb845a2c4; WORD $0x975c; BYTE $0x60 // vfmadd231ps    ymm3, ymm7, yword [rdi + 4*r10 + 96]
   601      LONG $0x20c28349             // add    r10, 32
   602      WORD $0x394d; BYTE $0xd0     // cmp    r8, r10
   603  	JNE LBB4_14
   604      LONG $0xc058f4c5             // vaddps    ymm0, ymm1, ymm0
   605      LONG $0xc058ecc5             // vaddps    ymm0, ymm2, ymm0
   606      LONG $0xc058e4c5             // vaddps    ymm0, ymm3, ymm0
   607      LONG $0x197de3c4; WORD $0x01c1 // vextractf128    xmm1, ymm0, 1
   608      LONG $0xc158f8c5             // vaddps    xmm0, xmm0, xmm1
   609      LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd    xmm1, xmm0, 1
   610      LONG $0xc158f8c5             // vaddps    xmm0, xmm0, xmm1
   611      LONG $0xc816fac5             // vmovshdup    xmm1, xmm0
   612      LONG $0xc158fac5             // vaddss    xmm0, xmm0, xmm1
   613      LONG $0x0111fac5             // vmovss    dword [rcx], xmm0
   614      WORD $0x854d; BYTE $0xc9     // test    r9, r9
   615  	JE LBB4_20
   616  LBB4_16:
   617      WORD $0x2944; BYTE $0xc2     // sub    edx, r8d
   618      WORD $0x894d; BYTE $0xc1     // mov    r9, r8
   619      WORD $0xf749; BYTE $0xd1     // not    r9
   620      WORD $0x0149; BYTE $0xc1     // add    r9, rax
   621      LONG $0x03e28348             // and    rdx, 3
   622  	JE LBB4_18
   623  LBB4_17:
   624      LONG $0x107aa1c4; WORD $0x860c // vmovss    xmm1, dword [rsi + 4*r8]
   625      LONG $0xb971a2c4; WORD $0x8704 // vfmadd231ss    xmm0, xmm1, dword [rdi + 4*r8]
   626      LONG $0x0111fac5             // vmovss    dword [rcx], xmm0
   627      WORD $0xff49; BYTE $0xc0     // inc    r8
   628      WORD $0xff48; BYTE $0xca     // dec    rdx
   629  	JNE LBB4_17
   630  LBB4_18:
   631      LONG $0x03f98349             // cmp    r9, 3
   632  	JB LBB4_20
   633  LBB4_19:
   634      LONG $0x107aa1c4; WORD $0x860c // vmovss    xmm1, dword [rsi + 4*r8]
   635      LONG $0x9979a2c4; WORD $0x870c // vfmadd132ss    xmm1, xmm0, dword [rdi + 4*r8]
   636      LONG $0x0911fac5             // vmovss    dword [rcx], xmm1
   637      LONG $0x107aa1c4; WORD $0x8644; BYTE $0x04 // vmovss    xmm0, dword [rsi + 4*r8 + 4]
   638      LONG $0x9971a2c4; WORD $0x8744; BYTE $0x04 // vfmadd132ss    xmm0, xmm1, dword [rdi + 4*r8 + 4]
   639      LONG $0x0111fac5             // vmovss    dword [rcx], xmm0
   640      LONG $0x107aa1c4; WORD $0x864c; BYTE $0x08 // vmovss    xmm1, dword [rsi + 4*r8 + 8]
   641      LONG $0x9979a2c4; WORD $0x874c; BYTE $0x08 // vfmadd132ss    xmm1, xmm0, dword [rdi + 4*r8 + 8]
   642      LONG $0x0911fac5             // vmovss    dword [rcx], xmm1
   643      LONG $0x107aa1c4; WORD $0x8644; BYTE $0x0c // vmovss    xmm0, dword [rsi + 4*r8 + 12]
   644      LONG $0x9971a2c4; WORD $0x8744; BYTE $0x0c // vfmadd132ss    xmm0, xmm1, dword [rdi + 4*r8 + 12]
   645      LONG $0x0111fac5             // vmovss    dword [rcx], xmm0
   646      LONG $0x04c08349             // add    r8, 4
   647      WORD $0x394c; BYTE $0xc0     // cmp    rax, r8
   648  	JNE LBB4_19
   649  LBB4_20:
   650      VZEROUPPER
   651      RET