github.com/egonelbre/exp@v0.0.0-20240430123955-ed1d3aa93911/vector/compare/axpy_amd64.s (about)

     1  // Code generated by command: go run main.go -out axpy_amd64.s -stubs axpy_amd64.go -testhelp axpy_stub_amd64_test.go. DO NOT EDIT.
     2  
     3  #include "textflag.h"
     4  
     5  // func AmdAxpyPointer_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
     6  // Requires: SSE
     7  TEXT ·AmdAxpyPointer_V0A0(SB), NOSPLIT, $0-48
     8  	MOVSS alpha+0(FP), X0
     9  	MOVQ  xs+8(FP), AX
    10  	MOVQ  incx+16(FP), CX
    11  	MOVQ  ys+24(FP), DX
    12  	MOVQ  incy+32(FP), BX
    13  	MOVQ  n+40(FP), SI
    14  	SHLQ  $0x02, SI
    15  	IMULQ CX, SI
    16  	ADDQ  AX, SI
    17  	JMP   check_limit
    18  
    19  loop:
    20  	MOVSS (AX), X1
    21  	MULSS X0, X1
    22  	ADDSS (DX), X1
    23  	MOVSS X1, (DX)
    24  	LEAQ  (AX)(CX*4), AX
    25  	LEAQ  (DX)(BX*4), DX
    26  
    27  check_limit:
    28  	CMPQ SI, AX
    29  	JHI  loop
    30  	RET
    31  
    32  // func AmdAxpyPointer_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
    33  // Requires: SSE
    34  TEXT ·AmdAxpyPointer_V1A0(SB), NOSPLIT, $0-48
    35  	MOVSS alpha+0(FP), X0
    36  	MOVQ  xs+8(FP), AX
    37  	MOVQ  incx+16(FP), CX
    38  	MOVQ  ys+24(FP), DX
    39  	MOVQ  incy+32(FP), BX
    40  	MOVQ  n+40(FP), SI
    41  	SHLQ  $0x02, SI
    42  	IMULQ CX, SI
    43  	ADDQ  AX, SI
    44  	JMP   check_limit
    45  
    46  loop:
    47  	MOVSS (AX), X1
    48  	MULSS X0, X1
    49  	ADDSS (DX), X1
    50  	MOVSS X1, (DX)
    51  	LEAQ  (AX)(CX*4), AX
    52  	LEAQ  (DX)(BX*4), DX
    53  
    54  check_limit:
    55  	CMPQ SI, AX
    56  	JHI  loop
    57  	RET
    58  
    59  // func AmdAxpyPointer_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
    60  // Requires: SSE
    61  TEXT ·AmdAxpyPointer_V2A0(SB), NOSPLIT, $0-48
    62  	MOVSS alpha+0(FP), X0
    63  	MOVQ  xs+8(FP), AX
    64  	MOVQ  incx+16(FP), CX
    65  	MOVQ  ys+24(FP), DX
    66  	MOVQ  incy+32(FP), BX
    67  	MOVQ  n+40(FP), SI
    68  	SHLQ  $0x02, SI
    69  	IMULQ CX, SI
    70  	ADDQ  AX, SI
    71  	JMP   check_limit
    72  
    73  loop:
    74  	MOVSS (AX), X1
    75  	MULSS X0, X1
    76  	ADDSS (DX), X1
    77  	MOVSS X1, (DX)
    78  	LEAQ  (AX)(CX*4), AX
    79  	LEAQ  (DX)(BX*4), DX
    80  
    81  check_limit:
    82  	CMPQ SI, AX
    83  	JHI  loop
    84  	RET
    85  
    86  // func AmdAxpyPointer_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
    87  // Requires: SSE
    88  TEXT ·AmdAxpyPointer_V3A0(SB), NOSPLIT, $0-48
    89  	MOVSS alpha+0(FP), X0
    90  	MOVQ  xs+8(FP), AX
    91  	MOVQ  incx+16(FP), CX
    92  	MOVQ  ys+24(FP), DX
    93  	MOVQ  incy+32(FP), BX
    94  	MOVQ  n+40(FP), SI
    95  	SHLQ  $0x02, SI
    96  	IMULQ CX, SI
    97  	ADDQ  AX, SI
    98  	JMP   check_limit
    99  
   100  loop:
   101  	MOVSS (AX), X1
   102  	MULSS X0, X1
   103  	ADDSS (DX), X1
   104  	MOVSS X1, (DX)
   105  	LEAQ  (AX)(CX*4), AX
   106  	LEAQ  (DX)(BX*4), DX
   107  
   108  check_limit:
   109  	CMPQ SI, AX
   110  	JHI  loop
   111  	RET
   112  
   113  // func AmdAxpyPointer_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   114  // Requires: SSE
   115  TEXT ·AmdAxpyPointer_V4A0(SB), NOSPLIT, $0-48
   116  	MOVSS alpha+0(FP), X0
   117  	MOVQ  xs+8(FP), AX
   118  	MOVQ  incx+16(FP), CX
   119  	MOVQ  ys+24(FP), DX
   120  	MOVQ  incy+32(FP), BX
   121  	MOVQ  n+40(FP), SI
   122  	SHLQ  $0x02, SI
   123  	IMULQ CX, SI
   124  	ADDQ  AX, SI
   125  	JMP   check_limit
   126  
   127  loop:
   128  	MOVSS (AX), X1
   129  	MULSS X0, X1
   130  	ADDSS (DX), X1
   131  	MOVSS X1, (DX)
   132  	LEAQ  (AX)(CX*4), AX
   133  	LEAQ  (DX)(BX*4), DX
   134  
   135  check_limit:
   136  	CMPQ SI, AX
   137  	JHI  loop
   138  	RET
   139  
   140  // func AmdAxpyPointer_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   141  // Requires: SSE
   142  TEXT ·AmdAxpyPointer_V5A0(SB), NOSPLIT, $0-48
   143  	MOVSS alpha+0(FP), X0
   144  	MOVQ  xs+8(FP), AX
   145  	MOVQ  incx+16(FP), CX
   146  	MOVQ  ys+24(FP), DX
   147  	MOVQ  incy+32(FP), BX
   148  	MOVQ  n+40(FP), SI
   149  	SHLQ  $0x02, SI
   150  	IMULQ CX, SI
   151  	ADDQ  AX, SI
   152  	JMP   check_limit
   153  
   154  loop:
   155  	MOVSS (AX), X1
   156  	MULSS X0, X1
   157  	ADDSS (DX), X1
   158  	MOVSS X1, (DX)
   159  	LEAQ  (AX)(CX*4), AX
   160  	LEAQ  (DX)(BX*4), DX
   161  
   162  check_limit:
   163  	CMPQ SI, AX
   164  	JHI  loop
   165  	RET
   166  
   167  // func AmdAxpyPointer_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   168  // Requires: SSE
   169  TEXT ·AmdAxpyPointer_V0A8(SB), NOSPLIT, $0-48
   170  	MOVSS alpha+0(FP), X0
   171  	MOVQ  xs+8(FP), AX
   172  	MOVQ  incx+16(FP), CX
   173  	MOVQ  ys+24(FP), DX
   174  	MOVQ  incy+32(FP), BX
   175  	MOVQ  n+40(FP), SI
   176  	SHLQ  $0x02, SI
   177  	IMULQ CX, SI
   178  	ADDQ  AX, SI
   179  	JMP   check_limit
   180  	PCALIGN $0x08
   181  
   182  loop:
   183  	MOVSS (AX), X1
   184  	MULSS X0, X1
   185  	ADDSS (DX), X1
   186  	MOVSS X1, (DX)
   187  	LEAQ  (AX)(CX*4), AX
   188  	LEAQ  (DX)(BX*4), DX
   189  
   190  check_limit:
   191  	CMPQ SI, AX
   192  	JHI  loop
   193  	RET
   194  
   195  // func AmdAxpyPointer_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   196  // Requires: SSE
   197  TEXT ·AmdAxpyPointer_V1A8(SB), NOSPLIT, $0-48
   198  	MOVSS alpha+0(FP), X0
   199  	MOVQ  xs+8(FP), AX
   200  	MOVQ  incx+16(FP), CX
   201  	MOVQ  ys+24(FP), DX
   202  	MOVQ  incy+32(FP), BX
   203  	MOVQ  n+40(FP), SI
   204  	SHLQ  $0x02, SI
   205  	IMULQ CX, SI
   206  	ADDQ  AX, SI
   207  	JMP   check_limit
   208  	PCALIGN $0x08
   209  
   210  loop:
   211  	MOVSS (AX), X1
   212  	MULSS X0, X1
   213  	ADDSS (DX), X1
   214  	MOVSS X1, (DX)
   215  	LEAQ  (AX)(CX*4), AX
   216  	LEAQ  (DX)(BX*4), DX
   217  
   218  check_limit:
   219  	CMPQ SI, AX
   220  	JHI  loop
   221  	RET
   222  
   223  // func AmdAxpyPointer_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   224  // Requires: SSE
   225  TEXT ·AmdAxpyPointer_V2A8(SB), NOSPLIT, $0-48
   226  	MOVSS alpha+0(FP), X0
   227  	MOVQ  xs+8(FP), AX
   228  	MOVQ  incx+16(FP), CX
   229  	MOVQ  ys+24(FP), DX
   230  	MOVQ  incy+32(FP), BX
   231  	MOVQ  n+40(FP), SI
   232  	SHLQ  $0x02, SI
   233  	IMULQ CX, SI
   234  	ADDQ  AX, SI
   235  	JMP   check_limit
   236  	PCALIGN $0x08
   237  
   238  loop:
   239  	MOVSS (AX), X1
   240  	MULSS X0, X1
   241  	ADDSS (DX), X1
   242  	MOVSS X1, (DX)
   243  	LEAQ  (AX)(CX*4), AX
   244  	LEAQ  (DX)(BX*4), DX
   245  
   246  check_limit:
   247  	CMPQ SI, AX
   248  	JHI  loop
   249  	RET
   250  
   251  // func AmdAxpyPointer_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   252  // Requires: SSE
   253  TEXT ·AmdAxpyPointer_V3A8(SB), NOSPLIT, $0-48
   254  	MOVSS alpha+0(FP), X0
   255  	MOVQ  xs+8(FP), AX
   256  	MOVQ  incx+16(FP), CX
   257  	MOVQ  ys+24(FP), DX
   258  	MOVQ  incy+32(FP), BX
   259  	MOVQ  n+40(FP), SI
   260  	SHLQ  $0x02, SI
   261  	IMULQ CX, SI
   262  	ADDQ  AX, SI
   263  	JMP   check_limit
   264  	PCALIGN $0x08
   265  
   266  loop:
   267  	MOVSS (AX), X1
   268  	MULSS X0, X1
   269  	ADDSS (DX), X1
   270  	MOVSS X1, (DX)
   271  	LEAQ  (AX)(CX*4), AX
   272  	LEAQ  (DX)(BX*4), DX
   273  
   274  check_limit:
   275  	CMPQ SI, AX
   276  	JHI  loop
   277  	RET
   278  
   279  // func AmdAxpyPointer_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   280  // Requires: SSE
   281  TEXT ·AmdAxpyPointer_V4A8(SB), NOSPLIT, $0-48
   282  	MOVSS alpha+0(FP), X0
   283  	MOVQ  xs+8(FP), AX
   284  	MOVQ  incx+16(FP), CX
   285  	MOVQ  ys+24(FP), DX
   286  	MOVQ  incy+32(FP), BX
   287  	MOVQ  n+40(FP), SI
   288  	SHLQ  $0x02, SI
   289  	IMULQ CX, SI
   290  	ADDQ  AX, SI
   291  	JMP   check_limit
   292  	PCALIGN $0x08
   293  
   294  loop:
   295  	MOVSS (AX), X1
   296  	MULSS X0, X1
   297  	ADDSS (DX), X1
   298  	MOVSS X1, (DX)
   299  	LEAQ  (AX)(CX*4), AX
   300  	LEAQ  (DX)(BX*4), DX
   301  
   302  check_limit:
   303  	CMPQ SI, AX
   304  	JHI  loop
   305  	RET
   306  
   307  // func AmdAxpyPointer_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   308  // Requires: SSE
   309  TEXT ·AmdAxpyPointer_V5A8(SB), NOSPLIT, $0-48
   310  	MOVSS alpha+0(FP), X0
   311  	MOVQ  xs+8(FP), AX
   312  	MOVQ  incx+16(FP), CX
   313  	MOVQ  ys+24(FP), DX
   314  	MOVQ  incy+32(FP), BX
   315  	MOVQ  n+40(FP), SI
   316  	SHLQ  $0x02, SI
   317  	IMULQ CX, SI
   318  	ADDQ  AX, SI
   319  	JMP   check_limit
   320  	PCALIGN $0x08
   321  
   322  loop:
   323  	MOVSS (AX), X1
   324  	MULSS X0, X1
   325  	ADDSS (DX), X1
   326  	MOVSS X1, (DX)
   327  	LEAQ  (AX)(CX*4), AX
   328  	LEAQ  (DX)(BX*4), DX
   329  
   330  check_limit:
   331  	CMPQ SI, AX
   332  	JHI  loop
   333  	RET
   334  
   335  // func AmdAxpyPointer_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   336  // Requires: SSE
   337  TEXT ·AmdAxpyPointer_V0A9(SB), NOSPLIT, $0-48
   338  	MOVSS alpha+0(FP), X0
   339  	MOVQ  xs+8(FP), AX
   340  	MOVQ  incx+16(FP), CX
   341  	MOVQ  ys+24(FP), DX
   342  	MOVQ  incy+32(FP), BX
   343  	MOVQ  n+40(FP), SI
   344  	SHLQ  $0x02, SI
   345  	IMULQ CX, SI
   346  	ADDQ  AX, SI
   347  	JMP   check_limit
   348  	PCALIGN $0x08
   349  	NOP
   350  
   351  loop:
   352  	MOVSS (AX), X1
   353  	MULSS X0, X1
   354  	ADDSS (DX), X1
   355  	MOVSS X1, (DX)
   356  	LEAQ  (AX)(CX*4), AX
   357  	LEAQ  (DX)(BX*4), DX
   358  
   359  check_limit:
   360  	CMPQ SI, AX
   361  	JHI  loop
   362  	RET
   363  
   364  // func AmdAxpyPointer_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   365  // Requires: SSE
   366  TEXT ·AmdAxpyPointer_V1A9(SB), NOSPLIT, $0-48
   367  	MOVSS alpha+0(FP), X0
   368  	MOVQ  xs+8(FP), AX
   369  	MOVQ  incx+16(FP), CX
   370  	MOVQ  ys+24(FP), DX
   371  	MOVQ  incy+32(FP), BX
   372  	MOVQ  n+40(FP), SI
   373  	SHLQ  $0x02, SI
   374  	IMULQ CX, SI
   375  	ADDQ  AX, SI
   376  	JMP   check_limit
   377  	PCALIGN $0x08
   378  	NOP
   379  
   380  loop:
   381  	MOVSS (AX), X1
   382  	MULSS X0, X1
   383  	ADDSS (DX), X1
   384  	MOVSS X1, (DX)
   385  	LEAQ  (AX)(CX*4), AX
   386  	LEAQ  (DX)(BX*4), DX
   387  
   388  check_limit:
   389  	CMPQ SI, AX
   390  	JHI  loop
   391  	RET
   392  
   393  // func AmdAxpyPointer_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   394  // Requires: SSE
   395  TEXT ·AmdAxpyPointer_V2A9(SB), NOSPLIT, $0-48
   396  	MOVSS alpha+0(FP), X0
   397  	MOVQ  xs+8(FP), AX
   398  	MOVQ  incx+16(FP), CX
   399  	MOVQ  ys+24(FP), DX
   400  	MOVQ  incy+32(FP), BX
   401  	MOVQ  n+40(FP), SI
   402  	SHLQ  $0x02, SI
   403  	IMULQ CX, SI
   404  	ADDQ  AX, SI
   405  	JMP   check_limit
   406  	PCALIGN $0x08
   407  	NOP
   408  
   409  loop:
   410  	MOVSS (AX), X1
   411  	MULSS X0, X1
   412  	ADDSS (DX), X1
   413  	MOVSS X1, (DX)
   414  	LEAQ  (AX)(CX*4), AX
   415  	LEAQ  (DX)(BX*4), DX
   416  
   417  check_limit:
   418  	CMPQ SI, AX
   419  	JHI  loop
   420  	RET
   421  
   422  // func AmdAxpyPointer_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   423  // Requires: SSE
   424  TEXT ·AmdAxpyPointer_V3A9(SB), NOSPLIT, $0-48
   425  	MOVSS alpha+0(FP), X0
   426  	MOVQ  xs+8(FP), AX
   427  	MOVQ  incx+16(FP), CX
   428  	MOVQ  ys+24(FP), DX
   429  	MOVQ  incy+32(FP), BX
   430  	MOVQ  n+40(FP), SI
   431  	SHLQ  $0x02, SI
   432  	IMULQ CX, SI
   433  	ADDQ  AX, SI
   434  	JMP   check_limit
   435  	PCALIGN $0x08
   436  	NOP
   437  
   438  loop:
   439  	MOVSS (AX), X1
   440  	MULSS X0, X1
   441  	ADDSS (DX), X1
   442  	MOVSS X1, (DX)
   443  	LEAQ  (AX)(CX*4), AX
   444  	LEAQ  (DX)(BX*4), DX
   445  
   446  check_limit:
   447  	CMPQ SI, AX
   448  	JHI  loop
   449  	RET
   450  
   451  // func AmdAxpyPointer_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   452  // Requires: SSE
   453  TEXT ·AmdAxpyPointer_V4A9(SB), NOSPLIT, $0-48
   454  	MOVSS alpha+0(FP), X0
   455  	MOVQ  xs+8(FP), AX
   456  	MOVQ  incx+16(FP), CX
   457  	MOVQ  ys+24(FP), DX
   458  	MOVQ  incy+32(FP), BX
   459  	MOVQ  n+40(FP), SI
   460  	SHLQ  $0x02, SI
   461  	IMULQ CX, SI
   462  	ADDQ  AX, SI
   463  	JMP   check_limit
   464  	PCALIGN $0x08
   465  	NOP
   466  
   467  loop:
   468  	MOVSS (AX), X1
   469  	MULSS X0, X1
   470  	ADDSS (DX), X1
   471  	MOVSS X1, (DX)
   472  	LEAQ  (AX)(CX*4), AX
   473  	LEAQ  (DX)(BX*4), DX
   474  
   475  check_limit:
   476  	CMPQ SI, AX
   477  	JHI  loop
   478  	RET
   479  
   480  // func AmdAxpyPointer_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   481  // Requires: SSE
   482  TEXT ·AmdAxpyPointer_V5A9(SB), NOSPLIT, $0-48
   483  	MOVSS alpha+0(FP), X0
   484  	MOVQ  xs+8(FP), AX
   485  	MOVQ  incx+16(FP), CX
   486  	MOVQ  ys+24(FP), DX
   487  	MOVQ  incy+32(FP), BX
   488  	MOVQ  n+40(FP), SI
   489  	SHLQ  $0x02, SI
   490  	IMULQ CX, SI
   491  	ADDQ  AX, SI
   492  	JMP   check_limit
   493  	PCALIGN $0x08
   494  	NOP
   495  
   496  loop:
   497  	MOVSS (AX), X1
   498  	MULSS X0, X1
   499  	ADDSS (DX), X1
   500  	MOVSS X1, (DX)
   501  	LEAQ  (AX)(CX*4), AX
   502  	LEAQ  (DX)(BX*4), DX
   503  
   504  check_limit:
   505  	CMPQ SI, AX
   506  	JHI  loop
   507  	RET
   508  
   509  // func AmdAxpyPointer_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   510  // Requires: SSE
   511  TEXT ·AmdAxpyPointer_V0A10(SB), NOSPLIT, $0-48
   512  	MOVSS alpha+0(FP), X0
   513  	MOVQ  xs+8(FP), AX
   514  	MOVQ  incx+16(FP), CX
   515  	MOVQ  ys+24(FP), DX
   516  	MOVQ  incy+32(FP), BX
   517  	MOVQ  n+40(FP), SI
   518  	SHLQ  $0x02, SI
   519  	IMULQ CX, SI
   520  	ADDQ  AX, SI
   521  	JMP   check_limit
   522  	PCALIGN $0x08
   523  	NOP
   524  	NOP
   525  
   526  loop:
   527  	MOVSS (AX), X1
   528  	MULSS X0, X1
   529  	ADDSS (DX), X1
   530  	MOVSS X1, (DX)
   531  	LEAQ  (AX)(CX*4), AX
   532  	LEAQ  (DX)(BX*4), DX
   533  
   534  check_limit:
   535  	CMPQ SI, AX
   536  	JHI  loop
   537  	RET
   538  
   539  // func AmdAxpyPointer_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   540  // Requires: SSE
   541  TEXT ·AmdAxpyPointer_V1A10(SB), NOSPLIT, $0-48
   542  	MOVSS alpha+0(FP), X0
   543  	MOVQ  xs+8(FP), AX
   544  	MOVQ  incx+16(FP), CX
   545  	MOVQ  ys+24(FP), DX
   546  	MOVQ  incy+32(FP), BX
   547  	MOVQ  n+40(FP), SI
   548  	SHLQ  $0x02, SI
   549  	IMULQ CX, SI
   550  	ADDQ  AX, SI
   551  	JMP   check_limit
   552  	PCALIGN $0x08
   553  	NOP
   554  	NOP
   555  
   556  loop:
   557  	MOVSS (AX), X1
   558  	MULSS X0, X1
   559  	ADDSS (DX), X1
   560  	MOVSS X1, (DX)
   561  	LEAQ  (AX)(CX*4), AX
   562  	LEAQ  (DX)(BX*4), DX
   563  
   564  check_limit:
   565  	CMPQ SI, AX
   566  	JHI  loop
   567  	RET
   568  
   569  // func AmdAxpyPointer_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   570  // Requires: SSE
   571  TEXT ·AmdAxpyPointer_V2A10(SB), NOSPLIT, $0-48
   572  	MOVSS alpha+0(FP), X0
   573  	MOVQ  xs+8(FP), AX
   574  	MOVQ  incx+16(FP), CX
   575  	MOVQ  ys+24(FP), DX
   576  	MOVQ  incy+32(FP), BX
   577  	MOVQ  n+40(FP), SI
   578  	SHLQ  $0x02, SI
   579  	IMULQ CX, SI
   580  	ADDQ  AX, SI
   581  	JMP   check_limit
   582  	PCALIGN $0x08
   583  	NOP
   584  	NOP
   585  
   586  loop:
   587  	MOVSS (AX), X1
   588  	MULSS X0, X1
   589  	ADDSS (DX), X1
   590  	MOVSS X1, (DX)
   591  	LEAQ  (AX)(CX*4), AX
   592  	LEAQ  (DX)(BX*4), DX
   593  
   594  check_limit:
   595  	CMPQ SI, AX
   596  	JHI  loop
   597  	RET
   598  
   599  // func AmdAxpyPointer_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   600  // Requires: SSE
   601  TEXT ·AmdAxpyPointer_V3A10(SB), NOSPLIT, $0-48
   602  	MOVSS alpha+0(FP), X0
   603  	MOVQ  xs+8(FP), AX
   604  	MOVQ  incx+16(FP), CX
   605  	MOVQ  ys+24(FP), DX
   606  	MOVQ  incy+32(FP), BX
   607  	MOVQ  n+40(FP), SI
   608  	SHLQ  $0x02, SI
   609  	IMULQ CX, SI
   610  	ADDQ  AX, SI
   611  	JMP   check_limit
   612  	PCALIGN $0x08
   613  	NOP
   614  	NOP
   615  
   616  loop:
   617  	MOVSS (AX), X1
   618  	MULSS X0, X1
   619  	ADDSS (DX), X1
   620  	MOVSS X1, (DX)
   621  	LEAQ  (AX)(CX*4), AX
   622  	LEAQ  (DX)(BX*4), DX
   623  
   624  check_limit:
   625  	CMPQ SI, AX
   626  	JHI  loop
   627  	RET
   628  
   629  // func AmdAxpyPointer_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   630  // Requires: SSE
   631  TEXT ·AmdAxpyPointer_V4A10(SB), NOSPLIT, $0-48
   632  	MOVSS alpha+0(FP), X0
   633  	MOVQ  xs+8(FP), AX
   634  	MOVQ  incx+16(FP), CX
   635  	MOVQ  ys+24(FP), DX
   636  	MOVQ  incy+32(FP), BX
   637  	MOVQ  n+40(FP), SI
   638  	SHLQ  $0x02, SI
   639  	IMULQ CX, SI
   640  	ADDQ  AX, SI
   641  	JMP   check_limit
   642  	PCALIGN $0x08
   643  	NOP
   644  	NOP
   645  
   646  loop:
   647  	MOVSS (AX), X1
   648  	MULSS X0, X1
   649  	ADDSS (DX), X1
   650  	MOVSS X1, (DX)
   651  	LEAQ  (AX)(CX*4), AX
   652  	LEAQ  (DX)(BX*4), DX
   653  
   654  check_limit:
   655  	CMPQ SI, AX
   656  	JHI  loop
   657  	RET
   658  
   659  // func AmdAxpyPointer_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   660  // Requires: SSE
   661  TEXT ·AmdAxpyPointer_V5A10(SB), NOSPLIT, $0-48
   662  	MOVSS alpha+0(FP), X0
   663  	MOVQ  xs+8(FP), AX
   664  	MOVQ  incx+16(FP), CX
   665  	MOVQ  ys+24(FP), DX
   666  	MOVQ  incy+32(FP), BX
   667  	MOVQ  n+40(FP), SI
   668  	SHLQ  $0x02, SI
   669  	IMULQ CX, SI
   670  	ADDQ  AX, SI
   671  	JMP   check_limit
   672  	PCALIGN $0x08
   673  	NOP
   674  	NOP
   675  
   676  loop:
   677  	MOVSS (AX), X1
   678  	MULSS X0, X1
   679  	ADDSS (DX), X1
   680  	MOVSS X1, (DX)
   681  	LEAQ  (AX)(CX*4), AX
   682  	LEAQ  (DX)(BX*4), DX
   683  
   684  check_limit:
   685  	CMPQ SI, AX
   686  	JHI  loop
   687  	RET
   688  
   689  // func AmdAxpyPointer_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   690  // Requires: SSE
   691  TEXT ·AmdAxpyPointer_V0A11(SB), NOSPLIT, $0-48
   692  	MOVSS alpha+0(FP), X0
   693  	MOVQ  xs+8(FP), AX
   694  	MOVQ  incx+16(FP), CX
   695  	MOVQ  ys+24(FP), DX
   696  	MOVQ  incy+32(FP), BX
   697  	MOVQ  n+40(FP), SI
   698  	SHLQ  $0x02, SI
   699  	IMULQ CX, SI
   700  	ADDQ  AX, SI
   701  	JMP   check_limit
   702  	PCALIGN $0x08
   703  	NOP
   704  	NOP
   705  	NOP
   706  
   707  loop:
   708  	MOVSS (AX), X1
   709  	MULSS X0, X1
   710  	ADDSS (DX), X1
   711  	MOVSS X1, (DX)
   712  	LEAQ  (AX)(CX*4), AX
   713  	LEAQ  (DX)(BX*4), DX
   714  
   715  check_limit:
   716  	CMPQ SI, AX
   717  	JHI  loop
   718  	RET
   719  
   720  // func AmdAxpyPointer_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   721  // Requires: SSE
   722  TEXT ·AmdAxpyPointer_V1A11(SB), NOSPLIT, $0-48
   723  	MOVSS alpha+0(FP), X0
   724  	MOVQ  xs+8(FP), AX
   725  	MOVQ  incx+16(FP), CX
   726  	MOVQ  ys+24(FP), DX
   727  	MOVQ  incy+32(FP), BX
   728  	MOVQ  n+40(FP), SI
   729  	SHLQ  $0x02, SI
   730  	IMULQ CX, SI
   731  	ADDQ  AX, SI
   732  	JMP   check_limit
   733  	PCALIGN $0x08
   734  	NOP
   735  	NOP
   736  	NOP
   737  
   738  loop:
   739  	MOVSS (AX), X1
   740  	MULSS X0, X1
   741  	ADDSS (DX), X1
   742  	MOVSS X1, (DX)
   743  	LEAQ  (AX)(CX*4), AX
   744  	LEAQ  (DX)(BX*4), DX
   745  
   746  check_limit:
   747  	CMPQ SI, AX
   748  	JHI  loop
   749  	RET
   750  
   751  // func AmdAxpyPointer_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   752  // Requires: SSE
   753  TEXT ·AmdAxpyPointer_V2A11(SB), NOSPLIT, $0-48
   754  	MOVSS alpha+0(FP), X0
   755  	MOVQ  xs+8(FP), AX
   756  	MOVQ  incx+16(FP), CX
   757  	MOVQ  ys+24(FP), DX
   758  	MOVQ  incy+32(FP), BX
   759  	MOVQ  n+40(FP), SI
   760  	SHLQ  $0x02, SI
   761  	IMULQ CX, SI
   762  	ADDQ  AX, SI
   763  	JMP   check_limit
   764  	PCALIGN $0x08
   765  	NOP
   766  	NOP
   767  	NOP
   768  
   769  loop:
   770  	MOVSS (AX), X1
   771  	MULSS X0, X1
   772  	ADDSS (DX), X1
   773  	MOVSS X1, (DX)
   774  	LEAQ  (AX)(CX*4), AX
   775  	LEAQ  (DX)(BX*4), DX
   776  
   777  check_limit:
   778  	CMPQ SI, AX
   779  	JHI  loop
   780  	RET
   781  
   782  // func AmdAxpyPointer_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   783  // Requires: SSE
   784  TEXT ·AmdAxpyPointer_V3A11(SB), NOSPLIT, $0-48
   785  	MOVSS alpha+0(FP), X0
   786  	MOVQ  xs+8(FP), AX
   787  	MOVQ  incx+16(FP), CX
   788  	MOVQ  ys+24(FP), DX
   789  	MOVQ  incy+32(FP), BX
   790  	MOVQ  n+40(FP), SI
   791  	SHLQ  $0x02, SI
   792  	IMULQ CX, SI
   793  	ADDQ  AX, SI
   794  	JMP   check_limit
   795  	PCALIGN $0x08
   796  	NOP
   797  	NOP
   798  	NOP
   799  
   800  loop:
   801  	MOVSS (AX), X1
   802  	MULSS X0, X1
   803  	ADDSS (DX), X1
   804  	MOVSS X1, (DX)
   805  	LEAQ  (AX)(CX*4), AX
   806  	LEAQ  (DX)(BX*4), DX
   807  
   808  check_limit:
   809  	CMPQ SI, AX
   810  	JHI  loop
   811  	RET
   812  
   813  // func AmdAxpyPointer_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   814  // Requires: SSE
   815  TEXT ·AmdAxpyPointer_V4A11(SB), NOSPLIT, $0-48
   816  	MOVSS alpha+0(FP), X0
   817  	MOVQ  xs+8(FP), AX
   818  	MOVQ  incx+16(FP), CX
   819  	MOVQ  ys+24(FP), DX
   820  	MOVQ  incy+32(FP), BX
   821  	MOVQ  n+40(FP), SI
   822  	SHLQ  $0x02, SI
   823  	IMULQ CX, SI
   824  	ADDQ  AX, SI
   825  	JMP   check_limit
   826  	PCALIGN $0x08
   827  	NOP
   828  	NOP
   829  	NOP
   830  
   831  loop:
   832  	MOVSS (AX), X1
   833  	MULSS X0, X1
   834  	ADDSS (DX), X1
   835  	MOVSS X1, (DX)
   836  	LEAQ  (AX)(CX*4), AX
   837  	LEAQ  (DX)(BX*4), DX
   838  
   839  check_limit:
   840  	CMPQ SI, AX
   841  	JHI  loop
   842  	RET
   843  
   844  // func AmdAxpyPointer_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   845  // Requires: SSE
   846  TEXT ·AmdAxpyPointer_V5A11(SB), NOSPLIT, $0-48
   847  	MOVSS alpha+0(FP), X0
   848  	MOVQ  xs+8(FP), AX
   849  	MOVQ  incx+16(FP), CX
   850  	MOVQ  ys+24(FP), DX
   851  	MOVQ  incy+32(FP), BX
   852  	MOVQ  n+40(FP), SI
   853  	SHLQ  $0x02, SI
   854  	IMULQ CX, SI
   855  	ADDQ  AX, SI
   856  	JMP   check_limit
   857  	PCALIGN $0x08
   858  	NOP
   859  	NOP
   860  	NOP
   861  
   862  loop:
   863  	MOVSS (AX), X1
   864  	MULSS X0, X1
   865  	ADDSS (DX), X1
   866  	MOVSS X1, (DX)
   867  	LEAQ  (AX)(CX*4), AX
   868  	LEAQ  (DX)(BX*4), DX
   869  
   870  check_limit:
   871  	CMPQ SI, AX
   872  	JHI  loop
   873  	RET
   874  
   875  // func AmdAxpyPointer_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   876  // Requires: SSE
   877  TEXT ·AmdAxpyPointer_V0A12(SB), NOSPLIT, $0-48
   878  	MOVSS alpha+0(FP), X0
   879  	MOVQ  xs+8(FP), AX
   880  	MOVQ  incx+16(FP), CX
   881  	MOVQ  ys+24(FP), DX
   882  	MOVQ  incy+32(FP), BX
   883  	MOVQ  n+40(FP), SI
   884  	SHLQ  $0x02, SI
   885  	IMULQ CX, SI
   886  	ADDQ  AX, SI
   887  	JMP   check_limit
   888  	PCALIGN $0x08
   889  	NOP
   890  	NOP
   891  	NOP
   892  	NOP
   893  
   894  loop:
   895  	MOVSS (AX), X1
   896  	MULSS X0, X1
   897  	ADDSS (DX), X1
   898  	MOVSS X1, (DX)
   899  	LEAQ  (AX)(CX*4), AX
   900  	LEAQ  (DX)(BX*4), DX
   901  
   902  check_limit:
   903  	CMPQ SI, AX
   904  	JHI  loop
   905  	RET
   906  
   907  // func AmdAxpyPointer_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   908  // Requires: SSE
   909  TEXT ·AmdAxpyPointer_V1A12(SB), NOSPLIT, $0-48
   910  	MOVSS alpha+0(FP), X0
   911  	MOVQ  xs+8(FP), AX
   912  	MOVQ  incx+16(FP), CX
   913  	MOVQ  ys+24(FP), DX
   914  	MOVQ  incy+32(FP), BX
   915  	MOVQ  n+40(FP), SI
   916  	SHLQ  $0x02, SI
   917  	IMULQ CX, SI
   918  	ADDQ  AX, SI
   919  	JMP   check_limit
   920  	PCALIGN $0x08
   921  	NOP
   922  	NOP
   923  	NOP
   924  	NOP
   925  
   926  loop:
   927  	MOVSS (AX), X1
   928  	MULSS X0, X1
   929  	ADDSS (DX), X1
   930  	MOVSS X1, (DX)
   931  	LEAQ  (AX)(CX*4), AX
   932  	LEAQ  (DX)(BX*4), DX
   933  
   934  check_limit:
   935  	CMPQ SI, AX
   936  	JHI  loop
   937  	RET
   938  
   939  // func AmdAxpyPointer_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   940  // Requires: SSE
   941  TEXT ·AmdAxpyPointer_V2A12(SB), NOSPLIT, $0-48
   942  	MOVSS alpha+0(FP), X0
   943  	MOVQ  xs+8(FP), AX
   944  	MOVQ  incx+16(FP), CX
   945  	MOVQ  ys+24(FP), DX
   946  	MOVQ  incy+32(FP), BX
   947  	MOVQ  n+40(FP), SI
   948  	SHLQ  $0x02, SI
   949  	IMULQ CX, SI
   950  	ADDQ  AX, SI
   951  	JMP   check_limit
   952  	PCALIGN $0x08
   953  	NOP
   954  	NOP
   955  	NOP
   956  	NOP
   957  
   958  loop:
   959  	MOVSS (AX), X1
   960  	MULSS X0, X1
   961  	ADDSS (DX), X1
   962  	MOVSS X1, (DX)
   963  	LEAQ  (AX)(CX*4), AX
   964  	LEAQ  (DX)(BX*4), DX
   965  
   966  check_limit:
   967  	CMPQ SI, AX
   968  	JHI  loop
   969  	RET
   970  
   971  // func AmdAxpyPointer_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
   972  // Requires: SSE
   973  TEXT ·AmdAxpyPointer_V3A12(SB), NOSPLIT, $0-48
   974  	MOVSS alpha+0(FP), X0
   975  	MOVQ  xs+8(FP), AX
   976  	MOVQ  incx+16(FP), CX
   977  	MOVQ  ys+24(FP), DX
   978  	MOVQ  incy+32(FP), BX
   979  	MOVQ  n+40(FP), SI
   980  	SHLQ  $0x02, SI
   981  	IMULQ CX, SI
   982  	ADDQ  AX, SI
   983  	JMP   check_limit
   984  	PCALIGN $0x08
   985  	NOP
   986  	NOP
   987  	NOP
   988  	NOP
   989  
   990  loop:
   991  	MOVSS (AX), X1
   992  	MULSS X0, X1
   993  	ADDSS (DX), X1
   994  	MOVSS X1, (DX)
   995  	LEAQ  (AX)(CX*4), AX
   996  	LEAQ  (DX)(BX*4), DX
   997  
   998  check_limit:
   999  	CMPQ SI, AX
  1000  	JHI  loop
  1001  	RET
  1002  
  1003  // func AmdAxpyPointer_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1004  // Requires: SSE
  1005  TEXT ·AmdAxpyPointer_V4A12(SB), NOSPLIT, $0-48
  1006  	MOVSS alpha+0(FP), X0
  1007  	MOVQ  xs+8(FP), AX
  1008  	MOVQ  incx+16(FP), CX
  1009  	MOVQ  ys+24(FP), DX
  1010  	MOVQ  incy+32(FP), BX
  1011  	MOVQ  n+40(FP), SI
  1012  	SHLQ  $0x02, SI
  1013  	IMULQ CX, SI
  1014  	ADDQ  AX, SI
  1015  	JMP   check_limit
  1016  	PCALIGN $0x08
  1017  	NOP
  1018  	NOP
  1019  	NOP
  1020  	NOP
  1021  
  1022  loop:
  1023  	MOVSS (AX), X1
  1024  	MULSS X0, X1
  1025  	ADDSS (DX), X1
  1026  	MOVSS X1, (DX)
  1027  	LEAQ  (AX)(CX*4), AX
  1028  	LEAQ  (DX)(BX*4), DX
  1029  
  1030  check_limit:
  1031  	CMPQ SI, AX
  1032  	JHI  loop
  1033  	RET
  1034  
  1035  // func AmdAxpyPointer_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1036  // Requires: SSE
  1037  TEXT ·AmdAxpyPointer_V5A12(SB), NOSPLIT, $0-48
  1038  	MOVSS alpha+0(FP), X0
  1039  	MOVQ  xs+8(FP), AX
  1040  	MOVQ  incx+16(FP), CX
  1041  	MOVQ  ys+24(FP), DX
  1042  	MOVQ  incy+32(FP), BX
  1043  	MOVQ  n+40(FP), SI
  1044  	SHLQ  $0x02, SI
  1045  	IMULQ CX, SI
  1046  	ADDQ  AX, SI
  1047  	JMP   check_limit
  1048  	PCALIGN $0x08
  1049  	NOP
  1050  	NOP
  1051  	NOP
  1052  	NOP
  1053  
  1054  loop:
  1055  	MOVSS (AX), X1
  1056  	MULSS X0, X1
  1057  	ADDSS (DX), X1
  1058  	MOVSS X1, (DX)
  1059  	LEAQ  (AX)(CX*4), AX
  1060  	LEAQ  (DX)(BX*4), DX
  1061  
  1062  check_limit:
  1063  	CMPQ SI, AX
  1064  	JHI  loop
  1065  	RET
  1066  
  1067  // func AmdAxpyPointer_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1068  // Requires: SSE
  1069  TEXT ·AmdAxpyPointer_V0A13(SB), NOSPLIT, $0-48
  1070  	MOVSS alpha+0(FP), X0
  1071  	MOVQ  xs+8(FP), AX
  1072  	MOVQ  incx+16(FP), CX
  1073  	MOVQ  ys+24(FP), DX
  1074  	MOVQ  incy+32(FP), BX
  1075  	MOVQ  n+40(FP), SI
  1076  	SHLQ  $0x02, SI
  1077  	IMULQ CX, SI
  1078  	ADDQ  AX, SI
  1079  	JMP   check_limit
  1080  	PCALIGN $0x08
  1081  	NOP
  1082  	NOP
  1083  	NOP
  1084  	NOP
  1085  	NOP
  1086  
  1087  loop:
  1088  	MOVSS (AX), X1
  1089  	MULSS X0, X1
  1090  	ADDSS (DX), X1
  1091  	MOVSS X1, (DX)
  1092  	LEAQ  (AX)(CX*4), AX
  1093  	LEAQ  (DX)(BX*4), DX
  1094  
  1095  check_limit:
  1096  	CMPQ SI, AX
  1097  	JHI  loop
  1098  	RET
  1099  
  1100  // func AmdAxpyPointer_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1101  // Requires: SSE
  1102  TEXT ·AmdAxpyPointer_V1A13(SB), NOSPLIT, $0-48
  1103  	MOVSS alpha+0(FP), X0
  1104  	MOVQ  xs+8(FP), AX
  1105  	MOVQ  incx+16(FP), CX
  1106  	MOVQ  ys+24(FP), DX
  1107  	MOVQ  incy+32(FP), BX
  1108  	MOVQ  n+40(FP), SI
  1109  	SHLQ  $0x02, SI
  1110  	IMULQ CX, SI
  1111  	ADDQ  AX, SI
  1112  	JMP   check_limit
  1113  	PCALIGN $0x08
  1114  	NOP
  1115  	NOP
  1116  	NOP
  1117  	NOP
  1118  	NOP
  1119  
  1120  loop:
  1121  	MOVSS (AX), X1
  1122  	MULSS X0, X1
  1123  	ADDSS (DX), X1
  1124  	MOVSS X1, (DX)
  1125  	LEAQ  (AX)(CX*4), AX
  1126  	LEAQ  (DX)(BX*4), DX
  1127  
  1128  check_limit:
  1129  	CMPQ SI, AX
  1130  	JHI  loop
  1131  	RET
  1132  
  1133  // func AmdAxpyPointer_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1134  // Requires: SSE
  1135  TEXT ·AmdAxpyPointer_V2A13(SB), NOSPLIT, $0-48
  1136  	MOVSS alpha+0(FP), X0
  1137  	MOVQ  xs+8(FP), AX
  1138  	MOVQ  incx+16(FP), CX
  1139  	MOVQ  ys+24(FP), DX
  1140  	MOVQ  incy+32(FP), BX
  1141  	MOVQ  n+40(FP), SI
  1142  	SHLQ  $0x02, SI
  1143  	IMULQ CX, SI
  1144  	ADDQ  AX, SI
  1145  	JMP   check_limit
  1146  	PCALIGN $0x08
  1147  	NOP
  1148  	NOP
  1149  	NOP
  1150  	NOP
  1151  	NOP
  1152  
  1153  loop:
  1154  	MOVSS (AX), X1
  1155  	MULSS X0, X1
  1156  	ADDSS (DX), X1
  1157  	MOVSS X1, (DX)
  1158  	LEAQ  (AX)(CX*4), AX
  1159  	LEAQ  (DX)(BX*4), DX
  1160  
  1161  check_limit:
  1162  	CMPQ SI, AX
  1163  	JHI  loop
  1164  	RET
  1165  
  1166  // func AmdAxpyPointer_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1167  // Requires: SSE
  1168  TEXT ·AmdAxpyPointer_V3A13(SB), NOSPLIT, $0-48
  1169  	MOVSS alpha+0(FP), X0
  1170  	MOVQ  xs+8(FP), AX
  1171  	MOVQ  incx+16(FP), CX
  1172  	MOVQ  ys+24(FP), DX
  1173  	MOVQ  incy+32(FP), BX
  1174  	MOVQ  n+40(FP), SI
  1175  	SHLQ  $0x02, SI
  1176  	IMULQ CX, SI
  1177  	ADDQ  AX, SI
  1178  	JMP   check_limit
  1179  	PCALIGN $0x08
  1180  	NOP
  1181  	NOP
  1182  	NOP
  1183  	NOP
  1184  	NOP
  1185  
  1186  loop:
  1187  	MOVSS (AX), X1
  1188  	MULSS X0, X1
  1189  	ADDSS (DX), X1
  1190  	MOVSS X1, (DX)
  1191  	LEAQ  (AX)(CX*4), AX
  1192  	LEAQ  (DX)(BX*4), DX
  1193  
  1194  check_limit:
  1195  	CMPQ SI, AX
  1196  	JHI  loop
  1197  	RET
  1198  
  1199  // func AmdAxpyPointer_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1200  // Requires: SSE
  1201  TEXT ·AmdAxpyPointer_V4A13(SB), NOSPLIT, $0-48
  1202  	MOVSS alpha+0(FP), X0
  1203  	MOVQ  xs+8(FP), AX
  1204  	MOVQ  incx+16(FP), CX
  1205  	MOVQ  ys+24(FP), DX
  1206  	MOVQ  incy+32(FP), BX
  1207  	MOVQ  n+40(FP), SI
  1208  	SHLQ  $0x02, SI
  1209  	IMULQ CX, SI
  1210  	ADDQ  AX, SI
  1211  	JMP   check_limit
  1212  	PCALIGN $0x08
  1213  	NOP
  1214  	NOP
  1215  	NOP
  1216  	NOP
  1217  	NOP
  1218  
  1219  loop:
  1220  	MOVSS (AX), X1
  1221  	MULSS X0, X1
  1222  	ADDSS (DX), X1
  1223  	MOVSS X1, (DX)
  1224  	LEAQ  (AX)(CX*4), AX
  1225  	LEAQ  (DX)(BX*4), DX
  1226  
  1227  check_limit:
  1228  	CMPQ SI, AX
  1229  	JHI  loop
  1230  	RET
  1231  
  1232  // func AmdAxpyPointer_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1233  // Requires: SSE
  1234  TEXT ·AmdAxpyPointer_V5A13(SB), NOSPLIT, $0-48
  1235  	MOVSS alpha+0(FP), X0
  1236  	MOVQ  xs+8(FP), AX
  1237  	MOVQ  incx+16(FP), CX
  1238  	MOVQ  ys+24(FP), DX
  1239  	MOVQ  incy+32(FP), BX
  1240  	MOVQ  n+40(FP), SI
  1241  	SHLQ  $0x02, SI
  1242  	IMULQ CX, SI
  1243  	ADDQ  AX, SI
  1244  	JMP   check_limit
  1245  	PCALIGN $0x08
  1246  	NOP
  1247  	NOP
  1248  	NOP
  1249  	NOP
  1250  	NOP
  1251  
  1252  loop:
  1253  	MOVSS (AX), X1
  1254  	MULSS X0, X1
  1255  	ADDSS (DX), X1
  1256  	MOVSS X1, (DX)
  1257  	LEAQ  (AX)(CX*4), AX
  1258  	LEAQ  (DX)(BX*4), DX
  1259  
  1260  check_limit:
  1261  	CMPQ SI, AX
  1262  	JHI  loop
  1263  	RET
  1264  
  1265  // func AmdAxpyPointer_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1266  // Requires: SSE
  1267  TEXT ·AmdAxpyPointer_V0A14(SB), NOSPLIT, $0-48
  1268  	MOVSS alpha+0(FP), X0
  1269  	MOVQ  xs+8(FP), AX
  1270  	MOVQ  incx+16(FP), CX
  1271  	MOVQ  ys+24(FP), DX
  1272  	MOVQ  incy+32(FP), BX
  1273  	MOVQ  n+40(FP), SI
  1274  	SHLQ  $0x02, SI
  1275  	IMULQ CX, SI
  1276  	ADDQ  AX, SI
  1277  	JMP   check_limit
  1278  	PCALIGN $0x08
  1279  	NOP
  1280  	NOP
  1281  	NOP
  1282  	NOP
  1283  	NOP
  1284  	NOP
  1285  
  1286  loop:
  1287  	MOVSS (AX), X1
  1288  	MULSS X0, X1
  1289  	ADDSS (DX), X1
  1290  	MOVSS X1, (DX)
  1291  	LEAQ  (AX)(CX*4), AX
  1292  	LEAQ  (DX)(BX*4), DX
  1293  
  1294  check_limit:
  1295  	CMPQ SI, AX
  1296  	JHI  loop
  1297  	RET
  1298  
  1299  // func AmdAxpyPointer_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1300  // Requires: SSE
  1301  TEXT ·AmdAxpyPointer_V1A14(SB), NOSPLIT, $0-48
  1302  	MOVSS alpha+0(FP), X0
  1303  	MOVQ  xs+8(FP), AX
  1304  	MOVQ  incx+16(FP), CX
  1305  	MOVQ  ys+24(FP), DX
  1306  	MOVQ  incy+32(FP), BX
  1307  	MOVQ  n+40(FP), SI
  1308  	SHLQ  $0x02, SI
  1309  	IMULQ CX, SI
  1310  	ADDQ  AX, SI
  1311  	JMP   check_limit
  1312  	PCALIGN $0x08
  1313  	NOP
  1314  	NOP
  1315  	NOP
  1316  	NOP
  1317  	NOP
  1318  	NOP
  1319  
  1320  loop:
  1321  	MOVSS (AX), X1
  1322  	MULSS X0, X1
  1323  	ADDSS (DX), X1
  1324  	MOVSS X1, (DX)
  1325  	LEAQ  (AX)(CX*4), AX
  1326  	LEAQ  (DX)(BX*4), DX
  1327  
  1328  check_limit:
  1329  	CMPQ SI, AX
  1330  	JHI  loop
  1331  	RET
  1332  
  1333  // func AmdAxpyPointer_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1334  // Requires: SSE
  1335  TEXT ·AmdAxpyPointer_V2A14(SB), NOSPLIT, $0-48
  1336  	MOVSS alpha+0(FP), X0
  1337  	MOVQ  xs+8(FP), AX
  1338  	MOVQ  incx+16(FP), CX
  1339  	MOVQ  ys+24(FP), DX
  1340  	MOVQ  incy+32(FP), BX
  1341  	MOVQ  n+40(FP), SI
  1342  	SHLQ  $0x02, SI
  1343  	IMULQ CX, SI
  1344  	ADDQ  AX, SI
  1345  	JMP   check_limit
  1346  	PCALIGN $0x08
  1347  	NOP
  1348  	NOP
  1349  	NOP
  1350  	NOP
  1351  	NOP
  1352  	NOP
  1353  
  1354  loop:
  1355  	MOVSS (AX), X1
  1356  	MULSS X0, X1
  1357  	ADDSS (DX), X1
  1358  	MOVSS X1, (DX)
  1359  	LEAQ  (AX)(CX*4), AX
  1360  	LEAQ  (DX)(BX*4), DX
  1361  
  1362  check_limit:
  1363  	CMPQ SI, AX
  1364  	JHI  loop
  1365  	RET
  1366  
  1367  // func AmdAxpyPointer_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1368  // Requires: SSE
  1369  TEXT ·AmdAxpyPointer_V3A14(SB), NOSPLIT, $0-48
  1370  	MOVSS alpha+0(FP), X0
  1371  	MOVQ  xs+8(FP), AX
  1372  	MOVQ  incx+16(FP), CX
  1373  	MOVQ  ys+24(FP), DX
  1374  	MOVQ  incy+32(FP), BX
  1375  	MOVQ  n+40(FP), SI
  1376  	SHLQ  $0x02, SI
  1377  	IMULQ CX, SI
  1378  	ADDQ  AX, SI
  1379  	JMP   check_limit
  1380  	PCALIGN $0x08
  1381  	NOP
  1382  	NOP
  1383  	NOP
  1384  	NOP
  1385  	NOP
  1386  	NOP
  1387  
  1388  loop:
  1389  	MOVSS (AX), X1
  1390  	MULSS X0, X1
  1391  	ADDSS (DX), X1
  1392  	MOVSS X1, (DX)
  1393  	LEAQ  (AX)(CX*4), AX
  1394  	LEAQ  (DX)(BX*4), DX
  1395  
  1396  check_limit:
  1397  	CMPQ SI, AX
  1398  	JHI  loop
  1399  	RET
  1400  
  1401  // func AmdAxpyPointer_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1402  // Requires: SSE
  1403  TEXT ·AmdAxpyPointer_V4A14(SB), NOSPLIT, $0-48
  1404  	MOVSS alpha+0(FP), X0
  1405  	MOVQ  xs+8(FP), AX
  1406  	MOVQ  incx+16(FP), CX
  1407  	MOVQ  ys+24(FP), DX
  1408  	MOVQ  incy+32(FP), BX
  1409  	MOVQ  n+40(FP), SI
  1410  	SHLQ  $0x02, SI
  1411  	IMULQ CX, SI
  1412  	ADDQ  AX, SI
  1413  	JMP   check_limit
  1414  	PCALIGN $0x08
  1415  	NOP
  1416  	NOP
  1417  	NOP
  1418  	NOP
  1419  	NOP
  1420  	NOP
  1421  
  1422  loop:
  1423  	MOVSS (AX), X1
  1424  	MULSS X0, X1
  1425  	ADDSS (DX), X1
  1426  	MOVSS X1, (DX)
  1427  	LEAQ  (AX)(CX*4), AX
  1428  	LEAQ  (DX)(BX*4), DX
  1429  
  1430  check_limit:
  1431  	CMPQ SI, AX
  1432  	JHI  loop
  1433  	RET
  1434  
  1435  // func AmdAxpyPointer_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1436  // Requires: SSE
  1437  TEXT ·AmdAxpyPointer_V5A14(SB), NOSPLIT, $0-48
  1438  	MOVSS alpha+0(FP), X0
  1439  	MOVQ  xs+8(FP), AX
  1440  	MOVQ  incx+16(FP), CX
  1441  	MOVQ  ys+24(FP), DX
  1442  	MOVQ  incy+32(FP), BX
  1443  	MOVQ  n+40(FP), SI
  1444  	SHLQ  $0x02, SI
  1445  	IMULQ CX, SI
  1446  	ADDQ  AX, SI
  1447  	JMP   check_limit
  1448  	PCALIGN $0x08
  1449  	NOP
  1450  	NOP
  1451  	NOP
  1452  	NOP
  1453  	NOP
  1454  	NOP
  1455  
  1456  loop:
  1457  	MOVSS (AX), X1
  1458  	MULSS X0, X1
  1459  	ADDSS (DX), X1
  1460  	MOVSS X1, (DX)
  1461  	LEAQ  (AX)(CX*4), AX
  1462  	LEAQ  (DX)(BX*4), DX
  1463  
  1464  check_limit:
  1465  	CMPQ SI, AX
  1466  	JHI  loop
  1467  	RET
  1468  
  1469  // func AmdAxpyPointer_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1470  // Requires: SSE
  1471  TEXT ·AmdAxpyPointer_V0A15(SB), NOSPLIT, $0-48
  1472  	MOVSS alpha+0(FP), X0
  1473  	MOVQ  xs+8(FP), AX
  1474  	MOVQ  incx+16(FP), CX
  1475  	MOVQ  ys+24(FP), DX
  1476  	MOVQ  incy+32(FP), BX
  1477  	MOVQ  n+40(FP), SI
  1478  	SHLQ  $0x02, SI
  1479  	IMULQ CX, SI
  1480  	ADDQ  AX, SI
  1481  	JMP   check_limit
  1482  	PCALIGN $0x08
  1483  	NOP
  1484  	NOP
  1485  	NOP
  1486  	NOP
  1487  	NOP
  1488  	NOP
  1489  	NOP
  1490  
  1491  loop:
  1492  	MOVSS (AX), X1
  1493  	MULSS X0, X1
  1494  	ADDSS (DX), X1
  1495  	MOVSS X1, (DX)
  1496  	LEAQ  (AX)(CX*4), AX
  1497  	LEAQ  (DX)(BX*4), DX
  1498  
  1499  check_limit:
  1500  	CMPQ SI, AX
  1501  	JHI  loop
  1502  	RET
  1503  
  1504  // func AmdAxpyPointer_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1505  // Requires: SSE
  1506  TEXT ·AmdAxpyPointer_V1A15(SB), NOSPLIT, $0-48
  1507  	MOVSS alpha+0(FP), X0
  1508  	MOVQ  xs+8(FP), AX
  1509  	MOVQ  incx+16(FP), CX
  1510  	MOVQ  ys+24(FP), DX
  1511  	MOVQ  incy+32(FP), BX
  1512  	MOVQ  n+40(FP), SI
  1513  	SHLQ  $0x02, SI
  1514  	IMULQ CX, SI
  1515  	ADDQ  AX, SI
  1516  	JMP   check_limit
  1517  	PCALIGN $0x08
  1518  	NOP
  1519  	NOP
  1520  	NOP
  1521  	NOP
  1522  	NOP
  1523  	NOP
  1524  	NOP
  1525  
  1526  loop:
  1527  	MOVSS (AX), X1
  1528  	MULSS X0, X1
  1529  	ADDSS (DX), X1
  1530  	MOVSS X1, (DX)
  1531  	LEAQ  (AX)(CX*4), AX
  1532  	LEAQ  (DX)(BX*4), DX
  1533  
  1534  check_limit:
  1535  	CMPQ SI, AX
  1536  	JHI  loop
  1537  	RET
  1538  
  1539  // func AmdAxpyPointer_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1540  // Requires: SSE
  1541  TEXT ·AmdAxpyPointer_V2A15(SB), NOSPLIT, $0-48
  1542  	MOVSS alpha+0(FP), X0
  1543  	MOVQ  xs+8(FP), AX
  1544  	MOVQ  incx+16(FP), CX
  1545  	MOVQ  ys+24(FP), DX
  1546  	MOVQ  incy+32(FP), BX
  1547  	MOVQ  n+40(FP), SI
  1548  	SHLQ  $0x02, SI
  1549  	IMULQ CX, SI
  1550  	ADDQ  AX, SI
  1551  	JMP   check_limit
  1552  	PCALIGN $0x08
  1553  	NOP
  1554  	NOP
  1555  	NOP
  1556  	NOP
  1557  	NOP
  1558  	NOP
  1559  	NOP
  1560  
  1561  loop:
  1562  	MOVSS (AX), X1
  1563  	MULSS X0, X1
  1564  	ADDSS (DX), X1
  1565  	MOVSS X1, (DX)
  1566  	LEAQ  (AX)(CX*4), AX
  1567  	LEAQ  (DX)(BX*4), DX
  1568  
  1569  check_limit:
  1570  	CMPQ SI, AX
  1571  	JHI  loop
  1572  	RET
  1573  
  1574  // func AmdAxpyPointer_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1575  // Requires: SSE
  1576  TEXT ·AmdAxpyPointer_V3A15(SB), NOSPLIT, $0-48
  1577  	MOVSS alpha+0(FP), X0
  1578  	MOVQ  xs+8(FP), AX
  1579  	MOVQ  incx+16(FP), CX
  1580  	MOVQ  ys+24(FP), DX
  1581  	MOVQ  incy+32(FP), BX
  1582  	MOVQ  n+40(FP), SI
  1583  	SHLQ  $0x02, SI
  1584  	IMULQ CX, SI
  1585  	ADDQ  AX, SI
  1586  	JMP   check_limit
  1587  	PCALIGN $0x08
  1588  	NOP
  1589  	NOP
  1590  	NOP
  1591  	NOP
  1592  	NOP
  1593  	NOP
  1594  	NOP
  1595  
  1596  loop:
  1597  	MOVSS (AX), X1
  1598  	MULSS X0, X1
  1599  	ADDSS (DX), X1
  1600  	MOVSS X1, (DX)
  1601  	LEAQ  (AX)(CX*4), AX
  1602  	LEAQ  (DX)(BX*4), DX
  1603  
  1604  check_limit:
  1605  	CMPQ SI, AX
  1606  	JHI  loop
  1607  	RET
  1608  
  1609  // func AmdAxpyPointer_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1610  // Requires: SSE
  1611  TEXT ·AmdAxpyPointer_V4A15(SB), NOSPLIT, $0-48
  1612  	MOVSS alpha+0(FP), X0
  1613  	MOVQ  xs+8(FP), AX
  1614  	MOVQ  incx+16(FP), CX
  1615  	MOVQ  ys+24(FP), DX
  1616  	MOVQ  incy+32(FP), BX
  1617  	MOVQ  n+40(FP), SI
  1618  	SHLQ  $0x02, SI
  1619  	IMULQ CX, SI
  1620  	ADDQ  AX, SI
  1621  	JMP   check_limit
  1622  	PCALIGN $0x08
  1623  	NOP
  1624  	NOP
  1625  	NOP
  1626  	NOP
  1627  	NOP
  1628  	NOP
  1629  	NOP
  1630  
  1631  loop:
  1632  	MOVSS (AX), X1
  1633  	MULSS X0, X1
  1634  	ADDSS (DX), X1
  1635  	MOVSS X1, (DX)
  1636  	LEAQ  (AX)(CX*4), AX
  1637  	LEAQ  (DX)(BX*4), DX
  1638  
  1639  check_limit:
  1640  	CMPQ SI, AX
  1641  	JHI  loop
  1642  	RET
  1643  
  1644  // func AmdAxpyPointer_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1645  // Requires: SSE
  1646  TEXT ·AmdAxpyPointer_V5A15(SB), NOSPLIT, $0-48
  1647  	MOVSS alpha+0(FP), X0
  1648  	MOVQ  xs+8(FP), AX
  1649  	MOVQ  incx+16(FP), CX
  1650  	MOVQ  ys+24(FP), DX
  1651  	MOVQ  incy+32(FP), BX
  1652  	MOVQ  n+40(FP), SI
  1653  	SHLQ  $0x02, SI
  1654  	IMULQ CX, SI
  1655  	ADDQ  AX, SI
  1656  	JMP   check_limit
  1657  	PCALIGN $0x08
  1658  	NOP
  1659  	NOP
  1660  	NOP
  1661  	NOP
  1662  	NOP
  1663  	NOP
  1664  	NOP
  1665  
  1666  loop:
  1667  	MOVSS (AX), X1
  1668  	MULSS X0, X1
  1669  	ADDSS (DX), X1
  1670  	MOVSS X1, (DX)
  1671  	LEAQ  (AX)(CX*4), AX
  1672  	LEAQ  (DX)(BX*4), DX
  1673  
  1674  check_limit:
  1675  	CMPQ SI, AX
  1676  	JHI  loop
  1677  	RET
  1678  
  1679  // func AmdAxpyPointer_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1680  // Requires: SSE
  1681  TEXT ·AmdAxpyPointer_V0A16(SB), NOSPLIT, $0-48
  1682  	MOVSS alpha+0(FP), X0
  1683  	MOVQ  xs+8(FP), AX
  1684  	MOVQ  incx+16(FP), CX
  1685  	MOVQ  ys+24(FP), DX
  1686  	MOVQ  incy+32(FP), BX
  1687  	MOVQ  n+40(FP), SI
  1688  	SHLQ  $0x02, SI
  1689  	IMULQ CX, SI
  1690  	ADDQ  AX, SI
  1691  	JMP   check_limit
  1692  	PCALIGN $0x10
  1693  
  1694  loop:
  1695  	MOVSS (AX), X1
  1696  	MULSS X0, X1
  1697  	ADDSS (DX), X1
  1698  	MOVSS X1, (DX)
  1699  	LEAQ  (AX)(CX*4), AX
  1700  	LEAQ  (DX)(BX*4), DX
  1701  
  1702  check_limit:
  1703  	CMPQ SI, AX
  1704  	JHI  loop
  1705  	RET
  1706  
  1707  // func AmdAxpyPointer_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1708  // Requires: SSE
  1709  TEXT ·AmdAxpyPointer_V1A16(SB), NOSPLIT, $0-48
  1710  	MOVSS alpha+0(FP), X0
  1711  	MOVQ  xs+8(FP), AX
  1712  	MOVQ  incx+16(FP), CX
  1713  	MOVQ  ys+24(FP), DX
  1714  	MOVQ  incy+32(FP), BX
  1715  	MOVQ  n+40(FP), SI
  1716  	SHLQ  $0x02, SI
  1717  	IMULQ CX, SI
  1718  	ADDQ  AX, SI
  1719  	JMP   check_limit
  1720  	PCALIGN $0x10
  1721  
  1722  loop:
  1723  	MOVSS (AX), X1
  1724  	MULSS X0, X1
  1725  	ADDSS (DX), X1
  1726  	MOVSS X1, (DX)
  1727  	LEAQ  (AX)(CX*4), AX
  1728  	LEAQ  (DX)(BX*4), DX
  1729  
  1730  check_limit:
  1731  	CMPQ SI, AX
  1732  	JHI  loop
  1733  	RET
  1734  
  1735  // func AmdAxpyPointer_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1736  // Requires: SSE
  1737  TEXT ·AmdAxpyPointer_V2A16(SB), NOSPLIT, $0-48
  1738  	MOVSS alpha+0(FP), X0
  1739  	MOVQ  xs+8(FP), AX
  1740  	MOVQ  incx+16(FP), CX
  1741  	MOVQ  ys+24(FP), DX
  1742  	MOVQ  incy+32(FP), BX
  1743  	MOVQ  n+40(FP), SI
  1744  	SHLQ  $0x02, SI
  1745  	IMULQ CX, SI
  1746  	ADDQ  AX, SI
  1747  	JMP   check_limit
  1748  	PCALIGN $0x10
  1749  
  1750  loop:
  1751  	MOVSS (AX), X1
  1752  	MULSS X0, X1
  1753  	ADDSS (DX), X1
  1754  	MOVSS X1, (DX)
  1755  	LEAQ  (AX)(CX*4), AX
  1756  	LEAQ  (DX)(BX*4), DX
  1757  
  1758  check_limit:
  1759  	CMPQ SI, AX
  1760  	JHI  loop
  1761  	RET
  1762  
  1763  // func AmdAxpyPointer_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1764  // Requires: SSE
  1765  TEXT ·AmdAxpyPointer_V3A16(SB), NOSPLIT, $0-48
  1766  	MOVSS alpha+0(FP), X0
  1767  	MOVQ  xs+8(FP), AX
  1768  	MOVQ  incx+16(FP), CX
  1769  	MOVQ  ys+24(FP), DX
  1770  	MOVQ  incy+32(FP), BX
  1771  	MOVQ  n+40(FP), SI
  1772  	SHLQ  $0x02, SI
  1773  	IMULQ CX, SI
  1774  	ADDQ  AX, SI
  1775  	JMP   check_limit
  1776  	PCALIGN $0x10
  1777  
  1778  loop:
  1779  	MOVSS (AX), X1
  1780  	MULSS X0, X1
  1781  	ADDSS (DX), X1
  1782  	MOVSS X1, (DX)
  1783  	LEAQ  (AX)(CX*4), AX
  1784  	LEAQ  (DX)(BX*4), DX
  1785  
  1786  check_limit:
  1787  	CMPQ SI, AX
  1788  	JHI  loop
  1789  	RET
  1790  
  1791  // func AmdAxpyPointer_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1792  // Requires: SSE
  1793  TEXT ·AmdAxpyPointer_V4A16(SB), NOSPLIT, $0-48
  1794  	MOVSS alpha+0(FP), X0
  1795  	MOVQ  xs+8(FP), AX
  1796  	MOVQ  incx+16(FP), CX
  1797  	MOVQ  ys+24(FP), DX
  1798  	MOVQ  incy+32(FP), BX
  1799  	MOVQ  n+40(FP), SI
  1800  	SHLQ  $0x02, SI
  1801  	IMULQ CX, SI
  1802  	ADDQ  AX, SI
  1803  	JMP   check_limit
  1804  	PCALIGN $0x10
  1805  
  1806  loop:
  1807  	MOVSS (AX), X1
  1808  	MULSS X0, X1
  1809  	ADDSS (DX), X1
  1810  	MOVSS X1, (DX)
  1811  	LEAQ  (AX)(CX*4), AX
  1812  	LEAQ  (DX)(BX*4), DX
  1813  
  1814  check_limit:
  1815  	CMPQ SI, AX
  1816  	JHI  loop
  1817  	RET
  1818  
  1819  // func AmdAxpyPointer_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1820  // Requires: SSE
  1821  TEXT ·AmdAxpyPointer_V5A16(SB), NOSPLIT, $0-48
  1822  	MOVSS alpha+0(FP), X0
  1823  	MOVQ  xs+8(FP), AX
  1824  	MOVQ  incx+16(FP), CX
  1825  	MOVQ  ys+24(FP), DX
  1826  	MOVQ  incy+32(FP), BX
  1827  	MOVQ  n+40(FP), SI
  1828  	SHLQ  $0x02, SI
  1829  	IMULQ CX, SI
  1830  	ADDQ  AX, SI
  1831  	JMP   check_limit
  1832  	PCALIGN $0x10
  1833  
  1834  loop:
  1835  	MOVSS (AX), X1
  1836  	MULSS X0, X1
  1837  	ADDSS (DX), X1
  1838  	MOVSS X1, (DX)
  1839  	LEAQ  (AX)(CX*4), AX
  1840  	LEAQ  (DX)(BX*4), DX
  1841  
  1842  check_limit:
  1843  	CMPQ SI, AX
  1844  	JHI  loop
  1845  	RET
  1846  
  1847  // func AmdAxpyPointerLoop_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1848  // Requires: SSE
  1849  TEXT ·AmdAxpyPointerLoop_V0A0(SB), NOSPLIT, $0-48
  1850  	MOVSS alpha+0(FP), X0
  1851  	MOVQ  xs+8(FP), AX
  1852  	MOVQ  incx+16(FP), CX
  1853  	MOVQ  ys+24(FP), DX
  1854  	MOVQ  incy+32(FP), BX
  1855  	MOVQ  n+40(FP), SI
  1856  	XORQ  DI, DI
  1857  	JMP   check_limit
  1858  
  1859  loop:
  1860  	MOVSS (AX), X1
  1861  	MULSS X0, X1
  1862  	ADDSS (DX), X1
  1863  	MOVSS X1, (DX)
  1864  	INCQ  DI
  1865  	LEAQ  (AX)(CX*4), AX
  1866  	LEAQ  (DX)(BX*4), DX
  1867  
  1868  check_limit:
  1869  	CMPQ SI, DI
  1870  	JHI  loop
  1871  	RET
  1872  
  1873  // func AmdAxpyPointerLoop_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1874  // Requires: SSE
  1875  TEXT ·AmdAxpyPointerLoop_V1A0(SB), NOSPLIT, $0-48
  1876  	MOVSS alpha+0(FP), X0
  1877  	MOVQ  xs+8(FP), AX
  1878  	MOVQ  incx+16(FP), CX
  1879  	MOVQ  ys+24(FP), DX
  1880  	MOVQ  incy+32(FP), BX
  1881  	MOVQ  n+40(FP), SI
  1882  	XORQ  DI, DI
  1883  	JMP   check_limit
  1884  
  1885  loop:
  1886  	MOVSS (AX), X1
  1887  	MULSS X0, X1
  1888  	ADDSS (DX), X1
  1889  	MOVSS X1, (DX)
  1890  	INCQ  DI
  1891  	LEAQ  (AX)(CX*4), AX
  1892  	LEAQ  (DX)(BX*4), DX
  1893  
  1894  check_limit:
  1895  	CMPQ SI, DI
  1896  	JHI  loop
  1897  	RET
  1898  
  1899  // func AmdAxpyPointerLoop_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1900  // Requires: SSE
  1901  TEXT ·AmdAxpyPointerLoop_V2A0(SB), NOSPLIT, $0-48
  1902  	MOVSS alpha+0(FP), X0
  1903  	MOVQ  xs+8(FP), AX
  1904  	MOVQ  incx+16(FP), CX
  1905  	MOVQ  ys+24(FP), DX
  1906  	MOVQ  incy+32(FP), BX
  1907  	MOVQ  n+40(FP), SI
  1908  	XORQ  DI, DI
  1909  	JMP   check_limit
  1910  
  1911  loop:
  1912  	MOVSS (AX), X1
  1913  	MULSS X0, X1
  1914  	ADDSS (DX), X1
  1915  	MOVSS X1, (DX)
  1916  	INCQ  DI
  1917  	LEAQ  (AX)(CX*4), AX
  1918  	LEAQ  (DX)(BX*4), DX
  1919  
  1920  check_limit:
  1921  	CMPQ SI, DI
  1922  	JHI  loop
  1923  	RET
  1924  
  1925  // func AmdAxpyPointerLoop_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1926  // Requires: SSE
  1927  TEXT ·AmdAxpyPointerLoop_V3A0(SB), NOSPLIT, $0-48
  1928  	MOVSS alpha+0(FP), X0
  1929  	MOVQ  xs+8(FP), AX
  1930  	MOVQ  incx+16(FP), CX
  1931  	MOVQ  ys+24(FP), DX
  1932  	MOVQ  incy+32(FP), BX
  1933  	MOVQ  n+40(FP), SI
  1934  	XORQ  DI, DI
  1935  	JMP   check_limit
  1936  
  1937  loop:
  1938  	MOVSS (AX), X1
  1939  	MULSS X0, X1
  1940  	ADDSS (DX), X1
  1941  	MOVSS X1, (DX)
  1942  	INCQ  DI
  1943  	LEAQ  (AX)(CX*4), AX
  1944  	LEAQ  (DX)(BX*4), DX
  1945  
  1946  check_limit:
  1947  	CMPQ SI, DI
  1948  	JHI  loop
  1949  	RET
  1950  
  1951  // func AmdAxpyPointerLoop_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1952  // Requires: SSE
  1953  TEXT ·AmdAxpyPointerLoop_V4A0(SB), NOSPLIT, $0-48
  1954  	MOVSS alpha+0(FP), X0
  1955  	MOVQ  xs+8(FP), AX
  1956  	MOVQ  incx+16(FP), CX
  1957  	MOVQ  ys+24(FP), DX
  1958  	MOVQ  incy+32(FP), BX
  1959  	MOVQ  n+40(FP), SI
  1960  	XORQ  DI, DI
  1961  	JMP   check_limit
  1962  
  1963  loop:
  1964  	MOVSS (AX), X1
  1965  	MULSS X0, X1
  1966  	ADDSS (DX), X1
  1967  	MOVSS X1, (DX)
  1968  	INCQ  DI
  1969  	LEAQ  (AX)(CX*4), AX
  1970  	LEAQ  (DX)(BX*4), DX
  1971  
  1972  check_limit:
  1973  	CMPQ SI, DI
  1974  	JHI  loop
  1975  	RET
  1976  
  1977  // func AmdAxpyPointerLoop_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  1978  // Requires: SSE
  1979  TEXT ·AmdAxpyPointerLoop_V5A0(SB), NOSPLIT, $0-48
  1980  	MOVSS alpha+0(FP), X0
  1981  	MOVQ  xs+8(FP), AX
  1982  	MOVQ  incx+16(FP), CX
  1983  	MOVQ  ys+24(FP), DX
  1984  	MOVQ  incy+32(FP), BX
  1985  	MOVQ  n+40(FP), SI
  1986  	XORQ  DI, DI
  1987  	JMP   check_limit
  1988  
  1989  loop:
  1990  	MOVSS (AX), X1
  1991  	MULSS X0, X1
  1992  	ADDSS (DX), X1
  1993  	MOVSS X1, (DX)
  1994  	INCQ  DI
  1995  	LEAQ  (AX)(CX*4), AX
  1996  	LEAQ  (DX)(BX*4), DX
  1997  
  1998  check_limit:
  1999  	CMPQ SI, DI
  2000  	JHI  loop
  2001  	RET
  2002  
  2003  // func AmdAxpyPointerLoop_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2004  // Requires: SSE
  2005  TEXT ·AmdAxpyPointerLoop_V0A8(SB), NOSPLIT, $0-48
  2006  	MOVSS alpha+0(FP), X0
  2007  	MOVQ  xs+8(FP), AX
  2008  	MOVQ  incx+16(FP), CX
  2009  	MOVQ  ys+24(FP), DX
  2010  	MOVQ  incy+32(FP), BX
  2011  	MOVQ  n+40(FP), SI
  2012  	XORQ  DI, DI
  2013  	JMP   check_limit
  2014  	PCALIGN $0x08
  2015  
  2016  loop:
  2017  	MOVSS (AX), X1
  2018  	MULSS X0, X1
  2019  	ADDSS (DX), X1
  2020  	MOVSS X1, (DX)
  2021  	INCQ  DI
  2022  	LEAQ  (AX)(CX*4), AX
  2023  	LEAQ  (DX)(BX*4), DX
  2024  
  2025  check_limit:
  2026  	CMPQ SI, DI
  2027  	JHI  loop
  2028  	RET
  2029  
  2030  // func AmdAxpyPointerLoop_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2031  // Requires: SSE
  2032  TEXT ·AmdAxpyPointerLoop_V1A8(SB), NOSPLIT, $0-48
  2033  	MOVSS alpha+0(FP), X0
  2034  	MOVQ  xs+8(FP), AX
  2035  	MOVQ  incx+16(FP), CX
  2036  	MOVQ  ys+24(FP), DX
  2037  	MOVQ  incy+32(FP), BX
  2038  	MOVQ  n+40(FP), SI
  2039  	XORQ  DI, DI
  2040  	JMP   check_limit
  2041  	PCALIGN $0x08
  2042  
  2043  loop:
  2044  	MOVSS (AX), X1
  2045  	MULSS X0, X1
  2046  	ADDSS (DX), X1
  2047  	MOVSS X1, (DX)
  2048  	INCQ  DI
  2049  	LEAQ  (AX)(CX*4), AX
  2050  	LEAQ  (DX)(BX*4), DX
  2051  
  2052  check_limit:
  2053  	CMPQ SI, DI
  2054  	JHI  loop
  2055  	RET
  2056  
  2057  // func AmdAxpyPointerLoop_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2058  // Requires: SSE
  2059  TEXT ·AmdAxpyPointerLoop_V2A8(SB), NOSPLIT, $0-48
  2060  	MOVSS alpha+0(FP), X0
  2061  	MOVQ  xs+8(FP), AX
  2062  	MOVQ  incx+16(FP), CX
  2063  	MOVQ  ys+24(FP), DX
  2064  	MOVQ  incy+32(FP), BX
  2065  	MOVQ  n+40(FP), SI
  2066  	XORQ  DI, DI
  2067  	JMP   check_limit
  2068  	PCALIGN $0x08
  2069  
  2070  loop:
  2071  	MOVSS (AX), X1
  2072  	MULSS X0, X1
  2073  	ADDSS (DX), X1
  2074  	MOVSS X1, (DX)
  2075  	INCQ  DI
  2076  	LEAQ  (AX)(CX*4), AX
  2077  	LEAQ  (DX)(BX*4), DX
  2078  
  2079  check_limit:
  2080  	CMPQ SI, DI
  2081  	JHI  loop
  2082  	RET
  2083  
  2084  // func AmdAxpyPointerLoop_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2085  // Requires: SSE
  2086  TEXT ·AmdAxpyPointerLoop_V3A8(SB), NOSPLIT, $0-48
  2087  	MOVSS alpha+0(FP), X0
  2088  	MOVQ  xs+8(FP), AX
  2089  	MOVQ  incx+16(FP), CX
  2090  	MOVQ  ys+24(FP), DX
  2091  	MOVQ  incy+32(FP), BX
  2092  	MOVQ  n+40(FP), SI
  2093  	XORQ  DI, DI
  2094  	JMP   check_limit
  2095  	PCALIGN $0x08
  2096  
  2097  loop:
  2098  	MOVSS (AX), X1
  2099  	MULSS X0, X1
  2100  	ADDSS (DX), X1
  2101  	MOVSS X1, (DX)
  2102  	INCQ  DI
  2103  	LEAQ  (AX)(CX*4), AX
  2104  	LEAQ  (DX)(BX*4), DX
  2105  
  2106  check_limit:
  2107  	CMPQ SI, DI
  2108  	JHI  loop
  2109  	RET
  2110  
  2111  // func AmdAxpyPointerLoop_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2112  // Requires: SSE
  2113  TEXT ·AmdAxpyPointerLoop_V4A8(SB), NOSPLIT, $0-48
  2114  	MOVSS alpha+0(FP), X0
  2115  	MOVQ  xs+8(FP), AX
  2116  	MOVQ  incx+16(FP), CX
  2117  	MOVQ  ys+24(FP), DX
  2118  	MOVQ  incy+32(FP), BX
  2119  	MOVQ  n+40(FP), SI
  2120  	XORQ  DI, DI
  2121  	JMP   check_limit
  2122  	PCALIGN $0x08
  2123  
  2124  loop:
  2125  	MOVSS (AX), X1
  2126  	MULSS X0, X1
  2127  	ADDSS (DX), X1
  2128  	MOVSS X1, (DX)
  2129  	INCQ  DI
  2130  	LEAQ  (AX)(CX*4), AX
  2131  	LEAQ  (DX)(BX*4), DX
  2132  
  2133  check_limit:
  2134  	CMPQ SI, DI
  2135  	JHI  loop
  2136  	RET
  2137  
  2138  // func AmdAxpyPointerLoop_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2139  // Requires: SSE
  2140  TEXT ·AmdAxpyPointerLoop_V5A8(SB), NOSPLIT, $0-48
  2141  	MOVSS alpha+0(FP), X0
  2142  	MOVQ  xs+8(FP), AX
  2143  	MOVQ  incx+16(FP), CX
  2144  	MOVQ  ys+24(FP), DX
  2145  	MOVQ  incy+32(FP), BX
  2146  	MOVQ  n+40(FP), SI
  2147  	XORQ  DI, DI
  2148  	JMP   check_limit
  2149  	PCALIGN $0x08
  2150  
  2151  loop:
  2152  	MOVSS (AX), X1
  2153  	MULSS X0, X1
  2154  	ADDSS (DX), X1
  2155  	MOVSS X1, (DX)
  2156  	INCQ  DI
  2157  	LEAQ  (AX)(CX*4), AX
  2158  	LEAQ  (DX)(BX*4), DX
  2159  
  2160  check_limit:
  2161  	CMPQ SI, DI
  2162  	JHI  loop
  2163  	RET
  2164  
  2165  // func AmdAxpyPointerLoop_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2166  // Requires: SSE
  2167  TEXT ·AmdAxpyPointerLoop_V0A9(SB), NOSPLIT, $0-48
  2168  	MOVSS alpha+0(FP), X0
  2169  	MOVQ  xs+8(FP), AX
  2170  	MOVQ  incx+16(FP), CX
  2171  	MOVQ  ys+24(FP), DX
  2172  	MOVQ  incy+32(FP), BX
  2173  	MOVQ  n+40(FP), SI
  2174  	XORQ  DI, DI
  2175  	JMP   check_limit
  2176  	PCALIGN $0x08
  2177  	NOP
  2178  
  2179  loop:
  2180  	MOVSS (AX), X1
  2181  	MULSS X0, X1
  2182  	ADDSS (DX), X1
  2183  	MOVSS X1, (DX)
  2184  	INCQ  DI
  2185  	LEAQ  (AX)(CX*4), AX
  2186  	LEAQ  (DX)(BX*4), DX
  2187  
  2188  check_limit:
  2189  	CMPQ SI, DI
  2190  	JHI  loop
  2191  	RET
  2192  
  2193  // func AmdAxpyPointerLoop_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2194  // Requires: SSE
  2195  TEXT ·AmdAxpyPointerLoop_V1A9(SB), NOSPLIT, $0-48
  2196  	MOVSS alpha+0(FP), X0
  2197  	MOVQ  xs+8(FP), AX
  2198  	MOVQ  incx+16(FP), CX
  2199  	MOVQ  ys+24(FP), DX
  2200  	MOVQ  incy+32(FP), BX
  2201  	MOVQ  n+40(FP), SI
  2202  	XORQ  DI, DI
  2203  	JMP   check_limit
  2204  	PCALIGN $0x08
  2205  	NOP
  2206  
  2207  loop:
  2208  	MOVSS (AX), X1
  2209  	MULSS X0, X1
  2210  	ADDSS (DX), X1
  2211  	MOVSS X1, (DX)
  2212  	INCQ  DI
  2213  	LEAQ  (AX)(CX*4), AX
  2214  	LEAQ  (DX)(BX*4), DX
  2215  
  2216  check_limit:
  2217  	CMPQ SI, DI
  2218  	JHI  loop
  2219  	RET
  2220  
  2221  // func AmdAxpyPointerLoop_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2222  // Requires: SSE
  2223  TEXT ·AmdAxpyPointerLoop_V2A9(SB), NOSPLIT, $0-48
  2224  	MOVSS alpha+0(FP), X0
  2225  	MOVQ  xs+8(FP), AX
  2226  	MOVQ  incx+16(FP), CX
  2227  	MOVQ  ys+24(FP), DX
  2228  	MOVQ  incy+32(FP), BX
  2229  	MOVQ  n+40(FP), SI
  2230  	XORQ  DI, DI
  2231  	JMP   check_limit
  2232  	PCALIGN $0x08
  2233  	NOP
  2234  
  2235  loop:
  2236  	MOVSS (AX), X1
  2237  	MULSS X0, X1
  2238  	ADDSS (DX), X1
  2239  	MOVSS X1, (DX)
  2240  	INCQ  DI
  2241  	LEAQ  (AX)(CX*4), AX
  2242  	LEAQ  (DX)(BX*4), DX
  2243  
  2244  check_limit:
  2245  	CMPQ SI, DI
  2246  	JHI  loop
  2247  	RET
  2248  
  2249  // func AmdAxpyPointerLoop_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2250  // Requires: SSE
  2251  TEXT ·AmdAxpyPointerLoop_V3A9(SB), NOSPLIT, $0-48
  2252  	MOVSS alpha+0(FP), X0
  2253  	MOVQ  xs+8(FP), AX
  2254  	MOVQ  incx+16(FP), CX
  2255  	MOVQ  ys+24(FP), DX
  2256  	MOVQ  incy+32(FP), BX
  2257  	MOVQ  n+40(FP), SI
  2258  	XORQ  DI, DI
  2259  	JMP   check_limit
  2260  	PCALIGN $0x08
  2261  	NOP
  2262  
  2263  loop:
  2264  	MOVSS (AX), X1
  2265  	MULSS X0, X1
  2266  	ADDSS (DX), X1
  2267  	MOVSS X1, (DX)
  2268  	INCQ  DI
  2269  	LEAQ  (AX)(CX*4), AX
  2270  	LEAQ  (DX)(BX*4), DX
  2271  
  2272  check_limit:
  2273  	CMPQ SI, DI
  2274  	JHI  loop
  2275  	RET
  2276  
  2277  // func AmdAxpyPointerLoop_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2278  // Requires: SSE
  2279  TEXT ·AmdAxpyPointerLoop_V4A9(SB), NOSPLIT, $0-48
  2280  	MOVSS alpha+0(FP), X0
  2281  	MOVQ  xs+8(FP), AX
  2282  	MOVQ  incx+16(FP), CX
  2283  	MOVQ  ys+24(FP), DX
  2284  	MOVQ  incy+32(FP), BX
  2285  	MOVQ  n+40(FP), SI
  2286  	XORQ  DI, DI
  2287  	JMP   check_limit
  2288  	PCALIGN $0x08
  2289  	NOP
  2290  
  2291  loop:
  2292  	MOVSS (AX), X1
  2293  	MULSS X0, X1
  2294  	ADDSS (DX), X1
  2295  	MOVSS X1, (DX)
  2296  	INCQ  DI
  2297  	LEAQ  (AX)(CX*4), AX
  2298  	LEAQ  (DX)(BX*4), DX
  2299  
  2300  check_limit:
  2301  	CMPQ SI, DI
  2302  	JHI  loop
  2303  	RET
  2304  
  2305  // func AmdAxpyPointerLoop_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2306  // Requires: SSE
  2307  TEXT ·AmdAxpyPointerLoop_V5A9(SB), NOSPLIT, $0-48
  2308  	MOVSS alpha+0(FP), X0
  2309  	MOVQ  xs+8(FP), AX
  2310  	MOVQ  incx+16(FP), CX
  2311  	MOVQ  ys+24(FP), DX
  2312  	MOVQ  incy+32(FP), BX
  2313  	MOVQ  n+40(FP), SI
  2314  	XORQ  DI, DI
  2315  	JMP   check_limit
  2316  	PCALIGN $0x08
  2317  	NOP
  2318  
  2319  loop:
  2320  	MOVSS (AX), X1
  2321  	MULSS X0, X1
  2322  	ADDSS (DX), X1
  2323  	MOVSS X1, (DX)
  2324  	INCQ  DI
  2325  	LEAQ  (AX)(CX*4), AX
  2326  	LEAQ  (DX)(BX*4), DX
  2327  
  2328  check_limit:
  2329  	CMPQ SI, DI
  2330  	JHI  loop
  2331  	RET
  2332  
  2333  // func AmdAxpyPointerLoop_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2334  // Requires: SSE
  2335  TEXT ·AmdAxpyPointerLoop_V0A10(SB), NOSPLIT, $0-48
  2336  	MOVSS alpha+0(FP), X0
  2337  	MOVQ  xs+8(FP), AX
  2338  	MOVQ  incx+16(FP), CX
  2339  	MOVQ  ys+24(FP), DX
  2340  	MOVQ  incy+32(FP), BX
  2341  	MOVQ  n+40(FP), SI
  2342  	XORQ  DI, DI
  2343  	JMP   check_limit
  2344  	PCALIGN $0x08
  2345  	NOP
  2346  	NOP
  2347  
  2348  loop:
  2349  	MOVSS (AX), X1
  2350  	MULSS X0, X1
  2351  	ADDSS (DX), X1
  2352  	MOVSS X1, (DX)
  2353  	INCQ  DI
  2354  	LEAQ  (AX)(CX*4), AX
  2355  	LEAQ  (DX)(BX*4), DX
  2356  
  2357  check_limit:
  2358  	CMPQ SI, DI
  2359  	JHI  loop
  2360  	RET
  2361  
  2362  // func AmdAxpyPointerLoop_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2363  // Requires: SSE
  2364  TEXT ·AmdAxpyPointerLoop_V1A10(SB), NOSPLIT, $0-48
  2365  	MOVSS alpha+0(FP), X0
  2366  	MOVQ  xs+8(FP), AX
  2367  	MOVQ  incx+16(FP), CX
  2368  	MOVQ  ys+24(FP), DX
  2369  	MOVQ  incy+32(FP), BX
  2370  	MOVQ  n+40(FP), SI
  2371  	XORQ  DI, DI
  2372  	JMP   check_limit
  2373  	PCALIGN $0x08
  2374  	NOP
  2375  	NOP
  2376  
  2377  loop:
  2378  	MOVSS (AX), X1
  2379  	MULSS X0, X1
  2380  	ADDSS (DX), X1
  2381  	MOVSS X1, (DX)
  2382  	INCQ  DI
  2383  	LEAQ  (AX)(CX*4), AX
  2384  	LEAQ  (DX)(BX*4), DX
  2385  
  2386  check_limit:
  2387  	CMPQ SI, DI
  2388  	JHI  loop
  2389  	RET
  2390  
  2391  // func AmdAxpyPointerLoop_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2392  // Requires: SSE
  2393  TEXT ·AmdAxpyPointerLoop_V2A10(SB), NOSPLIT, $0-48
  2394  	MOVSS alpha+0(FP), X0
  2395  	MOVQ  xs+8(FP), AX
  2396  	MOVQ  incx+16(FP), CX
  2397  	MOVQ  ys+24(FP), DX
  2398  	MOVQ  incy+32(FP), BX
  2399  	MOVQ  n+40(FP), SI
  2400  	XORQ  DI, DI
  2401  	JMP   check_limit
  2402  	PCALIGN $0x08
  2403  	NOP
  2404  	NOP
  2405  
  2406  loop:
  2407  	MOVSS (AX), X1
  2408  	MULSS X0, X1
  2409  	ADDSS (DX), X1
  2410  	MOVSS X1, (DX)
  2411  	INCQ  DI
  2412  	LEAQ  (AX)(CX*4), AX
  2413  	LEAQ  (DX)(BX*4), DX
  2414  
  2415  check_limit:
  2416  	CMPQ SI, DI
  2417  	JHI  loop
  2418  	RET
  2419  
  2420  // func AmdAxpyPointerLoop_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2421  // Requires: SSE
  2422  TEXT ·AmdAxpyPointerLoop_V3A10(SB), NOSPLIT, $0-48
  2423  	MOVSS alpha+0(FP), X0
  2424  	MOVQ  xs+8(FP), AX
  2425  	MOVQ  incx+16(FP), CX
  2426  	MOVQ  ys+24(FP), DX
  2427  	MOVQ  incy+32(FP), BX
  2428  	MOVQ  n+40(FP), SI
  2429  	XORQ  DI, DI
  2430  	JMP   check_limit
  2431  	PCALIGN $0x08
  2432  	NOP
  2433  	NOP
  2434  
  2435  loop:
  2436  	MOVSS (AX), X1
  2437  	MULSS X0, X1
  2438  	ADDSS (DX), X1
  2439  	MOVSS X1, (DX)
  2440  	INCQ  DI
  2441  	LEAQ  (AX)(CX*4), AX
  2442  	LEAQ  (DX)(BX*4), DX
  2443  
  2444  check_limit:
  2445  	CMPQ SI, DI
  2446  	JHI  loop
  2447  	RET
  2448  
  2449  // func AmdAxpyPointerLoop_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2450  // Requires: SSE
  2451  TEXT ·AmdAxpyPointerLoop_V4A10(SB), NOSPLIT, $0-48
  2452  	MOVSS alpha+0(FP), X0
  2453  	MOVQ  xs+8(FP), AX
  2454  	MOVQ  incx+16(FP), CX
  2455  	MOVQ  ys+24(FP), DX
  2456  	MOVQ  incy+32(FP), BX
  2457  	MOVQ  n+40(FP), SI
  2458  	XORQ  DI, DI
  2459  	JMP   check_limit
  2460  	PCALIGN $0x08
  2461  	NOP
  2462  	NOP
  2463  
  2464  loop:
  2465  	MOVSS (AX), X1
  2466  	MULSS X0, X1
  2467  	ADDSS (DX), X1
  2468  	MOVSS X1, (DX)
  2469  	INCQ  DI
  2470  	LEAQ  (AX)(CX*4), AX
  2471  	LEAQ  (DX)(BX*4), DX
  2472  
  2473  check_limit:
  2474  	CMPQ SI, DI
  2475  	JHI  loop
  2476  	RET
  2477  
  2478  // func AmdAxpyPointerLoop_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2479  // Requires: SSE
  2480  TEXT ·AmdAxpyPointerLoop_V5A10(SB), NOSPLIT, $0-48
  2481  	MOVSS alpha+0(FP), X0
  2482  	MOVQ  xs+8(FP), AX
  2483  	MOVQ  incx+16(FP), CX
  2484  	MOVQ  ys+24(FP), DX
  2485  	MOVQ  incy+32(FP), BX
  2486  	MOVQ  n+40(FP), SI
  2487  	XORQ  DI, DI
  2488  	JMP   check_limit
  2489  	PCALIGN $0x08
  2490  	NOP
  2491  	NOP
  2492  
  2493  loop:
  2494  	MOVSS (AX), X1
  2495  	MULSS X0, X1
  2496  	ADDSS (DX), X1
  2497  	MOVSS X1, (DX)
  2498  	INCQ  DI
  2499  	LEAQ  (AX)(CX*4), AX
  2500  	LEAQ  (DX)(BX*4), DX
  2501  
  2502  check_limit:
  2503  	CMPQ SI, DI
  2504  	JHI  loop
  2505  	RET
  2506  
  2507  // func AmdAxpyPointerLoop_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2508  // Requires: SSE
  2509  TEXT ·AmdAxpyPointerLoop_V0A11(SB), NOSPLIT, $0-48
  2510  	MOVSS alpha+0(FP), X0
  2511  	MOVQ  xs+8(FP), AX
  2512  	MOVQ  incx+16(FP), CX
  2513  	MOVQ  ys+24(FP), DX
  2514  	MOVQ  incy+32(FP), BX
  2515  	MOVQ  n+40(FP), SI
  2516  	XORQ  DI, DI
  2517  	JMP   check_limit
  2518  	PCALIGN $0x08
  2519  	NOP
  2520  	NOP
  2521  	NOP
  2522  
  2523  loop:
  2524  	MOVSS (AX), X1
  2525  	MULSS X0, X1
  2526  	ADDSS (DX), X1
  2527  	MOVSS X1, (DX)
  2528  	INCQ  DI
  2529  	LEAQ  (AX)(CX*4), AX
  2530  	LEAQ  (DX)(BX*4), DX
  2531  
  2532  check_limit:
  2533  	CMPQ SI, DI
  2534  	JHI  loop
  2535  	RET
  2536  
  2537  // func AmdAxpyPointerLoop_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2538  // Requires: SSE
  2539  TEXT ·AmdAxpyPointerLoop_V1A11(SB), NOSPLIT, $0-48
  2540  	MOVSS alpha+0(FP), X0
  2541  	MOVQ  xs+8(FP), AX
  2542  	MOVQ  incx+16(FP), CX
  2543  	MOVQ  ys+24(FP), DX
  2544  	MOVQ  incy+32(FP), BX
  2545  	MOVQ  n+40(FP), SI
  2546  	XORQ  DI, DI
  2547  	JMP   check_limit
  2548  	PCALIGN $0x08
  2549  	NOP
  2550  	NOP
  2551  	NOP
  2552  
  2553  loop:
  2554  	MOVSS (AX), X1
  2555  	MULSS X0, X1
  2556  	ADDSS (DX), X1
  2557  	MOVSS X1, (DX)
  2558  	INCQ  DI
  2559  	LEAQ  (AX)(CX*4), AX
  2560  	LEAQ  (DX)(BX*4), DX
  2561  
  2562  check_limit:
  2563  	CMPQ SI, DI
  2564  	JHI  loop
  2565  	RET
  2566  
  2567  // func AmdAxpyPointerLoop_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2568  // Requires: SSE
  2569  TEXT ·AmdAxpyPointerLoop_V2A11(SB), NOSPLIT, $0-48
  2570  	MOVSS alpha+0(FP), X0
  2571  	MOVQ  xs+8(FP), AX
  2572  	MOVQ  incx+16(FP), CX
  2573  	MOVQ  ys+24(FP), DX
  2574  	MOVQ  incy+32(FP), BX
  2575  	MOVQ  n+40(FP), SI
  2576  	XORQ  DI, DI
  2577  	JMP   check_limit
  2578  	PCALIGN $0x08
  2579  	NOP
  2580  	NOP
  2581  	NOP
  2582  
  2583  loop:
  2584  	MOVSS (AX), X1
  2585  	MULSS X0, X1
  2586  	ADDSS (DX), X1
  2587  	MOVSS X1, (DX)
  2588  	INCQ  DI
  2589  	LEAQ  (AX)(CX*4), AX
  2590  	LEAQ  (DX)(BX*4), DX
  2591  
  2592  check_limit:
  2593  	CMPQ SI, DI
  2594  	JHI  loop
  2595  	RET
  2596  
  2597  // func AmdAxpyPointerLoop_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2598  // Requires: SSE
  2599  TEXT ·AmdAxpyPointerLoop_V3A11(SB), NOSPLIT, $0-48
  2600  	MOVSS alpha+0(FP), X0
  2601  	MOVQ  xs+8(FP), AX
  2602  	MOVQ  incx+16(FP), CX
  2603  	MOVQ  ys+24(FP), DX
  2604  	MOVQ  incy+32(FP), BX
  2605  	MOVQ  n+40(FP), SI
  2606  	XORQ  DI, DI
  2607  	JMP   check_limit
  2608  	PCALIGN $0x08
  2609  	NOP
  2610  	NOP
  2611  	NOP
  2612  
  2613  loop:
  2614  	MOVSS (AX), X1
  2615  	MULSS X0, X1
  2616  	ADDSS (DX), X1
  2617  	MOVSS X1, (DX)
  2618  	INCQ  DI
  2619  	LEAQ  (AX)(CX*4), AX
  2620  	LEAQ  (DX)(BX*4), DX
  2621  
  2622  check_limit:
  2623  	CMPQ SI, DI
  2624  	JHI  loop
  2625  	RET
  2626  
  2627  // func AmdAxpyPointerLoop_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2628  // Requires: SSE
  2629  TEXT ·AmdAxpyPointerLoop_V4A11(SB), NOSPLIT, $0-48
  2630  	MOVSS alpha+0(FP), X0
  2631  	MOVQ  xs+8(FP), AX
  2632  	MOVQ  incx+16(FP), CX
  2633  	MOVQ  ys+24(FP), DX
  2634  	MOVQ  incy+32(FP), BX
  2635  	MOVQ  n+40(FP), SI
  2636  	XORQ  DI, DI
  2637  	JMP   check_limit
  2638  	PCALIGN $0x08
  2639  	NOP
  2640  	NOP
  2641  	NOP
  2642  
  2643  loop:
  2644  	MOVSS (AX), X1
  2645  	MULSS X0, X1
  2646  	ADDSS (DX), X1
  2647  	MOVSS X1, (DX)
  2648  	INCQ  DI
  2649  	LEAQ  (AX)(CX*4), AX
  2650  	LEAQ  (DX)(BX*4), DX
  2651  
  2652  check_limit:
  2653  	CMPQ SI, DI
  2654  	JHI  loop
  2655  	RET
  2656  
  2657  // func AmdAxpyPointerLoop_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2658  // Requires: SSE
  2659  TEXT ·AmdAxpyPointerLoop_V5A11(SB), NOSPLIT, $0-48
  2660  	MOVSS alpha+0(FP), X0
  2661  	MOVQ  xs+8(FP), AX
  2662  	MOVQ  incx+16(FP), CX
  2663  	MOVQ  ys+24(FP), DX
  2664  	MOVQ  incy+32(FP), BX
  2665  	MOVQ  n+40(FP), SI
  2666  	XORQ  DI, DI
  2667  	JMP   check_limit
  2668  	PCALIGN $0x08
  2669  	NOP
  2670  	NOP
  2671  	NOP
  2672  
  2673  loop:
  2674  	MOVSS (AX), X1
  2675  	MULSS X0, X1
  2676  	ADDSS (DX), X1
  2677  	MOVSS X1, (DX)
  2678  	INCQ  DI
  2679  	LEAQ  (AX)(CX*4), AX
  2680  	LEAQ  (DX)(BX*4), DX
  2681  
  2682  check_limit:
  2683  	CMPQ SI, DI
  2684  	JHI  loop
  2685  	RET
  2686  
  2687  // func AmdAxpyPointerLoop_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2688  // Requires: SSE
  2689  TEXT ·AmdAxpyPointerLoop_V0A12(SB), NOSPLIT, $0-48
  2690  	MOVSS alpha+0(FP), X0
  2691  	MOVQ  xs+8(FP), AX
  2692  	MOVQ  incx+16(FP), CX
  2693  	MOVQ  ys+24(FP), DX
  2694  	MOVQ  incy+32(FP), BX
  2695  	MOVQ  n+40(FP), SI
  2696  	XORQ  DI, DI
  2697  	JMP   check_limit
  2698  	PCALIGN $0x08
  2699  	NOP
  2700  	NOP
  2701  	NOP
  2702  	NOP
  2703  
  2704  loop:
  2705  	MOVSS (AX), X1
  2706  	MULSS X0, X1
  2707  	ADDSS (DX), X1
  2708  	MOVSS X1, (DX)
  2709  	INCQ  DI
  2710  	LEAQ  (AX)(CX*4), AX
  2711  	LEAQ  (DX)(BX*4), DX
  2712  
  2713  check_limit:
  2714  	CMPQ SI, DI
  2715  	JHI  loop
  2716  	RET
  2717  
  2718  // func AmdAxpyPointerLoop_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2719  // Requires: SSE
  2720  TEXT ·AmdAxpyPointerLoop_V1A12(SB), NOSPLIT, $0-48
  2721  	MOVSS alpha+0(FP), X0
  2722  	MOVQ  xs+8(FP), AX
  2723  	MOVQ  incx+16(FP), CX
  2724  	MOVQ  ys+24(FP), DX
  2725  	MOVQ  incy+32(FP), BX
  2726  	MOVQ  n+40(FP), SI
  2727  	XORQ  DI, DI
  2728  	JMP   check_limit
  2729  	PCALIGN $0x08
  2730  	NOP
  2731  	NOP
  2732  	NOP
  2733  	NOP
  2734  
  2735  loop:
  2736  	MOVSS (AX), X1
  2737  	MULSS X0, X1
  2738  	ADDSS (DX), X1
  2739  	MOVSS X1, (DX)
  2740  	INCQ  DI
  2741  	LEAQ  (AX)(CX*4), AX
  2742  	LEAQ  (DX)(BX*4), DX
  2743  
  2744  check_limit:
  2745  	CMPQ SI, DI
  2746  	JHI  loop
  2747  	RET
  2748  
  2749  // func AmdAxpyPointerLoop_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2750  // Requires: SSE
  2751  TEXT ·AmdAxpyPointerLoop_V2A12(SB), NOSPLIT, $0-48
  2752  	MOVSS alpha+0(FP), X0
  2753  	MOVQ  xs+8(FP), AX
  2754  	MOVQ  incx+16(FP), CX
  2755  	MOVQ  ys+24(FP), DX
  2756  	MOVQ  incy+32(FP), BX
  2757  	MOVQ  n+40(FP), SI
  2758  	XORQ  DI, DI
  2759  	JMP   check_limit
  2760  	PCALIGN $0x08
  2761  	NOP
  2762  	NOP
  2763  	NOP
  2764  	NOP
  2765  
  2766  loop:
  2767  	MOVSS (AX), X1
  2768  	MULSS X0, X1
  2769  	ADDSS (DX), X1
  2770  	MOVSS X1, (DX)
  2771  	INCQ  DI
  2772  	LEAQ  (AX)(CX*4), AX
  2773  	LEAQ  (DX)(BX*4), DX
  2774  
  2775  check_limit:
  2776  	CMPQ SI, DI
  2777  	JHI  loop
  2778  	RET
  2779  
  2780  // func AmdAxpyPointerLoop_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2781  // Requires: SSE
  2782  TEXT ·AmdAxpyPointerLoop_V3A12(SB), NOSPLIT, $0-48
  2783  	MOVSS alpha+0(FP), X0
  2784  	MOVQ  xs+8(FP), AX
  2785  	MOVQ  incx+16(FP), CX
  2786  	MOVQ  ys+24(FP), DX
  2787  	MOVQ  incy+32(FP), BX
  2788  	MOVQ  n+40(FP), SI
  2789  	XORQ  DI, DI
  2790  	JMP   check_limit
  2791  	PCALIGN $0x08
  2792  	NOP
  2793  	NOP
  2794  	NOP
  2795  	NOP
  2796  
  2797  loop:
  2798  	MOVSS (AX), X1
  2799  	MULSS X0, X1
  2800  	ADDSS (DX), X1
  2801  	MOVSS X1, (DX)
  2802  	INCQ  DI
  2803  	LEAQ  (AX)(CX*4), AX
  2804  	LEAQ  (DX)(BX*4), DX
  2805  
  2806  check_limit:
  2807  	CMPQ SI, DI
  2808  	JHI  loop
  2809  	RET
  2810  
  2811  // func AmdAxpyPointerLoop_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2812  // Requires: SSE
  2813  TEXT ·AmdAxpyPointerLoop_V4A12(SB), NOSPLIT, $0-48
  2814  	MOVSS alpha+0(FP), X0
  2815  	MOVQ  xs+8(FP), AX
  2816  	MOVQ  incx+16(FP), CX
  2817  	MOVQ  ys+24(FP), DX
  2818  	MOVQ  incy+32(FP), BX
  2819  	MOVQ  n+40(FP), SI
  2820  	XORQ  DI, DI
  2821  	JMP   check_limit
  2822  	PCALIGN $0x08
  2823  	NOP
  2824  	NOP
  2825  	NOP
  2826  	NOP
  2827  
  2828  loop:
  2829  	MOVSS (AX), X1
  2830  	MULSS X0, X1
  2831  	ADDSS (DX), X1
  2832  	MOVSS X1, (DX)
  2833  	INCQ  DI
  2834  	LEAQ  (AX)(CX*4), AX
  2835  	LEAQ  (DX)(BX*4), DX
  2836  
  2837  check_limit:
  2838  	CMPQ SI, DI
  2839  	JHI  loop
  2840  	RET
  2841  
  2842  // func AmdAxpyPointerLoop_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2843  // Requires: SSE
  2844  TEXT ·AmdAxpyPointerLoop_V5A12(SB), NOSPLIT, $0-48
  2845  	MOVSS alpha+0(FP), X0
  2846  	MOVQ  xs+8(FP), AX
  2847  	MOVQ  incx+16(FP), CX
  2848  	MOVQ  ys+24(FP), DX
  2849  	MOVQ  incy+32(FP), BX
  2850  	MOVQ  n+40(FP), SI
  2851  	XORQ  DI, DI
  2852  	JMP   check_limit
  2853  	PCALIGN $0x08
  2854  	NOP
  2855  	NOP
  2856  	NOP
  2857  	NOP
  2858  
  2859  loop:
  2860  	MOVSS (AX), X1
  2861  	MULSS X0, X1
  2862  	ADDSS (DX), X1
  2863  	MOVSS X1, (DX)
  2864  	INCQ  DI
  2865  	LEAQ  (AX)(CX*4), AX
  2866  	LEAQ  (DX)(BX*4), DX
  2867  
  2868  check_limit:
  2869  	CMPQ SI, DI
  2870  	JHI  loop
  2871  	RET
  2872  
  2873  // func AmdAxpyPointerLoop_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2874  // Requires: SSE
  2875  TEXT ·AmdAxpyPointerLoop_V0A13(SB), NOSPLIT, $0-48
  2876  	MOVSS alpha+0(FP), X0
  2877  	MOVQ  xs+8(FP), AX
  2878  	MOVQ  incx+16(FP), CX
  2879  	MOVQ  ys+24(FP), DX
  2880  	MOVQ  incy+32(FP), BX
  2881  	MOVQ  n+40(FP), SI
  2882  	XORQ  DI, DI
  2883  	JMP   check_limit
  2884  	PCALIGN $0x08
  2885  	NOP
  2886  	NOP
  2887  	NOP
  2888  	NOP
  2889  	NOP
  2890  
  2891  loop:
  2892  	MOVSS (AX), X1
  2893  	MULSS X0, X1
  2894  	ADDSS (DX), X1
  2895  	MOVSS X1, (DX)
  2896  	INCQ  DI
  2897  	LEAQ  (AX)(CX*4), AX
  2898  	LEAQ  (DX)(BX*4), DX
  2899  
  2900  check_limit:
  2901  	CMPQ SI, DI
  2902  	JHI  loop
  2903  	RET
  2904  
  2905  // func AmdAxpyPointerLoop_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2906  // Requires: SSE
  2907  TEXT ·AmdAxpyPointerLoop_V1A13(SB), NOSPLIT, $0-48
  2908  	MOVSS alpha+0(FP), X0
  2909  	MOVQ  xs+8(FP), AX
  2910  	MOVQ  incx+16(FP), CX
  2911  	MOVQ  ys+24(FP), DX
  2912  	MOVQ  incy+32(FP), BX
  2913  	MOVQ  n+40(FP), SI
  2914  	XORQ  DI, DI
  2915  	JMP   check_limit
  2916  	PCALIGN $0x08
  2917  	NOP
  2918  	NOP
  2919  	NOP
  2920  	NOP
  2921  	NOP
  2922  
  2923  loop:
  2924  	MOVSS (AX), X1
  2925  	MULSS X0, X1
  2926  	ADDSS (DX), X1
  2927  	MOVSS X1, (DX)
  2928  	INCQ  DI
  2929  	LEAQ  (AX)(CX*4), AX
  2930  	LEAQ  (DX)(BX*4), DX
  2931  
  2932  check_limit:
  2933  	CMPQ SI, DI
  2934  	JHI  loop
  2935  	RET
  2936  
  2937  // func AmdAxpyPointerLoop_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2938  // Requires: SSE
  2939  TEXT ·AmdAxpyPointerLoop_V2A13(SB), NOSPLIT, $0-48
  2940  	MOVSS alpha+0(FP), X0
  2941  	MOVQ  xs+8(FP), AX
  2942  	MOVQ  incx+16(FP), CX
  2943  	MOVQ  ys+24(FP), DX
  2944  	MOVQ  incy+32(FP), BX
  2945  	MOVQ  n+40(FP), SI
  2946  	XORQ  DI, DI
  2947  	JMP   check_limit
  2948  	PCALIGN $0x08
  2949  	NOP
  2950  	NOP
  2951  	NOP
  2952  	NOP
  2953  	NOP
  2954  
  2955  loop:
  2956  	MOVSS (AX), X1
  2957  	MULSS X0, X1
  2958  	ADDSS (DX), X1
  2959  	MOVSS X1, (DX)
  2960  	INCQ  DI
  2961  	LEAQ  (AX)(CX*4), AX
  2962  	LEAQ  (DX)(BX*4), DX
  2963  
  2964  check_limit:
  2965  	CMPQ SI, DI
  2966  	JHI  loop
  2967  	RET
  2968  
  2969  // func AmdAxpyPointerLoop_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  2970  // Requires: SSE
  2971  TEXT ·AmdAxpyPointerLoop_V3A13(SB), NOSPLIT, $0-48
  2972  	MOVSS alpha+0(FP), X0
  2973  	MOVQ  xs+8(FP), AX
  2974  	MOVQ  incx+16(FP), CX
  2975  	MOVQ  ys+24(FP), DX
  2976  	MOVQ  incy+32(FP), BX
  2977  	MOVQ  n+40(FP), SI
  2978  	XORQ  DI, DI
  2979  	JMP   check_limit
  2980  	PCALIGN $0x08
  2981  	NOP
  2982  	NOP
  2983  	NOP
  2984  	NOP
  2985  	NOP
  2986  
  2987  loop:
  2988  	MOVSS (AX), X1
  2989  	MULSS X0, X1
  2990  	ADDSS (DX), X1
  2991  	MOVSS X1, (DX)
  2992  	INCQ  DI
  2993  	LEAQ  (AX)(CX*4), AX
  2994  	LEAQ  (DX)(BX*4), DX
  2995  
  2996  check_limit:
  2997  	CMPQ SI, DI
  2998  	JHI  loop
  2999  	RET
  3000  
  3001  // func AmdAxpyPointerLoop_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3002  // Requires: SSE
  3003  TEXT ·AmdAxpyPointerLoop_V4A13(SB), NOSPLIT, $0-48
  3004  	MOVSS alpha+0(FP), X0
  3005  	MOVQ  xs+8(FP), AX
  3006  	MOVQ  incx+16(FP), CX
  3007  	MOVQ  ys+24(FP), DX
  3008  	MOVQ  incy+32(FP), BX
  3009  	MOVQ  n+40(FP), SI
  3010  	XORQ  DI, DI
  3011  	JMP   check_limit
  3012  	PCALIGN $0x08
  3013  	NOP
  3014  	NOP
  3015  	NOP
  3016  	NOP
  3017  	NOP
  3018  
  3019  loop:
  3020  	MOVSS (AX), X1
  3021  	MULSS X0, X1
  3022  	ADDSS (DX), X1
  3023  	MOVSS X1, (DX)
  3024  	INCQ  DI
  3025  	LEAQ  (AX)(CX*4), AX
  3026  	LEAQ  (DX)(BX*4), DX
  3027  
  3028  check_limit:
  3029  	CMPQ SI, DI
  3030  	JHI  loop
  3031  	RET
  3032  
  3033  // func AmdAxpyPointerLoop_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3034  // Requires: SSE
  3035  TEXT ·AmdAxpyPointerLoop_V5A13(SB), NOSPLIT, $0-48
  3036  	MOVSS alpha+0(FP), X0
  3037  	MOVQ  xs+8(FP), AX
  3038  	MOVQ  incx+16(FP), CX
  3039  	MOVQ  ys+24(FP), DX
  3040  	MOVQ  incy+32(FP), BX
  3041  	MOVQ  n+40(FP), SI
  3042  	XORQ  DI, DI
  3043  	JMP   check_limit
  3044  	PCALIGN $0x08
  3045  	NOP
  3046  	NOP
  3047  	NOP
  3048  	NOP
  3049  	NOP
  3050  
  3051  loop:
  3052  	MOVSS (AX), X1
  3053  	MULSS X0, X1
  3054  	ADDSS (DX), X1
  3055  	MOVSS X1, (DX)
  3056  	INCQ  DI
  3057  	LEAQ  (AX)(CX*4), AX
  3058  	LEAQ  (DX)(BX*4), DX
  3059  
  3060  check_limit:
  3061  	CMPQ SI, DI
  3062  	JHI  loop
  3063  	RET
  3064  
  3065  // func AmdAxpyPointerLoop_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3066  // Requires: SSE
  3067  TEXT ·AmdAxpyPointerLoop_V0A14(SB), NOSPLIT, $0-48
  3068  	MOVSS alpha+0(FP), X0
  3069  	MOVQ  xs+8(FP), AX
  3070  	MOVQ  incx+16(FP), CX
  3071  	MOVQ  ys+24(FP), DX
  3072  	MOVQ  incy+32(FP), BX
  3073  	MOVQ  n+40(FP), SI
  3074  	XORQ  DI, DI
  3075  	JMP   check_limit
  3076  	PCALIGN $0x08
  3077  	NOP
  3078  	NOP
  3079  	NOP
  3080  	NOP
  3081  	NOP
  3082  	NOP
  3083  
  3084  loop:
  3085  	MOVSS (AX), X1
  3086  	MULSS X0, X1
  3087  	ADDSS (DX), X1
  3088  	MOVSS X1, (DX)
  3089  	INCQ  DI
  3090  	LEAQ  (AX)(CX*4), AX
  3091  	LEAQ  (DX)(BX*4), DX
  3092  
  3093  check_limit:
  3094  	CMPQ SI, DI
  3095  	JHI  loop
  3096  	RET
  3097  
  3098  // func AmdAxpyPointerLoop_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3099  // Requires: SSE
  3100  TEXT ·AmdAxpyPointerLoop_V1A14(SB), NOSPLIT, $0-48
  3101  	MOVSS alpha+0(FP), X0
  3102  	MOVQ  xs+8(FP), AX
  3103  	MOVQ  incx+16(FP), CX
  3104  	MOVQ  ys+24(FP), DX
  3105  	MOVQ  incy+32(FP), BX
  3106  	MOVQ  n+40(FP), SI
  3107  	XORQ  DI, DI
  3108  	JMP   check_limit
  3109  	PCALIGN $0x08
  3110  	NOP
  3111  	NOP
  3112  	NOP
  3113  	NOP
  3114  	NOP
  3115  	NOP
  3116  
  3117  loop:
  3118  	MOVSS (AX), X1
  3119  	MULSS X0, X1
  3120  	ADDSS (DX), X1
  3121  	MOVSS X1, (DX)
  3122  	INCQ  DI
  3123  	LEAQ  (AX)(CX*4), AX
  3124  	LEAQ  (DX)(BX*4), DX
  3125  
  3126  check_limit:
  3127  	CMPQ SI, DI
  3128  	JHI  loop
  3129  	RET
  3130  
  3131  // func AmdAxpyPointerLoop_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3132  // Requires: SSE
  3133  TEXT ·AmdAxpyPointerLoop_V2A14(SB), NOSPLIT, $0-48
  3134  	MOVSS alpha+0(FP), X0
  3135  	MOVQ  xs+8(FP), AX
  3136  	MOVQ  incx+16(FP), CX
  3137  	MOVQ  ys+24(FP), DX
  3138  	MOVQ  incy+32(FP), BX
  3139  	MOVQ  n+40(FP), SI
  3140  	XORQ  DI, DI
  3141  	JMP   check_limit
  3142  	PCALIGN $0x08
  3143  	NOP
  3144  	NOP
  3145  	NOP
  3146  	NOP
  3147  	NOP
  3148  	NOP
  3149  
  3150  loop:
  3151  	MOVSS (AX), X1
  3152  	MULSS X0, X1
  3153  	ADDSS (DX), X1
  3154  	MOVSS X1, (DX)
  3155  	INCQ  DI
  3156  	LEAQ  (AX)(CX*4), AX
  3157  	LEAQ  (DX)(BX*4), DX
  3158  
  3159  check_limit:
  3160  	CMPQ SI, DI
  3161  	JHI  loop
  3162  	RET
  3163  
  3164  // func AmdAxpyPointerLoop_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3165  // Requires: SSE
  3166  TEXT ·AmdAxpyPointerLoop_V3A14(SB), NOSPLIT, $0-48
  3167  	MOVSS alpha+0(FP), X0
  3168  	MOVQ  xs+8(FP), AX
  3169  	MOVQ  incx+16(FP), CX
  3170  	MOVQ  ys+24(FP), DX
  3171  	MOVQ  incy+32(FP), BX
  3172  	MOVQ  n+40(FP), SI
  3173  	XORQ  DI, DI
  3174  	JMP   check_limit
  3175  	PCALIGN $0x08
  3176  	NOP
  3177  	NOP
  3178  	NOP
  3179  	NOP
  3180  	NOP
  3181  	NOP
  3182  
  3183  loop:
  3184  	MOVSS (AX), X1
  3185  	MULSS X0, X1
  3186  	ADDSS (DX), X1
  3187  	MOVSS X1, (DX)
  3188  	INCQ  DI
  3189  	LEAQ  (AX)(CX*4), AX
  3190  	LEAQ  (DX)(BX*4), DX
  3191  
  3192  check_limit:
  3193  	CMPQ SI, DI
  3194  	JHI  loop
  3195  	RET
  3196  
  3197  // func AmdAxpyPointerLoop_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3198  // Requires: SSE
  3199  TEXT ·AmdAxpyPointerLoop_V4A14(SB), NOSPLIT, $0-48
  3200  	MOVSS alpha+0(FP), X0
  3201  	MOVQ  xs+8(FP), AX
  3202  	MOVQ  incx+16(FP), CX
  3203  	MOVQ  ys+24(FP), DX
  3204  	MOVQ  incy+32(FP), BX
  3205  	MOVQ  n+40(FP), SI
  3206  	XORQ  DI, DI
  3207  	JMP   check_limit
  3208  	PCALIGN $0x08
  3209  	NOP
  3210  	NOP
  3211  	NOP
  3212  	NOP
  3213  	NOP
  3214  	NOP
  3215  
  3216  loop:
  3217  	MOVSS (AX), X1
  3218  	MULSS X0, X1
  3219  	ADDSS (DX), X1
  3220  	MOVSS X1, (DX)
  3221  	INCQ  DI
  3222  	LEAQ  (AX)(CX*4), AX
  3223  	LEAQ  (DX)(BX*4), DX
  3224  
  3225  check_limit:
  3226  	CMPQ SI, DI
  3227  	JHI  loop
  3228  	RET
  3229  
  3230  // func AmdAxpyPointerLoop_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3231  // Requires: SSE
  3232  TEXT ·AmdAxpyPointerLoop_V5A14(SB), NOSPLIT, $0-48
  3233  	MOVSS alpha+0(FP), X0
  3234  	MOVQ  xs+8(FP), AX
  3235  	MOVQ  incx+16(FP), CX
  3236  	MOVQ  ys+24(FP), DX
  3237  	MOVQ  incy+32(FP), BX
  3238  	MOVQ  n+40(FP), SI
  3239  	XORQ  DI, DI
  3240  	JMP   check_limit
  3241  	PCALIGN $0x08
  3242  	NOP
  3243  	NOP
  3244  	NOP
  3245  	NOP
  3246  	NOP
  3247  	NOP
  3248  
  3249  loop:
  3250  	MOVSS (AX), X1
  3251  	MULSS X0, X1
  3252  	ADDSS (DX), X1
  3253  	MOVSS X1, (DX)
  3254  	INCQ  DI
  3255  	LEAQ  (AX)(CX*4), AX
  3256  	LEAQ  (DX)(BX*4), DX
  3257  
  3258  check_limit:
  3259  	CMPQ SI, DI
  3260  	JHI  loop
  3261  	RET
  3262  
  3263  // func AmdAxpyPointerLoop_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3264  // Requires: SSE
  3265  TEXT ·AmdAxpyPointerLoop_V0A15(SB), NOSPLIT, $0-48
  3266  	MOVSS alpha+0(FP), X0
  3267  	MOVQ  xs+8(FP), AX
  3268  	MOVQ  incx+16(FP), CX
  3269  	MOVQ  ys+24(FP), DX
  3270  	MOVQ  incy+32(FP), BX
  3271  	MOVQ  n+40(FP), SI
  3272  	XORQ  DI, DI
  3273  	JMP   check_limit
  3274  	PCALIGN $0x08
  3275  	NOP
  3276  	NOP
  3277  	NOP
  3278  	NOP
  3279  	NOP
  3280  	NOP
  3281  	NOP
  3282  
  3283  loop:
  3284  	MOVSS (AX), X1
  3285  	MULSS X0, X1
  3286  	ADDSS (DX), X1
  3287  	MOVSS X1, (DX)
  3288  	INCQ  DI
  3289  	LEAQ  (AX)(CX*4), AX
  3290  	LEAQ  (DX)(BX*4), DX
  3291  
  3292  check_limit:
  3293  	CMPQ SI, DI
  3294  	JHI  loop
  3295  	RET
  3296  
  3297  // func AmdAxpyPointerLoop_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3298  // Requires: SSE
  3299  TEXT ·AmdAxpyPointerLoop_V1A15(SB), NOSPLIT, $0-48
  3300  	MOVSS alpha+0(FP), X0
  3301  	MOVQ  xs+8(FP), AX
  3302  	MOVQ  incx+16(FP), CX
  3303  	MOVQ  ys+24(FP), DX
  3304  	MOVQ  incy+32(FP), BX
  3305  	MOVQ  n+40(FP), SI
  3306  	XORQ  DI, DI
  3307  	JMP   check_limit
  3308  	PCALIGN $0x08
  3309  	NOP
  3310  	NOP
  3311  	NOP
  3312  	NOP
  3313  	NOP
  3314  	NOP
  3315  	NOP
  3316  
  3317  loop:
  3318  	MOVSS (AX), X1
  3319  	MULSS X0, X1
  3320  	ADDSS (DX), X1
  3321  	MOVSS X1, (DX)
  3322  	INCQ  DI
  3323  	LEAQ  (AX)(CX*4), AX
  3324  	LEAQ  (DX)(BX*4), DX
  3325  
  3326  check_limit:
  3327  	CMPQ SI, DI
  3328  	JHI  loop
  3329  	RET
  3330  
  3331  // func AmdAxpyPointerLoop_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3332  // Requires: SSE
  3333  TEXT ·AmdAxpyPointerLoop_V2A15(SB), NOSPLIT, $0-48
  3334  	MOVSS alpha+0(FP), X0
  3335  	MOVQ  xs+8(FP), AX
  3336  	MOVQ  incx+16(FP), CX
  3337  	MOVQ  ys+24(FP), DX
  3338  	MOVQ  incy+32(FP), BX
  3339  	MOVQ  n+40(FP), SI
  3340  	XORQ  DI, DI
  3341  	JMP   check_limit
  3342  	PCALIGN $0x08
  3343  	NOP
  3344  	NOP
  3345  	NOP
  3346  	NOP
  3347  	NOP
  3348  	NOP
  3349  	NOP
  3350  
  3351  loop:
  3352  	MOVSS (AX), X1
  3353  	MULSS X0, X1
  3354  	ADDSS (DX), X1
  3355  	MOVSS X1, (DX)
  3356  	INCQ  DI
  3357  	LEAQ  (AX)(CX*4), AX
  3358  	LEAQ  (DX)(BX*4), DX
  3359  
  3360  check_limit:
  3361  	CMPQ SI, DI
  3362  	JHI  loop
  3363  	RET
  3364  
  3365  // func AmdAxpyPointerLoop_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3366  // Requires: SSE
  3367  TEXT ·AmdAxpyPointerLoop_V3A15(SB), NOSPLIT, $0-48
  3368  	MOVSS alpha+0(FP), X0
  3369  	MOVQ  xs+8(FP), AX
  3370  	MOVQ  incx+16(FP), CX
  3371  	MOVQ  ys+24(FP), DX
  3372  	MOVQ  incy+32(FP), BX
  3373  	MOVQ  n+40(FP), SI
  3374  	XORQ  DI, DI
  3375  	JMP   check_limit
  3376  	PCALIGN $0x08
  3377  	NOP
  3378  	NOP
  3379  	NOP
  3380  	NOP
  3381  	NOP
  3382  	NOP
  3383  	NOP
  3384  
  3385  loop:
  3386  	MOVSS (AX), X1
  3387  	MULSS X0, X1
  3388  	ADDSS (DX), X1
  3389  	MOVSS X1, (DX)
  3390  	INCQ  DI
  3391  	LEAQ  (AX)(CX*4), AX
  3392  	LEAQ  (DX)(BX*4), DX
  3393  
  3394  check_limit:
  3395  	CMPQ SI, DI
  3396  	JHI  loop
  3397  	RET
  3398  
  3399  // func AmdAxpyPointerLoop_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3400  // Requires: SSE
  3401  TEXT ·AmdAxpyPointerLoop_V4A15(SB), NOSPLIT, $0-48
  3402  	MOVSS alpha+0(FP), X0
  3403  	MOVQ  xs+8(FP), AX
  3404  	MOVQ  incx+16(FP), CX
  3405  	MOVQ  ys+24(FP), DX
  3406  	MOVQ  incy+32(FP), BX
  3407  	MOVQ  n+40(FP), SI
  3408  	XORQ  DI, DI
  3409  	JMP   check_limit
  3410  	PCALIGN $0x08
  3411  	NOP
  3412  	NOP
  3413  	NOP
  3414  	NOP
  3415  	NOP
  3416  	NOP
  3417  	NOP
  3418  
  3419  loop:
  3420  	MOVSS (AX), X1
  3421  	MULSS X0, X1
  3422  	ADDSS (DX), X1
  3423  	MOVSS X1, (DX)
  3424  	INCQ  DI
  3425  	LEAQ  (AX)(CX*4), AX
  3426  	LEAQ  (DX)(BX*4), DX
  3427  
  3428  check_limit:
  3429  	CMPQ SI, DI
  3430  	JHI  loop
  3431  	RET
  3432  
  3433  // func AmdAxpyPointerLoop_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3434  // Requires: SSE
  3435  TEXT ·AmdAxpyPointerLoop_V5A15(SB), NOSPLIT, $0-48
  3436  	MOVSS alpha+0(FP), X0
  3437  	MOVQ  xs+8(FP), AX
  3438  	MOVQ  incx+16(FP), CX
  3439  	MOVQ  ys+24(FP), DX
  3440  	MOVQ  incy+32(FP), BX
  3441  	MOVQ  n+40(FP), SI
  3442  	XORQ  DI, DI
  3443  	JMP   check_limit
  3444  	PCALIGN $0x08
  3445  	NOP
  3446  	NOP
  3447  	NOP
  3448  	NOP
  3449  	NOP
  3450  	NOP
  3451  	NOP
  3452  
  3453  loop:
  3454  	MOVSS (AX), X1
  3455  	MULSS X0, X1
  3456  	ADDSS (DX), X1
  3457  	MOVSS X1, (DX)
  3458  	INCQ  DI
  3459  	LEAQ  (AX)(CX*4), AX
  3460  	LEAQ  (DX)(BX*4), DX
  3461  
  3462  check_limit:
  3463  	CMPQ SI, DI
  3464  	JHI  loop
  3465  	RET
  3466  
  3467  // func AmdAxpyPointerLoop_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3468  // Requires: SSE
  3469  TEXT ·AmdAxpyPointerLoop_V0A16(SB), NOSPLIT, $0-48
  3470  	MOVSS alpha+0(FP), X0
  3471  	MOVQ  xs+8(FP), AX
  3472  	MOVQ  incx+16(FP), CX
  3473  	MOVQ  ys+24(FP), DX
  3474  	MOVQ  incy+32(FP), BX
  3475  	MOVQ  n+40(FP), SI
  3476  	XORQ  DI, DI
  3477  	JMP   check_limit
  3478  	PCALIGN $0x10
  3479  
  3480  loop:
  3481  	MOVSS (AX), X1
  3482  	MULSS X0, X1
  3483  	ADDSS (DX), X1
  3484  	MOVSS X1, (DX)
  3485  	INCQ  DI
  3486  	LEAQ  (AX)(CX*4), AX
  3487  	LEAQ  (DX)(BX*4), DX
  3488  
  3489  check_limit:
  3490  	CMPQ SI, DI
  3491  	JHI  loop
  3492  	RET
  3493  
  3494  // func AmdAxpyPointerLoop_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3495  // Requires: SSE
  3496  TEXT ·AmdAxpyPointerLoop_V1A16(SB), NOSPLIT, $0-48
  3497  	MOVSS alpha+0(FP), X0
  3498  	MOVQ  xs+8(FP), AX
  3499  	MOVQ  incx+16(FP), CX
  3500  	MOVQ  ys+24(FP), DX
  3501  	MOVQ  incy+32(FP), BX
  3502  	MOVQ  n+40(FP), SI
  3503  	XORQ  DI, DI
  3504  	JMP   check_limit
  3505  	PCALIGN $0x10
  3506  
  3507  loop:
  3508  	MOVSS (AX), X1
  3509  	MULSS X0, X1
  3510  	ADDSS (DX), X1
  3511  	MOVSS X1, (DX)
  3512  	INCQ  DI
  3513  	LEAQ  (AX)(CX*4), AX
  3514  	LEAQ  (DX)(BX*4), DX
  3515  
  3516  check_limit:
  3517  	CMPQ SI, DI
  3518  	JHI  loop
  3519  	RET
  3520  
  3521  // func AmdAxpyPointerLoop_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3522  // Requires: SSE
  3523  TEXT ·AmdAxpyPointerLoop_V2A16(SB), NOSPLIT, $0-48
  3524  	MOVSS alpha+0(FP), X0
  3525  	MOVQ  xs+8(FP), AX
  3526  	MOVQ  incx+16(FP), CX
  3527  	MOVQ  ys+24(FP), DX
  3528  	MOVQ  incy+32(FP), BX
  3529  	MOVQ  n+40(FP), SI
  3530  	XORQ  DI, DI
  3531  	JMP   check_limit
  3532  	PCALIGN $0x10
  3533  
  3534  loop:
  3535  	MOVSS (AX), X1
  3536  	MULSS X0, X1
  3537  	ADDSS (DX), X1
  3538  	MOVSS X1, (DX)
  3539  	INCQ  DI
  3540  	LEAQ  (AX)(CX*4), AX
  3541  	LEAQ  (DX)(BX*4), DX
  3542  
  3543  check_limit:
  3544  	CMPQ SI, DI
  3545  	JHI  loop
  3546  	RET
  3547  
  3548  // func AmdAxpyPointerLoop_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3549  // Requires: SSE
  3550  TEXT ·AmdAxpyPointerLoop_V3A16(SB), NOSPLIT, $0-48
  3551  	MOVSS alpha+0(FP), X0
  3552  	MOVQ  xs+8(FP), AX
  3553  	MOVQ  incx+16(FP), CX
  3554  	MOVQ  ys+24(FP), DX
  3555  	MOVQ  incy+32(FP), BX
  3556  	MOVQ  n+40(FP), SI
  3557  	XORQ  DI, DI
  3558  	JMP   check_limit
  3559  	PCALIGN $0x10
  3560  
  3561  loop:
  3562  	MOVSS (AX), X1
  3563  	MULSS X0, X1
  3564  	ADDSS (DX), X1
  3565  	MOVSS X1, (DX)
  3566  	INCQ  DI
  3567  	LEAQ  (AX)(CX*4), AX
  3568  	LEAQ  (DX)(BX*4), DX
  3569  
  3570  check_limit:
  3571  	CMPQ SI, DI
  3572  	JHI  loop
  3573  	RET
  3574  
  3575  // func AmdAxpyPointerLoop_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3576  // Requires: SSE
  3577  TEXT ·AmdAxpyPointerLoop_V4A16(SB), NOSPLIT, $0-48
  3578  	MOVSS alpha+0(FP), X0
  3579  	MOVQ  xs+8(FP), AX
  3580  	MOVQ  incx+16(FP), CX
  3581  	MOVQ  ys+24(FP), DX
  3582  	MOVQ  incy+32(FP), BX
  3583  	MOVQ  n+40(FP), SI
  3584  	XORQ  DI, DI
  3585  	JMP   check_limit
  3586  	PCALIGN $0x10
  3587  
  3588  loop:
  3589  	MOVSS (AX), X1
  3590  	MULSS X0, X1
  3591  	ADDSS (DX), X1
  3592  	MOVSS X1, (DX)
  3593  	INCQ  DI
  3594  	LEAQ  (AX)(CX*4), AX
  3595  	LEAQ  (DX)(BX*4), DX
  3596  
  3597  check_limit:
  3598  	CMPQ SI, DI
  3599  	JHI  loop
  3600  	RET
  3601  
  3602  // func AmdAxpyPointerLoop_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3603  // Requires: SSE
  3604  TEXT ·AmdAxpyPointerLoop_V5A16(SB), NOSPLIT, $0-48
  3605  	MOVSS alpha+0(FP), X0
  3606  	MOVQ  xs+8(FP), AX
  3607  	MOVQ  incx+16(FP), CX
  3608  	MOVQ  ys+24(FP), DX
  3609  	MOVQ  incy+32(FP), BX
  3610  	MOVQ  n+40(FP), SI
  3611  	XORQ  DI, DI
  3612  	JMP   check_limit
  3613  	PCALIGN $0x10
  3614  
  3615  loop:
  3616  	MOVSS (AX), X1
  3617  	MULSS X0, X1
  3618  	ADDSS (DX), X1
  3619  	MOVSS X1, (DX)
  3620  	INCQ  DI
  3621  	LEAQ  (AX)(CX*4), AX
  3622  	LEAQ  (DX)(BX*4), DX
  3623  
  3624  check_limit:
  3625  	CMPQ SI, DI
  3626  	JHI  loop
  3627  	RET
  3628  
  3629  // func AmdAxpyPointerLoopX_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3630  // Requires: SSE
  3631  TEXT ·AmdAxpyPointerLoopX_V0A0(SB), NOSPLIT, $0-48
  3632  	MOVSS alpha+0(FP), X0
  3633  	MOVQ  xs+8(FP), AX
  3634  	MOVQ  incx+16(FP), CX
  3635  	MOVQ  ys+24(FP), DX
  3636  	MOVQ  incy+32(FP), BX
  3637  	MOVQ  n+40(FP), SI
  3638  	JMP   check_limit
  3639  
  3640  loop:
  3641  	MOVSS (AX), X1
  3642  	MULSS X0, X1
  3643  	ADDSS (DX), X1
  3644  	MOVSS X1, (DX)
  3645  	DECQ  SI
  3646  	LEAQ  (AX)(CX*4), AX
  3647  	LEAQ  (DX)(BX*4), DX
  3648  
  3649  check_limit:
  3650  	CMPQ SI, $0x00
  3651  	JHI  loop
  3652  	RET
  3653  
  3654  // func AmdAxpyPointerLoopX_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3655  // Requires: SSE
  3656  TEXT ·AmdAxpyPointerLoopX_V1A0(SB), NOSPLIT, $0-48
  3657  	MOVSS alpha+0(FP), X0
  3658  	MOVQ  xs+8(FP), AX
  3659  	MOVQ  incx+16(FP), CX
  3660  	MOVQ  ys+24(FP), DX
  3661  	MOVQ  incy+32(FP), BX
  3662  	MOVQ  n+40(FP), SI
  3663  	JMP   check_limit
  3664  
  3665  loop:
  3666  	MOVSS (AX), X1
  3667  	MULSS X0, X1
  3668  	ADDSS (DX), X1
  3669  	MOVSS X1, (DX)
  3670  	DECQ  SI
  3671  	LEAQ  (AX)(CX*4), AX
  3672  	LEAQ  (DX)(BX*4), DX
  3673  
  3674  check_limit:
  3675  	CMPQ SI, $0x00
  3676  	JHI  loop
  3677  	RET
  3678  
  3679  // func AmdAxpyPointerLoopX_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3680  // Requires: SSE
  3681  TEXT ·AmdAxpyPointerLoopX_V2A0(SB), NOSPLIT, $0-48
  3682  	MOVSS alpha+0(FP), X0
  3683  	MOVQ  xs+8(FP), AX
  3684  	MOVQ  incx+16(FP), CX
  3685  	MOVQ  ys+24(FP), DX
  3686  	MOVQ  incy+32(FP), BX
  3687  	MOVQ  n+40(FP), SI
  3688  	JMP   check_limit
  3689  
  3690  loop:
  3691  	MOVSS (AX), X1
  3692  	MULSS X0, X1
  3693  	ADDSS (DX), X1
  3694  	MOVSS X1, (DX)
  3695  	DECQ  SI
  3696  	LEAQ  (AX)(CX*4), AX
  3697  	LEAQ  (DX)(BX*4), DX
  3698  
  3699  check_limit:
  3700  	CMPQ SI, $0x00
  3701  	JHI  loop
  3702  	RET
  3703  
  3704  // func AmdAxpyPointerLoopX_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3705  // Requires: SSE
  3706  TEXT ·AmdAxpyPointerLoopX_V3A0(SB), NOSPLIT, $0-48
  3707  	MOVSS alpha+0(FP), X0
  3708  	MOVQ  xs+8(FP), AX
  3709  	MOVQ  incx+16(FP), CX
  3710  	MOVQ  ys+24(FP), DX
  3711  	MOVQ  incy+32(FP), BX
  3712  	MOVQ  n+40(FP), SI
  3713  	JMP   check_limit
  3714  
  3715  loop:
  3716  	MOVSS (AX), X1
  3717  	MULSS X0, X1
  3718  	ADDSS (DX), X1
  3719  	MOVSS X1, (DX)
  3720  	DECQ  SI
  3721  	LEAQ  (AX)(CX*4), AX
  3722  	LEAQ  (DX)(BX*4), DX
  3723  
  3724  check_limit:
  3725  	CMPQ SI, $0x00
  3726  	JHI  loop
  3727  	RET
  3728  
  3729  // func AmdAxpyPointerLoopX_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3730  // Requires: SSE
  3731  TEXT ·AmdAxpyPointerLoopX_V4A0(SB), NOSPLIT, $0-48
  3732  	MOVSS alpha+0(FP), X0
  3733  	MOVQ  xs+8(FP), AX
  3734  	MOVQ  incx+16(FP), CX
  3735  	MOVQ  ys+24(FP), DX
  3736  	MOVQ  incy+32(FP), BX
  3737  	MOVQ  n+40(FP), SI
  3738  	JMP   check_limit
  3739  
  3740  loop:
  3741  	MOVSS (AX), X1
  3742  	MULSS X0, X1
  3743  	ADDSS (DX), X1
  3744  	MOVSS X1, (DX)
  3745  	DECQ  SI
  3746  	LEAQ  (AX)(CX*4), AX
  3747  	LEAQ  (DX)(BX*4), DX
  3748  
  3749  check_limit:
  3750  	CMPQ SI, $0x00
  3751  	JHI  loop
  3752  	RET
  3753  
  3754  // func AmdAxpyPointerLoopX_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3755  // Requires: SSE
  3756  TEXT ·AmdAxpyPointerLoopX_V5A0(SB), NOSPLIT, $0-48
  3757  	MOVSS alpha+0(FP), X0
  3758  	MOVQ  xs+8(FP), AX
  3759  	MOVQ  incx+16(FP), CX
  3760  	MOVQ  ys+24(FP), DX
  3761  	MOVQ  incy+32(FP), BX
  3762  	MOVQ  n+40(FP), SI
  3763  	JMP   check_limit
  3764  
  3765  loop:
  3766  	MOVSS (AX), X1
  3767  	MULSS X0, X1
  3768  	ADDSS (DX), X1
  3769  	MOVSS X1, (DX)
  3770  	DECQ  SI
  3771  	LEAQ  (AX)(CX*4), AX
  3772  	LEAQ  (DX)(BX*4), DX
  3773  
  3774  check_limit:
  3775  	CMPQ SI, $0x00
  3776  	JHI  loop
  3777  	RET
  3778  
  3779  // func AmdAxpyPointerLoopX_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3780  // Requires: SSE
  3781  TEXT ·AmdAxpyPointerLoopX_V0A8(SB), NOSPLIT, $0-48
  3782  	MOVSS alpha+0(FP), X0
  3783  	MOVQ  xs+8(FP), AX
  3784  	MOVQ  incx+16(FP), CX
  3785  	MOVQ  ys+24(FP), DX
  3786  	MOVQ  incy+32(FP), BX
  3787  	MOVQ  n+40(FP), SI
  3788  	JMP   check_limit
  3789  	PCALIGN $0x08
  3790  
  3791  loop:
  3792  	MOVSS (AX), X1
  3793  	MULSS X0, X1
  3794  	ADDSS (DX), X1
  3795  	MOVSS X1, (DX)
  3796  	DECQ  SI
  3797  	LEAQ  (AX)(CX*4), AX
  3798  	LEAQ  (DX)(BX*4), DX
  3799  
  3800  check_limit:
  3801  	CMPQ SI, $0x00
  3802  	JHI  loop
  3803  	RET
  3804  
  3805  // func AmdAxpyPointerLoopX_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3806  // Requires: SSE
  3807  TEXT ·AmdAxpyPointerLoopX_V1A8(SB), NOSPLIT, $0-48
  3808  	MOVSS alpha+0(FP), X0
  3809  	MOVQ  xs+8(FP), AX
  3810  	MOVQ  incx+16(FP), CX
  3811  	MOVQ  ys+24(FP), DX
  3812  	MOVQ  incy+32(FP), BX
  3813  	MOVQ  n+40(FP), SI
  3814  	JMP   check_limit
  3815  	PCALIGN $0x08
  3816  
  3817  loop:
  3818  	MOVSS (AX), X1
  3819  	MULSS X0, X1
  3820  	ADDSS (DX), X1
  3821  	MOVSS X1, (DX)
  3822  	DECQ  SI
  3823  	LEAQ  (AX)(CX*4), AX
  3824  	LEAQ  (DX)(BX*4), DX
  3825  
  3826  check_limit:
  3827  	CMPQ SI, $0x00
  3828  	JHI  loop
  3829  	RET
  3830  
  3831  // func AmdAxpyPointerLoopX_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3832  // Requires: SSE
  3833  TEXT ·AmdAxpyPointerLoopX_V2A8(SB), NOSPLIT, $0-48
  3834  	MOVSS alpha+0(FP), X0
  3835  	MOVQ  xs+8(FP), AX
  3836  	MOVQ  incx+16(FP), CX
  3837  	MOVQ  ys+24(FP), DX
  3838  	MOVQ  incy+32(FP), BX
  3839  	MOVQ  n+40(FP), SI
  3840  	JMP   check_limit
  3841  	PCALIGN $0x08
  3842  
  3843  loop:
  3844  	MOVSS (AX), X1
  3845  	MULSS X0, X1
  3846  	ADDSS (DX), X1
  3847  	MOVSS X1, (DX)
  3848  	DECQ  SI
  3849  	LEAQ  (AX)(CX*4), AX
  3850  	LEAQ  (DX)(BX*4), DX
  3851  
  3852  check_limit:
  3853  	CMPQ SI, $0x00
  3854  	JHI  loop
  3855  	RET
  3856  
  3857  // func AmdAxpyPointerLoopX_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3858  // Requires: SSE
  3859  TEXT ·AmdAxpyPointerLoopX_V3A8(SB), NOSPLIT, $0-48
  3860  	MOVSS alpha+0(FP), X0
  3861  	MOVQ  xs+8(FP), AX
  3862  	MOVQ  incx+16(FP), CX
  3863  	MOVQ  ys+24(FP), DX
  3864  	MOVQ  incy+32(FP), BX
  3865  	MOVQ  n+40(FP), SI
  3866  	JMP   check_limit
  3867  	PCALIGN $0x08
  3868  
  3869  loop:
  3870  	MOVSS (AX), X1
  3871  	MULSS X0, X1
  3872  	ADDSS (DX), X1
  3873  	MOVSS X1, (DX)
  3874  	DECQ  SI
  3875  	LEAQ  (AX)(CX*4), AX
  3876  	LEAQ  (DX)(BX*4), DX
  3877  
  3878  check_limit:
  3879  	CMPQ SI, $0x00
  3880  	JHI  loop
  3881  	RET
  3882  
  3883  // func AmdAxpyPointerLoopX_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3884  // Requires: SSE
  3885  TEXT ·AmdAxpyPointerLoopX_V4A8(SB), NOSPLIT, $0-48
  3886  	MOVSS alpha+0(FP), X0
  3887  	MOVQ  xs+8(FP), AX
  3888  	MOVQ  incx+16(FP), CX
  3889  	MOVQ  ys+24(FP), DX
  3890  	MOVQ  incy+32(FP), BX
  3891  	MOVQ  n+40(FP), SI
  3892  	JMP   check_limit
  3893  	PCALIGN $0x08
  3894  
  3895  loop:
  3896  	MOVSS (AX), X1
  3897  	MULSS X0, X1
  3898  	ADDSS (DX), X1
  3899  	MOVSS X1, (DX)
  3900  	DECQ  SI
  3901  	LEAQ  (AX)(CX*4), AX
  3902  	LEAQ  (DX)(BX*4), DX
  3903  
  3904  check_limit:
  3905  	CMPQ SI, $0x00
  3906  	JHI  loop
  3907  	RET
  3908  
  3909  // func AmdAxpyPointerLoopX_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3910  // Requires: SSE
  3911  TEXT ·AmdAxpyPointerLoopX_V5A8(SB), NOSPLIT, $0-48
  3912  	MOVSS alpha+0(FP), X0
  3913  	MOVQ  xs+8(FP), AX
  3914  	MOVQ  incx+16(FP), CX
  3915  	MOVQ  ys+24(FP), DX
  3916  	MOVQ  incy+32(FP), BX
  3917  	MOVQ  n+40(FP), SI
  3918  	JMP   check_limit
  3919  	PCALIGN $0x08
  3920  
  3921  loop:
  3922  	MOVSS (AX), X1
  3923  	MULSS X0, X1
  3924  	ADDSS (DX), X1
  3925  	MOVSS X1, (DX)
  3926  	DECQ  SI
  3927  	LEAQ  (AX)(CX*4), AX
  3928  	LEAQ  (DX)(BX*4), DX
  3929  
  3930  check_limit:
  3931  	CMPQ SI, $0x00
  3932  	JHI  loop
  3933  	RET
  3934  
  3935  // func AmdAxpyPointerLoopX_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3936  // Requires: SSE
  3937  TEXT ·AmdAxpyPointerLoopX_V0A9(SB), NOSPLIT, $0-48
  3938  	MOVSS alpha+0(FP), X0
  3939  	MOVQ  xs+8(FP), AX
  3940  	MOVQ  incx+16(FP), CX
  3941  	MOVQ  ys+24(FP), DX
  3942  	MOVQ  incy+32(FP), BX
  3943  	MOVQ  n+40(FP), SI
  3944  	JMP   check_limit
  3945  	PCALIGN $0x08
  3946  	NOP
  3947  
  3948  loop:
  3949  	MOVSS (AX), X1
  3950  	MULSS X0, X1
  3951  	ADDSS (DX), X1
  3952  	MOVSS X1, (DX)
  3953  	DECQ  SI
  3954  	LEAQ  (AX)(CX*4), AX
  3955  	LEAQ  (DX)(BX*4), DX
  3956  
  3957  check_limit:
  3958  	CMPQ SI, $0x00
  3959  	JHI  loop
  3960  	RET
  3961  
  3962  // func AmdAxpyPointerLoopX_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3963  // Requires: SSE
  3964  TEXT ·AmdAxpyPointerLoopX_V1A9(SB), NOSPLIT, $0-48
  3965  	MOVSS alpha+0(FP), X0
  3966  	MOVQ  xs+8(FP), AX
  3967  	MOVQ  incx+16(FP), CX
  3968  	MOVQ  ys+24(FP), DX
  3969  	MOVQ  incy+32(FP), BX
  3970  	MOVQ  n+40(FP), SI
  3971  	JMP   check_limit
  3972  	PCALIGN $0x08
  3973  	NOP
  3974  
  3975  loop:
  3976  	MOVSS (AX), X1
  3977  	MULSS X0, X1
  3978  	ADDSS (DX), X1
  3979  	MOVSS X1, (DX)
  3980  	DECQ  SI
  3981  	LEAQ  (AX)(CX*4), AX
  3982  	LEAQ  (DX)(BX*4), DX
  3983  
  3984  check_limit:
  3985  	CMPQ SI, $0x00
  3986  	JHI  loop
  3987  	RET
  3988  
  3989  // func AmdAxpyPointerLoopX_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  3990  // Requires: SSE
  3991  TEXT ·AmdAxpyPointerLoopX_V2A9(SB), NOSPLIT, $0-48
  3992  	MOVSS alpha+0(FP), X0
  3993  	MOVQ  xs+8(FP), AX
  3994  	MOVQ  incx+16(FP), CX
  3995  	MOVQ  ys+24(FP), DX
  3996  	MOVQ  incy+32(FP), BX
  3997  	MOVQ  n+40(FP), SI
  3998  	JMP   check_limit
  3999  	PCALIGN $0x08
  4000  	NOP
  4001  
  4002  loop:
  4003  	MOVSS (AX), X1
  4004  	MULSS X0, X1
  4005  	ADDSS (DX), X1
  4006  	MOVSS X1, (DX)
  4007  	DECQ  SI
  4008  	LEAQ  (AX)(CX*4), AX
  4009  	LEAQ  (DX)(BX*4), DX
  4010  
  4011  check_limit:
  4012  	CMPQ SI, $0x00
  4013  	JHI  loop
  4014  	RET
  4015  
  4016  // func AmdAxpyPointerLoopX_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4017  // Requires: SSE
  4018  TEXT ·AmdAxpyPointerLoopX_V3A9(SB), NOSPLIT, $0-48
  4019  	MOVSS alpha+0(FP), X0
  4020  	MOVQ  xs+8(FP), AX
  4021  	MOVQ  incx+16(FP), CX
  4022  	MOVQ  ys+24(FP), DX
  4023  	MOVQ  incy+32(FP), BX
  4024  	MOVQ  n+40(FP), SI
  4025  	JMP   check_limit
  4026  	PCALIGN $0x08
  4027  	NOP
  4028  
  4029  loop:
  4030  	MOVSS (AX), X1
  4031  	MULSS X0, X1
  4032  	ADDSS (DX), X1
  4033  	MOVSS X1, (DX)
  4034  	DECQ  SI
  4035  	LEAQ  (AX)(CX*4), AX
  4036  	LEAQ  (DX)(BX*4), DX
  4037  
  4038  check_limit:
  4039  	CMPQ SI, $0x00
  4040  	JHI  loop
  4041  	RET
  4042  
  4043  // func AmdAxpyPointerLoopX_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4044  // Requires: SSE
  4045  TEXT ·AmdAxpyPointerLoopX_V4A9(SB), NOSPLIT, $0-48
  4046  	MOVSS alpha+0(FP), X0
  4047  	MOVQ  xs+8(FP), AX
  4048  	MOVQ  incx+16(FP), CX
  4049  	MOVQ  ys+24(FP), DX
  4050  	MOVQ  incy+32(FP), BX
  4051  	MOVQ  n+40(FP), SI
  4052  	JMP   check_limit
  4053  	PCALIGN $0x08
  4054  	NOP
  4055  
  4056  loop:
  4057  	MOVSS (AX), X1
  4058  	MULSS X0, X1
  4059  	ADDSS (DX), X1
  4060  	MOVSS X1, (DX)
  4061  	DECQ  SI
  4062  	LEAQ  (AX)(CX*4), AX
  4063  	LEAQ  (DX)(BX*4), DX
  4064  
  4065  check_limit:
  4066  	CMPQ SI, $0x00
  4067  	JHI  loop
  4068  	RET
  4069  
  4070  // func AmdAxpyPointerLoopX_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4071  // Requires: SSE
  4072  TEXT ·AmdAxpyPointerLoopX_V5A9(SB), NOSPLIT, $0-48
  4073  	MOVSS alpha+0(FP), X0
  4074  	MOVQ  xs+8(FP), AX
  4075  	MOVQ  incx+16(FP), CX
  4076  	MOVQ  ys+24(FP), DX
  4077  	MOVQ  incy+32(FP), BX
  4078  	MOVQ  n+40(FP), SI
  4079  	JMP   check_limit
  4080  	PCALIGN $0x08
  4081  	NOP
  4082  
  4083  loop:
  4084  	MOVSS (AX), X1
  4085  	MULSS X0, X1
  4086  	ADDSS (DX), X1
  4087  	MOVSS X1, (DX)
  4088  	DECQ  SI
  4089  	LEAQ  (AX)(CX*4), AX
  4090  	LEAQ  (DX)(BX*4), DX
  4091  
  4092  check_limit:
  4093  	CMPQ SI, $0x00
  4094  	JHI  loop
  4095  	RET
  4096  
  4097  // func AmdAxpyPointerLoopX_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4098  // Requires: SSE
  4099  TEXT ·AmdAxpyPointerLoopX_V0A10(SB), NOSPLIT, $0-48
  4100  	MOVSS alpha+0(FP), X0
  4101  	MOVQ  xs+8(FP), AX
  4102  	MOVQ  incx+16(FP), CX
  4103  	MOVQ  ys+24(FP), DX
  4104  	MOVQ  incy+32(FP), BX
  4105  	MOVQ  n+40(FP), SI
  4106  	JMP   check_limit
  4107  	PCALIGN $0x08
  4108  	NOP
  4109  	NOP
  4110  
  4111  loop:
  4112  	MOVSS (AX), X1
  4113  	MULSS X0, X1
  4114  	ADDSS (DX), X1
  4115  	MOVSS X1, (DX)
  4116  	DECQ  SI
  4117  	LEAQ  (AX)(CX*4), AX
  4118  	LEAQ  (DX)(BX*4), DX
  4119  
  4120  check_limit:
  4121  	CMPQ SI, $0x00
  4122  	JHI  loop
  4123  	RET
  4124  
  4125  // func AmdAxpyPointerLoopX_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4126  // Requires: SSE
  4127  TEXT ·AmdAxpyPointerLoopX_V1A10(SB), NOSPLIT, $0-48
  4128  	MOVSS alpha+0(FP), X0
  4129  	MOVQ  xs+8(FP), AX
  4130  	MOVQ  incx+16(FP), CX
  4131  	MOVQ  ys+24(FP), DX
  4132  	MOVQ  incy+32(FP), BX
  4133  	MOVQ  n+40(FP), SI
  4134  	JMP   check_limit
  4135  	PCALIGN $0x08
  4136  	NOP
  4137  	NOP
  4138  
  4139  loop:
  4140  	MOVSS (AX), X1
  4141  	MULSS X0, X1
  4142  	ADDSS (DX), X1
  4143  	MOVSS X1, (DX)
  4144  	DECQ  SI
  4145  	LEAQ  (AX)(CX*4), AX
  4146  	LEAQ  (DX)(BX*4), DX
  4147  
  4148  check_limit:
  4149  	CMPQ SI, $0x00
  4150  	JHI  loop
  4151  	RET
  4152  
  4153  // func AmdAxpyPointerLoopX_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4154  // Requires: SSE
  4155  TEXT ·AmdAxpyPointerLoopX_V2A10(SB), NOSPLIT, $0-48
  4156  	MOVSS alpha+0(FP), X0
  4157  	MOVQ  xs+8(FP), AX
  4158  	MOVQ  incx+16(FP), CX
  4159  	MOVQ  ys+24(FP), DX
  4160  	MOVQ  incy+32(FP), BX
  4161  	MOVQ  n+40(FP), SI
  4162  	JMP   check_limit
  4163  	PCALIGN $0x08
  4164  	NOP
  4165  	NOP
  4166  
  4167  loop:
  4168  	MOVSS (AX), X1
  4169  	MULSS X0, X1
  4170  	ADDSS (DX), X1
  4171  	MOVSS X1, (DX)
  4172  	DECQ  SI
  4173  	LEAQ  (AX)(CX*4), AX
  4174  	LEAQ  (DX)(BX*4), DX
  4175  
  4176  check_limit:
  4177  	CMPQ SI, $0x00
  4178  	JHI  loop
  4179  	RET
  4180  
  4181  // func AmdAxpyPointerLoopX_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4182  // Requires: SSE
  4183  TEXT ·AmdAxpyPointerLoopX_V3A10(SB), NOSPLIT, $0-48
  4184  	MOVSS alpha+0(FP), X0
  4185  	MOVQ  xs+8(FP), AX
  4186  	MOVQ  incx+16(FP), CX
  4187  	MOVQ  ys+24(FP), DX
  4188  	MOVQ  incy+32(FP), BX
  4189  	MOVQ  n+40(FP), SI
  4190  	JMP   check_limit
  4191  	PCALIGN $0x08
  4192  	NOP
  4193  	NOP
  4194  
  4195  loop:
  4196  	MOVSS (AX), X1
  4197  	MULSS X0, X1
  4198  	ADDSS (DX), X1
  4199  	MOVSS X1, (DX)
  4200  	DECQ  SI
  4201  	LEAQ  (AX)(CX*4), AX
  4202  	LEAQ  (DX)(BX*4), DX
  4203  
  4204  check_limit:
  4205  	CMPQ SI, $0x00
  4206  	JHI  loop
  4207  	RET
  4208  
  4209  // func AmdAxpyPointerLoopX_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4210  // Requires: SSE
  4211  TEXT ·AmdAxpyPointerLoopX_V4A10(SB), NOSPLIT, $0-48
  4212  	MOVSS alpha+0(FP), X0
  4213  	MOVQ  xs+8(FP), AX
  4214  	MOVQ  incx+16(FP), CX
  4215  	MOVQ  ys+24(FP), DX
  4216  	MOVQ  incy+32(FP), BX
  4217  	MOVQ  n+40(FP), SI
  4218  	JMP   check_limit
  4219  	PCALIGN $0x08
  4220  	NOP
  4221  	NOP
  4222  
  4223  loop:
  4224  	MOVSS (AX), X1
  4225  	MULSS X0, X1
  4226  	ADDSS (DX), X1
  4227  	MOVSS X1, (DX)
  4228  	DECQ  SI
  4229  	LEAQ  (AX)(CX*4), AX
  4230  	LEAQ  (DX)(BX*4), DX
  4231  
  4232  check_limit:
  4233  	CMPQ SI, $0x00
  4234  	JHI  loop
  4235  	RET
  4236  
  4237  // func AmdAxpyPointerLoopX_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4238  // Requires: SSE
  4239  TEXT ·AmdAxpyPointerLoopX_V5A10(SB), NOSPLIT, $0-48
  4240  	MOVSS alpha+0(FP), X0
  4241  	MOVQ  xs+8(FP), AX
  4242  	MOVQ  incx+16(FP), CX
  4243  	MOVQ  ys+24(FP), DX
  4244  	MOVQ  incy+32(FP), BX
  4245  	MOVQ  n+40(FP), SI
  4246  	JMP   check_limit
  4247  	PCALIGN $0x08
  4248  	NOP
  4249  	NOP
  4250  
  4251  loop:
  4252  	MOVSS (AX), X1
  4253  	MULSS X0, X1
  4254  	ADDSS (DX), X1
  4255  	MOVSS X1, (DX)
  4256  	DECQ  SI
  4257  	LEAQ  (AX)(CX*4), AX
  4258  	LEAQ  (DX)(BX*4), DX
  4259  
  4260  check_limit:
  4261  	CMPQ SI, $0x00
  4262  	JHI  loop
  4263  	RET
  4264  
  4265  // func AmdAxpyPointerLoopX_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4266  // Requires: SSE
  4267  TEXT ·AmdAxpyPointerLoopX_V0A11(SB), NOSPLIT, $0-48
  4268  	MOVSS alpha+0(FP), X0
  4269  	MOVQ  xs+8(FP), AX
  4270  	MOVQ  incx+16(FP), CX
  4271  	MOVQ  ys+24(FP), DX
  4272  	MOVQ  incy+32(FP), BX
  4273  	MOVQ  n+40(FP), SI
  4274  	JMP   check_limit
  4275  	PCALIGN $0x08
  4276  	NOP
  4277  	NOP
  4278  	NOP
  4279  
  4280  loop:
  4281  	MOVSS (AX), X1
  4282  	MULSS X0, X1
  4283  	ADDSS (DX), X1
  4284  	MOVSS X1, (DX)
  4285  	DECQ  SI
  4286  	LEAQ  (AX)(CX*4), AX
  4287  	LEAQ  (DX)(BX*4), DX
  4288  
  4289  check_limit:
  4290  	CMPQ SI, $0x00
  4291  	JHI  loop
  4292  	RET
  4293  
  4294  // func AmdAxpyPointerLoopX_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4295  // Requires: SSE
  4296  TEXT ·AmdAxpyPointerLoopX_V1A11(SB), NOSPLIT, $0-48
  4297  	MOVSS alpha+0(FP), X0
  4298  	MOVQ  xs+8(FP), AX
  4299  	MOVQ  incx+16(FP), CX
  4300  	MOVQ  ys+24(FP), DX
  4301  	MOVQ  incy+32(FP), BX
  4302  	MOVQ  n+40(FP), SI
  4303  	JMP   check_limit
  4304  	PCALIGN $0x08
  4305  	NOP
  4306  	NOP
  4307  	NOP
  4308  
  4309  loop:
  4310  	MOVSS (AX), X1
  4311  	MULSS X0, X1
  4312  	ADDSS (DX), X1
  4313  	MOVSS X1, (DX)
  4314  	DECQ  SI
  4315  	LEAQ  (AX)(CX*4), AX
  4316  	LEAQ  (DX)(BX*4), DX
  4317  
  4318  check_limit:
  4319  	CMPQ SI, $0x00
  4320  	JHI  loop
  4321  	RET
  4322  
  4323  // func AmdAxpyPointerLoopX_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4324  // Requires: SSE
  4325  TEXT ·AmdAxpyPointerLoopX_V2A11(SB), NOSPLIT, $0-48
  4326  	MOVSS alpha+0(FP), X0
  4327  	MOVQ  xs+8(FP), AX
  4328  	MOVQ  incx+16(FP), CX
  4329  	MOVQ  ys+24(FP), DX
  4330  	MOVQ  incy+32(FP), BX
  4331  	MOVQ  n+40(FP), SI
  4332  	JMP   check_limit
  4333  	PCALIGN $0x08
  4334  	NOP
  4335  	NOP
  4336  	NOP
  4337  
  4338  loop:
  4339  	MOVSS (AX), X1
  4340  	MULSS X0, X1
  4341  	ADDSS (DX), X1
  4342  	MOVSS X1, (DX)
  4343  	DECQ  SI
  4344  	LEAQ  (AX)(CX*4), AX
  4345  	LEAQ  (DX)(BX*4), DX
  4346  
  4347  check_limit:
  4348  	CMPQ SI, $0x00
  4349  	JHI  loop
  4350  	RET
  4351  
  4352  // func AmdAxpyPointerLoopX_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4353  // Requires: SSE
  4354  TEXT ·AmdAxpyPointerLoopX_V3A11(SB), NOSPLIT, $0-48
  4355  	MOVSS alpha+0(FP), X0
  4356  	MOVQ  xs+8(FP), AX
  4357  	MOVQ  incx+16(FP), CX
  4358  	MOVQ  ys+24(FP), DX
  4359  	MOVQ  incy+32(FP), BX
  4360  	MOVQ  n+40(FP), SI
  4361  	JMP   check_limit
  4362  	PCALIGN $0x08
  4363  	NOP
  4364  	NOP
  4365  	NOP
  4366  
  4367  loop:
  4368  	MOVSS (AX), X1
  4369  	MULSS X0, X1
  4370  	ADDSS (DX), X1
  4371  	MOVSS X1, (DX)
  4372  	DECQ  SI
  4373  	LEAQ  (AX)(CX*4), AX
  4374  	LEAQ  (DX)(BX*4), DX
  4375  
  4376  check_limit:
  4377  	CMPQ SI, $0x00
  4378  	JHI  loop
  4379  	RET
  4380  
  4381  // func AmdAxpyPointerLoopX_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4382  // Requires: SSE
  4383  TEXT ·AmdAxpyPointerLoopX_V4A11(SB), NOSPLIT, $0-48
  4384  	MOVSS alpha+0(FP), X0
  4385  	MOVQ  xs+8(FP), AX
  4386  	MOVQ  incx+16(FP), CX
  4387  	MOVQ  ys+24(FP), DX
  4388  	MOVQ  incy+32(FP), BX
  4389  	MOVQ  n+40(FP), SI
  4390  	JMP   check_limit
  4391  	PCALIGN $0x08
  4392  	NOP
  4393  	NOP
  4394  	NOP
  4395  
  4396  loop:
  4397  	MOVSS (AX), X1
  4398  	MULSS X0, X1
  4399  	ADDSS (DX), X1
  4400  	MOVSS X1, (DX)
  4401  	DECQ  SI
  4402  	LEAQ  (AX)(CX*4), AX
  4403  	LEAQ  (DX)(BX*4), DX
  4404  
  4405  check_limit:
  4406  	CMPQ SI, $0x00
  4407  	JHI  loop
  4408  	RET
  4409  
  4410  // func AmdAxpyPointerLoopX_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4411  // Requires: SSE
  4412  TEXT ·AmdAxpyPointerLoopX_V5A11(SB), NOSPLIT, $0-48
  4413  	MOVSS alpha+0(FP), X0
  4414  	MOVQ  xs+8(FP), AX
  4415  	MOVQ  incx+16(FP), CX
  4416  	MOVQ  ys+24(FP), DX
  4417  	MOVQ  incy+32(FP), BX
  4418  	MOVQ  n+40(FP), SI
  4419  	JMP   check_limit
  4420  	PCALIGN $0x08
  4421  	NOP
  4422  	NOP
  4423  	NOP
  4424  
  4425  loop:
  4426  	MOVSS (AX), X1
  4427  	MULSS X0, X1
  4428  	ADDSS (DX), X1
  4429  	MOVSS X1, (DX)
  4430  	DECQ  SI
  4431  	LEAQ  (AX)(CX*4), AX
  4432  	LEAQ  (DX)(BX*4), DX
  4433  
  4434  check_limit:
  4435  	CMPQ SI, $0x00
  4436  	JHI  loop
  4437  	RET
  4438  
  4439  // func AmdAxpyPointerLoopX_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4440  // Requires: SSE
  4441  TEXT ·AmdAxpyPointerLoopX_V0A12(SB), NOSPLIT, $0-48
  4442  	MOVSS alpha+0(FP), X0
  4443  	MOVQ  xs+8(FP), AX
  4444  	MOVQ  incx+16(FP), CX
  4445  	MOVQ  ys+24(FP), DX
  4446  	MOVQ  incy+32(FP), BX
  4447  	MOVQ  n+40(FP), SI
  4448  	JMP   check_limit
  4449  	PCALIGN $0x08
  4450  	NOP
  4451  	NOP
  4452  	NOP
  4453  	NOP
  4454  
  4455  loop:
  4456  	MOVSS (AX), X1
  4457  	MULSS X0, X1
  4458  	ADDSS (DX), X1
  4459  	MOVSS X1, (DX)
  4460  	DECQ  SI
  4461  	LEAQ  (AX)(CX*4), AX
  4462  	LEAQ  (DX)(BX*4), DX
  4463  
  4464  check_limit:
  4465  	CMPQ SI, $0x00
  4466  	JHI  loop
  4467  	RET
  4468  
  4469  // func AmdAxpyPointerLoopX_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4470  // Requires: SSE
  4471  TEXT ·AmdAxpyPointerLoopX_V1A12(SB), NOSPLIT, $0-48
  4472  	MOVSS alpha+0(FP), X0
  4473  	MOVQ  xs+8(FP), AX
  4474  	MOVQ  incx+16(FP), CX
  4475  	MOVQ  ys+24(FP), DX
  4476  	MOVQ  incy+32(FP), BX
  4477  	MOVQ  n+40(FP), SI
  4478  	JMP   check_limit
  4479  	PCALIGN $0x08
  4480  	NOP
  4481  	NOP
  4482  	NOP
  4483  	NOP
  4484  
  4485  loop:
  4486  	MOVSS (AX), X1
  4487  	MULSS X0, X1
  4488  	ADDSS (DX), X1
  4489  	MOVSS X1, (DX)
  4490  	DECQ  SI
  4491  	LEAQ  (AX)(CX*4), AX
  4492  	LEAQ  (DX)(BX*4), DX
  4493  
  4494  check_limit:
  4495  	CMPQ SI, $0x00
  4496  	JHI  loop
  4497  	RET
  4498  
  4499  // func AmdAxpyPointerLoopX_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4500  // Requires: SSE
  4501  TEXT ·AmdAxpyPointerLoopX_V2A12(SB), NOSPLIT, $0-48
  4502  	MOVSS alpha+0(FP), X0
  4503  	MOVQ  xs+8(FP), AX
  4504  	MOVQ  incx+16(FP), CX
  4505  	MOVQ  ys+24(FP), DX
  4506  	MOVQ  incy+32(FP), BX
  4507  	MOVQ  n+40(FP), SI
  4508  	JMP   check_limit
  4509  	PCALIGN $0x08
  4510  	NOP
  4511  	NOP
  4512  	NOP
  4513  	NOP
  4514  
  4515  loop:
  4516  	MOVSS (AX), X1
  4517  	MULSS X0, X1
  4518  	ADDSS (DX), X1
  4519  	MOVSS X1, (DX)
  4520  	DECQ  SI
  4521  	LEAQ  (AX)(CX*4), AX
  4522  	LEAQ  (DX)(BX*4), DX
  4523  
  4524  check_limit:
  4525  	CMPQ SI, $0x00
  4526  	JHI  loop
  4527  	RET
  4528  
  4529  // func AmdAxpyPointerLoopX_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4530  // Requires: SSE
  4531  TEXT ·AmdAxpyPointerLoopX_V3A12(SB), NOSPLIT, $0-48
  4532  	MOVSS alpha+0(FP), X0
  4533  	MOVQ  xs+8(FP), AX
  4534  	MOVQ  incx+16(FP), CX
  4535  	MOVQ  ys+24(FP), DX
  4536  	MOVQ  incy+32(FP), BX
  4537  	MOVQ  n+40(FP), SI
  4538  	JMP   check_limit
  4539  	PCALIGN $0x08
  4540  	NOP
  4541  	NOP
  4542  	NOP
  4543  	NOP
  4544  
  4545  loop:
  4546  	MOVSS (AX), X1
  4547  	MULSS X0, X1
  4548  	ADDSS (DX), X1
  4549  	MOVSS X1, (DX)
  4550  	DECQ  SI
  4551  	LEAQ  (AX)(CX*4), AX
  4552  	LEAQ  (DX)(BX*4), DX
  4553  
  4554  check_limit:
  4555  	CMPQ SI, $0x00
  4556  	JHI  loop
  4557  	RET
  4558  
  4559  // func AmdAxpyPointerLoopX_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4560  // Requires: SSE
  4561  TEXT ·AmdAxpyPointerLoopX_V4A12(SB), NOSPLIT, $0-48
  4562  	MOVSS alpha+0(FP), X0
  4563  	MOVQ  xs+8(FP), AX
  4564  	MOVQ  incx+16(FP), CX
  4565  	MOVQ  ys+24(FP), DX
  4566  	MOVQ  incy+32(FP), BX
  4567  	MOVQ  n+40(FP), SI
  4568  	JMP   check_limit
  4569  	PCALIGN $0x08
  4570  	NOP
  4571  	NOP
  4572  	NOP
  4573  	NOP
  4574  
  4575  loop:
  4576  	MOVSS (AX), X1
  4577  	MULSS X0, X1
  4578  	ADDSS (DX), X1
  4579  	MOVSS X1, (DX)
  4580  	DECQ  SI
  4581  	LEAQ  (AX)(CX*4), AX
  4582  	LEAQ  (DX)(BX*4), DX
  4583  
  4584  check_limit:
  4585  	CMPQ SI, $0x00
  4586  	JHI  loop
  4587  	RET
  4588  
  4589  // func AmdAxpyPointerLoopX_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4590  // Requires: SSE
  4591  TEXT ·AmdAxpyPointerLoopX_V5A12(SB), NOSPLIT, $0-48
  4592  	MOVSS alpha+0(FP), X0
  4593  	MOVQ  xs+8(FP), AX
  4594  	MOVQ  incx+16(FP), CX
  4595  	MOVQ  ys+24(FP), DX
  4596  	MOVQ  incy+32(FP), BX
  4597  	MOVQ  n+40(FP), SI
  4598  	JMP   check_limit
  4599  	PCALIGN $0x08
  4600  	NOP
  4601  	NOP
  4602  	NOP
  4603  	NOP
  4604  
  4605  loop:
  4606  	MOVSS (AX), X1
  4607  	MULSS X0, X1
  4608  	ADDSS (DX), X1
  4609  	MOVSS X1, (DX)
  4610  	DECQ  SI
  4611  	LEAQ  (AX)(CX*4), AX
  4612  	LEAQ  (DX)(BX*4), DX
  4613  
  4614  check_limit:
  4615  	CMPQ SI, $0x00
  4616  	JHI  loop
  4617  	RET
  4618  
  4619  // func AmdAxpyPointerLoopX_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4620  // Requires: SSE
  4621  TEXT ·AmdAxpyPointerLoopX_V0A13(SB), NOSPLIT, $0-48
  4622  	MOVSS alpha+0(FP), X0
  4623  	MOVQ  xs+8(FP), AX
  4624  	MOVQ  incx+16(FP), CX
  4625  	MOVQ  ys+24(FP), DX
  4626  	MOVQ  incy+32(FP), BX
  4627  	MOVQ  n+40(FP), SI
  4628  	JMP   check_limit
  4629  	PCALIGN $0x08
  4630  	NOP
  4631  	NOP
  4632  	NOP
  4633  	NOP
  4634  	NOP
  4635  
  4636  loop:
  4637  	MOVSS (AX), X1
  4638  	MULSS X0, X1
  4639  	ADDSS (DX), X1
  4640  	MOVSS X1, (DX)
  4641  	DECQ  SI
  4642  	LEAQ  (AX)(CX*4), AX
  4643  	LEAQ  (DX)(BX*4), DX
  4644  
  4645  check_limit:
  4646  	CMPQ SI, $0x00
  4647  	JHI  loop
  4648  	RET
  4649  
  4650  // func AmdAxpyPointerLoopX_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4651  // Requires: SSE
  4652  TEXT ·AmdAxpyPointerLoopX_V1A13(SB), NOSPLIT, $0-48
  4653  	MOVSS alpha+0(FP), X0
  4654  	MOVQ  xs+8(FP), AX
  4655  	MOVQ  incx+16(FP), CX
  4656  	MOVQ  ys+24(FP), DX
  4657  	MOVQ  incy+32(FP), BX
  4658  	MOVQ  n+40(FP), SI
  4659  	JMP   check_limit
  4660  	PCALIGN $0x08
  4661  	NOP
  4662  	NOP
  4663  	NOP
  4664  	NOP
  4665  	NOP
  4666  
  4667  loop:
  4668  	MOVSS (AX), X1
  4669  	MULSS X0, X1
  4670  	ADDSS (DX), X1
  4671  	MOVSS X1, (DX)
  4672  	DECQ  SI
  4673  	LEAQ  (AX)(CX*4), AX
  4674  	LEAQ  (DX)(BX*4), DX
  4675  
  4676  check_limit:
  4677  	CMPQ SI, $0x00
  4678  	JHI  loop
  4679  	RET
  4680  
  4681  // func AmdAxpyPointerLoopX_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4682  // Requires: SSE
  4683  TEXT ·AmdAxpyPointerLoopX_V2A13(SB), NOSPLIT, $0-48
  4684  	MOVSS alpha+0(FP), X0
  4685  	MOVQ  xs+8(FP), AX
  4686  	MOVQ  incx+16(FP), CX
  4687  	MOVQ  ys+24(FP), DX
  4688  	MOVQ  incy+32(FP), BX
  4689  	MOVQ  n+40(FP), SI
  4690  	JMP   check_limit
  4691  	PCALIGN $0x08
  4692  	NOP
  4693  	NOP
  4694  	NOP
  4695  	NOP
  4696  	NOP
  4697  
  4698  loop:
  4699  	MOVSS (AX), X1
  4700  	MULSS X0, X1
  4701  	ADDSS (DX), X1
  4702  	MOVSS X1, (DX)
  4703  	DECQ  SI
  4704  	LEAQ  (AX)(CX*4), AX
  4705  	LEAQ  (DX)(BX*4), DX
  4706  
  4707  check_limit:
  4708  	CMPQ SI, $0x00
  4709  	JHI  loop
  4710  	RET
  4711  
  4712  // func AmdAxpyPointerLoopX_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4713  // Requires: SSE
  4714  TEXT ·AmdAxpyPointerLoopX_V3A13(SB), NOSPLIT, $0-48
  4715  	MOVSS alpha+0(FP), X0
  4716  	MOVQ  xs+8(FP), AX
  4717  	MOVQ  incx+16(FP), CX
  4718  	MOVQ  ys+24(FP), DX
  4719  	MOVQ  incy+32(FP), BX
  4720  	MOVQ  n+40(FP), SI
  4721  	JMP   check_limit
  4722  	PCALIGN $0x08
  4723  	NOP
  4724  	NOP
  4725  	NOP
  4726  	NOP
  4727  	NOP
  4728  
  4729  loop:
  4730  	MOVSS (AX), X1
  4731  	MULSS X0, X1
  4732  	ADDSS (DX), X1
  4733  	MOVSS X1, (DX)
  4734  	DECQ  SI
  4735  	LEAQ  (AX)(CX*4), AX
  4736  	LEAQ  (DX)(BX*4), DX
  4737  
  4738  check_limit:
  4739  	CMPQ SI, $0x00
  4740  	JHI  loop
  4741  	RET
  4742  
  4743  // func AmdAxpyPointerLoopX_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4744  // Requires: SSE
  4745  TEXT ·AmdAxpyPointerLoopX_V4A13(SB), NOSPLIT, $0-48
  4746  	MOVSS alpha+0(FP), X0
  4747  	MOVQ  xs+8(FP), AX
  4748  	MOVQ  incx+16(FP), CX
  4749  	MOVQ  ys+24(FP), DX
  4750  	MOVQ  incy+32(FP), BX
  4751  	MOVQ  n+40(FP), SI
  4752  	JMP   check_limit
  4753  	PCALIGN $0x08
  4754  	NOP
  4755  	NOP
  4756  	NOP
  4757  	NOP
  4758  	NOP
  4759  
  4760  loop:
  4761  	MOVSS (AX), X1
  4762  	MULSS X0, X1
  4763  	ADDSS (DX), X1
  4764  	MOVSS X1, (DX)
  4765  	DECQ  SI
  4766  	LEAQ  (AX)(CX*4), AX
  4767  	LEAQ  (DX)(BX*4), DX
  4768  
  4769  check_limit:
  4770  	CMPQ SI, $0x00
  4771  	JHI  loop
  4772  	RET
  4773  
  4774  // func AmdAxpyPointerLoopX_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4775  // Requires: SSE
  4776  TEXT ·AmdAxpyPointerLoopX_V5A13(SB), NOSPLIT, $0-48
  4777  	MOVSS alpha+0(FP), X0
  4778  	MOVQ  xs+8(FP), AX
  4779  	MOVQ  incx+16(FP), CX
  4780  	MOVQ  ys+24(FP), DX
  4781  	MOVQ  incy+32(FP), BX
  4782  	MOVQ  n+40(FP), SI
  4783  	JMP   check_limit
  4784  	PCALIGN $0x08
  4785  	NOP
  4786  	NOP
  4787  	NOP
  4788  	NOP
  4789  	NOP
  4790  
  4791  loop:
  4792  	MOVSS (AX), X1
  4793  	MULSS X0, X1
  4794  	ADDSS (DX), X1
  4795  	MOVSS X1, (DX)
  4796  	DECQ  SI
  4797  	LEAQ  (AX)(CX*4), AX
  4798  	LEAQ  (DX)(BX*4), DX
  4799  
  4800  check_limit:
  4801  	CMPQ SI, $0x00
  4802  	JHI  loop
  4803  	RET
  4804  
  4805  // func AmdAxpyPointerLoopX_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4806  // Requires: SSE
  4807  TEXT ·AmdAxpyPointerLoopX_V0A14(SB), NOSPLIT, $0-48
  4808  	MOVSS alpha+0(FP), X0
  4809  	MOVQ  xs+8(FP), AX
  4810  	MOVQ  incx+16(FP), CX
  4811  	MOVQ  ys+24(FP), DX
  4812  	MOVQ  incy+32(FP), BX
  4813  	MOVQ  n+40(FP), SI
  4814  	JMP   check_limit
  4815  	PCALIGN $0x08
  4816  	NOP
  4817  	NOP
  4818  	NOP
  4819  	NOP
  4820  	NOP
  4821  	NOP
  4822  
  4823  loop:
  4824  	MOVSS (AX), X1
  4825  	MULSS X0, X1
  4826  	ADDSS (DX), X1
  4827  	MOVSS X1, (DX)
  4828  	DECQ  SI
  4829  	LEAQ  (AX)(CX*4), AX
  4830  	LEAQ  (DX)(BX*4), DX
  4831  
  4832  check_limit:
  4833  	CMPQ SI, $0x00
  4834  	JHI  loop
  4835  	RET
  4836  
  4837  // func AmdAxpyPointerLoopX_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4838  // Requires: SSE
  4839  TEXT ·AmdAxpyPointerLoopX_V1A14(SB), NOSPLIT, $0-48
  4840  	MOVSS alpha+0(FP), X0
  4841  	MOVQ  xs+8(FP), AX
  4842  	MOVQ  incx+16(FP), CX
  4843  	MOVQ  ys+24(FP), DX
  4844  	MOVQ  incy+32(FP), BX
  4845  	MOVQ  n+40(FP), SI
  4846  	JMP   check_limit
  4847  	PCALIGN $0x08
  4848  	NOP
  4849  	NOP
  4850  	NOP
  4851  	NOP
  4852  	NOP
  4853  	NOP
  4854  
  4855  loop:
  4856  	MOVSS (AX), X1
  4857  	MULSS X0, X1
  4858  	ADDSS (DX), X1
  4859  	MOVSS X1, (DX)
  4860  	DECQ  SI
  4861  	LEAQ  (AX)(CX*4), AX
  4862  	LEAQ  (DX)(BX*4), DX
  4863  
  4864  check_limit:
  4865  	CMPQ SI, $0x00
  4866  	JHI  loop
  4867  	RET
  4868  
  4869  // func AmdAxpyPointerLoopX_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4870  // Requires: SSE
  4871  TEXT ·AmdAxpyPointerLoopX_V2A14(SB), NOSPLIT, $0-48
  4872  	MOVSS alpha+0(FP), X0
  4873  	MOVQ  xs+8(FP), AX
  4874  	MOVQ  incx+16(FP), CX
  4875  	MOVQ  ys+24(FP), DX
  4876  	MOVQ  incy+32(FP), BX
  4877  	MOVQ  n+40(FP), SI
  4878  	JMP   check_limit
  4879  	PCALIGN $0x08
  4880  	NOP
  4881  	NOP
  4882  	NOP
  4883  	NOP
  4884  	NOP
  4885  	NOP
  4886  
  4887  loop:
  4888  	MOVSS (AX), X1
  4889  	MULSS X0, X1
  4890  	ADDSS (DX), X1
  4891  	MOVSS X1, (DX)
  4892  	DECQ  SI
  4893  	LEAQ  (AX)(CX*4), AX
  4894  	LEAQ  (DX)(BX*4), DX
  4895  
  4896  check_limit:
  4897  	CMPQ SI, $0x00
  4898  	JHI  loop
  4899  	RET
  4900  
  4901  // func AmdAxpyPointerLoopX_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4902  // Requires: SSE
  4903  TEXT ·AmdAxpyPointerLoopX_V3A14(SB), NOSPLIT, $0-48
  4904  	MOVSS alpha+0(FP), X0
  4905  	MOVQ  xs+8(FP), AX
  4906  	MOVQ  incx+16(FP), CX
  4907  	MOVQ  ys+24(FP), DX
  4908  	MOVQ  incy+32(FP), BX
  4909  	MOVQ  n+40(FP), SI
  4910  	JMP   check_limit
  4911  	PCALIGN $0x08
  4912  	NOP
  4913  	NOP
  4914  	NOP
  4915  	NOP
  4916  	NOP
  4917  	NOP
  4918  
  4919  loop:
  4920  	MOVSS (AX), X1
  4921  	MULSS X0, X1
  4922  	ADDSS (DX), X1
  4923  	MOVSS X1, (DX)
  4924  	DECQ  SI
  4925  	LEAQ  (AX)(CX*4), AX
  4926  	LEAQ  (DX)(BX*4), DX
  4927  
  4928  check_limit:
  4929  	CMPQ SI, $0x00
  4930  	JHI  loop
  4931  	RET
  4932  
  4933  // func AmdAxpyPointerLoopX_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4934  // Requires: SSE
  4935  TEXT ·AmdAxpyPointerLoopX_V4A14(SB), NOSPLIT, $0-48
  4936  	MOVSS alpha+0(FP), X0
  4937  	MOVQ  xs+8(FP), AX
  4938  	MOVQ  incx+16(FP), CX
  4939  	MOVQ  ys+24(FP), DX
  4940  	MOVQ  incy+32(FP), BX
  4941  	MOVQ  n+40(FP), SI
  4942  	JMP   check_limit
  4943  	PCALIGN $0x08
  4944  	NOP
  4945  	NOP
  4946  	NOP
  4947  	NOP
  4948  	NOP
  4949  	NOP
  4950  
  4951  loop:
  4952  	MOVSS (AX), X1
  4953  	MULSS X0, X1
  4954  	ADDSS (DX), X1
  4955  	MOVSS X1, (DX)
  4956  	DECQ  SI
  4957  	LEAQ  (AX)(CX*4), AX
  4958  	LEAQ  (DX)(BX*4), DX
  4959  
  4960  check_limit:
  4961  	CMPQ SI, $0x00
  4962  	JHI  loop
  4963  	RET
  4964  
  4965  // func AmdAxpyPointerLoopX_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4966  // Requires: SSE
  4967  TEXT ·AmdAxpyPointerLoopX_V5A14(SB), NOSPLIT, $0-48
  4968  	MOVSS alpha+0(FP), X0
  4969  	MOVQ  xs+8(FP), AX
  4970  	MOVQ  incx+16(FP), CX
  4971  	MOVQ  ys+24(FP), DX
  4972  	MOVQ  incy+32(FP), BX
  4973  	MOVQ  n+40(FP), SI
  4974  	JMP   check_limit
  4975  	PCALIGN $0x08
  4976  	NOP
  4977  	NOP
  4978  	NOP
  4979  	NOP
  4980  	NOP
  4981  	NOP
  4982  
  4983  loop:
  4984  	MOVSS (AX), X1
  4985  	MULSS X0, X1
  4986  	ADDSS (DX), X1
  4987  	MOVSS X1, (DX)
  4988  	DECQ  SI
  4989  	LEAQ  (AX)(CX*4), AX
  4990  	LEAQ  (DX)(BX*4), DX
  4991  
  4992  check_limit:
  4993  	CMPQ SI, $0x00
  4994  	JHI  loop
  4995  	RET
  4996  
  4997  // func AmdAxpyPointerLoopX_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  4998  // Requires: SSE
  4999  TEXT ·AmdAxpyPointerLoopX_V0A15(SB), NOSPLIT, $0-48
  5000  	MOVSS alpha+0(FP), X0
  5001  	MOVQ  xs+8(FP), AX
  5002  	MOVQ  incx+16(FP), CX
  5003  	MOVQ  ys+24(FP), DX
  5004  	MOVQ  incy+32(FP), BX
  5005  	MOVQ  n+40(FP), SI
  5006  	JMP   check_limit
  5007  	PCALIGN $0x08
  5008  	NOP
  5009  	NOP
  5010  	NOP
  5011  	NOP
  5012  	NOP
  5013  	NOP
  5014  	NOP
  5015  
  5016  loop:
  5017  	MOVSS (AX), X1
  5018  	MULSS X0, X1
  5019  	ADDSS (DX), X1
  5020  	MOVSS X1, (DX)
  5021  	DECQ  SI
  5022  	LEAQ  (AX)(CX*4), AX
  5023  	LEAQ  (DX)(BX*4), DX
  5024  
  5025  check_limit:
  5026  	CMPQ SI, $0x00
  5027  	JHI  loop
  5028  	RET
  5029  
  5030  // func AmdAxpyPointerLoopX_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5031  // Requires: SSE
  5032  TEXT ·AmdAxpyPointerLoopX_V1A15(SB), NOSPLIT, $0-48
  5033  	MOVSS alpha+0(FP), X0
  5034  	MOVQ  xs+8(FP), AX
  5035  	MOVQ  incx+16(FP), CX
  5036  	MOVQ  ys+24(FP), DX
  5037  	MOVQ  incy+32(FP), BX
  5038  	MOVQ  n+40(FP), SI
  5039  	JMP   check_limit
  5040  	PCALIGN $0x08
  5041  	NOP
  5042  	NOP
  5043  	NOP
  5044  	NOP
  5045  	NOP
  5046  	NOP
  5047  	NOP
  5048  
  5049  loop:
  5050  	MOVSS (AX), X1
  5051  	MULSS X0, X1
  5052  	ADDSS (DX), X1
  5053  	MOVSS X1, (DX)
  5054  	DECQ  SI
  5055  	LEAQ  (AX)(CX*4), AX
  5056  	LEAQ  (DX)(BX*4), DX
  5057  
  5058  check_limit:
  5059  	CMPQ SI, $0x00
  5060  	JHI  loop
  5061  	RET
  5062  
  5063  // func AmdAxpyPointerLoopX_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5064  // Requires: SSE
  5065  TEXT ·AmdAxpyPointerLoopX_V2A15(SB), NOSPLIT, $0-48
  5066  	MOVSS alpha+0(FP), X0
  5067  	MOVQ  xs+8(FP), AX
  5068  	MOVQ  incx+16(FP), CX
  5069  	MOVQ  ys+24(FP), DX
  5070  	MOVQ  incy+32(FP), BX
  5071  	MOVQ  n+40(FP), SI
  5072  	JMP   check_limit
  5073  	PCALIGN $0x08
  5074  	NOP
  5075  	NOP
  5076  	NOP
  5077  	NOP
  5078  	NOP
  5079  	NOP
  5080  	NOP
  5081  
  5082  loop:
  5083  	MOVSS (AX), X1
  5084  	MULSS X0, X1
  5085  	ADDSS (DX), X1
  5086  	MOVSS X1, (DX)
  5087  	DECQ  SI
  5088  	LEAQ  (AX)(CX*4), AX
  5089  	LEAQ  (DX)(BX*4), DX
  5090  
  5091  check_limit:
  5092  	CMPQ SI, $0x00
  5093  	JHI  loop
  5094  	RET
  5095  
  5096  // func AmdAxpyPointerLoopX_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5097  // Requires: SSE
  5098  TEXT ·AmdAxpyPointerLoopX_V3A15(SB), NOSPLIT, $0-48
  5099  	MOVSS alpha+0(FP), X0
  5100  	MOVQ  xs+8(FP), AX
  5101  	MOVQ  incx+16(FP), CX
  5102  	MOVQ  ys+24(FP), DX
  5103  	MOVQ  incy+32(FP), BX
  5104  	MOVQ  n+40(FP), SI
  5105  	JMP   check_limit
  5106  	PCALIGN $0x08
  5107  	NOP
  5108  	NOP
  5109  	NOP
  5110  	NOP
  5111  	NOP
  5112  	NOP
  5113  	NOP
  5114  
  5115  loop:
  5116  	MOVSS (AX), X1
  5117  	MULSS X0, X1
  5118  	ADDSS (DX), X1
  5119  	MOVSS X1, (DX)
  5120  	DECQ  SI
  5121  	LEAQ  (AX)(CX*4), AX
  5122  	LEAQ  (DX)(BX*4), DX
  5123  
  5124  check_limit:
  5125  	CMPQ SI, $0x00
  5126  	JHI  loop
  5127  	RET
  5128  
  5129  // func AmdAxpyPointerLoopX_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5130  // Requires: SSE
  5131  TEXT ·AmdAxpyPointerLoopX_V4A15(SB), NOSPLIT, $0-48
  5132  	MOVSS alpha+0(FP), X0
  5133  	MOVQ  xs+8(FP), AX
  5134  	MOVQ  incx+16(FP), CX
  5135  	MOVQ  ys+24(FP), DX
  5136  	MOVQ  incy+32(FP), BX
  5137  	MOVQ  n+40(FP), SI
  5138  	JMP   check_limit
  5139  	PCALIGN $0x08
  5140  	NOP
  5141  	NOP
  5142  	NOP
  5143  	NOP
  5144  	NOP
  5145  	NOP
  5146  	NOP
  5147  
  5148  loop:
  5149  	MOVSS (AX), X1
  5150  	MULSS X0, X1
  5151  	ADDSS (DX), X1
  5152  	MOVSS X1, (DX)
  5153  	DECQ  SI
  5154  	LEAQ  (AX)(CX*4), AX
  5155  	LEAQ  (DX)(BX*4), DX
  5156  
  5157  check_limit:
  5158  	CMPQ SI, $0x00
  5159  	JHI  loop
  5160  	RET
  5161  
  5162  // func AmdAxpyPointerLoopX_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5163  // Requires: SSE
  5164  TEXT ·AmdAxpyPointerLoopX_V5A15(SB), NOSPLIT, $0-48
  5165  	MOVSS alpha+0(FP), X0
  5166  	MOVQ  xs+8(FP), AX
  5167  	MOVQ  incx+16(FP), CX
  5168  	MOVQ  ys+24(FP), DX
  5169  	MOVQ  incy+32(FP), BX
  5170  	MOVQ  n+40(FP), SI
  5171  	JMP   check_limit
  5172  	PCALIGN $0x08
  5173  	NOP
  5174  	NOP
  5175  	NOP
  5176  	NOP
  5177  	NOP
  5178  	NOP
  5179  	NOP
  5180  
  5181  loop:
  5182  	MOVSS (AX), X1
  5183  	MULSS X0, X1
  5184  	ADDSS (DX), X1
  5185  	MOVSS X1, (DX)
  5186  	DECQ  SI
  5187  	LEAQ  (AX)(CX*4), AX
  5188  	LEAQ  (DX)(BX*4), DX
  5189  
  5190  check_limit:
  5191  	CMPQ SI, $0x00
  5192  	JHI  loop
  5193  	RET
  5194  
  5195  // func AmdAxpyPointerLoopX_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5196  // Requires: SSE
  5197  TEXT ·AmdAxpyPointerLoopX_V0A16(SB), NOSPLIT, $0-48
  5198  	MOVSS alpha+0(FP), X0
  5199  	MOVQ  xs+8(FP), AX
  5200  	MOVQ  incx+16(FP), CX
  5201  	MOVQ  ys+24(FP), DX
  5202  	MOVQ  incy+32(FP), BX
  5203  	MOVQ  n+40(FP), SI
  5204  	JMP   check_limit
  5205  	PCALIGN $0x10
  5206  
  5207  loop:
  5208  	MOVSS (AX), X1
  5209  	MULSS X0, X1
  5210  	ADDSS (DX), X1
  5211  	MOVSS X1, (DX)
  5212  	DECQ  SI
  5213  	LEAQ  (AX)(CX*4), AX
  5214  	LEAQ  (DX)(BX*4), DX
  5215  
  5216  check_limit:
  5217  	CMPQ SI, $0x00
  5218  	JHI  loop
  5219  	RET
  5220  
  5221  // func AmdAxpyPointerLoopX_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5222  // Requires: SSE
  5223  TEXT ·AmdAxpyPointerLoopX_V1A16(SB), NOSPLIT, $0-48
  5224  	MOVSS alpha+0(FP), X0
  5225  	MOVQ  xs+8(FP), AX
  5226  	MOVQ  incx+16(FP), CX
  5227  	MOVQ  ys+24(FP), DX
  5228  	MOVQ  incy+32(FP), BX
  5229  	MOVQ  n+40(FP), SI
  5230  	JMP   check_limit
  5231  	PCALIGN $0x10
  5232  
  5233  loop:
  5234  	MOVSS (AX), X1
  5235  	MULSS X0, X1
  5236  	ADDSS (DX), X1
  5237  	MOVSS X1, (DX)
  5238  	DECQ  SI
  5239  	LEAQ  (AX)(CX*4), AX
  5240  	LEAQ  (DX)(BX*4), DX
  5241  
  5242  check_limit:
  5243  	CMPQ SI, $0x00
  5244  	JHI  loop
  5245  	RET
  5246  
  5247  // func AmdAxpyPointerLoopX_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5248  // Requires: SSE
  5249  TEXT ·AmdAxpyPointerLoopX_V2A16(SB), NOSPLIT, $0-48
  5250  	MOVSS alpha+0(FP), X0
  5251  	MOVQ  xs+8(FP), AX
  5252  	MOVQ  incx+16(FP), CX
  5253  	MOVQ  ys+24(FP), DX
  5254  	MOVQ  incy+32(FP), BX
  5255  	MOVQ  n+40(FP), SI
  5256  	JMP   check_limit
  5257  	PCALIGN $0x10
  5258  
  5259  loop:
  5260  	MOVSS (AX), X1
  5261  	MULSS X0, X1
  5262  	ADDSS (DX), X1
  5263  	MOVSS X1, (DX)
  5264  	DECQ  SI
  5265  	LEAQ  (AX)(CX*4), AX
  5266  	LEAQ  (DX)(BX*4), DX
  5267  
  5268  check_limit:
  5269  	CMPQ SI, $0x00
  5270  	JHI  loop
  5271  	RET
  5272  
  5273  // func AmdAxpyPointerLoopX_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5274  // Requires: SSE
  5275  TEXT ·AmdAxpyPointerLoopX_V3A16(SB), NOSPLIT, $0-48
  5276  	MOVSS alpha+0(FP), X0
  5277  	MOVQ  xs+8(FP), AX
  5278  	MOVQ  incx+16(FP), CX
  5279  	MOVQ  ys+24(FP), DX
  5280  	MOVQ  incy+32(FP), BX
  5281  	MOVQ  n+40(FP), SI
  5282  	JMP   check_limit
  5283  	PCALIGN $0x10
  5284  
  5285  loop:
  5286  	MOVSS (AX), X1
  5287  	MULSS X0, X1
  5288  	ADDSS (DX), X1
  5289  	MOVSS X1, (DX)
  5290  	DECQ  SI
  5291  	LEAQ  (AX)(CX*4), AX
  5292  	LEAQ  (DX)(BX*4), DX
  5293  
  5294  check_limit:
  5295  	CMPQ SI, $0x00
  5296  	JHI  loop
  5297  	RET
  5298  
  5299  // func AmdAxpyPointerLoopX_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5300  // Requires: SSE
  5301  TEXT ·AmdAxpyPointerLoopX_V4A16(SB), NOSPLIT, $0-48
  5302  	MOVSS alpha+0(FP), X0
  5303  	MOVQ  xs+8(FP), AX
  5304  	MOVQ  incx+16(FP), CX
  5305  	MOVQ  ys+24(FP), DX
  5306  	MOVQ  incy+32(FP), BX
  5307  	MOVQ  n+40(FP), SI
  5308  	JMP   check_limit
  5309  	PCALIGN $0x10
  5310  
  5311  loop:
  5312  	MOVSS (AX), X1
  5313  	MULSS X0, X1
  5314  	ADDSS (DX), X1
  5315  	MOVSS X1, (DX)
  5316  	DECQ  SI
  5317  	LEAQ  (AX)(CX*4), AX
  5318  	LEAQ  (DX)(BX*4), DX
  5319  
  5320  check_limit:
  5321  	CMPQ SI, $0x00
  5322  	JHI  loop
  5323  	RET
  5324  
  5325  // func AmdAxpyPointerLoopX_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5326  // Requires: SSE
  5327  TEXT ·AmdAxpyPointerLoopX_V5A16(SB), NOSPLIT, $0-48
  5328  	MOVSS alpha+0(FP), X0
  5329  	MOVQ  xs+8(FP), AX
  5330  	MOVQ  incx+16(FP), CX
  5331  	MOVQ  ys+24(FP), DX
  5332  	MOVQ  incy+32(FP), BX
  5333  	MOVQ  n+40(FP), SI
  5334  	JMP   check_limit
  5335  	PCALIGN $0x10
  5336  
  5337  loop:
  5338  	MOVSS (AX), X1
  5339  	MULSS X0, X1
  5340  	ADDSS (DX), X1
  5341  	MOVSS X1, (DX)
  5342  	DECQ  SI
  5343  	LEAQ  (AX)(CX*4), AX
  5344  	LEAQ  (DX)(BX*4), DX
  5345  
  5346  check_limit:
  5347  	CMPQ SI, $0x00
  5348  	JHI  loop
  5349  	RET
  5350  
  5351  // func AmdAxpyUnsafeX_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5352  // Requires: SSE
  5353  TEXT ·AmdAxpyUnsafeX_V0A0(SB), NOSPLIT, $0-48
  5354  	MOVSS alpha+0(FP), X0
  5355  	MOVQ  xs+8(FP), AX
  5356  	MOVQ  incx+16(FP), CX
  5357  	MOVQ  ys+24(FP), DX
  5358  	MOVQ  incy+32(FP), BX
  5359  	MOVQ  n+40(FP), SI
  5360  	XORQ  DI, DI
  5361  	XORQ  R8, R8
  5362  	JMP   check_limit
  5363  
  5364  loop:
  5365  	MOVSS (AX)(DI*4), X1
  5366  	MULSS X0, X1
  5367  	ADDSS (DX)(R8*4), X1
  5368  	MOVSS X1, (DX)(R8*4)
  5369  	DECQ  SI
  5370  	ADDQ  CX, DI
  5371  	ADDQ  BX, R8
  5372  
  5373  check_limit:
  5374  	CMPQ SI, $0x00
  5375  	JHI  loop
  5376  	RET
  5377  
  5378  // func AmdAxpyUnsafeX_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5379  // Requires: SSE
  5380  TEXT ·AmdAxpyUnsafeX_V1A0(SB), NOSPLIT, $0-48
  5381  	MOVSS alpha+0(FP), X0
  5382  	MOVQ  xs+8(FP), AX
  5383  	MOVQ  incx+16(FP), CX
  5384  	MOVQ  ys+24(FP), DX
  5385  	MOVQ  incy+32(FP), BX
  5386  	MOVQ  n+40(FP), SI
  5387  	XORQ  DI, DI
  5388  	XORQ  R8, R8
  5389  	JMP   check_limit
  5390  
  5391  loop:
  5392  	MOVSS (AX)(DI*4), X1
  5393  	MULSS X0, X1
  5394  	ADDSS (DX)(R8*4), X1
  5395  	MOVSS X1, (DX)(R8*4)
  5396  	DECQ  SI
  5397  	ADDQ  CX, DI
  5398  	ADDQ  BX, R8
  5399  
  5400  check_limit:
  5401  	CMPQ SI, $0x00
  5402  	JHI  loop
  5403  	RET
  5404  
  5405  // func AmdAxpyUnsafeX_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5406  // Requires: SSE
  5407  TEXT ·AmdAxpyUnsafeX_V2A0(SB), NOSPLIT, $0-48
  5408  	MOVSS alpha+0(FP), X0
  5409  	MOVQ  xs+8(FP), AX
  5410  	MOVQ  incx+16(FP), CX
  5411  	MOVQ  ys+24(FP), DX
  5412  	MOVQ  incy+32(FP), BX
  5413  	MOVQ  n+40(FP), SI
  5414  	XORQ  DI, DI
  5415  	XORQ  R8, R8
  5416  	JMP   check_limit
  5417  
  5418  loop:
  5419  	MOVSS (AX)(DI*4), X1
  5420  	MULSS X0, X1
  5421  	ADDSS (DX)(R8*4), X1
  5422  	MOVSS X1, (DX)(R8*4)
  5423  	DECQ  SI
  5424  	ADDQ  CX, DI
  5425  	ADDQ  BX, R8
  5426  
  5427  check_limit:
  5428  	CMPQ SI, $0x00
  5429  	JHI  loop
  5430  	RET
  5431  
  5432  // func AmdAxpyUnsafeX_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5433  // Requires: SSE
  5434  TEXT ·AmdAxpyUnsafeX_V3A0(SB), NOSPLIT, $0-48
  5435  	MOVSS alpha+0(FP), X0
  5436  	MOVQ  xs+8(FP), AX
  5437  	MOVQ  incx+16(FP), CX
  5438  	MOVQ  ys+24(FP), DX
  5439  	MOVQ  incy+32(FP), BX
  5440  	MOVQ  n+40(FP), SI
  5441  	XORQ  DI, DI
  5442  	XORQ  R8, R8
  5443  	JMP   check_limit
  5444  
  5445  loop:
  5446  	MOVSS (AX)(DI*4), X1
  5447  	MULSS X0, X1
  5448  	ADDSS (DX)(R8*4), X1
  5449  	MOVSS X1, (DX)(R8*4)
  5450  	DECQ  SI
  5451  	ADDQ  CX, DI
  5452  	ADDQ  BX, R8
  5453  
  5454  check_limit:
  5455  	CMPQ SI, $0x00
  5456  	JHI  loop
  5457  	RET
  5458  
  5459  // func AmdAxpyUnsafeX_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5460  // Requires: SSE
  5461  TEXT ·AmdAxpyUnsafeX_V4A0(SB), NOSPLIT, $0-48
  5462  	MOVSS alpha+0(FP), X0
  5463  	MOVQ  xs+8(FP), AX
  5464  	MOVQ  incx+16(FP), CX
  5465  	MOVQ  ys+24(FP), DX
  5466  	MOVQ  incy+32(FP), BX
  5467  	MOVQ  n+40(FP), SI
  5468  	XORQ  DI, DI
  5469  	XORQ  R8, R8
  5470  	JMP   check_limit
  5471  
  5472  loop:
  5473  	MOVSS (AX)(DI*4), X1
  5474  	MULSS X0, X1
  5475  	ADDSS (DX)(R8*4), X1
  5476  	MOVSS X1, (DX)(R8*4)
  5477  	DECQ  SI
  5478  	ADDQ  CX, DI
  5479  	ADDQ  BX, R8
  5480  
  5481  check_limit:
  5482  	CMPQ SI, $0x00
  5483  	JHI  loop
  5484  	RET
  5485  
  5486  // func AmdAxpyUnsafeX_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5487  // Requires: SSE
  5488  TEXT ·AmdAxpyUnsafeX_V5A0(SB), NOSPLIT, $0-48
  5489  	MOVSS alpha+0(FP), X0
  5490  	MOVQ  xs+8(FP), AX
  5491  	MOVQ  incx+16(FP), CX
  5492  	MOVQ  ys+24(FP), DX
  5493  	MOVQ  incy+32(FP), BX
  5494  	MOVQ  n+40(FP), SI
  5495  	XORQ  DI, DI
  5496  	XORQ  R8, R8
  5497  	JMP   check_limit
  5498  
  5499  loop:
  5500  	MOVSS (AX)(DI*4), X1
  5501  	MULSS X0, X1
  5502  	ADDSS (DX)(R8*4), X1
  5503  	MOVSS X1, (DX)(R8*4)
  5504  	DECQ  SI
  5505  	ADDQ  CX, DI
  5506  	ADDQ  BX, R8
  5507  
  5508  check_limit:
  5509  	CMPQ SI, $0x00
  5510  	JHI  loop
  5511  	RET
  5512  
  5513  // func AmdAxpyUnsafeX_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5514  // Requires: SSE
  5515  TEXT ·AmdAxpyUnsafeX_V0A8(SB), NOSPLIT, $0-48
  5516  	MOVSS alpha+0(FP), X0
  5517  	MOVQ  xs+8(FP), AX
  5518  	MOVQ  incx+16(FP), CX
  5519  	MOVQ  ys+24(FP), DX
  5520  	MOVQ  incy+32(FP), BX
  5521  	MOVQ  n+40(FP), SI
  5522  	XORQ  DI, DI
  5523  	XORQ  R8, R8
  5524  	JMP   check_limit
  5525  	PCALIGN $0x08
  5526  
  5527  loop:
  5528  	MOVSS (AX)(DI*4), X1
  5529  	MULSS X0, X1
  5530  	ADDSS (DX)(R8*4), X1
  5531  	MOVSS X1, (DX)(R8*4)
  5532  	DECQ  SI
  5533  	ADDQ  CX, DI
  5534  	ADDQ  BX, R8
  5535  
  5536  check_limit:
  5537  	CMPQ SI, $0x00
  5538  	JHI  loop
  5539  	RET
  5540  
  5541  // func AmdAxpyUnsafeX_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5542  // Requires: SSE
  5543  TEXT ·AmdAxpyUnsafeX_V1A8(SB), NOSPLIT, $0-48
  5544  	MOVSS alpha+0(FP), X0
  5545  	MOVQ  xs+8(FP), AX
  5546  	MOVQ  incx+16(FP), CX
  5547  	MOVQ  ys+24(FP), DX
  5548  	MOVQ  incy+32(FP), BX
  5549  	MOVQ  n+40(FP), SI
  5550  	XORQ  DI, DI
  5551  	XORQ  R8, R8
  5552  	JMP   check_limit
  5553  	PCALIGN $0x08
  5554  
  5555  loop:
  5556  	MOVSS (AX)(DI*4), X1
  5557  	MULSS X0, X1
  5558  	ADDSS (DX)(R8*4), X1
  5559  	MOVSS X1, (DX)(R8*4)
  5560  	DECQ  SI
  5561  	ADDQ  CX, DI
  5562  	ADDQ  BX, R8
  5563  
  5564  check_limit:
  5565  	CMPQ SI, $0x00
  5566  	JHI  loop
  5567  	RET
  5568  
  5569  // func AmdAxpyUnsafeX_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5570  // Requires: SSE
  5571  TEXT ·AmdAxpyUnsafeX_V2A8(SB), NOSPLIT, $0-48
  5572  	MOVSS alpha+0(FP), X0
  5573  	MOVQ  xs+8(FP), AX
  5574  	MOVQ  incx+16(FP), CX
  5575  	MOVQ  ys+24(FP), DX
  5576  	MOVQ  incy+32(FP), BX
  5577  	MOVQ  n+40(FP), SI
  5578  	XORQ  DI, DI
  5579  	XORQ  R8, R8
  5580  	JMP   check_limit
  5581  	PCALIGN $0x08
  5582  
  5583  loop:
  5584  	MOVSS (AX)(DI*4), X1
  5585  	MULSS X0, X1
  5586  	ADDSS (DX)(R8*4), X1
  5587  	MOVSS X1, (DX)(R8*4)
  5588  	DECQ  SI
  5589  	ADDQ  CX, DI
  5590  	ADDQ  BX, R8
  5591  
  5592  check_limit:
  5593  	CMPQ SI, $0x00
  5594  	JHI  loop
  5595  	RET
  5596  
  5597  // func AmdAxpyUnsafeX_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5598  // Requires: SSE
  5599  TEXT ·AmdAxpyUnsafeX_V3A8(SB), NOSPLIT, $0-48
  5600  	MOVSS alpha+0(FP), X0
  5601  	MOVQ  xs+8(FP), AX
  5602  	MOVQ  incx+16(FP), CX
  5603  	MOVQ  ys+24(FP), DX
  5604  	MOVQ  incy+32(FP), BX
  5605  	MOVQ  n+40(FP), SI
  5606  	XORQ  DI, DI
  5607  	XORQ  R8, R8
  5608  	JMP   check_limit
  5609  	PCALIGN $0x08
  5610  
  5611  loop:
  5612  	MOVSS (AX)(DI*4), X1
  5613  	MULSS X0, X1
  5614  	ADDSS (DX)(R8*4), X1
  5615  	MOVSS X1, (DX)(R8*4)
  5616  	DECQ  SI
  5617  	ADDQ  CX, DI
  5618  	ADDQ  BX, R8
  5619  
  5620  check_limit:
  5621  	CMPQ SI, $0x00
  5622  	JHI  loop
  5623  	RET
  5624  
  5625  // func AmdAxpyUnsafeX_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5626  // Requires: SSE
  5627  TEXT ·AmdAxpyUnsafeX_V4A8(SB), NOSPLIT, $0-48
  5628  	MOVSS alpha+0(FP), X0
  5629  	MOVQ  xs+8(FP), AX
  5630  	MOVQ  incx+16(FP), CX
  5631  	MOVQ  ys+24(FP), DX
  5632  	MOVQ  incy+32(FP), BX
  5633  	MOVQ  n+40(FP), SI
  5634  	XORQ  DI, DI
  5635  	XORQ  R8, R8
  5636  	JMP   check_limit
  5637  	PCALIGN $0x08
  5638  
  5639  loop:
  5640  	MOVSS (AX)(DI*4), X1
  5641  	MULSS X0, X1
  5642  	ADDSS (DX)(R8*4), X1
  5643  	MOVSS X1, (DX)(R8*4)
  5644  	DECQ  SI
  5645  	ADDQ  CX, DI
  5646  	ADDQ  BX, R8
  5647  
  5648  check_limit:
  5649  	CMPQ SI, $0x00
  5650  	JHI  loop
  5651  	RET
  5652  
  5653  // func AmdAxpyUnsafeX_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5654  // Requires: SSE
  5655  TEXT ·AmdAxpyUnsafeX_V5A8(SB), NOSPLIT, $0-48
  5656  	MOVSS alpha+0(FP), X0
  5657  	MOVQ  xs+8(FP), AX
  5658  	MOVQ  incx+16(FP), CX
  5659  	MOVQ  ys+24(FP), DX
  5660  	MOVQ  incy+32(FP), BX
  5661  	MOVQ  n+40(FP), SI
  5662  	XORQ  DI, DI
  5663  	XORQ  R8, R8
  5664  	JMP   check_limit
  5665  	PCALIGN $0x08
  5666  
  5667  loop:
  5668  	MOVSS (AX)(DI*4), X1
  5669  	MULSS X0, X1
  5670  	ADDSS (DX)(R8*4), X1
  5671  	MOVSS X1, (DX)(R8*4)
  5672  	DECQ  SI
  5673  	ADDQ  CX, DI
  5674  	ADDQ  BX, R8
  5675  
  5676  check_limit:
  5677  	CMPQ SI, $0x00
  5678  	JHI  loop
  5679  	RET
  5680  
  5681  // func AmdAxpyUnsafeX_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5682  // Requires: SSE
  5683  TEXT ·AmdAxpyUnsafeX_V0A9(SB), NOSPLIT, $0-48
  5684  	MOVSS alpha+0(FP), X0
  5685  	MOVQ  xs+8(FP), AX
  5686  	MOVQ  incx+16(FP), CX
  5687  	MOVQ  ys+24(FP), DX
  5688  	MOVQ  incy+32(FP), BX
  5689  	MOVQ  n+40(FP), SI
  5690  	XORQ  DI, DI
  5691  	XORQ  R8, R8
  5692  	JMP   check_limit
  5693  	PCALIGN $0x08
  5694  	NOP
  5695  
  5696  loop:
  5697  	MOVSS (AX)(DI*4), X1
  5698  	MULSS X0, X1
  5699  	ADDSS (DX)(R8*4), X1
  5700  	MOVSS X1, (DX)(R8*4)
  5701  	DECQ  SI
  5702  	ADDQ  CX, DI
  5703  	ADDQ  BX, R8
  5704  
  5705  check_limit:
  5706  	CMPQ SI, $0x00
  5707  	JHI  loop
  5708  	RET
  5709  
  5710  // func AmdAxpyUnsafeX_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5711  // Requires: SSE
  5712  TEXT ·AmdAxpyUnsafeX_V1A9(SB), NOSPLIT, $0-48
  5713  	MOVSS alpha+0(FP), X0
  5714  	MOVQ  xs+8(FP), AX
  5715  	MOVQ  incx+16(FP), CX
  5716  	MOVQ  ys+24(FP), DX
  5717  	MOVQ  incy+32(FP), BX
  5718  	MOVQ  n+40(FP), SI
  5719  	XORQ  DI, DI
  5720  	XORQ  R8, R8
  5721  	JMP   check_limit
  5722  	PCALIGN $0x08
  5723  	NOP
  5724  
  5725  loop:
  5726  	MOVSS (AX)(DI*4), X1
  5727  	MULSS X0, X1
  5728  	ADDSS (DX)(R8*4), X1
  5729  	MOVSS X1, (DX)(R8*4)
  5730  	DECQ  SI
  5731  	ADDQ  CX, DI
  5732  	ADDQ  BX, R8
  5733  
  5734  check_limit:
  5735  	CMPQ SI, $0x00
  5736  	JHI  loop
  5737  	RET
  5738  
  5739  // func AmdAxpyUnsafeX_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5740  // Requires: SSE
  5741  TEXT ·AmdAxpyUnsafeX_V2A9(SB), NOSPLIT, $0-48
  5742  	MOVSS alpha+0(FP), X0
  5743  	MOVQ  xs+8(FP), AX
  5744  	MOVQ  incx+16(FP), CX
  5745  	MOVQ  ys+24(FP), DX
  5746  	MOVQ  incy+32(FP), BX
  5747  	MOVQ  n+40(FP), SI
  5748  	XORQ  DI, DI
  5749  	XORQ  R8, R8
  5750  	JMP   check_limit
  5751  	PCALIGN $0x08
  5752  	NOP
  5753  
  5754  loop:
  5755  	MOVSS (AX)(DI*4), X1
  5756  	MULSS X0, X1
  5757  	ADDSS (DX)(R8*4), X1
  5758  	MOVSS X1, (DX)(R8*4)
  5759  	DECQ  SI
  5760  	ADDQ  CX, DI
  5761  	ADDQ  BX, R8
  5762  
  5763  check_limit:
  5764  	CMPQ SI, $0x00
  5765  	JHI  loop
  5766  	RET
  5767  
  5768  // func AmdAxpyUnsafeX_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5769  // Requires: SSE
  5770  TEXT ·AmdAxpyUnsafeX_V3A9(SB), NOSPLIT, $0-48
  5771  	MOVSS alpha+0(FP), X0
  5772  	MOVQ  xs+8(FP), AX
  5773  	MOVQ  incx+16(FP), CX
  5774  	MOVQ  ys+24(FP), DX
  5775  	MOVQ  incy+32(FP), BX
  5776  	MOVQ  n+40(FP), SI
  5777  	XORQ  DI, DI
  5778  	XORQ  R8, R8
  5779  	JMP   check_limit
  5780  	PCALIGN $0x08
  5781  	NOP
  5782  
  5783  loop:
  5784  	MOVSS (AX)(DI*4), X1
  5785  	MULSS X0, X1
  5786  	ADDSS (DX)(R8*4), X1
  5787  	MOVSS X1, (DX)(R8*4)
  5788  	DECQ  SI
  5789  	ADDQ  CX, DI
  5790  	ADDQ  BX, R8
  5791  
  5792  check_limit:
  5793  	CMPQ SI, $0x00
  5794  	JHI  loop
  5795  	RET
  5796  
  5797  // func AmdAxpyUnsafeX_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5798  // Requires: SSE
  5799  TEXT ·AmdAxpyUnsafeX_V4A9(SB), NOSPLIT, $0-48
  5800  	MOVSS alpha+0(FP), X0
  5801  	MOVQ  xs+8(FP), AX
  5802  	MOVQ  incx+16(FP), CX
  5803  	MOVQ  ys+24(FP), DX
  5804  	MOVQ  incy+32(FP), BX
  5805  	MOVQ  n+40(FP), SI
  5806  	XORQ  DI, DI
  5807  	XORQ  R8, R8
  5808  	JMP   check_limit
  5809  	PCALIGN $0x08
  5810  	NOP
  5811  
  5812  loop:
  5813  	MOVSS (AX)(DI*4), X1
  5814  	MULSS X0, X1
  5815  	ADDSS (DX)(R8*4), X1
  5816  	MOVSS X1, (DX)(R8*4)
  5817  	DECQ  SI
  5818  	ADDQ  CX, DI
  5819  	ADDQ  BX, R8
  5820  
  5821  check_limit:
  5822  	CMPQ SI, $0x00
  5823  	JHI  loop
  5824  	RET
  5825  
  5826  // func AmdAxpyUnsafeX_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5827  // Requires: SSE
  5828  TEXT ·AmdAxpyUnsafeX_V5A9(SB), NOSPLIT, $0-48
  5829  	MOVSS alpha+0(FP), X0
  5830  	MOVQ  xs+8(FP), AX
  5831  	MOVQ  incx+16(FP), CX
  5832  	MOVQ  ys+24(FP), DX
  5833  	MOVQ  incy+32(FP), BX
  5834  	MOVQ  n+40(FP), SI
  5835  	XORQ  DI, DI
  5836  	XORQ  R8, R8
  5837  	JMP   check_limit
  5838  	PCALIGN $0x08
  5839  	NOP
  5840  
  5841  loop:
  5842  	MOVSS (AX)(DI*4), X1
  5843  	MULSS X0, X1
  5844  	ADDSS (DX)(R8*4), X1
  5845  	MOVSS X1, (DX)(R8*4)
  5846  	DECQ  SI
  5847  	ADDQ  CX, DI
  5848  	ADDQ  BX, R8
  5849  
  5850  check_limit:
  5851  	CMPQ SI, $0x00
  5852  	JHI  loop
  5853  	RET
  5854  
  5855  // func AmdAxpyUnsafeX_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5856  // Requires: SSE
  5857  TEXT ·AmdAxpyUnsafeX_V0A10(SB), NOSPLIT, $0-48
  5858  	MOVSS alpha+0(FP), X0
  5859  	MOVQ  xs+8(FP), AX
  5860  	MOVQ  incx+16(FP), CX
  5861  	MOVQ  ys+24(FP), DX
  5862  	MOVQ  incy+32(FP), BX
  5863  	MOVQ  n+40(FP), SI
  5864  	XORQ  DI, DI
  5865  	XORQ  R8, R8
  5866  	JMP   check_limit
  5867  	PCALIGN $0x08
  5868  	NOP
  5869  	NOP
  5870  
  5871  loop:
  5872  	MOVSS (AX)(DI*4), X1
  5873  	MULSS X0, X1
  5874  	ADDSS (DX)(R8*4), X1
  5875  	MOVSS X1, (DX)(R8*4)
  5876  	DECQ  SI
  5877  	ADDQ  CX, DI
  5878  	ADDQ  BX, R8
  5879  
  5880  check_limit:
  5881  	CMPQ SI, $0x00
  5882  	JHI  loop
  5883  	RET
  5884  
  5885  // func AmdAxpyUnsafeX_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5886  // Requires: SSE
  5887  TEXT ·AmdAxpyUnsafeX_V1A10(SB), NOSPLIT, $0-48
  5888  	MOVSS alpha+0(FP), X0
  5889  	MOVQ  xs+8(FP), AX
  5890  	MOVQ  incx+16(FP), CX
  5891  	MOVQ  ys+24(FP), DX
  5892  	MOVQ  incy+32(FP), BX
  5893  	MOVQ  n+40(FP), SI
  5894  	XORQ  DI, DI
  5895  	XORQ  R8, R8
  5896  	JMP   check_limit
  5897  	PCALIGN $0x08
  5898  	NOP
  5899  	NOP
  5900  
  5901  loop:
  5902  	MOVSS (AX)(DI*4), X1
  5903  	MULSS X0, X1
  5904  	ADDSS (DX)(R8*4), X1
  5905  	MOVSS X1, (DX)(R8*4)
  5906  	DECQ  SI
  5907  	ADDQ  CX, DI
  5908  	ADDQ  BX, R8
  5909  
  5910  check_limit:
  5911  	CMPQ SI, $0x00
  5912  	JHI  loop
  5913  	RET
  5914  
  5915  // func AmdAxpyUnsafeX_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5916  // Requires: SSE
  5917  TEXT ·AmdAxpyUnsafeX_V2A10(SB), NOSPLIT, $0-48
  5918  	MOVSS alpha+0(FP), X0
  5919  	MOVQ  xs+8(FP), AX
  5920  	MOVQ  incx+16(FP), CX
  5921  	MOVQ  ys+24(FP), DX
  5922  	MOVQ  incy+32(FP), BX
  5923  	MOVQ  n+40(FP), SI
  5924  	XORQ  DI, DI
  5925  	XORQ  R8, R8
  5926  	JMP   check_limit
  5927  	PCALIGN $0x08
  5928  	NOP
  5929  	NOP
  5930  
  5931  loop:
  5932  	MOVSS (AX)(DI*4), X1
  5933  	MULSS X0, X1
  5934  	ADDSS (DX)(R8*4), X1
  5935  	MOVSS X1, (DX)(R8*4)
  5936  	DECQ  SI
  5937  	ADDQ  CX, DI
  5938  	ADDQ  BX, R8
  5939  
  5940  check_limit:
  5941  	CMPQ SI, $0x00
  5942  	JHI  loop
  5943  	RET
  5944  
  5945  // func AmdAxpyUnsafeX_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5946  // Requires: SSE
  5947  TEXT ·AmdAxpyUnsafeX_V3A10(SB), NOSPLIT, $0-48
  5948  	MOVSS alpha+0(FP), X0
  5949  	MOVQ  xs+8(FP), AX
  5950  	MOVQ  incx+16(FP), CX
  5951  	MOVQ  ys+24(FP), DX
  5952  	MOVQ  incy+32(FP), BX
  5953  	MOVQ  n+40(FP), SI
  5954  	XORQ  DI, DI
  5955  	XORQ  R8, R8
  5956  	JMP   check_limit
  5957  	PCALIGN $0x08
  5958  	NOP
  5959  	NOP
  5960  
  5961  loop:
  5962  	MOVSS (AX)(DI*4), X1
  5963  	MULSS X0, X1
  5964  	ADDSS (DX)(R8*4), X1
  5965  	MOVSS X1, (DX)(R8*4)
  5966  	DECQ  SI
  5967  	ADDQ  CX, DI
  5968  	ADDQ  BX, R8
  5969  
  5970  check_limit:
  5971  	CMPQ SI, $0x00
  5972  	JHI  loop
  5973  	RET
  5974  
  5975  // func AmdAxpyUnsafeX_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  5976  // Requires: SSE
  5977  TEXT ·AmdAxpyUnsafeX_V4A10(SB), NOSPLIT, $0-48
  5978  	MOVSS alpha+0(FP), X0
  5979  	MOVQ  xs+8(FP), AX
  5980  	MOVQ  incx+16(FP), CX
  5981  	MOVQ  ys+24(FP), DX
  5982  	MOVQ  incy+32(FP), BX
  5983  	MOVQ  n+40(FP), SI
  5984  	XORQ  DI, DI
  5985  	XORQ  R8, R8
  5986  	JMP   check_limit
  5987  	PCALIGN $0x08
  5988  	NOP
  5989  	NOP
  5990  
  5991  loop:
  5992  	MOVSS (AX)(DI*4), X1
  5993  	MULSS X0, X1
  5994  	ADDSS (DX)(R8*4), X1
  5995  	MOVSS X1, (DX)(R8*4)
  5996  	DECQ  SI
  5997  	ADDQ  CX, DI
  5998  	ADDQ  BX, R8
  5999  
  6000  check_limit:
  6001  	CMPQ SI, $0x00
  6002  	JHI  loop
  6003  	RET
  6004  
  6005  // func AmdAxpyUnsafeX_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6006  // Requires: SSE
  6007  TEXT ·AmdAxpyUnsafeX_V5A10(SB), NOSPLIT, $0-48
  6008  	MOVSS alpha+0(FP), X0
  6009  	MOVQ  xs+8(FP), AX
  6010  	MOVQ  incx+16(FP), CX
  6011  	MOVQ  ys+24(FP), DX
  6012  	MOVQ  incy+32(FP), BX
  6013  	MOVQ  n+40(FP), SI
  6014  	XORQ  DI, DI
  6015  	XORQ  R8, R8
  6016  	JMP   check_limit
  6017  	PCALIGN $0x08
  6018  	NOP
  6019  	NOP
  6020  
  6021  loop:
  6022  	MOVSS (AX)(DI*4), X1
  6023  	MULSS X0, X1
  6024  	ADDSS (DX)(R8*4), X1
  6025  	MOVSS X1, (DX)(R8*4)
  6026  	DECQ  SI
  6027  	ADDQ  CX, DI
  6028  	ADDQ  BX, R8
  6029  
  6030  check_limit:
  6031  	CMPQ SI, $0x00
  6032  	JHI  loop
  6033  	RET
  6034  
  6035  // func AmdAxpyUnsafeX_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6036  // Requires: SSE
  6037  TEXT ·AmdAxpyUnsafeX_V0A11(SB), NOSPLIT, $0-48
  6038  	MOVSS alpha+0(FP), X0
  6039  	MOVQ  xs+8(FP), AX
  6040  	MOVQ  incx+16(FP), CX
  6041  	MOVQ  ys+24(FP), DX
  6042  	MOVQ  incy+32(FP), BX
  6043  	MOVQ  n+40(FP), SI
  6044  	XORQ  DI, DI
  6045  	XORQ  R8, R8
  6046  	JMP   check_limit
  6047  	PCALIGN $0x08
  6048  	NOP
  6049  	NOP
  6050  	NOP
  6051  
  6052  loop:
  6053  	MOVSS (AX)(DI*4), X1
  6054  	MULSS X0, X1
  6055  	ADDSS (DX)(R8*4), X1
  6056  	MOVSS X1, (DX)(R8*4)
  6057  	DECQ  SI
  6058  	ADDQ  CX, DI
  6059  	ADDQ  BX, R8
  6060  
  6061  check_limit:
  6062  	CMPQ SI, $0x00
  6063  	JHI  loop
  6064  	RET
  6065  
  6066  // func AmdAxpyUnsafeX_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6067  // Requires: SSE
  6068  TEXT ·AmdAxpyUnsafeX_V1A11(SB), NOSPLIT, $0-48
  6069  	MOVSS alpha+0(FP), X0
  6070  	MOVQ  xs+8(FP), AX
  6071  	MOVQ  incx+16(FP), CX
  6072  	MOVQ  ys+24(FP), DX
  6073  	MOVQ  incy+32(FP), BX
  6074  	MOVQ  n+40(FP), SI
  6075  	XORQ  DI, DI
  6076  	XORQ  R8, R8
  6077  	JMP   check_limit
  6078  	PCALIGN $0x08
  6079  	NOP
  6080  	NOP
  6081  	NOP
  6082  
  6083  loop:
  6084  	MOVSS (AX)(DI*4), X1
  6085  	MULSS X0, X1
  6086  	ADDSS (DX)(R8*4), X1
  6087  	MOVSS X1, (DX)(R8*4)
  6088  	DECQ  SI
  6089  	ADDQ  CX, DI
  6090  	ADDQ  BX, R8
  6091  
  6092  check_limit:
  6093  	CMPQ SI, $0x00
  6094  	JHI  loop
  6095  	RET
  6096  
  6097  // func AmdAxpyUnsafeX_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6098  // Requires: SSE
  6099  TEXT ·AmdAxpyUnsafeX_V2A11(SB), NOSPLIT, $0-48
  6100  	MOVSS alpha+0(FP), X0
  6101  	MOVQ  xs+8(FP), AX
  6102  	MOVQ  incx+16(FP), CX
  6103  	MOVQ  ys+24(FP), DX
  6104  	MOVQ  incy+32(FP), BX
  6105  	MOVQ  n+40(FP), SI
  6106  	XORQ  DI, DI
  6107  	XORQ  R8, R8
  6108  	JMP   check_limit
  6109  	PCALIGN $0x08
  6110  	NOP
  6111  	NOP
  6112  	NOP
  6113  
  6114  loop:
  6115  	MOVSS (AX)(DI*4), X1
  6116  	MULSS X0, X1
  6117  	ADDSS (DX)(R8*4), X1
  6118  	MOVSS X1, (DX)(R8*4)
  6119  	DECQ  SI
  6120  	ADDQ  CX, DI
  6121  	ADDQ  BX, R8
  6122  
  6123  check_limit:
  6124  	CMPQ SI, $0x00
  6125  	JHI  loop
  6126  	RET
  6127  
  6128  // func AmdAxpyUnsafeX_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6129  // Requires: SSE
  6130  TEXT ·AmdAxpyUnsafeX_V3A11(SB), NOSPLIT, $0-48
  6131  	MOVSS alpha+0(FP), X0
  6132  	MOVQ  xs+8(FP), AX
  6133  	MOVQ  incx+16(FP), CX
  6134  	MOVQ  ys+24(FP), DX
  6135  	MOVQ  incy+32(FP), BX
  6136  	MOVQ  n+40(FP), SI
  6137  	XORQ  DI, DI
  6138  	XORQ  R8, R8
  6139  	JMP   check_limit
  6140  	PCALIGN $0x08
  6141  	NOP
  6142  	NOP
  6143  	NOP
  6144  
  6145  loop:
  6146  	MOVSS (AX)(DI*4), X1
  6147  	MULSS X0, X1
  6148  	ADDSS (DX)(R8*4), X1
  6149  	MOVSS X1, (DX)(R8*4)
  6150  	DECQ  SI
  6151  	ADDQ  CX, DI
  6152  	ADDQ  BX, R8
  6153  
  6154  check_limit:
  6155  	CMPQ SI, $0x00
  6156  	JHI  loop
  6157  	RET
  6158  
  6159  // func AmdAxpyUnsafeX_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6160  // Requires: SSE
  6161  TEXT ·AmdAxpyUnsafeX_V4A11(SB), NOSPLIT, $0-48
  6162  	MOVSS alpha+0(FP), X0
  6163  	MOVQ  xs+8(FP), AX
  6164  	MOVQ  incx+16(FP), CX
  6165  	MOVQ  ys+24(FP), DX
  6166  	MOVQ  incy+32(FP), BX
  6167  	MOVQ  n+40(FP), SI
  6168  	XORQ  DI, DI
  6169  	XORQ  R8, R8
  6170  	JMP   check_limit
  6171  	PCALIGN $0x08
  6172  	NOP
  6173  	NOP
  6174  	NOP
  6175  
  6176  loop:
  6177  	MOVSS (AX)(DI*4), X1
  6178  	MULSS X0, X1
  6179  	ADDSS (DX)(R8*4), X1
  6180  	MOVSS X1, (DX)(R8*4)
  6181  	DECQ  SI
  6182  	ADDQ  CX, DI
  6183  	ADDQ  BX, R8
  6184  
  6185  check_limit:
  6186  	CMPQ SI, $0x00
  6187  	JHI  loop
  6188  	RET
  6189  
  6190  // func AmdAxpyUnsafeX_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6191  // Requires: SSE
  6192  TEXT ·AmdAxpyUnsafeX_V5A11(SB), NOSPLIT, $0-48
  6193  	MOVSS alpha+0(FP), X0
  6194  	MOVQ  xs+8(FP), AX
  6195  	MOVQ  incx+16(FP), CX
  6196  	MOVQ  ys+24(FP), DX
  6197  	MOVQ  incy+32(FP), BX
  6198  	MOVQ  n+40(FP), SI
  6199  	XORQ  DI, DI
  6200  	XORQ  R8, R8
  6201  	JMP   check_limit
  6202  	PCALIGN $0x08
  6203  	NOP
  6204  	NOP
  6205  	NOP
  6206  
  6207  loop:
  6208  	MOVSS (AX)(DI*4), X1
  6209  	MULSS X0, X1
  6210  	ADDSS (DX)(R8*4), X1
  6211  	MOVSS X1, (DX)(R8*4)
  6212  	DECQ  SI
  6213  	ADDQ  CX, DI
  6214  	ADDQ  BX, R8
  6215  
  6216  check_limit:
  6217  	CMPQ SI, $0x00
  6218  	JHI  loop
  6219  	RET
  6220  
  6221  // func AmdAxpyUnsafeX_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6222  // Requires: SSE
  6223  TEXT ·AmdAxpyUnsafeX_V0A12(SB), NOSPLIT, $0-48
  6224  	MOVSS alpha+0(FP), X0
  6225  	MOVQ  xs+8(FP), AX
  6226  	MOVQ  incx+16(FP), CX
  6227  	MOVQ  ys+24(FP), DX
  6228  	MOVQ  incy+32(FP), BX
  6229  	MOVQ  n+40(FP), SI
  6230  	XORQ  DI, DI
  6231  	XORQ  R8, R8
  6232  	JMP   check_limit
  6233  	PCALIGN $0x08
  6234  	NOP
  6235  	NOP
  6236  	NOP
  6237  	NOP
  6238  
  6239  loop:
  6240  	MOVSS (AX)(DI*4), X1
  6241  	MULSS X0, X1
  6242  	ADDSS (DX)(R8*4), X1
  6243  	MOVSS X1, (DX)(R8*4)
  6244  	DECQ  SI
  6245  	ADDQ  CX, DI
  6246  	ADDQ  BX, R8
  6247  
  6248  check_limit:
  6249  	CMPQ SI, $0x00
  6250  	JHI  loop
  6251  	RET
  6252  
  6253  // func AmdAxpyUnsafeX_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6254  // Requires: SSE
  6255  TEXT ·AmdAxpyUnsafeX_V1A12(SB), NOSPLIT, $0-48
  6256  	MOVSS alpha+0(FP), X0
  6257  	MOVQ  xs+8(FP), AX
  6258  	MOVQ  incx+16(FP), CX
  6259  	MOVQ  ys+24(FP), DX
  6260  	MOVQ  incy+32(FP), BX
  6261  	MOVQ  n+40(FP), SI
  6262  	XORQ  DI, DI
  6263  	XORQ  R8, R8
  6264  	JMP   check_limit
  6265  	PCALIGN $0x08
  6266  	NOP
  6267  	NOP
  6268  	NOP
  6269  	NOP
  6270  
  6271  loop:
  6272  	MOVSS (AX)(DI*4), X1
  6273  	MULSS X0, X1
  6274  	ADDSS (DX)(R8*4), X1
  6275  	MOVSS X1, (DX)(R8*4)
  6276  	DECQ  SI
  6277  	ADDQ  CX, DI
  6278  	ADDQ  BX, R8
  6279  
  6280  check_limit:
  6281  	CMPQ SI, $0x00
  6282  	JHI  loop
  6283  	RET
  6284  
  6285  // func AmdAxpyUnsafeX_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6286  // Requires: SSE
  6287  TEXT ·AmdAxpyUnsafeX_V2A12(SB), NOSPLIT, $0-48
  6288  	MOVSS alpha+0(FP), X0
  6289  	MOVQ  xs+8(FP), AX
  6290  	MOVQ  incx+16(FP), CX
  6291  	MOVQ  ys+24(FP), DX
  6292  	MOVQ  incy+32(FP), BX
  6293  	MOVQ  n+40(FP), SI
  6294  	XORQ  DI, DI
  6295  	XORQ  R8, R8
  6296  	JMP   check_limit
  6297  	PCALIGN $0x08
  6298  	NOP
  6299  	NOP
  6300  	NOP
  6301  	NOP
  6302  
  6303  loop:
  6304  	MOVSS (AX)(DI*4), X1
  6305  	MULSS X0, X1
  6306  	ADDSS (DX)(R8*4), X1
  6307  	MOVSS X1, (DX)(R8*4)
  6308  	DECQ  SI
  6309  	ADDQ  CX, DI
  6310  	ADDQ  BX, R8
  6311  
  6312  check_limit:
  6313  	CMPQ SI, $0x00
  6314  	JHI  loop
  6315  	RET
  6316  
  6317  // func AmdAxpyUnsafeX_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6318  // Requires: SSE
  6319  TEXT ·AmdAxpyUnsafeX_V3A12(SB), NOSPLIT, $0-48
  6320  	MOVSS alpha+0(FP), X0
  6321  	MOVQ  xs+8(FP), AX
  6322  	MOVQ  incx+16(FP), CX
  6323  	MOVQ  ys+24(FP), DX
  6324  	MOVQ  incy+32(FP), BX
  6325  	MOVQ  n+40(FP), SI
  6326  	XORQ  DI, DI
  6327  	XORQ  R8, R8
  6328  	JMP   check_limit
  6329  	PCALIGN $0x08
  6330  	NOP
  6331  	NOP
  6332  	NOP
  6333  	NOP
  6334  
  6335  loop:
  6336  	MOVSS (AX)(DI*4), X1
  6337  	MULSS X0, X1
  6338  	ADDSS (DX)(R8*4), X1
  6339  	MOVSS X1, (DX)(R8*4)
  6340  	DECQ  SI
  6341  	ADDQ  CX, DI
  6342  	ADDQ  BX, R8
  6343  
  6344  check_limit:
  6345  	CMPQ SI, $0x00
  6346  	JHI  loop
  6347  	RET
  6348  
  6349  // func AmdAxpyUnsafeX_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6350  // Requires: SSE
  6351  TEXT ·AmdAxpyUnsafeX_V4A12(SB), NOSPLIT, $0-48
  6352  	MOVSS alpha+0(FP), X0
  6353  	MOVQ  xs+8(FP), AX
  6354  	MOVQ  incx+16(FP), CX
  6355  	MOVQ  ys+24(FP), DX
  6356  	MOVQ  incy+32(FP), BX
  6357  	MOVQ  n+40(FP), SI
  6358  	XORQ  DI, DI
  6359  	XORQ  R8, R8
  6360  	JMP   check_limit
  6361  	PCALIGN $0x08
  6362  	NOP
  6363  	NOP
  6364  	NOP
  6365  	NOP
  6366  
  6367  loop:
  6368  	MOVSS (AX)(DI*4), X1
  6369  	MULSS X0, X1
  6370  	ADDSS (DX)(R8*4), X1
  6371  	MOVSS X1, (DX)(R8*4)
  6372  	DECQ  SI
  6373  	ADDQ  CX, DI
  6374  	ADDQ  BX, R8
  6375  
  6376  check_limit:
  6377  	CMPQ SI, $0x00
  6378  	JHI  loop
  6379  	RET
  6380  
  6381  // func AmdAxpyUnsafeX_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6382  // Requires: SSE
  6383  TEXT ·AmdAxpyUnsafeX_V5A12(SB), NOSPLIT, $0-48
  6384  	MOVSS alpha+0(FP), X0
  6385  	MOVQ  xs+8(FP), AX
  6386  	MOVQ  incx+16(FP), CX
  6387  	MOVQ  ys+24(FP), DX
  6388  	MOVQ  incy+32(FP), BX
  6389  	MOVQ  n+40(FP), SI
  6390  	XORQ  DI, DI
  6391  	XORQ  R8, R8
  6392  	JMP   check_limit
  6393  	PCALIGN $0x08
  6394  	NOP
  6395  	NOP
  6396  	NOP
  6397  	NOP
  6398  
  6399  loop:
  6400  	MOVSS (AX)(DI*4), X1
  6401  	MULSS X0, X1
  6402  	ADDSS (DX)(R8*4), X1
  6403  	MOVSS X1, (DX)(R8*4)
  6404  	DECQ  SI
  6405  	ADDQ  CX, DI
  6406  	ADDQ  BX, R8
  6407  
  6408  check_limit:
  6409  	CMPQ SI, $0x00
  6410  	JHI  loop
  6411  	RET
  6412  
  6413  // func AmdAxpyUnsafeX_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6414  // Requires: SSE
  6415  TEXT ·AmdAxpyUnsafeX_V0A13(SB), NOSPLIT, $0-48
  6416  	MOVSS alpha+0(FP), X0
  6417  	MOVQ  xs+8(FP), AX
  6418  	MOVQ  incx+16(FP), CX
  6419  	MOVQ  ys+24(FP), DX
  6420  	MOVQ  incy+32(FP), BX
  6421  	MOVQ  n+40(FP), SI
  6422  	XORQ  DI, DI
  6423  	XORQ  R8, R8
  6424  	JMP   check_limit
  6425  	PCALIGN $0x08
  6426  	NOP
  6427  	NOP
  6428  	NOP
  6429  	NOP
  6430  	NOP
  6431  
  6432  loop:
  6433  	MOVSS (AX)(DI*4), X1
  6434  	MULSS X0, X1
  6435  	ADDSS (DX)(R8*4), X1
  6436  	MOVSS X1, (DX)(R8*4)
  6437  	DECQ  SI
  6438  	ADDQ  CX, DI
  6439  	ADDQ  BX, R8
  6440  
  6441  check_limit:
  6442  	CMPQ SI, $0x00
  6443  	JHI  loop
  6444  	RET
  6445  
  6446  // func AmdAxpyUnsafeX_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6447  // Requires: SSE
  6448  TEXT ·AmdAxpyUnsafeX_V1A13(SB), NOSPLIT, $0-48
  6449  	MOVSS alpha+0(FP), X0
  6450  	MOVQ  xs+8(FP), AX
  6451  	MOVQ  incx+16(FP), CX
  6452  	MOVQ  ys+24(FP), DX
  6453  	MOVQ  incy+32(FP), BX
  6454  	MOVQ  n+40(FP), SI
  6455  	XORQ  DI, DI
  6456  	XORQ  R8, R8
  6457  	JMP   check_limit
  6458  	PCALIGN $0x08
  6459  	NOP
  6460  	NOP
  6461  	NOP
  6462  	NOP
  6463  	NOP
  6464  
  6465  loop:
  6466  	MOVSS (AX)(DI*4), X1
  6467  	MULSS X0, X1
  6468  	ADDSS (DX)(R8*4), X1
  6469  	MOVSS X1, (DX)(R8*4)
  6470  	DECQ  SI
  6471  	ADDQ  CX, DI
  6472  	ADDQ  BX, R8
  6473  
  6474  check_limit:
  6475  	CMPQ SI, $0x00
  6476  	JHI  loop
  6477  	RET
  6478  
  6479  // func AmdAxpyUnsafeX_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6480  // Requires: SSE
  6481  TEXT ·AmdAxpyUnsafeX_V2A13(SB), NOSPLIT, $0-48
  6482  	MOVSS alpha+0(FP), X0
  6483  	MOVQ  xs+8(FP), AX
  6484  	MOVQ  incx+16(FP), CX
  6485  	MOVQ  ys+24(FP), DX
  6486  	MOVQ  incy+32(FP), BX
  6487  	MOVQ  n+40(FP), SI
  6488  	XORQ  DI, DI
  6489  	XORQ  R8, R8
  6490  	JMP   check_limit
  6491  	PCALIGN $0x08
  6492  	NOP
  6493  	NOP
  6494  	NOP
  6495  	NOP
  6496  	NOP
  6497  
  6498  loop:
  6499  	MOVSS (AX)(DI*4), X1
  6500  	MULSS X0, X1
  6501  	ADDSS (DX)(R8*4), X1
  6502  	MOVSS X1, (DX)(R8*4)
  6503  	DECQ  SI
  6504  	ADDQ  CX, DI
  6505  	ADDQ  BX, R8
  6506  
  6507  check_limit:
  6508  	CMPQ SI, $0x00
  6509  	JHI  loop
  6510  	RET
  6511  
  6512  // func AmdAxpyUnsafeX_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6513  // Requires: SSE
  6514  TEXT ·AmdAxpyUnsafeX_V3A13(SB), NOSPLIT, $0-48
  6515  	MOVSS alpha+0(FP), X0
  6516  	MOVQ  xs+8(FP), AX
  6517  	MOVQ  incx+16(FP), CX
  6518  	MOVQ  ys+24(FP), DX
  6519  	MOVQ  incy+32(FP), BX
  6520  	MOVQ  n+40(FP), SI
  6521  	XORQ  DI, DI
  6522  	XORQ  R8, R8
  6523  	JMP   check_limit
  6524  	PCALIGN $0x08
  6525  	NOP
  6526  	NOP
  6527  	NOP
  6528  	NOP
  6529  	NOP
  6530  
  6531  loop:
  6532  	MOVSS (AX)(DI*4), X1
  6533  	MULSS X0, X1
  6534  	ADDSS (DX)(R8*4), X1
  6535  	MOVSS X1, (DX)(R8*4)
  6536  	DECQ  SI
  6537  	ADDQ  CX, DI
  6538  	ADDQ  BX, R8
  6539  
  6540  check_limit:
  6541  	CMPQ SI, $0x00
  6542  	JHI  loop
  6543  	RET
  6544  
  6545  // func AmdAxpyUnsafeX_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6546  // Requires: SSE
  6547  TEXT ·AmdAxpyUnsafeX_V4A13(SB), NOSPLIT, $0-48
  6548  	MOVSS alpha+0(FP), X0
  6549  	MOVQ  xs+8(FP), AX
  6550  	MOVQ  incx+16(FP), CX
  6551  	MOVQ  ys+24(FP), DX
  6552  	MOVQ  incy+32(FP), BX
  6553  	MOVQ  n+40(FP), SI
  6554  	XORQ  DI, DI
  6555  	XORQ  R8, R8
  6556  	JMP   check_limit
  6557  	PCALIGN $0x08
  6558  	NOP
  6559  	NOP
  6560  	NOP
  6561  	NOP
  6562  	NOP
  6563  
  6564  loop:
  6565  	MOVSS (AX)(DI*4), X1
  6566  	MULSS X0, X1
  6567  	ADDSS (DX)(R8*4), X1
  6568  	MOVSS X1, (DX)(R8*4)
  6569  	DECQ  SI
  6570  	ADDQ  CX, DI
  6571  	ADDQ  BX, R8
  6572  
  6573  check_limit:
  6574  	CMPQ SI, $0x00
  6575  	JHI  loop
  6576  	RET
  6577  
  6578  // func AmdAxpyUnsafeX_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6579  // Requires: SSE
  6580  TEXT ·AmdAxpyUnsafeX_V5A13(SB), NOSPLIT, $0-48
  6581  	MOVSS alpha+0(FP), X0
  6582  	MOVQ  xs+8(FP), AX
  6583  	MOVQ  incx+16(FP), CX
  6584  	MOVQ  ys+24(FP), DX
  6585  	MOVQ  incy+32(FP), BX
  6586  	MOVQ  n+40(FP), SI
  6587  	XORQ  DI, DI
  6588  	XORQ  R8, R8
  6589  	JMP   check_limit
  6590  	PCALIGN $0x08
  6591  	NOP
  6592  	NOP
  6593  	NOP
  6594  	NOP
  6595  	NOP
  6596  
  6597  loop:
  6598  	MOVSS (AX)(DI*4), X1
  6599  	MULSS X0, X1
  6600  	ADDSS (DX)(R8*4), X1
  6601  	MOVSS X1, (DX)(R8*4)
  6602  	DECQ  SI
  6603  	ADDQ  CX, DI
  6604  	ADDQ  BX, R8
  6605  
  6606  check_limit:
  6607  	CMPQ SI, $0x00
  6608  	JHI  loop
  6609  	RET
  6610  
  6611  // func AmdAxpyUnsafeX_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6612  // Requires: SSE
  6613  TEXT ·AmdAxpyUnsafeX_V0A14(SB), NOSPLIT, $0-48
  6614  	MOVSS alpha+0(FP), X0
  6615  	MOVQ  xs+8(FP), AX
  6616  	MOVQ  incx+16(FP), CX
  6617  	MOVQ  ys+24(FP), DX
  6618  	MOVQ  incy+32(FP), BX
  6619  	MOVQ  n+40(FP), SI
  6620  	XORQ  DI, DI
  6621  	XORQ  R8, R8
  6622  	JMP   check_limit
  6623  	PCALIGN $0x08
  6624  	NOP
  6625  	NOP
  6626  	NOP
  6627  	NOP
  6628  	NOP
  6629  	NOP
  6630  
  6631  loop:
  6632  	MOVSS (AX)(DI*4), X1
  6633  	MULSS X0, X1
  6634  	ADDSS (DX)(R8*4), X1
  6635  	MOVSS X1, (DX)(R8*4)
  6636  	DECQ  SI
  6637  	ADDQ  CX, DI
  6638  	ADDQ  BX, R8
  6639  
  6640  check_limit:
  6641  	CMPQ SI, $0x00
  6642  	JHI  loop
  6643  	RET
  6644  
  6645  // func AmdAxpyUnsafeX_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6646  // Requires: SSE
  6647  TEXT ·AmdAxpyUnsafeX_V1A14(SB), NOSPLIT, $0-48
  6648  	MOVSS alpha+0(FP), X0
  6649  	MOVQ  xs+8(FP), AX
  6650  	MOVQ  incx+16(FP), CX
  6651  	MOVQ  ys+24(FP), DX
  6652  	MOVQ  incy+32(FP), BX
  6653  	MOVQ  n+40(FP), SI
  6654  	XORQ  DI, DI
  6655  	XORQ  R8, R8
  6656  	JMP   check_limit
  6657  	PCALIGN $0x08
  6658  	NOP
  6659  	NOP
  6660  	NOP
  6661  	NOP
  6662  	NOP
  6663  	NOP
  6664  
  6665  loop:
  6666  	MOVSS (AX)(DI*4), X1
  6667  	MULSS X0, X1
  6668  	ADDSS (DX)(R8*4), X1
  6669  	MOVSS X1, (DX)(R8*4)
  6670  	DECQ  SI
  6671  	ADDQ  CX, DI
  6672  	ADDQ  BX, R8
  6673  
  6674  check_limit:
  6675  	CMPQ SI, $0x00
  6676  	JHI  loop
  6677  	RET
  6678  
  6679  // func AmdAxpyUnsafeX_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6680  // Requires: SSE
  6681  TEXT ·AmdAxpyUnsafeX_V2A14(SB), NOSPLIT, $0-48
  6682  	MOVSS alpha+0(FP), X0
  6683  	MOVQ  xs+8(FP), AX
  6684  	MOVQ  incx+16(FP), CX
  6685  	MOVQ  ys+24(FP), DX
  6686  	MOVQ  incy+32(FP), BX
  6687  	MOVQ  n+40(FP), SI
  6688  	XORQ  DI, DI
  6689  	XORQ  R8, R8
  6690  	JMP   check_limit
  6691  	PCALIGN $0x08
  6692  	NOP
  6693  	NOP
  6694  	NOP
  6695  	NOP
  6696  	NOP
  6697  	NOP
  6698  
  6699  loop:
  6700  	MOVSS (AX)(DI*4), X1
  6701  	MULSS X0, X1
  6702  	ADDSS (DX)(R8*4), X1
  6703  	MOVSS X1, (DX)(R8*4)
  6704  	DECQ  SI
  6705  	ADDQ  CX, DI
  6706  	ADDQ  BX, R8
  6707  
  6708  check_limit:
  6709  	CMPQ SI, $0x00
  6710  	JHI  loop
  6711  	RET
  6712  
  6713  // func AmdAxpyUnsafeX_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6714  // Requires: SSE
  6715  TEXT ·AmdAxpyUnsafeX_V3A14(SB), NOSPLIT, $0-48
  6716  	MOVSS alpha+0(FP), X0
  6717  	MOVQ  xs+8(FP), AX
  6718  	MOVQ  incx+16(FP), CX
  6719  	MOVQ  ys+24(FP), DX
  6720  	MOVQ  incy+32(FP), BX
  6721  	MOVQ  n+40(FP), SI
  6722  	XORQ  DI, DI
  6723  	XORQ  R8, R8
  6724  	JMP   check_limit
  6725  	PCALIGN $0x08
  6726  	NOP
  6727  	NOP
  6728  	NOP
  6729  	NOP
  6730  	NOP
  6731  	NOP
  6732  
  6733  loop:
  6734  	MOVSS (AX)(DI*4), X1
  6735  	MULSS X0, X1
  6736  	ADDSS (DX)(R8*4), X1
  6737  	MOVSS X1, (DX)(R8*4)
  6738  	DECQ  SI
  6739  	ADDQ  CX, DI
  6740  	ADDQ  BX, R8
  6741  
  6742  check_limit:
  6743  	CMPQ SI, $0x00
  6744  	JHI  loop
  6745  	RET
  6746  
  6747  // func AmdAxpyUnsafeX_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6748  // Requires: SSE
  6749  TEXT ·AmdAxpyUnsafeX_V4A14(SB), NOSPLIT, $0-48
  6750  	MOVSS alpha+0(FP), X0
  6751  	MOVQ  xs+8(FP), AX
  6752  	MOVQ  incx+16(FP), CX
  6753  	MOVQ  ys+24(FP), DX
  6754  	MOVQ  incy+32(FP), BX
  6755  	MOVQ  n+40(FP), SI
  6756  	XORQ  DI, DI
  6757  	XORQ  R8, R8
  6758  	JMP   check_limit
  6759  	PCALIGN $0x08
  6760  	NOP
  6761  	NOP
  6762  	NOP
  6763  	NOP
  6764  	NOP
  6765  	NOP
  6766  
  6767  loop:
  6768  	MOVSS (AX)(DI*4), X1
  6769  	MULSS X0, X1
  6770  	ADDSS (DX)(R8*4), X1
  6771  	MOVSS X1, (DX)(R8*4)
  6772  	DECQ  SI
  6773  	ADDQ  CX, DI
  6774  	ADDQ  BX, R8
  6775  
  6776  check_limit:
  6777  	CMPQ SI, $0x00
  6778  	JHI  loop
  6779  	RET
  6780  
  6781  // func AmdAxpyUnsafeX_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6782  // Requires: SSE
  6783  TEXT ·AmdAxpyUnsafeX_V5A14(SB), NOSPLIT, $0-48
  6784  	MOVSS alpha+0(FP), X0
  6785  	MOVQ  xs+8(FP), AX
  6786  	MOVQ  incx+16(FP), CX
  6787  	MOVQ  ys+24(FP), DX
  6788  	MOVQ  incy+32(FP), BX
  6789  	MOVQ  n+40(FP), SI
  6790  	XORQ  DI, DI
  6791  	XORQ  R8, R8
  6792  	JMP   check_limit
  6793  	PCALIGN $0x08
  6794  	NOP
  6795  	NOP
  6796  	NOP
  6797  	NOP
  6798  	NOP
  6799  	NOP
  6800  
  6801  loop:
  6802  	MOVSS (AX)(DI*4), X1
  6803  	MULSS X0, X1
  6804  	ADDSS (DX)(R8*4), X1
  6805  	MOVSS X1, (DX)(R8*4)
  6806  	DECQ  SI
  6807  	ADDQ  CX, DI
  6808  	ADDQ  BX, R8
  6809  
  6810  check_limit:
  6811  	CMPQ SI, $0x00
  6812  	JHI  loop
  6813  	RET
  6814  
  6815  // func AmdAxpyUnsafeX_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6816  // Requires: SSE
  6817  TEXT ·AmdAxpyUnsafeX_V0A15(SB), NOSPLIT, $0-48
  6818  	MOVSS alpha+0(FP), X0
  6819  	MOVQ  xs+8(FP), AX
  6820  	MOVQ  incx+16(FP), CX
  6821  	MOVQ  ys+24(FP), DX
  6822  	MOVQ  incy+32(FP), BX
  6823  	MOVQ  n+40(FP), SI
  6824  	XORQ  DI, DI
  6825  	XORQ  R8, R8
  6826  	JMP   check_limit
  6827  	PCALIGN $0x08
  6828  	NOP
  6829  	NOP
  6830  	NOP
  6831  	NOP
  6832  	NOP
  6833  	NOP
  6834  	NOP
  6835  
  6836  loop:
  6837  	MOVSS (AX)(DI*4), X1
  6838  	MULSS X0, X1
  6839  	ADDSS (DX)(R8*4), X1
  6840  	MOVSS X1, (DX)(R8*4)
  6841  	DECQ  SI
  6842  	ADDQ  CX, DI
  6843  	ADDQ  BX, R8
  6844  
  6845  check_limit:
  6846  	CMPQ SI, $0x00
  6847  	JHI  loop
  6848  	RET
  6849  
  6850  // func AmdAxpyUnsafeX_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6851  // Requires: SSE
  6852  TEXT ·AmdAxpyUnsafeX_V1A15(SB), NOSPLIT, $0-48
  6853  	MOVSS alpha+0(FP), X0
  6854  	MOVQ  xs+8(FP), AX
  6855  	MOVQ  incx+16(FP), CX
  6856  	MOVQ  ys+24(FP), DX
  6857  	MOVQ  incy+32(FP), BX
  6858  	MOVQ  n+40(FP), SI
  6859  	XORQ  DI, DI
  6860  	XORQ  R8, R8
  6861  	JMP   check_limit
  6862  	PCALIGN $0x08
  6863  	NOP
  6864  	NOP
  6865  	NOP
  6866  	NOP
  6867  	NOP
  6868  	NOP
  6869  	NOP
  6870  
  6871  loop:
  6872  	MOVSS (AX)(DI*4), X1
  6873  	MULSS X0, X1
  6874  	ADDSS (DX)(R8*4), X1
  6875  	MOVSS X1, (DX)(R8*4)
  6876  	DECQ  SI
  6877  	ADDQ  CX, DI
  6878  	ADDQ  BX, R8
  6879  
  6880  check_limit:
  6881  	CMPQ SI, $0x00
  6882  	JHI  loop
  6883  	RET
  6884  
  6885  // func AmdAxpyUnsafeX_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6886  // Requires: SSE
  6887  TEXT ·AmdAxpyUnsafeX_V2A15(SB), NOSPLIT, $0-48
  6888  	MOVSS alpha+0(FP), X0
  6889  	MOVQ  xs+8(FP), AX
  6890  	MOVQ  incx+16(FP), CX
  6891  	MOVQ  ys+24(FP), DX
  6892  	MOVQ  incy+32(FP), BX
  6893  	MOVQ  n+40(FP), SI
  6894  	XORQ  DI, DI
  6895  	XORQ  R8, R8
  6896  	JMP   check_limit
  6897  	PCALIGN $0x08
  6898  	NOP
  6899  	NOP
  6900  	NOP
  6901  	NOP
  6902  	NOP
  6903  	NOP
  6904  	NOP
  6905  
  6906  loop:
  6907  	MOVSS (AX)(DI*4), X1
  6908  	MULSS X0, X1
  6909  	ADDSS (DX)(R8*4), X1
  6910  	MOVSS X1, (DX)(R8*4)
  6911  	DECQ  SI
  6912  	ADDQ  CX, DI
  6913  	ADDQ  BX, R8
  6914  
  6915  check_limit:
  6916  	CMPQ SI, $0x00
  6917  	JHI  loop
  6918  	RET
  6919  
  6920  // func AmdAxpyUnsafeX_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6921  // Requires: SSE
  6922  TEXT ·AmdAxpyUnsafeX_V3A15(SB), NOSPLIT, $0-48
  6923  	MOVSS alpha+0(FP), X0
  6924  	MOVQ  xs+8(FP), AX
  6925  	MOVQ  incx+16(FP), CX
  6926  	MOVQ  ys+24(FP), DX
  6927  	MOVQ  incy+32(FP), BX
  6928  	MOVQ  n+40(FP), SI
  6929  	XORQ  DI, DI
  6930  	XORQ  R8, R8
  6931  	JMP   check_limit
  6932  	PCALIGN $0x08
  6933  	NOP
  6934  	NOP
  6935  	NOP
  6936  	NOP
  6937  	NOP
  6938  	NOP
  6939  	NOP
  6940  
  6941  loop:
  6942  	MOVSS (AX)(DI*4), X1
  6943  	MULSS X0, X1
  6944  	ADDSS (DX)(R8*4), X1
  6945  	MOVSS X1, (DX)(R8*4)
  6946  	DECQ  SI
  6947  	ADDQ  CX, DI
  6948  	ADDQ  BX, R8
  6949  
  6950  check_limit:
  6951  	CMPQ SI, $0x00
  6952  	JHI  loop
  6953  	RET
  6954  
  6955  // func AmdAxpyUnsafeX_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6956  // Requires: SSE
  6957  TEXT ·AmdAxpyUnsafeX_V4A15(SB), NOSPLIT, $0-48
  6958  	MOVSS alpha+0(FP), X0
  6959  	MOVQ  xs+8(FP), AX
  6960  	MOVQ  incx+16(FP), CX
  6961  	MOVQ  ys+24(FP), DX
  6962  	MOVQ  incy+32(FP), BX
  6963  	MOVQ  n+40(FP), SI
  6964  	XORQ  DI, DI
  6965  	XORQ  R8, R8
  6966  	JMP   check_limit
  6967  	PCALIGN $0x08
  6968  	NOP
  6969  	NOP
  6970  	NOP
  6971  	NOP
  6972  	NOP
  6973  	NOP
  6974  	NOP
  6975  
  6976  loop:
  6977  	MOVSS (AX)(DI*4), X1
  6978  	MULSS X0, X1
  6979  	ADDSS (DX)(R8*4), X1
  6980  	MOVSS X1, (DX)(R8*4)
  6981  	DECQ  SI
  6982  	ADDQ  CX, DI
  6983  	ADDQ  BX, R8
  6984  
  6985  check_limit:
  6986  	CMPQ SI, $0x00
  6987  	JHI  loop
  6988  	RET
  6989  
  6990  // func AmdAxpyUnsafeX_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  6991  // Requires: SSE
  6992  TEXT ·AmdAxpyUnsafeX_V5A15(SB), NOSPLIT, $0-48
  6993  	MOVSS alpha+0(FP), X0
  6994  	MOVQ  xs+8(FP), AX
  6995  	MOVQ  incx+16(FP), CX
  6996  	MOVQ  ys+24(FP), DX
  6997  	MOVQ  incy+32(FP), BX
  6998  	MOVQ  n+40(FP), SI
  6999  	XORQ  DI, DI
  7000  	XORQ  R8, R8
  7001  	JMP   check_limit
  7002  	PCALIGN $0x08
  7003  	NOP
  7004  	NOP
  7005  	NOP
  7006  	NOP
  7007  	NOP
  7008  	NOP
  7009  	NOP
  7010  
  7011  loop:
  7012  	MOVSS (AX)(DI*4), X1
  7013  	MULSS X0, X1
  7014  	ADDSS (DX)(R8*4), X1
  7015  	MOVSS X1, (DX)(R8*4)
  7016  	DECQ  SI
  7017  	ADDQ  CX, DI
  7018  	ADDQ  BX, R8
  7019  
  7020  check_limit:
  7021  	CMPQ SI, $0x00
  7022  	JHI  loop
  7023  	RET
  7024  
  7025  // func AmdAxpyUnsafeX_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7026  // Requires: SSE
  7027  TEXT ·AmdAxpyUnsafeX_V0A16(SB), NOSPLIT, $0-48
  7028  	MOVSS alpha+0(FP), X0
  7029  	MOVQ  xs+8(FP), AX
  7030  	MOVQ  incx+16(FP), CX
  7031  	MOVQ  ys+24(FP), DX
  7032  	MOVQ  incy+32(FP), BX
  7033  	MOVQ  n+40(FP), SI
  7034  	XORQ  DI, DI
  7035  	XORQ  R8, R8
  7036  	JMP   check_limit
  7037  	PCALIGN $0x10
  7038  
  7039  loop:
  7040  	MOVSS (AX)(DI*4), X1
  7041  	MULSS X0, X1
  7042  	ADDSS (DX)(R8*4), X1
  7043  	MOVSS X1, (DX)(R8*4)
  7044  	DECQ  SI
  7045  	ADDQ  CX, DI
  7046  	ADDQ  BX, R8
  7047  
  7048  check_limit:
  7049  	CMPQ SI, $0x00
  7050  	JHI  loop
  7051  	RET
  7052  
  7053  // func AmdAxpyUnsafeX_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7054  // Requires: SSE
  7055  TEXT ·AmdAxpyUnsafeX_V1A16(SB), NOSPLIT, $0-48
  7056  	MOVSS alpha+0(FP), X0
  7057  	MOVQ  xs+8(FP), AX
  7058  	MOVQ  incx+16(FP), CX
  7059  	MOVQ  ys+24(FP), DX
  7060  	MOVQ  incy+32(FP), BX
  7061  	MOVQ  n+40(FP), SI
  7062  	XORQ  DI, DI
  7063  	XORQ  R8, R8
  7064  	JMP   check_limit
  7065  	PCALIGN $0x10
  7066  
  7067  loop:
  7068  	MOVSS (AX)(DI*4), X1
  7069  	MULSS X0, X1
  7070  	ADDSS (DX)(R8*4), X1
  7071  	MOVSS X1, (DX)(R8*4)
  7072  	DECQ  SI
  7073  	ADDQ  CX, DI
  7074  	ADDQ  BX, R8
  7075  
  7076  check_limit:
  7077  	CMPQ SI, $0x00
  7078  	JHI  loop
  7079  	RET
  7080  
  7081  // func AmdAxpyUnsafeX_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7082  // Requires: SSE
  7083  TEXT ·AmdAxpyUnsafeX_V2A16(SB), NOSPLIT, $0-48
  7084  	MOVSS alpha+0(FP), X0
  7085  	MOVQ  xs+8(FP), AX
  7086  	MOVQ  incx+16(FP), CX
  7087  	MOVQ  ys+24(FP), DX
  7088  	MOVQ  incy+32(FP), BX
  7089  	MOVQ  n+40(FP), SI
  7090  	XORQ  DI, DI
  7091  	XORQ  R8, R8
  7092  	JMP   check_limit
  7093  	PCALIGN $0x10
  7094  
  7095  loop:
  7096  	MOVSS (AX)(DI*4), X1
  7097  	MULSS X0, X1
  7098  	ADDSS (DX)(R8*4), X1
  7099  	MOVSS X1, (DX)(R8*4)
  7100  	DECQ  SI
  7101  	ADDQ  CX, DI
  7102  	ADDQ  BX, R8
  7103  
  7104  check_limit:
  7105  	CMPQ SI, $0x00
  7106  	JHI  loop
  7107  	RET
  7108  
  7109  // func AmdAxpyUnsafeX_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7110  // Requires: SSE
  7111  TEXT ·AmdAxpyUnsafeX_V3A16(SB), NOSPLIT, $0-48
  7112  	MOVSS alpha+0(FP), X0
  7113  	MOVQ  xs+8(FP), AX
  7114  	MOVQ  incx+16(FP), CX
  7115  	MOVQ  ys+24(FP), DX
  7116  	MOVQ  incy+32(FP), BX
  7117  	MOVQ  n+40(FP), SI
  7118  	XORQ  DI, DI
  7119  	XORQ  R8, R8
  7120  	JMP   check_limit
  7121  	PCALIGN $0x10
  7122  
  7123  loop:
  7124  	MOVSS (AX)(DI*4), X1
  7125  	MULSS X0, X1
  7126  	ADDSS (DX)(R8*4), X1
  7127  	MOVSS X1, (DX)(R8*4)
  7128  	DECQ  SI
  7129  	ADDQ  CX, DI
  7130  	ADDQ  BX, R8
  7131  
  7132  check_limit:
  7133  	CMPQ SI, $0x00
  7134  	JHI  loop
  7135  	RET
  7136  
  7137  // func AmdAxpyUnsafeX_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7138  // Requires: SSE
  7139  TEXT ·AmdAxpyUnsafeX_V4A16(SB), NOSPLIT, $0-48
  7140  	MOVSS alpha+0(FP), X0
  7141  	MOVQ  xs+8(FP), AX
  7142  	MOVQ  incx+16(FP), CX
  7143  	MOVQ  ys+24(FP), DX
  7144  	MOVQ  incy+32(FP), BX
  7145  	MOVQ  n+40(FP), SI
  7146  	XORQ  DI, DI
  7147  	XORQ  R8, R8
  7148  	JMP   check_limit
  7149  	PCALIGN $0x10
  7150  
  7151  loop:
  7152  	MOVSS (AX)(DI*4), X1
  7153  	MULSS X0, X1
  7154  	ADDSS (DX)(R8*4), X1
  7155  	MOVSS X1, (DX)(R8*4)
  7156  	DECQ  SI
  7157  	ADDQ  CX, DI
  7158  	ADDQ  BX, R8
  7159  
  7160  check_limit:
  7161  	CMPQ SI, $0x00
  7162  	JHI  loop
  7163  	RET
  7164  
  7165  // func AmdAxpyUnsafeX_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7166  // Requires: SSE
  7167  TEXT ·AmdAxpyUnsafeX_V5A16(SB), NOSPLIT, $0-48
  7168  	MOVSS alpha+0(FP), X0
  7169  	MOVQ  xs+8(FP), AX
  7170  	MOVQ  incx+16(FP), CX
  7171  	MOVQ  ys+24(FP), DX
  7172  	MOVQ  incy+32(FP), BX
  7173  	MOVQ  n+40(FP), SI
  7174  	XORQ  DI, DI
  7175  	XORQ  R8, R8
  7176  	JMP   check_limit
  7177  	PCALIGN $0x10
  7178  
  7179  loop:
  7180  	MOVSS (AX)(DI*4), X1
  7181  	MULSS X0, X1
  7182  	ADDSS (DX)(R8*4), X1
  7183  	MOVSS X1, (DX)(R8*4)
  7184  	DECQ  SI
  7185  	ADDQ  CX, DI
  7186  	ADDQ  BX, R8
  7187  
  7188  check_limit:
  7189  	CMPQ SI, $0x00
  7190  	JHI  loop
  7191  	RET
  7192  
  7193  // func AmdAxpyUnsafeX_V0A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7194  // Requires: SSE
  7195  TEXT ·AmdAxpyUnsafeX_V0A0R4(SB), NOSPLIT, $0-48
  7196  	MOVSS alpha+0(FP), X0
  7197  	MOVQ  xs+8(FP), AX
  7198  	MOVQ  incx+16(FP), CX
  7199  	MOVQ  ys+24(FP), DX
  7200  	MOVQ  incy+32(FP), BX
  7201  	MOVQ  n+40(FP), SI
  7202  	XORQ  DI, DI
  7203  	XORQ  R8, R8
  7204  	JMP   check_limit_unroll
  7205  
  7206  loop_unroll:
  7207  	MOVSS (AX)(DI*4), X1
  7208  	MULSS X0, X1
  7209  	ADDSS (DX)(R8*4), X1
  7210  	MOVSS X1, (DX)(R8*4)
  7211  	ADDQ  CX, DI
  7212  	ADDQ  BX, R8
  7213  	MOVSS (AX)(DI*4), X1
  7214  	MULSS X0, X1
  7215  	ADDSS (DX)(R8*4), X1
  7216  	MOVSS X1, (DX)(R8*4)
  7217  	ADDQ  CX, DI
  7218  	ADDQ  BX, R8
  7219  	MOVSS (AX)(DI*4), X1
  7220  	MULSS X0, X1
  7221  	ADDSS (DX)(R8*4), X1
  7222  	MOVSS X1, (DX)(R8*4)
  7223  	ADDQ  CX, DI
  7224  	ADDQ  BX, R8
  7225  	MOVSS (AX)(DI*4), X1
  7226  	MULSS X0, X1
  7227  	ADDSS (DX)(R8*4), X1
  7228  	MOVSS X1, (DX)(R8*4)
  7229  	ADDQ  CX, DI
  7230  	ADDQ  BX, R8
  7231  	SUBQ  $0x04, SI
  7232  
  7233  check_limit_unroll:
  7234  	CMPQ SI, $0x04
  7235  	JHI  loop_unroll
  7236  	JMP  check_limit
  7237  
  7238  loop:
  7239  	MOVSS (AX)(DI*4), X1
  7240  	MULSS X0, X1
  7241  	ADDSS (DX)(R8*4), X1
  7242  	MOVSS X1, (DX)(R8*4)
  7243  	DECQ  SI
  7244  	ADDQ  CX, DI
  7245  	ADDQ  BX, R8
  7246  
  7247  check_limit:
  7248  	CMPQ SI, $0x00
  7249  	JHI  loop
  7250  	RET
  7251  
  7252  // func AmdAxpyUnsafeX_V1A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7253  // Requires: SSE
  7254  TEXT ·AmdAxpyUnsafeX_V1A0R4(SB), NOSPLIT, $0-48
  7255  	MOVSS alpha+0(FP), X0
  7256  	MOVQ  xs+8(FP), AX
  7257  	MOVQ  incx+16(FP), CX
  7258  	MOVQ  ys+24(FP), DX
  7259  	MOVQ  incy+32(FP), BX
  7260  	MOVQ  n+40(FP), SI
  7261  	XORQ  DI, DI
  7262  	XORQ  R8, R8
  7263  	JMP   check_limit_unroll
  7264  
  7265  loop_unroll:
  7266  	MOVSS (AX)(DI*4), X1
  7267  	MULSS X0, X1
  7268  	ADDSS (DX)(R8*4), X1
  7269  	MOVSS X1, (DX)(R8*4)
  7270  	ADDQ  CX, DI
  7271  	ADDQ  BX, R8
  7272  	MOVSS (AX)(DI*4), X1
  7273  	MULSS X0, X1
  7274  	ADDSS (DX)(R8*4), X1
  7275  	MOVSS X1, (DX)(R8*4)
  7276  	ADDQ  CX, DI
  7277  	ADDQ  BX, R8
  7278  	MOVSS (AX)(DI*4), X1
  7279  	MULSS X0, X1
  7280  	ADDSS (DX)(R8*4), X1
  7281  	MOVSS X1, (DX)(R8*4)
  7282  	ADDQ  CX, DI
  7283  	ADDQ  BX, R8
  7284  	MOVSS (AX)(DI*4), X1
  7285  	MULSS X0, X1
  7286  	ADDSS (DX)(R8*4), X1
  7287  	MOVSS X1, (DX)(R8*4)
  7288  	ADDQ  CX, DI
  7289  	ADDQ  BX, R8
  7290  	SUBQ  $0x04, SI
  7291  
  7292  check_limit_unroll:
  7293  	CMPQ SI, $0x04
  7294  	JHI  loop_unroll
  7295  	JMP  check_limit
  7296  
  7297  loop:
  7298  	MOVSS (AX)(DI*4), X1
  7299  	MULSS X0, X1
  7300  	ADDSS (DX)(R8*4), X1
  7301  	MOVSS X1, (DX)(R8*4)
  7302  	DECQ  SI
  7303  	ADDQ  CX, DI
  7304  	ADDQ  BX, R8
  7305  
  7306  check_limit:
  7307  	CMPQ SI, $0x00
  7308  	JHI  loop
  7309  	RET
  7310  
  7311  // func AmdAxpyUnsafeX_V2A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7312  // Requires: SSE
  7313  TEXT ·AmdAxpyUnsafeX_V2A0R4(SB), NOSPLIT, $0-48
  7314  	MOVSS alpha+0(FP), X0
  7315  	MOVQ  xs+8(FP), AX
  7316  	MOVQ  incx+16(FP), CX
  7317  	MOVQ  ys+24(FP), DX
  7318  	MOVQ  incy+32(FP), BX
  7319  	MOVQ  n+40(FP), SI
  7320  	XORQ  DI, DI
  7321  	XORQ  R8, R8
  7322  	JMP   check_limit_unroll
  7323  
  7324  loop_unroll:
  7325  	MOVSS (AX)(DI*4), X1
  7326  	MULSS X0, X1
  7327  	ADDSS (DX)(R8*4), X1
  7328  	MOVSS X1, (DX)(R8*4)
  7329  	ADDQ  CX, DI
  7330  	ADDQ  BX, R8
  7331  	MOVSS (AX)(DI*4), X1
  7332  	MULSS X0, X1
  7333  	ADDSS (DX)(R8*4), X1
  7334  	MOVSS X1, (DX)(R8*4)
  7335  	ADDQ  CX, DI
  7336  	ADDQ  BX, R8
  7337  	MOVSS (AX)(DI*4), X1
  7338  	MULSS X0, X1
  7339  	ADDSS (DX)(R8*4), X1
  7340  	MOVSS X1, (DX)(R8*4)
  7341  	ADDQ  CX, DI
  7342  	ADDQ  BX, R8
  7343  	MOVSS (AX)(DI*4), X1
  7344  	MULSS X0, X1
  7345  	ADDSS (DX)(R8*4), X1
  7346  	MOVSS X1, (DX)(R8*4)
  7347  	ADDQ  CX, DI
  7348  	ADDQ  BX, R8
  7349  	SUBQ  $0x04, SI
  7350  
  7351  check_limit_unroll:
  7352  	CMPQ SI, $0x04
  7353  	JHI  loop_unroll
  7354  	JMP  check_limit
  7355  
  7356  loop:
  7357  	MOVSS (AX)(DI*4), X1
  7358  	MULSS X0, X1
  7359  	ADDSS (DX)(R8*4), X1
  7360  	MOVSS X1, (DX)(R8*4)
  7361  	DECQ  SI
  7362  	ADDQ  CX, DI
  7363  	ADDQ  BX, R8
  7364  
  7365  check_limit:
  7366  	CMPQ SI, $0x00
  7367  	JHI  loop
  7368  	RET
  7369  
  7370  // func AmdAxpyUnsafeX_V3A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7371  // Requires: SSE
  7372  TEXT ·AmdAxpyUnsafeX_V3A0R4(SB), NOSPLIT, $0-48
  7373  	MOVSS alpha+0(FP), X0
  7374  	MOVQ  xs+8(FP), AX
  7375  	MOVQ  incx+16(FP), CX
  7376  	MOVQ  ys+24(FP), DX
  7377  	MOVQ  incy+32(FP), BX
  7378  	MOVQ  n+40(FP), SI
  7379  	XORQ  DI, DI
  7380  	XORQ  R8, R8
  7381  	JMP   check_limit_unroll
  7382  
  7383  loop_unroll:
  7384  	MOVSS (AX)(DI*4), X1
  7385  	MULSS X0, X1
  7386  	ADDSS (DX)(R8*4), X1
  7387  	MOVSS X1, (DX)(R8*4)
  7388  	ADDQ  CX, DI
  7389  	ADDQ  BX, R8
  7390  	MOVSS (AX)(DI*4), X1
  7391  	MULSS X0, X1
  7392  	ADDSS (DX)(R8*4), X1
  7393  	MOVSS X1, (DX)(R8*4)
  7394  	ADDQ  CX, DI
  7395  	ADDQ  BX, R8
  7396  	MOVSS (AX)(DI*4), X1
  7397  	MULSS X0, X1
  7398  	ADDSS (DX)(R8*4), X1
  7399  	MOVSS X1, (DX)(R8*4)
  7400  	ADDQ  CX, DI
  7401  	ADDQ  BX, R8
  7402  	MOVSS (AX)(DI*4), X1
  7403  	MULSS X0, X1
  7404  	ADDSS (DX)(R8*4), X1
  7405  	MOVSS X1, (DX)(R8*4)
  7406  	ADDQ  CX, DI
  7407  	ADDQ  BX, R8
  7408  	SUBQ  $0x04, SI
  7409  
  7410  check_limit_unroll:
  7411  	CMPQ SI, $0x04
  7412  	JHI  loop_unroll
  7413  	JMP  check_limit
  7414  
  7415  loop:
  7416  	MOVSS (AX)(DI*4), X1
  7417  	MULSS X0, X1
  7418  	ADDSS (DX)(R8*4), X1
  7419  	MOVSS X1, (DX)(R8*4)
  7420  	DECQ  SI
  7421  	ADDQ  CX, DI
  7422  	ADDQ  BX, R8
  7423  
  7424  check_limit:
  7425  	CMPQ SI, $0x00
  7426  	JHI  loop
  7427  	RET
  7428  
  7429  // func AmdAxpyUnsafeX_V4A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7430  // Requires: SSE
  7431  TEXT ·AmdAxpyUnsafeX_V4A0R4(SB), NOSPLIT, $0-48
  7432  	MOVSS alpha+0(FP), X0
  7433  	MOVQ  xs+8(FP), AX
  7434  	MOVQ  incx+16(FP), CX
  7435  	MOVQ  ys+24(FP), DX
  7436  	MOVQ  incy+32(FP), BX
  7437  	MOVQ  n+40(FP), SI
  7438  	XORQ  DI, DI
  7439  	XORQ  R8, R8
  7440  	JMP   check_limit_unroll
  7441  
  7442  loop_unroll:
  7443  	MOVSS (AX)(DI*4), X1
  7444  	MULSS X0, X1
  7445  	ADDSS (DX)(R8*4), X1
  7446  	MOVSS X1, (DX)(R8*4)
  7447  	ADDQ  CX, DI
  7448  	ADDQ  BX, R8
  7449  	MOVSS (AX)(DI*4), X1
  7450  	MULSS X0, X1
  7451  	ADDSS (DX)(R8*4), X1
  7452  	MOVSS X1, (DX)(R8*4)
  7453  	ADDQ  CX, DI
  7454  	ADDQ  BX, R8
  7455  	MOVSS (AX)(DI*4), X1
  7456  	MULSS X0, X1
  7457  	ADDSS (DX)(R8*4), X1
  7458  	MOVSS X1, (DX)(R8*4)
  7459  	ADDQ  CX, DI
  7460  	ADDQ  BX, R8
  7461  	MOVSS (AX)(DI*4), X1
  7462  	MULSS X0, X1
  7463  	ADDSS (DX)(R8*4), X1
  7464  	MOVSS X1, (DX)(R8*4)
  7465  	ADDQ  CX, DI
  7466  	ADDQ  BX, R8
  7467  	SUBQ  $0x04, SI
  7468  
  7469  check_limit_unroll:
  7470  	CMPQ SI, $0x04
  7471  	JHI  loop_unroll
  7472  	JMP  check_limit
  7473  
  7474  loop:
  7475  	MOVSS (AX)(DI*4), X1
  7476  	MULSS X0, X1
  7477  	ADDSS (DX)(R8*4), X1
  7478  	MOVSS X1, (DX)(R8*4)
  7479  	DECQ  SI
  7480  	ADDQ  CX, DI
  7481  	ADDQ  BX, R8
  7482  
  7483  check_limit:
  7484  	CMPQ SI, $0x00
  7485  	JHI  loop
  7486  	RET
  7487  
  7488  // func AmdAxpyUnsafeX_V5A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7489  // Requires: SSE
  7490  TEXT ·AmdAxpyUnsafeX_V5A0R4(SB), NOSPLIT, $0-48
  7491  	MOVSS alpha+0(FP), X0
  7492  	MOVQ  xs+8(FP), AX
  7493  	MOVQ  incx+16(FP), CX
  7494  	MOVQ  ys+24(FP), DX
  7495  	MOVQ  incy+32(FP), BX
  7496  	MOVQ  n+40(FP), SI
  7497  	XORQ  DI, DI
  7498  	XORQ  R8, R8
  7499  	JMP   check_limit_unroll
  7500  
  7501  loop_unroll:
  7502  	MOVSS (AX)(DI*4), X1
  7503  	MULSS X0, X1
  7504  	ADDSS (DX)(R8*4), X1
  7505  	MOVSS X1, (DX)(R8*4)
  7506  	ADDQ  CX, DI
  7507  	ADDQ  BX, R8
  7508  	MOVSS (AX)(DI*4), X1
  7509  	MULSS X0, X1
  7510  	ADDSS (DX)(R8*4), X1
  7511  	MOVSS X1, (DX)(R8*4)
  7512  	ADDQ  CX, DI
  7513  	ADDQ  BX, R8
  7514  	MOVSS (AX)(DI*4), X1
  7515  	MULSS X0, X1
  7516  	ADDSS (DX)(R8*4), X1
  7517  	MOVSS X1, (DX)(R8*4)
  7518  	ADDQ  CX, DI
  7519  	ADDQ  BX, R8
  7520  	MOVSS (AX)(DI*4), X1
  7521  	MULSS X0, X1
  7522  	ADDSS (DX)(R8*4), X1
  7523  	MOVSS X1, (DX)(R8*4)
  7524  	ADDQ  CX, DI
  7525  	ADDQ  BX, R8
  7526  	SUBQ  $0x04, SI
  7527  
  7528  check_limit_unroll:
  7529  	CMPQ SI, $0x04
  7530  	JHI  loop_unroll
  7531  	JMP  check_limit
  7532  
  7533  loop:
  7534  	MOVSS (AX)(DI*4), X1
  7535  	MULSS X0, X1
  7536  	ADDSS (DX)(R8*4), X1
  7537  	MOVSS X1, (DX)(R8*4)
  7538  	DECQ  SI
  7539  	ADDQ  CX, DI
  7540  	ADDQ  BX, R8
  7541  
  7542  check_limit:
  7543  	CMPQ SI, $0x00
  7544  	JHI  loop
  7545  	RET
  7546  
  7547  // func AmdAxpyUnsafeX_V0A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7548  // Requires: SSE
  7549  TEXT ·AmdAxpyUnsafeX_V0A8R4(SB), NOSPLIT, $0-48
  7550  	MOVSS alpha+0(FP), X0
  7551  	MOVQ  xs+8(FP), AX
  7552  	MOVQ  incx+16(FP), CX
  7553  	MOVQ  ys+24(FP), DX
  7554  	MOVQ  incy+32(FP), BX
  7555  	MOVQ  n+40(FP), SI
  7556  	XORQ  DI, DI
  7557  	XORQ  R8, R8
  7558  	JMP   check_limit_unroll
  7559  	PCALIGN $0x08
  7560  
  7561  loop_unroll:
  7562  	MOVSS (AX)(DI*4), X1
  7563  	MULSS X0, X1
  7564  	ADDSS (DX)(R8*4), X1
  7565  	MOVSS X1, (DX)(R8*4)
  7566  	ADDQ  CX, DI
  7567  	ADDQ  BX, R8
  7568  	MOVSS (AX)(DI*4), X1
  7569  	MULSS X0, X1
  7570  	ADDSS (DX)(R8*4), X1
  7571  	MOVSS X1, (DX)(R8*4)
  7572  	ADDQ  CX, DI
  7573  	ADDQ  BX, R8
  7574  	MOVSS (AX)(DI*4), X1
  7575  	MULSS X0, X1
  7576  	ADDSS (DX)(R8*4), X1
  7577  	MOVSS X1, (DX)(R8*4)
  7578  	ADDQ  CX, DI
  7579  	ADDQ  BX, R8
  7580  	MOVSS (AX)(DI*4), X1
  7581  	MULSS X0, X1
  7582  	ADDSS (DX)(R8*4), X1
  7583  	MOVSS X1, (DX)(R8*4)
  7584  	ADDQ  CX, DI
  7585  	ADDQ  BX, R8
  7586  	SUBQ  $0x04, SI
  7587  
  7588  check_limit_unroll:
  7589  	CMPQ SI, $0x04
  7590  	JHI  loop_unroll
  7591  	JMP  check_limit
  7592  
  7593  loop:
  7594  	MOVSS (AX)(DI*4), X1
  7595  	MULSS X0, X1
  7596  	ADDSS (DX)(R8*4), X1
  7597  	MOVSS X1, (DX)(R8*4)
  7598  	DECQ  SI
  7599  	ADDQ  CX, DI
  7600  	ADDQ  BX, R8
  7601  
  7602  check_limit:
  7603  	CMPQ SI, $0x00
  7604  	JHI  loop
  7605  	RET
  7606  
  7607  // func AmdAxpyUnsafeX_V1A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7608  // Requires: SSE
  7609  TEXT ·AmdAxpyUnsafeX_V1A8R4(SB), NOSPLIT, $0-48
  7610  	MOVSS alpha+0(FP), X0
  7611  	MOVQ  xs+8(FP), AX
  7612  	MOVQ  incx+16(FP), CX
  7613  	MOVQ  ys+24(FP), DX
  7614  	MOVQ  incy+32(FP), BX
  7615  	MOVQ  n+40(FP), SI
  7616  	XORQ  DI, DI
  7617  	XORQ  R8, R8
  7618  	JMP   check_limit_unroll
  7619  	PCALIGN $0x08
  7620  
  7621  loop_unroll:
  7622  	MOVSS (AX)(DI*4), X1
  7623  	MULSS X0, X1
  7624  	ADDSS (DX)(R8*4), X1
  7625  	MOVSS X1, (DX)(R8*4)
  7626  	ADDQ  CX, DI
  7627  	ADDQ  BX, R8
  7628  	MOVSS (AX)(DI*4), X1
  7629  	MULSS X0, X1
  7630  	ADDSS (DX)(R8*4), X1
  7631  	MOVSS X1, (DX)(R8*4)
  7632  	ADDQ  CX, DI
  7633  	ADDQ  BX, R8
  7634  	MOVSS (AX)(DI*4), X1
  7635  	MULSS X0, X1
  7636  	ADDSS (DX)(R8*4), X1
  7637  	MOVSS X1, (DX)(R8*4)
  7638  	ADDQ  CX, DI
  7639  	ADDQ  BX, R8
  7640  	MOVSS (AX)(DI*4), X1
  7641  	MULSS X0, X1
  7642  	ADDSS (DX)(R8*4), X1
  7643  	MOVSS X1, (DX)(R8*4)
  7644  	ADDQ  CX, DI
  7645  	ADDQ  BX, R8
  7646  	SUBQ  $0x04, SI
  7647  
  7648  check_limit_unroll:
  7649  	CMPQ SI, $0x04
  7650  	JHI  loop_unroll
  7651  	JMP  check_limit
  7652  
  7653  loop:
  7654  	MOVSS (AX)(DI*4), X1
  7655  	MULSS X0, X1
  7656  	ADDSS (DX)(R8*4), X1
  7657  	MOVSS X1, (DX)(R8*4)
  7658  	DECQ  SI
  7659  	ADDQ  CX, DI
  7660  	ADDQ  BX, R8
  7661  
  7662  check_limit:
  7663  	CMPQ SI, $0x00
  7664  	JHI  loop
  7665  	RET
  7666  
  7667  // func AmdAxpyUnsafeX_V2A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7668  // Requires: SSE
  7669  TEXT ·AmdAxpyUnsafeX_V2A8R4(SB), NOSPLIT, $0-48
  7670  	MOVSS alpha+0(FP), X0
  7671  	MOVQ  xs+8(FP), AX
  7672  	MOVQ  incx+16(FP), CX
  7673  	MOVQ  ys+24(FP), DX
  7674  	MOVQ  incy+32(FP), BX
  7675  	MOVQ  n+40(FP), SI
  7676  	XORQ  DI, DI
  7677  	XORQ  R8, R8
  7678  	JMP   check_limit_unroll
  7679  	PCALIGN $0x08
  7680  
  7681  loop_unroll:
  7682  	MOVSS (AX)(DI*4), X1
  7683  	MULSS X0, X1
  7684  	ADDSS (DX)(R8*4), X1
  7685  	MOVSS X1, (DX)(R8*4)
  7686  	ADDQ  CX, DI
  7687  	ADDQ  BX, R8
  7688  	MOVSS (AX)(DI*4), X1
  7689  	MULSS X0, X1
  7690  	ADDSS (DX)(R8*4), X1
  7691  	MOVSS X1, (DX)(R8*4)
  7692  	ADDQ  CX, DI
  7693  	ADDQ  BX, R8
  7694  	MOVSS (AX)(DI*4), X1
  7695  	MULSS X0, X1
  7696  	ADDSS (DX)(R8*4), X1
  7697  	MOVSS X1, (DX)(R8*4)
  7698  	ADDQ  CX, DI
  7699  	ADDQ  BX, R8
  7700  	MOVSS (AX)(DI*4), X1
  7701  	MULSS X0, X1
  7702  	ADDSS (DX)(R8*4), X1
  7703  	MOVSS X1, (DX)(R8*4)
  7704  	ADDQ  CX, DI
  7705  	ADDQ  BX, R8
  7706  	SUBQ  $0x04, SI
  7707  
  7708  check_limit_unroll:
  7709  	CMPQ SI, $0x04
  7710  	JHI  loop_unroll
  7711  	JMP  check_limit
  7712  
  7713  loop:
  7714  	MOVSS (AX)(DI*4), X1
  7715  	MULSS X0, X1
  7716  	ADDSS (DX)(R8*4), X1
  7717  	MOVSS X1, (DX)(R8*4)
  7718  	DECQ  SI
  7719  	ADDQ  CX, DI
  7720  	ADDQ  BX, R8
  7721  
  7722  check_limit:
  7723  	CMPQ SI, $0x00
  7724  	JHI  loop
  7725  	RET
  7726  
  7727  // func AmdAxpyUnsafeX_V3A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7728  // Requires: SSE
  7729  TEXT ·AmdAxpyUnsafeX_V3A8R4(SB), NOSPLIT, $0-48
  7730  	MOVSS alpha+0(FP), X0
  7731  	MOVQ  xs+8(FP), AX
  7732  	MOVQ  incx+16(FP), CX
  7733  	MOVQ  ys+24(FP), DX
  7734  	MOVQ  incy+32(FP), BX
  7735  	MOVQ  n+40(FP), SI
  7736  	XORQ  DI, DI
  7737  	XORQ  R8, R8
  7738  	JMP   check_limit_unroll
  7739  	PCALIGN $0x08
  7740  
  7741  loop_unroll:
  7742  	MOVSS (AX)(DI*4), X1
  7743  	MULSS X0, X1
  7744  	ADDSS (DX)(R8*4), X1
  7745  	MOVSS X1, (DX)(R8*4)
  7746  	ADDQ  CX, DI
  7747  	ADDQ  BX, R8
  7748  	MOVSS (AX)(DI*4), X1
  7749  	MULSS X0, X1
  7750  	ADDSS (DX)(R8*4), X1
  7751  	MOVSS X1, (DX)(R8*4)
  7752  	ADDQ  CX, DI
  7753  	ADDQ  BX, R8
  7754  	MOVSS (AX)(DI*4), X1
  7755  	MULSS X0, X1
  7756  	ADDSS (DX)(R8*4), X1
  7757  	MOVSS X1, (DX)(R8*4)
  7758  	ADDQ  CX, DI
  7759  	ADDQ  BX, R8
  7760  	MOVSS (AX)(DI*4), X1
  7761  	MULSS X0, X1
  7762  	ADDSS (DX)(R8*4), X1
  7763  	MOVSS X1, (DX)(R8*4)
  7764  	ADDQ  CX, DI
  7765  	ADDQ  BX, R8
  7766  	SUBQ  $0x04, SI
  7767  
  7768  check_limit_unroll:
  7769  	CMPQ SI, $0x04
  7770  	JHI  loop_unroll
  7771  	JMP  check_limit
  7772  
  7773  loop:
  7774  	MOVSS (AX)(DI*4), X1
  7775  	MULSS X0, X1
  7776  	ADDSS (DX)(R8*4), X1
  7777  	MOVSS X1, (DX)(R8*4)
  7778  	DECQ  SI
  7779  	ADDQ  CX, DI
  7780  	ADDQ  BX, R8
  7781  
  7782  check_limit:
  7783  	CMPQ SI, $0x00
  7784  	JHI  loop
  7785  	RET
  7786  
  7787  // func AmdAxpyUnsafeX_V4A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7788  // Requires: SSE
  7789  TEXT ·AmdAxpyUnsafeX_V4A8R4(SB), NOSPLIT, $0-48
  7790  	MOVSS alpha+0(FP), X0
  7791  	MOVQ  xs+8(FP), AX
  7792  	MOVQ  incx+16(FP), CX
  7793  	MOVQ  ys+24(FP), DX
  7794  	MOVQ  incy+32(FP), BX
  7795  	MOVQ  n+40(FP), SI
  7796  	XORQ  DI, DI
  7797  	XORQ  R8, R8
  7798  	JMP   check_limit_unroll
  7799  	PCALIGN $0x08
  7800  
  7801  loop_unroll:
  7802  	MOVSS (AX)(DI*4), X1
  7803  	MULSS X0, X1
  7804  	ADDSS (DX)(R8*4), X1
  7805  	MOVSS X1, (DX)(R8*4)
  7806  	ADDQ  CX, DI
  7807  	ADDQ  BX, R8
  7808  	MOVSS (AX)(DI*4), X1
  7809  	MULSS X0, X1
  7810  	ADDSS (DX)(R8*4), X1
  7811  	MOVSS X1, (DX)(R8*4)
  7812  	ADDQ  CX, DI
  7813  	ADDQ  BX, R8
  7814  	MOVSS (AX)(DI*4), X1
  7815  	MULSS X0, X1
  7816  	ADDSS (DX)(R8*4), X1
  7817  	MOVSS X1, (DX)(R8*4)
  7818  	ADDQ  CX, DI
  7819  	ADDQ  BX, R8
  7820  	MOVSS (AX)(DI*4), X1
  7821  	MULSS X0, X1
  7822  	ADDSS (DX)(R8*4), X1
  7823  	MOVSS X1, (DX)(R8*4)
  7824  	ADDQ  CX, DI
  7825  	ADDQ  BX, R8
  7826  	SUBQ  $0x04, SI
  7827  
  7828  check_limit_unroll:
  7829  	CMPQ SI, $0x04
  7830  	JHI  loop_unroll
  7831  	JMP  check_limit
  7832  
  7833  loop:
  7834  	MOVSS (AX)(DI*4), X1
  7835  	MULSS X0, X1
  7836  	ADDSS (DX)(R8*4), X1
  7837  	MOVSS X1, (DX)(R8*4)
  7838  	DECQ  SI
  7839  	ADDQ  CX, DI
  7840  	ADDQ  BX, R8
  7841  
  7842  check_limit:
  7843  	CMPQ SI, $0x00
  7844  	JHI  loop
  7845  	RET
  7846  
  7847  // func AmdAxpyUnsafeX_V5A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7848  // Requires: SSE
  7849  TEXT ·AmdAxpyUnsafeX_V5A8R4(SB), NOSPLIT, $0-48
  7850  	MOVSS alpha+0(FP), X0
  7851  	MOVQ  xs+8(FP), AX
  7852  	MOVQ  incx+16(FP), CX
  7853  	MOVQ  ys+24(FP), DX
  7854  	MOVQ  incy+32(FP), BX
  7855  	MOVQ  n+40(FP), SI
  7856  	XORQ  DI, DI
  7857  	XORQ  R8, R8
  7858  	JMP   check_limit_unroll
  7859  	PCALIGN $0x08
  7860  
  7861  loop_unroll:
  7862  	MOVSS (AX)(DI*4), X1
  7863  	MULSS X0, X1
  7864  	ADDSS (DX)(R8*4), X1
  7865  	MOVSS X1, (DX)(R8*4)
  7866  	ADDQ  CX, DI
  7867  	ADDQ  BX, R8
  7868  	MOVSS (AX)(DI*4), X1
  7869  	MULSS X0, X1
  7870  	ADDSS (DX)(R8*4), X1
  7871  	MOVSS X1, (DX)(R8*4)
  7872  	ADDQ  CX, DI
  7873  	ADDQ  BX, R8
  7874  	MOVSS (AX)(DI*4), X1
  7875  	MULSS X0, X1
  7876  	ADDSS (DX)(R8*4), X1
  7877  	MOVSS X1, (DX)(R8*4)
  7878  	ADDQ  CX, DI
  7879  	ADDQ  BX, R8
  7880  	MOVSS (AX)(DI*4), X1
  7881  	MULSS X0, X1
  7882  	ADDSS (DX)(R8*4), X1
  7883  	MOVSS X1, (DX)(R8*4)
  7884  	ADDQ  CX, DI
  7885  	ADDQ  BX, R8
  7886  	SUBQ  $0x04, SI
  7887  
  7888  check_limit_unroll:
  7889  	CMPQ SI, $0x04
  7890  	JHI  loop_unroll
  7891  	JMP  check_limit
  7892  
  7893  loop:
  7894  	MOVSS (AX)(DI*4), X1
  7895  	MULSS X0, X1
  7896  	ADDSS (DX)(R8*4), X1
  7897  	MOVSS X1, (DX)(R8*4)
  7898  	DECQ  SI
  7899  	ADDQ  CX, DI
  7900  	ADDQ  BX, R8
  7901  
  7902  check_limit:
  7903  	CMPQ SI, $0x00
  7904  	JHI  loop
  7905  	RET
  7906  
  7907  // func AmdAxpyUnsafeX_V0A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7908  // Requires: SSE
  7909  TEXT ·AmdAxpyUnsafeX_V0A9R4(SB), NOSPLIT, $0-48
  7910  	MOVSS alpha+0(FP), X0
  7911  	MOVQ  xs+8(FP), AX
  7912  	MOVQ  incx+16(FP), CX
  7913  	MOVQ  ys+24(FP), DX
  7914  	MOVQ  incy+32(FP), BX
  7915  	MOVQ  n+40(FP), SI
  7916  	XORQ  DI, DI
  7917  	XORQ  R8, R8
  7918  	JMP   check_limit_unroll
  7919  	PCALIGN $0x08
  7920  	NOP
  7921  
  7922  loop_unroll:
  7923  	MOVSS (AX)(DI*4), X1
  7924  	MULSS X0, X1
  7925  	ADDSS (DX)(R8*4), X1
  7926  	MOVSS X1, (DX)(R8*4)
  7927  	ADDQ  CX, DI
  7928  	ADDQ  BX, R8
  7929  	MOVSS (AX)(DI*4), X1
  7930  	MULSS X0, X1
  7931  	ADDSS (DX)(R8*4), X1
  7932  	MOVSS X1, (DX)(R8*4)
  7933  	ADDQ  CX, DI
  7934  	ADDQ  BX, R8
  7935  	MOVSS (AX)(DI*4), X1
  7936  	MULSS X0, X1
  7937  	ADDSS (DX)(R8*4), X1
  7938  	MOVSS X1, (DX)(R8*4)
  7939  	ADDQ  CX, DI
  7940  	ADDQ  BX, R8
  7941  	MOVSS (AX)(DI*4), X1
  7942  	MULSS X0, X1
  7943  	ADDSS (DX)(R8*4), X1
  7944  	MOVSS X1, (DX)(R8*4)
  7945  	ADDQ  CX, DI
  7946  	ADDQ  BX, R8
  7947  	SUBQ  $0x04, SI
  7948  
  7949  check_limit_unroll:
  7950  	CMPQ SI, $0x04
  7951  	JHI  loop_unroll
  7952  	JMP  check_limit
  7953  
  7954  loop:
  7955  	MOVSS (AX)(DI*4), X1
  7956  	MULSS X0, X1
  7957  	ADDSS (DX)(R8*4), X1
  7958  	MOVSS X1, (DX)(R8*4)
  7959  	DECQ  SI
  7960  	ADDQ  CX, DI
  7961  	ADDQ  BX, R8
  7962  
  7963  check_limit:
  7964  	CMPQ SI, $0x00
  7965  	JHI  loop
  7966  	RET
  7967  
  7968  // func AmdAxpyUnsafeX_V1A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  7969  // Requires: SSE
  7970  TEXT ·AmdAxpyUnsafeX_V1A9R4(SB), NOSPLIT, $0-48
  7971  	MOVSS alpha+0(FP), X0
  7972  	MOVQ  xs+8(FP), AX
  7973  	MOVQ  incx+16(FP), CX
  7974  	MOVQ  ys+24(FP), DX
  7975  	MOVQ  incy+32(FP), BX
  7976  	MOVQ  n+40(FP), SI
  7977  	XORQ  DI, DI
  7978  	XORQ  R8, R8
  7979  	JMP   check_limit_unroll
  7980  	PCALIGN $0x08
  7981  	NOP
  7982  
  7983  loop_unroll:
  7984  	MOVSS (AX)(DI*4), X1
  7985  	MULSS X0, X1
  7986  	ADDSS (DX)(R8*4), X1
  7987  	MOVSS X1, (DX)(R8*4)
  7988  	ADDQ  CX, DI
  7989  	ADDQ  BX, R8
  7990  	MOVSS (AX)(DI*4), X1
  7991  	MULSS X0, X1
  7992  	ADDSS (DX)(R8*4), X1
  7993  	MOVSS X1, (DX)(R8*4)
  7994  	ADDQ  CX, DI
  7995  	ADDQ  BX, R8
  7996  	MOVSS (AX)(DI*4), X1
  7997  	MULSS X0, X1
  7998  	ADDSS (DX)(R8*4), X1
  7999  	MOVSS X1, (DX)(R8*4)
  8000  	ADDQ  CX, DI
  8001  	ADDQ  BX, R8
  8002  	MOVSS (AX)(DI*4), X1
  8003  	MULSS X0, X1
  8004  	ADDSS (DX)(R8*4), X1
  8005  	MOVSS X1, (DX)(R8*4)
  8006  	ADDQ  CX, DI
  8007  	ADDQ  BX, R8
  8008  	SUBQ  $0x04, SI
  8009  
  8010  check_limit_unroll:
  8011  	CMPQ SI, $0x04
  8012  	JHI  loop_unroll
  8013  	JMP  check_limit
  8014  
  8015  loop:
  8016  	MOVSS (AX)(DI*4), X1
  8017  	MULSS X0, X1
  8018  	ADDSS (DX)(R8*4), X1
  8019  	MOVSS X1, (DX)(R8*4)
  8020  	DECQ  SI
  8021  	ADDQ  CX, DI
  8022  	ADDQ  BX, R8
  8023  
  8024  check_limit:
  8025  	CMPQ SI, $0x00
  8026  	JHI  loop
  8027  	RET
  8028  
  8029  // func AmdAxpyUnsafeX_V2A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8030  // Requires: SSE
  8031  TEXT ·AmdAxpyUnsafeX_V2A9R4(SB), NOSPLIT, $0-48
  8032  	MOVSS alpha+0(FP), X0
  8033  	MOVQ  xs+8(FP), AX
  8034  	MOVQ  incx+16(FP), CX
  8035  	MOVQ  ys+24(FP), DX
  8036  	MOVQ  incy+32(FP), BX
  8037  	MOVQ  n+40(FP), SI
  8038  	XORQ  DI, DI
  8039  	XORQ  R8, R8
  8040  	JMP   check_limit_unroll
  8041  	PCALIGN $0x08
  8042  	NOP
  8043  
  8044  loop_unroll:
  8045  	MOVSS (AX)(DI*4), X1
  8046  	MULSS X0, X1
  8047  	ADDSS (DX)(R8*4), X1
  8048  	MOVSS X1, (DX)(R8*4)
  8049  	ADDQ  CX, DI
  8050  	ADDQ  BX, R8
  8051  	MOVSS (AX)(DI*4), X1
  8052  	MULSS X0, X1
  8053  	ADDSS (DX)(R8*4), X1
  8054  	MOVSS X1, (DX)(R8*4)
  8055  	ADDQ  CX, DI
  8056  	ADDQ  BX, R8
  8057  	MOVSS (AX)(DI*4), X1
  8058  	MULSS X0, X1
  8059  	ADDSS (DX)(R8*4), X1
  8060  	MOVSS X1, (DX)(R8*4)
  8061  	ADDQ  CX, DI
  8062  	ADDQ  BX, R8
  8063  	MOVSS (AX)(DI*4), X1
  8064  	MULSS X0, X1
  8065  	ADDSS (DX)(R8*4), X1
  8066  	MOVSS X1, (DX)(R8*4)
  8067  	ADDQ  CX, DI
  8068  	ADDQ  BX, R8
  8069  	SUBQ  $0x04, SI
  8070  
  8071  check_limit_unroll:
  8072  	CMPQ SI, $0x04
  8073  	JHI  loop_unroll
  8074  	JMP  check_limit
  8075  
  8076  loop:
  8077  	MOVSS (AX)(DI*4), X1
  8078  	MULSS X0, X1
  8079  	ADDSS (DX)(R8*4), X1
  8080  	MOVSS X1, (DX)(R8*4)
  8081  	DECQ  SI
  8082  	ADDQ  CX, DI
  8083  	ADDQ  BX, R8
  8084  
  8085  check_limit:
  8086  	CMPQ SI, $0x00
  8087  	JHI  loop
  8088  	RET
  8089  
  8090  // func AmdAxpyUnsafeX_V3A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8091  // Requires: SSE
  8092  TEXT ·AmdAxpyUnsafeX_V3A9R4(SB), NOSPLIT, $0-48
  8093  	MOVSS alpha+0(FP), X0
  8094  	MOVQ  xs+8(FP), AX
  8095  	MOVQ  incx+16(FP), CX
  8096  	MOVQ  ys+24(FP), DX
  8097  	MOVQ  incy+32(FP), BX
  8098  	MOVQ  n+40(FP), SI
  8099  	XORQ  DI, DI
  8100  	XORQ  R8, R8
  8101  	JMP   check_limit_unroll
  8102  	PCALIGN $0x08
  8103  	NOP
  8104  
  8105  loop_unroll:
  8106  	MOVSS (AX)(DI*4), X1
  8107  	MULSS X0, X1
  8108  	ADDSS (DX)(R8*4), X1
  8109  	MOVSS X1, (DX)(R8*4)
  8110  	ADDQ  CX, DI
  8111  	ADDQ  BX, R8
  8112  	MOVSS (AX)(DI*4), X1
  8113  	MULSS X0, X1
  8114  	ADDSS (DX)(R8*4), X1
  8115  	MOVSS X1, (DX)(R8*4)
  8116  	ADDQ  CX, DI
  8117  	ADDQ  BX, R8
  8118  	MOVSS (AX)(DI*4), X1
  8119  	MULSS X0, X1
  8120  	ADDSS (DX)(R8*4), X1
  8121  	MOVSS X1, (DX)(R8*4)
  8122  	ADDQ  CX, DI
  8123  	ADDQ  BX, R8
  8124  	MOVSS (AX)(DI*4), X1
  8125  	MULSS X0, X1
  8126  	ADDSS (DX)(R8*4), X1
  8127  	MOVSS X1, (DX)(R8*4)
  8128  	ADDQ  CX, DI
  8129  	ADDQ  BX, R8
  8130  	SUBQ  $0x04, SI
  8131  
  8132  check_limit_unroll:
  8133  	CMPQ SI, $0x04
  8134  	JHI  loop_unroll
  8135  	JMP  check_limit
  8136  
  8137  loop:
  8138  	MOVSS (AX)(DI*4), X1
  8139  	MULSS X0, X1
  8140  	ADDSS (DX)(R8*4), X1
  8141  	MOVSS X1, (DX)(R8*4)
  8142  	DECQ  SI
  8143  	ADDQ  CX, DI
  8144  	ADDQ  BX, R8
  8145  
  8146  check_limit:
  8147  	CMPQ SI, $0x00
  8148  	JHI  loop
  8149  	RET
  8150  
  8151  // func AmdAxpyUnsafeX_V4A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8152  // Requires: SSE
  8153  TEXT ·AmdAxpyUnsafeX_V4A9R4(SB), NOSPLIT, $0-48
  8154  	MOVSS alpha+0(FP), X0
  8155  	MOVQ  xs+8(FP), AX
  8156  	MOVQ  incx+16(FP), CX
  8157  	MOVQ  ys+24(FP), DX
  8158  	MOVQ  incy+32(FP), BX
  8159  	MOVQ  n+40(FP), SI
  8160  	XORQ  DI, DI
  8161  	XORQ  R8, R8
  8162  	JMP   check_limit_unroll
  8163  	PCALIGN $0x08
  8164  	NOP
  8165  
  8166  loop_unroll:
  8167  	MOVSS (AX)(DI*4), X1
  8168  	MULSS X0, X1
  8169  	ADDSS (DX)(R8*4), X1
  8170  	MOVSS X1, (DX)(R8*4)
  8171  	ADDQ  CX, DI
  8172  	ADDQ  BX, R8
  8173  	MOVSS (AX)(DI*4), X1
  8174  	MULSS X0, X1
  8175  	ADDSS (DX)(R8*4), X1
  8176  	MOVSS X1, (DX)(R8*4)
  8177  	ADDQ  CX, DI
  8178  	ADDQ  BX, R8
  8179  	MOVSS (AX)(DI*4), X1
  8180  	MULSS X0, X1
  8181  	ADDSS (DX)(R8*4), X1
  8182  	MOVSS X1, (DX)(R8*4)
  8183  	ADDQ  CX, DI
  8184  	ADDQ  BX, R8
  8185  	MOVSS (AX)(DI*4), X1
  8186  	MULSS X0, X1
  8187  	ADDSS (DX)(R8*4), X1
  8188  	MOVSS X1, (DX)(R8*4)
  8189  	ADDQ  CX, DI
  8190  	ADDQ  BX, R8
  8191  	SUBQ  $0x04, SI
  8192  
  8193  check_limit_unroll:
  8194  	CMPQ SI, $0x04
  8195  	JHI  loop_unroll
  8196  	JMP  check_limit
  8197  
  8198  loop:
  8199  	MOVSS (AX)(DI*4), X1
  8200  	MULSS X0, X1
  8201  	ADDSS (DX)(R8*4), X1
  8202  	MOVSS X1, (DX)(R8*4)
  8203  	DECQ  SI
  8204  	ADDQ  CX, DI
  8205  	ADDQ  BX, R8
  8206  
  8207  check_limit:
  8208  	CMPQ SI, $0x00
  8209  	JHI  loop
  8210  	RET
  8211  
  8212  // func AmdAxpyUnsafeX_V5A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8213  // Requires: SSE
  8214  TEXT ·AmdAxpyUnsafeX_V5A9R4(SB), NOSPLIT, $0-48
  8215  	MOVSS alpha+0(FP), X0
  8216  	MOVQ  xs+8(FP), AX
  8217  	MOVQ  incx+16(FP), CX
  8218  	MOVQ  ys+24(FP), DX
  8219  	MOVQ  incy+32(FP), BX
  8220  	MOVQ  n+40(FP), SI
  8221  	XORQ  DI, DI
  8222  	XORQ  R8, R8
  8223  	JMP   check_limit_unroll
  8224  	PCALIGN $0x08
  8225  	NOP
  8226  
  8227  loop_unroll:
  8228  	MOVSS (AX)(DI*4), X1
  8229  	MULSS X0, X1
  8230  	ADDSS (DX)(R8*4), X1
  8231  	MOVSS X1, (DX)(R8*4)
  8232  	ADDQ  CX, DI
  8233  	ADDQ  BX, R8
  8234  	MOVSS (AX)(DI*4), X1
  8235  	MULSS X0, X1
  8236  	ADDSS (DX)(R8*4), X1
  8237  	MOVSS X1, (DX)(R8*4)
  8238  	ADDQ  CX, DI
  8239  	ADDQ  BX, R8
  8240  	MOVSS (AX)(DI*4), X1
  8241  	MULSS X0, X1
  8242  	ADDSS (DX)(R8*4), X1
  8243  	MOVSS X1, (DX)(R8*4)
  8244  	ADDQ  CX, DI
  8245  	ADDQ  BX, R8
  8246  	MOVSS (AX)(DI*4), X1
  8247  	MULSS X0, X1
  8248  	ADDSS (DX)(R8*4), X1
  8249  	MOVSS X1, (DX)(R8*4)
  8250  	ADDQ  CX, DI
  8251  	ADDQ  BX, R8
  8252  	SUBQ  $0x04, SI
  8253  
  8254  check_limit_unroll:
  8255  	CMPQ SI, $0x04
  8256  	JHI  loop_unroll
  8257  	JMP  check_limit
  8258  
  8259  loop:
  8260  	MOVSS (AX)(DI*4), X1
  8261  	MULSS X0, X1
  8262  	ADDSS (DX)(R8*4), X1
  8263  	MOVSS X1, (DX)(R8*4)
  8264  	DECQ  SI
  8265  	ADDQ  CX, DI
  8266  	ADDQ  BX, R8
  8267  
  8268  check_limit:
  8269  	CMPQ SI, $0x00
  8270  	JHI  loop
  8271  	RET
  8272  
  8273  // func AmdAxpyUnsafeX_V0A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8274  // Requires: SSE
  8275  TEXT ·AmdAxpyUnsafeX_V0A10R4(SB), NOSPLIT, $0-48
  8276  	MOVSS alpha+0(FP), X0
  8277  	MOVQ  xs+8(FP), AX
  8278  	MOVQ  incx+16(FP), CX
  8279  	MOVQ  ys+24(FP), DX
  8280  	MOVQ  incy+32(FP), BX
  8281  	MOVQ  n+40(FP), SI
  8282  	XORQ  DI, DI
  8283  	XORQ  R8, R8
  8284  	JMP   check_limit_unroll
  8285  	PCALIGN $0x08
  8286  	NOP
  8287  	NOP
  8288  
  8289  loop_unroll:
  8290  	MOVSS (AX)(DI*4), X1
  8291  	MULSS X0, X1
  8292  	ADDSS (DX)(R8*4), X1
  8293  	MOVSS X1, (DX)(R8*4)
  8294  	ADDQ  CX, DI
  8295  	ADDQ  BX, R8
  8296  	MOVSS (AX)(DI*4), X1
  8297  	MULSS X0, X1
  8298  	ADDSS (DX)(R8*4), X1
  8299  	MOVSS X1, (DX)(R8*4)
  8300  	ADDQ  CX, DI
  8301  	ADDQ  BX, R8
  8302  	MOVSS (AX)(DI*4), X1
  8303  	MULSS X0, X1
  8304  	ADDSS (DX)(R8*4), X1
  8305  	MOVSS X1, (DX)(R8*4)
  8306  	ADDQ  CX, DI
  8307  	ADDQ  BX, R8
  8308  	MOVSS (AX)(DI*4), X1
  8309  	MULSS X0, X1
  8310  	ADDSS (DX)(R8*4), X1
  8311  	MOVSS X1, (DX)(R8*4)
  8312  	ADDQ  CX, DI
  8313  	ADDQ  BX, R8
  8314  	SUBQ  $0x04, SI
  8315  
  8316  check_limit_unroll:
  8317  	CMPQ SI, $0x04
  8318  	JHI  loop_unroll
  8319  	JMP  check_limit
  8320  
  8321  loop:
  8322  	MOVSS (AX)(DI*4), X1
  8323  	MULSS X0, X1
  8324  	ADDSS (DX)(R8*4), X1
  8325  	MOVSS X1, (DX)(R8*4)
  8326  	DECQ  SI
  8327  	ADDQ  CX, DI
  8328  	ADDQ  BX, R8
  8329  
  8330  check_limit:
  8331  	CMPQ SI, $0x00
  8332  	JHI  loop
  8333  	RET
  8334  
  8335  // func AmdAxpyUnsafeX_V1A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8336  // Requires: SSE
  8337  TEXT ·AmdAxpyUnsafeX_V1A10R4(SB), NOSPLIT, $0-48
  8338  	MOVSS alpha+0(FP), X0
  8339  	MOVQ  xs+8(FP), AX
  8340  	MOVQ  incx+16(FP), CX
  8341  	MOVQ  ys+24(FP), DX
  8342  	MOVQ  incy+32(FP), BX
  8343  	MOVQ  n+40(FP), SI
  8344  	XORQ  DI, DI
  8345  	XORQ  R8, R8
  8346  	JMP   check_limit_unroll
  8347  	PCALIGN $0x08
  8348  	NOP
  8349  	NOP
  8350  
  8351  loop_unroll:
  8352  	MOVSS (AX)(DI*4), X1
  8353  	MULSS X0, X1
  8354  	ADDSS (DX)(R8*4), X1
  8355  	MOVSS X1, (DX)(R8*4)
  8356  	ADDQ  CX, DI
  8357  	ADDQ  BX, R8
  8358  	MOVSS (AX)(DI*4), X1
  8359  	MULSS X0, X1
  8360  	ADDSS (DX)(R8*4), X1
  8361  	MOVSS X1, (DX)(R8*4)
  8362  	ADDQ  CX, DI
  8363  	ADDQ  BX, R8
  8364  	MOVSS (AX)(DI*4), X1
  8365  	MULSS X0, X1
  8366  	ADDSS (DX)(R8*4), X1
  8367  	MOVSS X1, (DX)(R8*4)
  8368  	ADDQ  CX, DI
  8369  	ADDQ  BX, R8
  8370  	MOVSS (AX)(DI*4), X1
  8371  	MULSS X0, X1
  8372  	ADDSS (DX)(R8*4), X1
  8373  	MOVSS X1, (DX)(R8*4)
  8374  	ADDQ  CX, DI
  8375  	ADDQ  BX, R8
  8376  	SUBQ  $0x04, SI
  8377  
  8378  check_limit_unroll:
  8379  	CMPQ SI, $0x04
  8380  	JHI  loop_unroll
  8381  	JMP  check_limit
  8382  
  8383  loop:
  8384  	MOVSS (AX)(DI*4), X1
  8385  	MULSS X0, X1
  8386  	ADDSS (DX)(R8*4), X1
  8387  	MOVSS X1, (DX)(R8*4)
  8388  	DECQ  SI
  8389  	ADDQ  CX, DI
  8390  	ADDQ  BX, R8
  8391  
  8392  check_limit:
  8393  	CMPQ SI, $0x00
  8394  	JHI  loop
  8395  	RET
  8396  
  8397  // func AmdAxpyUnsafeX_V2A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8398  // Requires: SSE
  8399  TEXT ·AmdAxpyUnsafeX_V2A10R4(SB), NOSPLIT, $0-48
  8400  	MOVSS alpha+0(FP), X0
  8401  	MOVQ  xs+8(FP), AX
  8402  	MOVQ  incx+16(FP), CX
  8403  	MOVQ  ys+24(FP), DX
  8404  	MOVQ  incy+32(FP), BX
  8405  	MOVQ  n+40(FP), SI
  8406  	XORQ  DI, DI
  8407  	XORQ  R8, R8
  8408  	JMP   check_limit_unroll
  8409  	PCALIGN $0x08
  8410  	NOP
  8411  	NOP
  8412  
  8413  loop_unroll:
  8414  	MOVSS (AX)(DI*4), X1
  8415  	MULSS X0, X1
  8416  	ADDSS (DX)(R8*4), X1
  8417  	MOVSS X1, (DX)(R8*4)
  8418  	ADDQ  CX, DI
  8419  	ADDQ  BX, R8
  8420  	MOVSS (AX)(DI*4), X1
  8421  	MULSS X0, X1
  8422  	ADDSS (DX)(R8*4), X1
  8423  	MOVSS X1, (DX)(R8*4)
  8424  	ADDQ  CX, DI
  8425  	ADDQ  BX, R8
  8426  	MOVSS (AX)(DI*4), X1
  8427  	MULSS X0, X1
  8428  	ADDSS (DX)(R8*4), X1
  8429  	MOVSS X1, (DX)(R8*4)
  8430  	ADDQ  CX, DI
  8431  	ADDQ  BX, R8
  8432  	MOVSS (AX)(DI*4), X1
  8433  	MULSS X0, X1
  8434  	ADDSS (DX)(R8*4), X1
  8435  	MOVSS X1, (DX)(R8*4)
  8436  	ADDQ  CX, DI
  8437  	ADDQ  BX, R8
  8438  	SUBQ  $0x04, SI
  8439  
  8440  check_limit_unroll:
  8441  	CMPQ SI, $0x04
  8442  	JHI  loop_unroll
  8443  	JMP  check_limit
  8444  
  8445  loop:
  8446  	MOVSS (AX)(DI*4), X1
  8447  	MULSS X0, X1
  8448  	ADDSS (DX)(R8*4), X1
  8449  	MOVSS X1, (DX)(R8*4)
  8450  	DECQ  SI
  8451  	ADDQ  CX, DI
  8452  	ADDQ  BX, R8
  8453  
  8454  check_limit:
  8455  	CMPQ SI, $0x00
  8456  	JHI  loop
  8457  	RET
  8458  
  8459  // func AmdAxpyUnsafeX_V3A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8460  // Requires: SSE
  8461  TEXT ·AmdAxpyUnsafeX_V3A10R4(SB), NOSPLIT, $0-48
  8462  	MOVSS alpha+0(FP), X0
  8463  	MOVQ  xs+8(FP), AX
  8464  	MOVQ  incx+16(FP), CX
  8465  	MOVQ  ys+24(FP), DX
  8466  	MOVQ  incy+32(FP), BX
  8467  	MOVQ  n+40(FP), SI
  8468  	XORQ  DI, DI
  8469  	XORQ  R8, R8
  8470  	JMP   check_limit_unroll
  8471  	PCALIGN $0x08
  8472  	NOP
  8473  	NOP
  8474  
  8475  loop_unroll:
  8476  	MOVSS (AX)(DI*4), X1
  8477  	MULSS X0, X1
  8478  	ADDSS (DX)(R8*4), X1
  8479  	MOVSS X1, (DX)(R8*4)
  8480  	ADDQ  CX, DI
  8481  	ADDQ  BX, R8
  8482  	MOVSS (AX)(DI*4), X1
  8483  	MULSS X0, X1
  8484  	ADDSS (DX)(R8*4), X1
  8485  	MOVSS X1, (DX)(R8*4)
  8486  	ADDQ  CX, DI
  8487  	ADDQ  BX, R8
  8488  	MOVSS (AX)(DI*4), X1
  8489  	MULSS X0, X1
  8490  	ADDSS (DX)(R8*4), X1
  8491  	MOVSS X1, (DX)(R8*4)
  8492  	ADDQ  CX, DI
  8493  	ADDQ  BX, R8
  8494  	MOVSS (AX)(DI*4), X1
  8495  	MULSS X0, X1
  8496  	ADDSS (DX)(R8*4), X1
  8497  	MOVSS X1, (DX)(R8*4)
  8498  	ADDQ  CX, DI
  8499  	ADDQ  BX, R8
  8500  	SUBQ  $0x04, SI
  8501  
  8502  check_limit_unroll:
  8503  	CMPQ SI, $0x04
  8504  	JHI  loop_unroll
  8505  	JMP  check_limit
  8506  
  8507  loop:
  8508  	MOVSS (AX)(DI*4), X1
  8509  	MULSS X0, X1
  8510  	ADDSS (DX)(R8*4), X1
  8511  	MOVSS X1, (DX)(R8*4)
  8512  	DECQ  SI
  8513  	ADDQ  CX, DI
  8514  	ADDQ  BX, R8
  8515  
  8516  check_limit:
  8517  	CMPQ SI, $0x00
  8518  	JHI  loop
  8519  	RET
  8520  
  8521  // func AmdAxpyUnsafeX_V4A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8522  // Requires: SSE
  8523  TEXT ·AmdAxpyUnsafeX_V4A10R4(SB), NOSPLIT, $0-48
  8524  	MOVSS alpha+0(FP), X0
  8525  	MOVQ  xs+8(FP), AX
  8526  	MOVQ  incx+16(FP), CX
  8527  	MOVQ  ys+24(FP), DX
  8528  	MOVQ  incy+32(FP), BX
  8529  	MOVQ  n+40(FP), SI
  8530  	XORQ  DI, DI
  8531  	XORQ  R8, R8
  8532  	JMP   check_limit_unroll
  8533  	PCALIGN $0x08
  8534  	NOP
  8535  	NOP
  8536  
  8537  loop_unroll:
  8538  	MOVSS (AX)(DI*4), X1
  8539  	MULSS X0, X1
  8540  	ADDSS (DX)(R8*4), X1
  8541  	MOVSS X1, (DX)(R8*4)
  8542  	ADDQ  CX, DI
  8543  	ADDQ  BX, R8
  8544  	MOVSS (AX)(DI*4), X1
  8545  	MULSS X0, X1
  8546  	ADDSS (DX)(R8*4), X1
  8547  	MOVSS X1, (DX)(R8*4)
  8548  	ADDQ  CX, DI
  8549  	ADDQ  BX, R8
  8550  	MOVSS (AX)(DI*4), X1
  8551  	MULSS X0, X1
  8552  	ADDSS (DX)(R8*4), X1
  8553  	MOVSS X1, (DX)(R8*4)
  8554  	ADDQ  CX, DI
  8555  	ADDQ  BX, R8
  8556  	MOVSS (AX)(DI*4), X1
  8557  	MULSS X0, X1
  8558  	ADDSS (DX)(R8*4), X1
  8559  	MOVSS X1, (DX)(R8*4)
  8560  	ADDQ  CX, DI
  8561  	ADDQ  BX, R8
  8562  	SUBQ  $0x04, SI
  8563  
  8564  check_limit_unroll:
  8565  	CMPQ SI, $0x04
  8566  	JHI  loop_unroll
  8567  	JMP  check_limit
  8568  
  8569  loop:
  8570  	MOVSS (AX)(DI*4), X1
  8571  	MULSS X0, X1
  8572  	ADDSS (DX)(R8*4), X1
  8573  	MOVSS X1, (DX)(R8*4)
  8574  	DECQ  SI
  8575  	ADDQ  CX, DI
  8576  	ADDQ  BX, R8
  8577  
  8578  check_limit:
  8579  	CMPQ SI, $0x00
  8580  	JHI  loop
  8581  	RET
  8582  
  8583  // func AmdAxpyUnsafeX_V5A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8584  // Requires: SSE
  8585  TEXT ·AmdAxpyUnsafeX_V5A10R4(SB), NOSPLIT, $0-48
  8586  	MOVSS alpha+0(FP), X0
  8587  	MOVQ  xs+8(FP), AX
  8588  	MOVQ  incx+16(FP), CX
  8589  	MOVQ  ys+24(FP), DX
  8590  	MOVQ  incy+32(FP), BX
  8591  	MOVQ  n+40(FP), SI
  8592  	XORQ  DI, DI
  8593  	XORQ  R8, R8
  8594  	JMP   check_limit_unroll
  8595  	PCALIGN $0x08
  8596  	NOP
  8597  	NOP
  8598  
  8599  loop_unroll:
  8600  	MOVSS (AX)(DI*4), X1
  8601  	MULSS X0, X1
  8602  	ADDSS (DX)(R8*4), X1
  8603  	MOVSS X1, (DX)(R8*4)
  8604  	ADDQ  CX, DI
  8605  	ADDQ  BX, R8
  8606  	MOVSS (AX)(DI*4), X1
  8607  	MULSS X0, X1
  8608  	ADDSS (DX)(R8*4), X1
  8609  	MOVSS X1, (DX)(R8*4)
  8610  	ADDQ  CX, DI
  8611  	ADDQ  BX, R8
  8612  	MOVSS (AX)(DI*4), X1
  8613  	MULSS X0, X1
  8614  	ADDSS (DX)(R8*4), X1
  8615  	MOVSS X1, (DX)(R8*4)
  8616  	ADDQ  CX, DI
  8617  	ADDQ  BX, R8
  8618  	MOVSS (AX)(DI*4), X1
  8619  	MULSS X0, X1
  8620  	ADDSS (DX)(R8*4), X1
  8621  	MOVSS X1, (DX)(R8*4)
  8622  	ADDQ  CX, DI
  8623  	ADDQ  BX, R8
  8624  	SUBQ  $0x04, SI
  8625  
  8626  check_limit_unroll:
  8627  	CMPQ SI, $0x04
  8628  	JHI  loop_unroll
  8629  	JMP  check_limit
  8630  
  8631  loop:
  8632  	MOVSS (AX)(DI*4), X1
  8633  	MULSS X0, X1
  8634  	ADDSS (DX)(R8*4), X1
  8635  	MOVSS X1, (DX)(R8*4)
  8636  	DECQ  SI
  8637  	ADDQ  CX, DI
  8638  	ADDQ  BX, R8
  8639  
  8640  check_limit:
  8641  	CMPQ SI, $0x00
  8642  	JHI  loop
  8643  	RET
  8644  
  8645  // func AmdAxpyUnsafeX_V0A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8646  // Requires: SSE
  8647  TEXT ·AmdAxpyUnsafeX_V0A11R4(SB), NOSPLIT, $0-48
  8648  	MOVSS alpha+0(FP), X0
  8649  	MOVQ  xs+8(FP), AX
  8650  	MOVQ  incx+16(FP), CX
  8651  	MOVQ  ys+24(FP), DX
  8652  	MOVQ  incy+32(FP), BX
  8653  	MOVQ  n+40(FP), SI
  8654  	XORQ  DI, DI
  8655  	XORQ  R8, R8
  8656  	JMP   check_limit_unroll
  8657  	PCALIGN $0x08
  8658  	NOP
  8659  	NOP
  8660  	NOP
  8661  
  8662  loop_unroll:
  8663  	MOVSS (AX)(DI*4), X1
  8664  	MULSS X0, X1
  8665  	ADDSS (DX)(R8*4), X1
  8666  	MOVSS X1, (DX)(R8*4)
  8667  	ADDQ  CX, DI
  8668  	ADDQ  BX, R8
  8669  	MOVSS (AX)(DI*4), X1
  8670  	MULSS X0, X1
  8671  	ADDSS (DX)(R8*4), X1
  8672  	MOVSS X1, (DX)(R8*4)
  8673  	ADDQ  CX, DI
  8674  	ADDQ  BX, R8
  8675  	MOVSS (AX)(DI*4), X1
  8676  	MULSS X0, X1
  8677  	ADDSS (DX)(R8*4), X1
  8678  	MOVSS X1, (DX)(R8*4)
  8679  	ADDQ  CX, DI
  8680  	ADDQ  BX, R8
  8681  	MOVSS (AX)(DI*4), X1
  8682  	MULSS X0, X1
  8683  	ADDSS (DX)(R8*4), X1
  8684  	MOVSS X1, (DX)(R8*4)
  8685  	ADDQ  CX, DI
  8686  	ADDQ  BX, R8
  8687  	SUBQ  $0x04, SI
  8688  
  8689  check_limit_unroll:
  8690  	CMPQ SI, $0x04
  8691  	JHI  loop_unroll
  8692  	JMP  check_limit
  8693  
  8694  loop:
  8695  	MOVSS (AX)(DI*4), X1
  8696  	MULSS X0, X1
  8697  	ADDSS (DX)(R8*4), X1
  8698  	MOVSS X1, (DX)(R8*4)
  8699  	DECQ  SI
  8700  	ADDQ  CX, DI
  8701  	ADDQ  BX, R8
  8702  
  8703  check_limit:
  8704  	CMPQ SI, $0x00
  8705  	JHI  loop
  8706  	RET
  8707  
  8708  // func AmdAxpyUnsafeX_V1A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8709  // Requires: SSE
  8710  TEXT ·AmdAxpyUnsafeX_V1A11R4(SB), NOSPLIT, $0-48
  8711  	MOVSS alpha+0(FP), X0
  8712  	MOVQ  xs+8(FP), AX
  8713  	MOVQ  incx+16(FP), CX
  8714  	MOVQ  ys+24(FP), DX
  8715  	MOVQ  incy+32(FP), BX
  8716  	MOVQ  n+40(FP), SI
  8717  	XORQ  DI, DI
  8718  	XORQ  R8, R8
  8719  	JMP   check_limit_unroll
  8720  	PCALIGN $0x08
  8721  	NOP
  8722  	NOP
  8723  	NOP
  8724  
  8725  loop_unroll:
  8726  	MOVSS (AX)(DI*4), X1
  8727  	MULSS X0, X1
  8728  	ADDSS (DX)(R8*4), X1
  8729  	MOVSS X1, (DX)(R8*4)
  8730  	ADDQ  CX, DI
  8731  	ADDQ  BX, R8
  8732  	MOVSS (AX)(DI*4), X1
  8733  	MULSS X0, X1
  8734  	ADDSS (DX)(R8*4), X1
  8735  	MOVSS X1, (DX)(R8*4)
  8736  	ADDQ  CX, DI
  8737  	ADDQ  BX, R8
  8738  	MOVSS (AX)(DI*4), X1
  8739  	MULSS X0, X1
  8740  	ADDSS (DX)(R8*4), X1
  8741  	MOVSS X1, (DX)(R8*4)
  8742  	ADDQ  CX, DI
  8743  	ADDQ  BX, R8
  8744  	MOVSS (AX)(DI*4), X1
  8745  	MULSS X0, X1
  8746  	ADDSS (DX)(R8*4), X1
  8747  	MOVSS X1, (DX)(R8*4)
  8748  	ADDQ  CX, DI
  8749  	ADDQ  BX, R8
  8750  	SUBQ  $0x04, SI
  8751  
  8752  check_limit_unroll:
  8753  	CMPQ SI, $0x04
  8754  	JHI  loop_unroll
  8755  	JMP  check_limit
  8756  
  8757  loop:
  8758  	MOVSS (AX)(DI*4), X1
  8759  	MULSS X0, X1
  8760  	ADDSS (DX)(R8*4), X1
  8761  	MOVSS X1, (DX)(R8*4)
  8762  	DECQ  SI
  8763  	ADDQ  CX, DI
  8764  	ADDQ  BX, R8
  8765  
  8766  check_limit:
  8767  	CMPQ SI, $0x00
  8768  	JHI  loop
  8769  	RET
  8770  
  8771  // func AmdAxpyUnsafeX_V2A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8772  // Requires: SSE
  8773  TEXT ·AmdAxpyUnsafeX_V2A11R4(SB), NOSPLIT, $0-48
  8774  	MOVSS alpha+0(FP), X0
  8775  	MOVQ  xs+8(FP), AX
  8776  	MOVQ  incx+16(FP), CX
  8777  	MOVQ  ys+24(FP), DX
  8778  	MOVQ  incy+32(FP), BX
  8779  	MOVQ  n+40(FP), SI
  8780  	XORQ  DI, DI
  8781  	XORQ  R8, R8
  8782  	JMP   check_limit_unroll
  8783  	PCALIGN $0x08
  8784  	NOP
  8785  	NOP
  8786  	NOP
  8787  
  8788  loop_unroll:
  8789  	MOVSS (AX)(DI*4), X1
  8790  	MULSS X0, X1
  8791  	ADDSS (DX)(R8*4), X1
  8792  	MOVSS X1, (DX)(R8*4)
  8793  	ADDQ  CX, DI
  8794  	ADDQ  BX, R8
  8795  	MOVSS (AX)(DI*4), X1
  8796  	MULSS X0, X1
  8797  	ADDSS (DX)(R8*4), X1
  8798  	MOVSS X1, (DX)(R8*4)
  8799  	ADDQ  CX, DI
  8800  	ADDQ  BX, R8
  8801  	MOVSS (AX)(DI*4), X1
  8802  	MULSS X0, X1
  8803  	ADDSS (DX)(R8*4), X1
  8804  	MOVSS X1, (DX)(R8*4)
  8805  	ADDQ  CX, DI
  8806  	ADDQ  BX, R8
  8807  	MOVSS (AX)(DI*4), X1
  8808  	MULSS X0, X1
  8809  	ADDSS (DX)(R8*4), X1
  8810  	MOVSS X1, (DX)(R8*4)
  8811  	ADDQ  CX, DI
  8812  	ADDQ  BX, R8
  8813  	SUBQ  $0x04, SI
  8814  
  8815  check_limit_unroll:
  8816  	CMPQ SI, $0x04
  8817  	JHI  loop_unroll
  8818  	JMP  check_limit
  8819  
  8820  loop:
  8821  	MOVSS (AX)(DI*4), X1
  8822  	MULSS X0, X1
  8823  	ADDSS (DX)(R8*4), X1
  8824  	MOVSS X1, (DX)(R8*4)
  8825  	DECQ  SI
  8826  	ADDQ  CX, DI
  8827  	ADDQ  BX, R8
  8828  
  8829  check_limit:
  8830  	CMPQ SI, $0x00
  8831  	JHI  loop
  8832  	RET
  8833  
  8834  // func AmdAxpyUnsafeX_V3A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8835  // Requires: SSE
  8836  TEXT ·AmdAxpyUnsafeX_V3A11R4(SB), NOSPLIT, $0-48
  8837  	MOVSS alpha+0(FP), X0
  8838  	MOVQ  xs+8(FP), AX
  8839  	MOVQ  incx+16(FP), CX
  8840  	MOVQ  ys+24(FP), DX
  8841  	MOVQ  incy+32(FP), BX
  8842  	MOVQ  n+40(FP), SI
  8843  	XORQ  DI, DI
  8844  	XORQ  R8, R8
  8845  	JMP   check_limit_unroll
  8846  	PCALIGN $0x08
  8847  	NOP
  8848  	NOP
  8849  	NOP
  8850  
  8851  loop_unroll:
  8852  	MOVSS (AX)(DI*4), X1
  8853  	MULSS X0, X1
  8854  	ADDSS (DX)(R8*4), X1
  8855  	MOVSS X1, (DX)(R8*4)
  8856  	ADDQ  CX, DI
  8857  	ADDQ  BX, R8
  8858  	MOVSS (AX)(DI*4), X1
  8859  	MULSS X0, X1
  8860  	ADDSS (DX)(R8*4), X1
  8861  	MOVSS X1, (DX)(R8*4)
  8862  	ADDQ  CX, DI
  8863  	ADDQ  BX, R8
  8864  	MOVSS (AX)(DI*4), X1
  8865  	MULSS X0, X1
  8866  	ADDSS (DX)(R8*4), X1
  8867  	MOVSS X1, (DX)(R8*4)
  8868  	ADDQ  CX, DI
  8869  	ADDQ  BX, R8
  8870  	MOVSS (AX)(DI*4), X1
  8871  	MULSS X0, X1
  8872  	ADDSS (DX)(R8*4), X1
  8873  	MOVSS X1, (DX)(R8*4)
  8874  	ADDQ  CX, DI
  8875  	ADDQ  BX, R8
  8876  	SUBQ  $0x04, SI
  8877  
  8878  check_limit_unroll:
  8879  	CMPQ SI, $0x04
  8880  	JHI  loop_unroll
  8881  	JMP  check_limit
  8882  
  8883  loop:
  8884  	MOVSS (AX)(DI*4), X1
  8885  	MULSS X0, X1
  8886  	ADDSS (DX)(R8*4), X1
  8887  	MOVSS X1, (DX)(R8*4)
  8888  	DECQ  SI
  8889  	ADDQ  CX, DI
  8890  	ADDQ  BX, R8
  8891  
  8892  check_limit:
  8893  	CMPQ SI, $0x00
  8894  	JHI  loop
  8895  	RET
  8896  
  8897  // func AmdAxpyUnsafeX_V4A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8898  // Requires: SSE
  8899  TEXT ·AmdAxpyUnsafeX_V4A11R4(SB), NOSPLIT, $0-48
  8900  	MOVSS alpha+0(FP), X0
  8901  	MOVQ  xs+8(FP), AX
  8902  	MOVQ  incx+16(FP), CX
  8903  	MOVQ  ys+24(FP), DX
  8904  	MOVQ  incy+32(FP), BX
  8905  	MOVQ  n+40(FP), SI
  8906  	XORQ  DI, DI
  8907  	XORQ  R8, R8
  8908  	JMP   check_limit_unroll
  8909  	PCALIGN $0x08
  8910  	NOP
  8911  	NOP
  8912  	NOP
  8913  
  8914  loop_unroll:
  8915  	MOVSS (AX)(DI*4), X1
  8916  	MULSS X0, X1
  8917  	ADDSS (DX)(R8*4), X1
  8918  	MOVSS X1, (DX)(R8*4)
  8919  	ADDQ  CX, DI
  8920  	ADDQ  BX, R8
  8921  	MOVSS (AX)(DI*4), X1
  8922  	MULSS X0, X1
  8923  	ADDSS (DX)(R8*4), X1
  8924  	MOVSS X1, (DX)(R8*4)
  8925  	ADDQ  CX, DI
  8926  	ADDQ  BX, R8
  8927  	MOVSS (AX)(DI*4), X1
  8928  	MULSS X0, X1
  8929  	ADDSS (DX)(R8*4), X1
  8930  	MOVSS X1, (DX)(R8*4)
  8931  	ADDQ  CX, DI
  8932  	ADDQ  BX, R8
  8933  	MOVSS (AX)(DI*4), X1
  8934  	MULSS X0, X1
  8935  	ADDSS (DX)(R8*4), X1
  8936  	MOVSS X1, (DX)(R8*4)
  8937  	ADDQ  CX, DI
  8938  	ADDQ  BX, R8
  8939  	SUBQ  $0x04, SI
  8940  
  8941  check_limit_unroll:
  8942  	CMPQ SI, $0x04
  8943  	JHI  loop_unroll
  8944  	JMP  check_limit
  8945  
  8946  loop:
  8947  	MOVSS (AX)(DI*4), X1
  8948  	MULSS X0, X1
  8949  	ADDSS (DX)(R8*4), X1
  8950  	MOVSS X1, (DX)(R8*4)
  8951  	DECQ  SI
  8952  	ADDQ  CX, DI
  8953  	ADDQ  BX, R8
  8954  
  8955  check_limit:
  8956  	CMPQ SI, $0x00
  8957  	JHI  loop
  8958  	RET
  8959  
  8960  // func AmdAxpyUnsafeX_V5A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  8961  // Requires: SSE
  8962  TEXT ·AmdAxpyUnsafeX_V5A11R4(SB), NOSPLIT, $0-48
  8963  	MOVSS alpha+0(FP), X0
  8964  	MOVQ  xs+8(FP), AX
  8965  	MOVQ  incx+16(FP), CX
  8966  	MOVQ  ys+24(FP), DX
  8967  	MOVQ  incy+32(FP), BX
  8968  	MOVQ  n+40(FP), SI
  8969  	XORQ  DI, DI
  8970  	XORQ  R8, R8
  8971  	JMP   check_limit_unroll
  8972  	PCALIGN $0x08
  8973  	NOP
  8974  	NOP
  8975  	NOP
  8976  
  8977  loop_unroll:
  8978  	MOVSS (AX)(DI*4), X1
  8979  	MULSS X0, X1
  8980  	ADDSS (DX)(R8*4), X1
  8981  	MOVSS X1, (DX)(R8*4)
  8982  	ADDQ  CX, DI
  8983  	ADDQ  BX, R8
  8984  	MOVSS (AX)(DI*4), X1
  8985  	MULSS X0, X1
  8986  	ADDSS (DX)(R8*4), X1
  8987  	MOVSS X1, (DX)(R8*4)
  8988  	ADDQ  CX, DI
  8989  	ADDQ  BX, R8
  8990  	MOVSS (AX)(DI*4), X1
  8991  	MULSS X0, X1
  8992  	ADDSS (DX)(R8*4), X1
  8993  	MOVSS X1, (DX)(R8*4)
  8994  	ADDQ  CX, DI
  8995  	ADDQ  BX, R8
  8996  	MOVSS (AX)(DI*4), X1
  8997  	MULSS X0, X1
  8998  	ADDSS (DX)(R8*4), X1
  8999  	MOVSS X1, (DX)(R8*4)
  9000  	ADDQ  CX, DI
  9001  	ADDQ  BX, R8
  9002  	SUBQ  $0x04, SI
  9003  
  9004  check_limit_unroll:
  9005  	CMPQ SI, $0x04
  9006  	JHI  loop_unroll
  9007  	JMP  check_limit
  9008  
  9009  loop:
  9010  	MOVSS (AX)(DI*4), X1
  9011  	MULSS X0, X1
  9012  	ADDSS (DX)(R8*4), X1
  9013  	MOVSS X1, (DX)(R8*4)
  9014  	DECQ  SI
  9015  	ADDQ  CX, DI
  9016  	ADDQ  BX, R8
  9017  
  9018  check_limit:
  9019  	CMPQ SI, $0x00
  9020  	JHI  loop
  9021  	RET
  9022  
  9023  // func AmdAxpyUnsafeX_V0A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9024  // Requires: SSE
  9025  TEXT ·AmdAxpyUnsafeX_V0A12R4(SB), NOSPLIT, $0-48
  9026  	MOVSS alpha+0(FP), X0
  9027  	MOVQ  xs+8(FP), AX
  9028  	MOVQ  incx+16(FP), CX
  9029  	MOVQ  ys+24(FP), DX
  9030  	MOVQ  incy+32(FP), BX
  9031  	MOVQ  n+40(FP), SI
  9032  	XORQ  DI, DI
  9033  	XORQ  R8, R8
  9034  	JMP   check_limit_unroll
  9035  	PCALIGN $0x08
  9036  	NOP
  9037  	NOP
  9038  	NOP
  9039  	NOP
  9040  
  9041  loop_unroll:
  9042  	MOVSS (AX)(DI*4), X1
  9043  	MULSS X0, X1
  9044  	ADDSS (DX)(R8*4), X1
  9045  	MOVSS X1, (DX)(R8*4)
  9046  	ADDQ  CX, DI
  9047  	ADDQ  BX, R8
  9048  	MOVSS (AX)(DI*4), X1
  9049  	MULSS X0, X1
  9050  	ADDSS (DX)(R8*4), X1
  9051  	MOVSS X1, (DX)(R8*4)
  9052  	ADDQ  CX, DI
  9053  	ADDQ  BX, R8
  9054  	MOVSS (AX)(DI*4), X1
  9055  	MULSS X0, X1
  9056  	ADDSS (DX)(R8*4), X1
  9057  	MOVSS X1, (DX)(R8*4)
  9058  	ADDQ  CX, DI
  9059  	ADDQ  BX, R8
  9060  	MOVSS (AX)(DI*4), X1
  9061  	MULSS X0, X1
  9062  	ADDSS (DX)(R8*4), X1
  9063  	MOVSS X1, (DX)(R8*4)
  9064  	ADDQ  CX, DI
  9065  	ADDQ  BX, R8
  9066  	SUBQ  $0x04, SI
  9067  
  9068  check_limit_unroll:
  9069  	CMPQ SI, $0x04
  9070  	JHI  loop_unroll
  9071  	JMP  check_limit
  9072  
  9073  loop:
  9074  	MOVSS (AX)(DI*4), X1
  9075  	MULSS X0, X1
  9076  	ADDSS (DX)(R8*4), X1
  9077  	MOVSS X1, (DX)(R8*4)
  9078  	DECQ  SI
  9079  	ADDQ  CX, DI
  9080  	ADDQ  BX, R8
  9081  
  9082  check_limit:
  9083  	CMPQ SI, $0x00
  9084  	JHI  loop
  9085  	RET
  9086  
  9087  // func AmdAxpyUnsafeX_V1A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9088  // Requires: SSE
  9089  TEXT ·AmdAxpyUnsafeX_V1A12R4(SB), NOSPLIT, $0-48
  9090  	MOVSS alpha+0(FP), X0
  9091  	MOVQ  xs+8(FP), AX
  9092  	MOVQ  incx+16(FP), CX
  9093  	MOVQ  ys+24(FP), DX
  9094  	MOVQ  incy+32(FP), BX
  9095  	MOVQ  n+40(FP), SI
  9096  	XORQ  DI, DI
  9097  	XORQ  R8, R8
  9098  	JMP   check_limit_unroll
  9099  	PCALIGN $0x08
  9100  	NOP
  9101  	NOP
  9102  	NOP
  9103  	NOP
  9104  
  9105  loop_unroll:
  9106  	MOVSS (AX)(DI*4), X1
  9107  	MULSS X0, X1
  9108  	ADDSS (DX)(R8*4), X1
  9109  	MOVSS X1, (DX)(R8*4)
  9110  	ADDQ  CX, DI
  9111  	ADDQ  BX, R8
  9112  	MOVSS (AX)(DI*4), X1
  9113  	MULSS X0, X1
  9114  	ADDSS (DX)(R8*4), X1
  9115  	MOVSS X1, (DX)(R8*4)
  9116  	ADDQ  CX, DI
  9117  	ADDQ  BX, R8
  9118  	MOVSS (AX)(DI*4), X1
  9119  	MULSS X0, X1
  9120  	ADDSS (DX)(R8*4), X1
  9121  	MOVSS X1, (DX)(R8*4)
  9122  	ADDQ  CX, DI
  9123  	ADDQ  BX, R8
  9124  	MOVSS (AX)(DI*4), X1
  9125  	MULSS X0, X1
  9126  	ADDSS (DX)(R8*4), X1
  9127  	MOVSS X1, (DX)(R8*4)
  9128  	ADDQ  CX, DI
  9129  	ADDQ  BX, R8
  9130  	SUBQ  $0x04, SI
  9131  
  9132  check_limit_unroll:
  9133  	CMPQ SI, $0x04
  9134  	JHI  loop_unroll
  9135  	JMP  check_limit
  9136  
  9137  loop:
  9138  	MOVSS (AX)(DI*4), X1
  9139  	MULSS X0, X1
  9140  	ADDSS (DX)(R8*4), X1
  9141  	MOVSS X1, (DX)(R8*4)
  9142  	DECQ  SI
  9143  	ADDQ  CX, DI
  9144  	ADDQ  BX, R8
  9145  
  9146  check_limit:
  9147  	CMPQ SI, $0x00
  9148  	JHI  loop
  9149  	RET
  9150  
  9151  // func AmdAxpyUnsafeX_V2A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9152  // Requires: SSE
  9153  TEXT ·AmdAxpyUnsafeX_V2A12R4(SB), NOSPLIT, $0-48
  9154  	MOVSS alpha+0(FP), X0
  9155  	MOVQ  xs+8(FP), AX
  9156  	MOVQ  incx+16(FP), CX
  9157  	MOVQ  ys+24(FP), DX
  9158  	MOVQ  incy+32(FP), BX
  9159  	MOVQ  n+40(FP), SI
  9160  	XORQ  DI, DI
  9161  	XORQ  R8, R8
  9162  	JMP   check_limit_unroll
  9163  	PCALIGN $0x08
  9164  	NOP
  9165  	NOP
  9166  	NOP
  9167  	NOP
  9168  
  9169  loop_unroll:
  9170  	MOVSS (AX)(DI*4), X1
  9171  	MULSS X0, X1
  9172  	ADDSS (DX)(R8*4), X1
  9173  	MOVSS X1, (DX)(R8*4)
  9174  	ADDQ  CX, DI
  9175  	ADDQ  BX, R8
  9176  	MOVSS (AX)(DI*4), X1
  9177  	MULSS X0, X1
  9178  	ADDSS (DX)(R8*4), X1
  9179  	MOVSS X1, (DX)(R8*4)
  9180  	ADDQ  CX, DI
  9181  	ADDQ  BX, R8
  9182  	MOVSS (AX)(DI*4), X1
  9183  	MULSS X0, X1
  9184  	ADDSS (DX)(R8*4), X1
  9185  	MOVSS X1, (DX)(R8*4)
  9186  	ADDQ  CX, DI
  9187  	ADDQ  BX, R8
  9188  	MOVSS (AX)(DI*4), X1
  9189  	MULSS X0, X1
  9190  	ADDSS (DX)(R8*4), X1
  9191  	MOVSS X1, (DX)(R8*4)
  9192  	ADDQ  CX, DI
  9193  	ADDQ  BX, R8
  9194  	SUBQ  $0x04, SI
  9195  
  9196  check_limit_unroll:
  9197  	CMPQ SI, $0x04
  9198  	JHI  loop_unroll
  9199  	JMP  check_limit
  9200  
  9201  loop:
  9202  	MOVSS (AX)(DI*4), X1
  9203  	MULSS X0, X1
  9204  	ADDSS (DX)(R8*4), X1
  9205  	MOVSS X1, (DX)(R8*4)
  9206  	DECQ  SI
  9207  	ADDQ  CX, DI
  9208  	ADDQ  BX, R8
  9209  
  9210  check_limit:
  9211  	CMPQ SI, $0x00
  9212  	JHI  loop
  9213  	RET
  9214  
  9215  // func AmdAxpyUnsafeX_V3A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9216  // Requires: SSE
  9217  TEXT ·AmdAxpyUnsafeX_V3A12R4(SB), NOSPLIT, $0-48
  9218  	MOVSS alpha+0(FP), X0
  9219  	MOVQ  xs+8(FP), AX
  9220  	MOVQ  incx+16(FP), CX
  9221  	MOVQ  ys+24(FP), DX
  9222  	MOVQ  incy+32(FP), BX
  9223  	MOVQ  n+40(FP), SI
  9224  	XORQ  DI, DI
  9225  	XORQ  R8, R8
  9226  	JMP   check_limit_unroll
  9227  	PCALIGN $0x08
  9228  	NOP
  9229  	NOP
  9230  	NOP
  9231  	NOP
  9232  
  9233  loop_unroll:
  9234  	MOVSS (AX)(DI*4), X1
  9235  	MULSS X0, X1
  9236  	ADDSS (DX)(R8*4), X1
  9237  	MOVSS X1, (DX)(R8*4)
  9238  	ADDQ  CX, DI
  9239  	ADDQ  BX, R8
  9240  	MOVSS (AX)(DI*4), X1
  9241  	MULSS X0, X1
  9242  	ADDSS (DX)(R8*4), X1
  9243  	MOVSS X1, (DX)(R8*4)
  9244  	ADDQ  CX, DI
  9245  	ADDQ  BX, R8
  9246  	MOVSS (AX)(DI*4), X1
  9247  	MULSS X0, X1
  9248  	ADDSS (DX)(R8*4), X1
  9249  	MOVSS X1, (DX)(R8*4)
  9250  	ADDQ  CX, DI
  9251  	ADDQ  BX, R8
  9252  	MOVSS (AX)(DI*4), X1
  9253  	MULSS X0, X1
  9254  	ADDSS (DX)(R8*4), X1
  9255  	MOVSS X1, (DX)(R8*4)
  9256  	ADDQ  CX, DI
  9257  	ADDQ  BX, R8
  9258  	SUBQ  $0x04, SI
  9259  
  9260  check_limit_unroll:
  9261  	CMPQ SI, $0x04
  9262  	JHI  loop_unroll
  9263  	JMP  check_limit
  9264  
  9265  loop:
  9266  	MOVSS (AX)(DI*4), X1
  9267  	MULSS X0, X1
  9268  	ADDSS (DX)(R8*4), X1
  9269  	MOVSS X1, (DX)(R8*4)
  9270  	DECQ  SI
  9271  	ADDQ  CX, DI
  9272  	ADDQ  BX, R8
  9273  
  9274  check_limit:
  9275  	CMPQ SI, $0x00
  9276  	JHI  loop
  9277  	RET
  9278  
  9279  // func AmdAxpyUnsafeX_V4A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9280  // Requires: SSE
  9281  TEXT ·AmdAxpyUnsafeX_V4A12R4(SB), NOSPLIT, $0-48
  9282  	MOVSS alpha+0(FP), X0
  9283  	MOVQ  xs+8(FP), AX
  9284  	MOVQ  incx+16(FP), CX
  9285  	MOVQ  ys+24(FP), DX
  9286  	MOVQ  incy+32(FP), BX
  9287  	MOVQ  n+40(FP), SI
  9288  	XORQ  DI, DI
  9289  	XORQ  R8, R8
  9290  	JMP   check_limit_unroll
  9291  	PCALIGN $0x08
  9292  	NOP
  9293  	NOP
  9294  	NOP
  9295  	NOP
  9296  
  9297  loop_unroll:
  9298  	MOVSS (AX)(DI*4), X1
  9299  	MULSS X0, X1
  9300  	ADDSS (DX)(R8*4), X1
  9301  	MOVSS X1, (DX)(R8*4)
  9302  	ADDQ  CX, DI
  9303  	ADDQ  BX, R8
  9304  	MOVSS (AX)(DI*4), X1
  9305  	MULSS X0, X1
  9306  	ADDSS (DX)(R8*4), X1
  9307  	MOVSS X1, (DX)(R8*4)
  9308  	ADDQ  CX, DI
  9309  	ADDQ  BX, R8
  9310  	MOVSS (AX)(DI*4), X1
  9311  	MULSS X0, X1
  9312  	ADDSS (DX)(R8*4), X1
  9313  	MOVSS X1, (DX)(R8*4)
  9314  	ADDQ  CX, DI
  9315  	ADDQ  BX, R8
  9316  	MOVSS (AX)(DI*4), X1
  9317  	MULSS X0, X1
  9318  	ADDSS (DX)(R8*4), X1
  9319  	MOVSS X1, (DX)(R8*4)
  9320  	ADDQ  CX, DI
  9321  	ADDQ  BX, R8
  9322  	SUBQ  $0x04, SI
  9323  
  9324  check_limit_unroll:
  9325  	CMPQ SI, $0x04
  9326  	JHI  loop_unroll
  9327  	JMP  check_limit
  9328  
  9329  loop:
  9330  	MOVSS (AX)(DI*4), X1
  9331  	MULSS X0, X1
  9332  	ADDSS (DX)(R8*4), X1
  9333  	MOVSS X1, (DX)(R8*4)
  9334  	DECQ  SI
  9335  	ADDQ  CX, DI
  9336  	ADDQ  BX, R8
  9337  
  9338  check_limit:
  9339  	CMPQ SI, $0x00
  9340  	JHI  loop
  9341  	RET
  9342  
  9343  // func AmdAxpyUnsafeX_V5A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9344  // Requires: SSE
  9345  TEXT ·AmdAxpyUnsafeX_V5A12R4(SB), NOSPLIT, $0-48
  9346  	MOVSS alpha+0(FP), X0
  9347  	MOVQ  xs+8(FP), AX
  9348  	MOVQ  incx+16(FP), CX
  9349  	MOVQ  ys+24(FP), DX
  9350  	MOVQ  incy+32(FP), BX
  9351  	MOVQ  n+40(FP), SI
  9352  	XORQ  DI, DI
  9353  	XORQ  R8, R8
  9354  	JMP   check_limit_unroll
  9355  	PCALIGN $0x08
  9356  	NOP
  9357  	NOP
  9358  	NOP
  9359  	NOP
  9360  
  9361  loop_unroll:
  9362  	MOVSS (AX)(DI*4), X1
  9363  	MULSS X0, X1
  9364  	ADDSS (DX)(R8*4), X1
  9365  	MOVSS X1, (DX)(R8*4)
  9366  	ADDQ  CX, DI
  9367  	ADDQ  BX, R8
  9368  	MOVSS (AX)(DI*4), X1
  9369  	MULSS X0, X1
  9370  	ADDSS (DX)(R8*4), X1
  9371  	MOVSS X1, (DX)(R8*4)
  9372  	ADDQ  CX, DI
  9373  	ADDQ  BX, R8
  9374  	MOVSS (AX)(DI*4), X1
  9375  	MULSS X0, X1
  9376  	ADDSS (DX)(R8*4), X1
  9377  	MOVSS X1, (DX)(R8*4)
  9378  	ADDQ  CX, DI
  9379  	ADDQ  BX, R8
  9380  	MOVSS (AX)(DI*4), X1
  9381  	MULSS X0, X1
  9382  	ADDSS (DX)(R8*4), X1
  9383  	MOVSS X1, (DX)(R8*4)
  9384  	ADDQ  CX, DI
  9385  	ADDQ  BX, R8
  9386  	SUBQ  $0x04, SI
  9387  
  9388  check_limit_unroll:
  9389  	CMPQ SI, $0x04
  9390  	JHI  loop_unroll
  9391  	JMP  check_limit
  9392  
  9393  loop:
  9394  	MOVSS (AX)(DI*4), X1
  9395  	MULSS X0, X1
  9396  	ADDSS (DX)(R8*4), X1
  9397  	MOVSS X1, (DX)(R8*4)
  9398  	DECQ  SI
  9399  	ADDQ  CX, DI
  9400  	ADDQ  BX, R8
  9401  
  9402  check_limit:
  9403  	CMPQ SI, $0x00
  9404  	JHI  loop
  9405  	RET
  9406  
  9407  // func AmdAxpyUnsafeX_V0A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9408  // Requires: SSE
  9409  TEXT ·AmdAxpyUnsafeX_V0A13R4(SB), NOSPLIT, $0-48
  9410  	MOVSS alpha+0(FP), X0
  9411  	MOVQ  xs+8(FP), AX
  9412  	MOVQ  incx+16(FP), CX
  9413  	MOVQ  ys+24(FP), DX
  9414  	MOVQ  incy+32(FP), BX
  9415  	MOVQ  n+40(FP), SI
  9416  	XORQ  DI, DI
  9417  	XORQ  R8, R8
  9418  	JMP   check_limit_unroll
  9419  	PCALIGN $0x08
  9420  	NOP
  9421  	NOP
  9422  	NOP
  9423  	NOP
  9424  	NOP
  9425  
  9426  loop_unroll:
  9427  	MOVSS (AX)(DI*4), X1
  9428  	MULSS X0, X1
  9429  	ADDSS (DX)(R8*4), X1
  9430  	MOVSS X1, (DX)(R8*4)
  9431  	ADDQ  CX, DI
  9432  	ADDQ  BX, R8
  9433  	MOVSS (AX)(DI*4), X1
  9434  	MULSS X0, X1
  9435  	ADDSS (DX)(R8*4), X1
  9436  	MOVSS X1, (DX)(R8*4)
  9437  	ADDQ  CX, DI
  9438  	ADDQ  BX, R8
  9439  	MOVSS (AX)(DI*4), X1
  9440  	MULSS X0, X1
  9441  	ADDSS (DX)(R8*4), X1
  9442  	MOVSS X1, (DX)(R8*4)
  9443  	ADDQ  CX, DI
  9444  	ADDQ  BX, R8
  9445  	MOVSS (AX)(DI*4), X1
  9446  	MULSS X0, X1
  9447  	ADDSS (DX)(R8*4), X1
  9448  	MOVSS X1, (DX)(R8*4)
  9449  	ADDQ  CX, DI
  9450  	ADDQ  BX, R8
  9451  	SUBQ  $0x04, SI
  9452  
  9453  check_limit_unroll:
  9454  	CMPQ SI, $0x04
  9455  	JHI  loop_unroll
  9456  	JMP  check_limit
  9457  
  9458  loop:
  9459  	MOVSS (AX)(DI*4), X1
  9460  	MULSS X0, X1
  9461  	ADDSS (DX)(R8*4), X1
  9462  	MOVSS X1, (DX)(R8*4)
  9463  	DECQ  SI
  9464  	ADDQ  CX, DI
  9465  	ADDQ  BX, R8
  9466  
  9467  check_limit:
  9468  	CMPQ SI, $0x00
  9469  	JHI  loop
  9470  	RET
  9471  
  9472  // func AmdAxpyUnsafeX_V1A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9473  // Requires: SSE
  9474  TEXT ·AmdAxpyUnsafeX_V1A13R4(SB), NOSPLIT, $0-48
  9475  	MOVSS alpha+0(FP), X0
  9476  	MOVQ  xs+8(FP), AX
  9477  	MOVQ  incx+16(FP), CX
  9478  	MOVQ  ys+24(FP), DX
  9479  	MOVQ  incy+32(FP), BX
  9480  	MOVQ  n+40(FP), SI
  9481  	XORQ  DI, DI
  9482  	XORQ  R8, R8
  9483  	JMP   check_limit_unroll
  9484  	PCALIGN $0x08
  9485  	NOP
  9486  	NOP
  9487  	NOP
  9488  	NOP
  9489  	NOP
  9490  
  9491  loop_unroll:
  9492  	MOVSS (AX)(DI*4), X1
  9493  	MULSS X0, X1
  9494  	ADDSS (DX)(R8*4), X1
  9495  	MOVSS X1, (DX)(R8*4)
  9496  	ADDQ  CX, DI
  9497  	ADDQ  BX, R8
  9498  	MOVSS (AX)(DI*4), X1
  9499  	MULSS X0, X1
  9500  	ADDSS (DX)(R8*4), X1
  9501  	MOVSS X1, (DX)(R8*4)
  9502  	ADDQ  CX, DI
  9503  	ADDQ  BX, R8
  9504  	MOVSS (AX)(DI*4), X1
  9505  	MULSS X0, X1
  9506  	ADDSS (DX)(R8*4), X1
  9507  	MOVSS X1, (DX)(R8*4)
  9508  	ADDQ  CX, DI
  9509  	ADDQ  BX, R8
  9510  	MOVSS (AX)(DI*4), X1
  9511  	MULSS X0, X1
  9512  	ADDSS (DX)(R8*4), X1
  9513  	MOVSS X1, (DX)(R8*4)
  9514  	ADDQ  CX, DI
  9515  	ADDQ  BX, R8
  9516  	SUBQ  $0x04, SI
  9517  
  9518  check_limit_unroll:
  9519  	CMPQ SI, $0x04
  9520  	JHI  loop_unroll
  9521  	JMP  check_limit
  9522  
  9523  loop:
  9524  	MOVSS (AX)(DI*4), X1
  9525  	MULSS X0, X1
  9526  	ADDSS (DX)(R8*4), X1
  9527  	MOVSS X1, (DX)(R8*4)
  9528  	DECQ  SI
  9529  	ADDQ  CX, DI
  9530  	ADDQ  BX, R8
  9531  
  9532  check_limit:
  9533  	CMPQ SI, $0x00
  9534  	JHI  loop
  9535  	RET
  9536  
  9537  // func AmdAxpyUnsafeX_V2A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9538  // Requires: SSE
  9539  TEXT ·AmdAxpyUnsafeX_V2A13R4(SB), NOSPLIT, $0-48
  9540  	MOVSS alpha+0(FP), X0
  9541  	MOVQ  xs+8(FP), AX
  9542  	MOVQ  incx+16(FP), CX
  9543  	MOVQ  ys+24(FP), DX
  9544  	MOVQ  incy+32(FP), BX
  9545  	MOVQ  n+40(FP), SI
  9546  	XORQ  DI, DI
  9547  	XORQ  R8, R8
  9548  	JMP   check_limit_unroll
  9549  	PCALIGN $0x08
  9550  	NOP
  9551  	NOP
  9552  	NOP
  9553  	NOP
  9554  	NOP
  9555  
  9556  loop_unroll:
  9557  	MOVSS (AX)(DI*4), X1
  9558  	MULSS X0, X1
  9559  	ADDSS (DX)(R8*4), X1
  9560  	MOVSS X1, (DX)(R8*4)
  9561  	ADDQ  CX, DI
  9562  	ADDQ  BX, R8
  9563  	MOVSS (AX)(DI*4), X1
  9564  	MULSS X0, X1
  9565  	ADDSS (DX)(R8*4), X1
  9566  	MOVSS X1, (DX)(R8*4)
  9567  	ADDQ  CX, DI
  9568  	ADDQ  BX, R8
  9569  	MOVSS (AX)(DI*4), X1
  9570  	MULSS X0, X1
  9571  	ADDSS (DX)(R8*4), X1
  9572  	MOVSS X1, (DX)(R8*4)
  9573  	ADDQ  CX, DI
  9574  	ADDQ  BX, R8
  9575  	MOVSS (AX)(DI*4), X1
  9576  	MULSS X0, X1
  9577  	ADDSS (DX)(R8*4), X1
  9578  	MOVSS X1, (DX)(R8*4)
  9579  	ADDQ  CX, DI
  9580  	ADDQ  BX, R8
  9581  	SUBQ  $0x04, SI
  9582  
  9583  check_limit_unroll:
  9584  	CMPQ SI, $0x04
  9585  	JHI  loop_unroll
  9586  	JMP  check_limit
  9587  
  9588  loop:
  9589  	MOVSS (AX)(DI*4), X1
  9590  	MULSS X0, X1
  9591  	ADDSS (DX)(R8*4), X1
  9592  	MOVSS X1, (DX)(R8*4)
  9593  	DECQ  SI
  9594  	ADDQ  CX, DI
  9595  	ADDQ  BX, R8
  9596  
  9597  check_limit:
  9598  	CMPQ SI, $0x00
  9599  	JHI  loop
  9600  	RET
  9601  
  9602  // func AmdAxpyUnsafeX_V3A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9603  // Requires: SSE
  9604  TEXT ·AmdAxpyUnsafeX_V3A13R4(SB), NOSPLIT, $0-48
  9605  	MOVSS alpha+0(FP), X0
  9606  	MOVQ  xs+8(FP), AX
  9607  	MOVQ  incx+16(FP), CX
  9608  	MOVQ  ys+24(FP), DX
  9609  	MOVQ  incy+32(FP), BX
  9610  	MOVQ  n+40(FP), SI
  9611  	XORQ  DI, DI
  9612  	XORQ  R8, R8
  9613  	JMP   check_limit_unroll
  9614  	PCALIGN $0x08
  9615  	NOP
  9616  	NOP
  9617  	NOP
  9618  	NOP
  9619  	NOP
  9620  
  9621  loop_unroll:
  9622  	MOVSS (AX)(DI*4), X1
  9623  	MULSS X0, X1
  9624  	ADDSS (DX)(R8*4), X1
  9625  	MOVSS X1, (DX)(R8*4)
  9626  	ADDQ  CX, DI
  9627  	ADDQ  BX, R8
  9628  	MOVSS (AX)(DI*4), X1
  9629  	MULSS X0, X1
  9630  	ADDSS (DX)(R8*4), X1
  9631  	MOVSS X1, (DX)(R8*4)
  9632  	ADDQ  CX, DI
  9633  	ADDQ  BX, R8
  9634  	MOVSS (AX)(DI*4), X1
  9635  	MULSS X0, X1
  9636  	ADDSS (DX)(R8*4), X1
  9637  	MOVSS X1, (DX)(R8*4)
  9638  	ADDQ  CX, DI
  9639  	ADDQ  BX, R8
  9640  	MOVSS (AX)(DI*4), X1
  9641  	MULSS X0, X1
  9642  	ADDSS (DX)(R8*4), X1
  9643  	MOVSS X1, (DX)(R8*4)
  9644  	ADDQ  CX, DI
  9645  	ADDQ  BX, R8
  9646  	SUBQ  $0x04, SI
  9647  
  9648  check_limit_unroll:
  9649  	CMPQ SI, $0x04
  9650  	JHI  loop_unroll
  9651  	JMP  check_limit
  9652  
  9653  loop:
  9654  	MOVSS (AX)(DI*4), X1
  9655  	MULSS X0, X1
  9656  	ADDSS (DX)(R8*4), X1
  9657  	MOVSS X1, (DX)(R8*4)
  9658  	DECQ  SI
  9659  	ADDQ  CX, DI
  9660  	ADDQ  BX, R8
  9661  
  9662  check_limit:
  9663  	CMPQ SI, $0x00
  9664  	JHI  loop
  9665  	RET
  9666  
  9667  // func AmdAxpyUnsafeX_V4A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9668  // Requires: SSE
  9669  TEXT ·AmdAxpyUnsafeX_V4A13R4(SB), NOSPLIT, $0-48
  9670  	MOVSS alpha+0(FP), X0
  9671  	MOVQ  xs+8(FP), AX
  9672  	MOVQ  incx+16(FP), CX
  9673  	MOVQ  ys+24(FP), DX
  9674  	MOVQ  incy+32(FP), BX
  9675  	MOVQ  n+40(FP), SI
  9676  	XORQ  DI, DI
  9677  	XORQ  R8, R8
  9678  	JMP   check_limit_unroll
  9679  	PCALIGN $0x08
  9680  	NOP
  9681  	NOP
  9682  	NOP
  9683  	NOP
  9684  	NOP
  9685  
  9686  loop_unroll:
  9687  	MOVSS (AX)(DI*4), X1
  9688  	MULSS X0, X1
  9689  	ADDSS (DX)(R8*4), X1
  9690  	MOVSS X1, (DX)(R8*4)
  9691  	ADDQ  CX, DI
  9692  	ADDQ  BX, R8
  9693  	MOVSS (AX)(DI*4), X1
  9694  	MULSS X0, X1
  9695  	ADDSS (DX)(R8*4), X1
  9696  	MOVSS X1, (DX)(R8*4)
  9697  	ADDQ  CX, DI
  9698  	ADDQ  BX, R8
  9699  	MOVSS (AX)(DI*4), X1
  9700  	MULSS X0, X1
  9701  	ADDSS (DX)(R8*4), X1
  9702  	MOVSS X1, (DX)(R8*4)
  9703  	ADDQ  CX, DI
  9704  	ADDQ  BX, R8
  9705  	MOVSS (AX)(DI*4), X1
  9706  	MULSS X0, X1
  9707  	ADDSS (DX)(R8*4), X1
  9708  	MOVSS X1, (DX)(R8*4)
  9709  	ADDQ  CX, DI
  9710  	ADDQ  BX, R8
  9711  	SUBQ  $0x04, SI
  9712  
  9713  check_limit_unroll:
  9714  	CMPQ SI, $0x04
  9715  	JHI  loop_unroll
  9716  	JMP  check_limit
  9717  
  9718  loop:
  9719  	MOVSS (AX)(DI*4), X1
  9720  	MULSS X0, X1
  9721  	ADDSS (DX)(R8*4), X1
  9722  	MOVSS X1, (DX)(R8*4)
  9723  	DECQ  SI
  9724  	ADDQ  CX, DI
  9725  	ADDQ  BX, R8
  9726  
  9727  check_limit:
  9728  	CMPQ SI, $0x00
  9729  	JHI  loop
  9730  	RET
  9731  
  9732  // func AmdAxpyUnsafeX_V5A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9733  // Requires: SSE
  9734  TEXT ·AmdAxpyUnsafeX_V5A13R4(SB), NOSPLIT, $0-48
  9735  	MOVSS alpha+0(FP), X0
  9736  	MOVQ  xs+8(FP), AX
  9737  	MOVQ  incx+16(FP), CX
  9738  	MOVQ  ys+24(FP), DX
  9739  	MOVQ  incy+32(FP), BX
  9740  	MOVQ  n+40(FP), SI
  9741  	XORQ  DI, DI
  9742  	XORQ  R8, R8
  9743  	JMP   check_limit_unroll
  9744  	PCALIGN $0x08
  9745  	NOP
  9746  	NOP
  9747  	NOP
  9748  	NOP
  9749  	NOP
  9750  
  9751  loop_unroll:
  9752  	MOVSS (AX)(DI*4), X1
  9753  	MULSS X0, X1
  9754  	ADDSS (DX)(R8*4), X1
  9755  	MOVSS X1, (DX)(R8*4)
  9756  	ADDQ  CX, DI
  9757  	ADDQ  BX, R8
  9758  	MOVSS (AX)(DI*4), X1
  9759  	MULSS X0, X1
  9760  	ADDSS (DX)(R8*4), X1
  9761  	MOVSS X1, (DX)(R8*4)
  9762  	ADDQ  CX, DI
  9763  	ADDQ  BX, R8
  9764  	MOVSS (AX)(DI*4), X1
  9765  	MULSS X0, X1
  9766  	ADDSS (DX)(R8*4), X1
  9767  	MOVSS X1, (DX)(R8*4)
  9768  	ADDQ  CX, DI
  9769  	ADDQ  BX, R8
  9770  	MOVSS (AX)(DI*4), X1
  9771  	MULSS X0, X1
  9772  	ADDSS (DX)(R8*4), X1
  9773  	MOVSS X1, (DX)(R8*4)
  9774  	ADDQ  CX, DI
  9775  	ADDQ  BX, R8
  9776  	SUBQ  $0x04, SI
  9777  
  9778  check_limit_unroll:
  9779  	CMPQ SI, $0x04
  9780  	JHI  loop_unroll
  9781  	JMP  check_limit
  9782  
  9783  loop:
  9784  	MOVSS (AX)(DI*4), X1
  9785  	MULSS X0, X1
  9786  	ADDSS (DX)(R8*4), X1
  9787  	MOVSS X1, (DX)(R8*4)
  9788  	DECQ  SI
  9789  	ADDQ  CX, DI
  9790  	ADDQ  BX, R8
  9791  
  9792  check_limit:
  9793  	CMPQ SI, $0x00
  9794  	JHI  loop
  9795  	RET
  9796  
  9797  // func AmdAxpyUnsafeX_V0A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9798  // Requires: SSE
  9799  TEXT ·AmdAxpyUnsafeX_V0A14R4(SB), NOSPLIT, $0-48
  9800  	MOVSS alpha+0(FP), X0
  9801  	MOVQ  xs+8(FP), AX
  9802  	MOVQ  incx+16(FP), CX
  9803  	MOVQ  ys+24(FP), DX
  9804  	MOVQ  incy+32(FP), BX
  9805  	MOVQ  n+40(FP), SI
  9806  	XORQ  DI, DI
  9807  	XORQ  R8, R8
  9808  	JMP   check_limit_unroll
  9809  	PCALIGN $0x08
  9810  	NOP
  9811  	NOP
  9812  	NOP
  9813  	NOP
  9814  	NOP
  9815  	NOP
  9816  
  9817  loop_unroll:
  9818  	MOVSS (AX)(DI*4), X1
  9819  	MULSS X0, X1
  9820  	ADDSS (DX)(R8*4), X1
  9821  	MOVSS X1, (DX)(R8*4)
  9822  	ADDQ  CX, DI
  9823  	ADDQ  BX, R8
  9824  	MOVSS (AX)(DI*4), X1
  9825  	MULSS X0, X1
  9826  	ADDSS (DX)(R8*4), X1
  9827  	MOVSS X1, (DX)(R8*4)
  9828  	ADDQ  CX, DI
  9829  	ADDQ  BX, R8
  9830  	MOVSS (AX)(DI*4), X1
  9831  	MULSS X0, X1
  9832  	ADDSS (DX)(R8*4), X1
  9833  	MOVSS X1, (DX)(R8*4)
  9834  	ADDQ  CX, DI
  9835  	ADDQ  BX, R8
  9836  	MOVSS (AX)(DI*4), X1
  9837  	MULSS X0, X1
  9838  	ADDSS (DX)(R8*4), X1
  9839  	MOVSS X1, (DX)(R8*4)
  9840  	ADDQ  CX, DI
  9841  	ADDQ  BX, R8
  9842  	SUBQ  $0x04, SI
  9843  
  9844  check_limit_unroll:
  9845  	CMPQ SI, $0x04
  9846  	JHI  loop_unroll
  9847  	JMP  check_limit
  9848  
  9849  loop:
  9850  	MOVSS (AX)(DI*4), X1
  9851  	MULSS X0, X1
  9852  	ADDSS (DX)(R8*4), X1
  9853  	MOVSS X1, (DX)(R8*4)
  9854  	DECQ  SI
  9855  	ADDQ  CX, DI
  9856  	ADDQ  BX, R8
  9857  
  9858  check_limit:
  9859  	CMPQ SI, $0x00
  9860  	JHI  loop
  9861  	RET
  9862  
  9863  // func AmdAxpyUnsafeX_V1A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9864  // Requires: SSE
  9865  TEXT ·AmdAxpyUnsafeX_V1A14R4(SB), NOSPLIT, $0-48
  9866  	MOVSS alpha+0(FP), X0
  9867  	MOVQ  xs+8(FP), AX
  9868  	MOVQ  incx+16(FP), CX
  9869  	MOVQ  ys+24(FP), DX
  9870  	MOVQ  incy+32(FP), BX
  9871  	MOVQ  n+40(FP), SI
  9872  	XORQ  DI, DI
  9873  	XORQ  R8, R8
  9874  	JMP   check_limit_unroll
  9875  	PCALIGN $0x08
  9876  	NOP
  9877  	NOP
  9878  	NOP
  9879  	NOP
  9880  	NOP
  9881  	NOP
  9882  
  9883  loop_unroll:
  9884  	MOVSS (AX)(DI*4), X1
  9885  	MULSS X0, X1
  9886  	ADDSS (DX)(R8*4), X1
  9887  	MOVSS X1, (DX)(R8*4)
  9888  	ADDQ  CX, DI
  9889  	ADDQ  BX, R8
  9890  	MOVSS (AX)(DI*4), X1
  9891  	MULSS X0, X1
  9892  	ADDSS (DX)(R8*4), X1
  9893  	MOVSS X1, (DX)(R8*4)
  9894  	ADDQ  CX, DI
  9895  	ADDQ  BX, R8
  9896  	MOVSS (AX)(DI*4), X1
  9897  	MULSS X0, X1
  9898  	ADDSS (DX)(R8*4), X1
  9899  	MOVSS X1, (DX)(R8*4)
  9900  	ADDQ  CX, DI
  9901  	ADDQ  BX, R8
  9902  	MOVSS (AX)(DI*4), X1
  9903  	MULSS X0, X1
  9904  	ADDSS (DX)(R8*4), X1
  9905  	MOVSS X1, (DX)(R8*4)
  9906  	ADDQ  CX, DI
  9907  	ADDQ  BX, R8
  9908  	SUBQ  $0x04, SI
  9909  
  9910  check_limit_unroll:
  9911  	CMPQ SI, $0x04
  9912  	JHI  loop_unroll
  9913  	JMP  check_limit
  9914  
  9915  loop:
  9916  	MOVSS (AX)(DI*4), X1
  9917  	MULSS X0, X1
  9918  	ADDSS (DX)(R8*4), X1
  9919  	MOVSS X1, (DX)(R8*4)
  9920  	DECQ  SI
  9921  	ADDQ  CX, DI
  9922  	ADDQ  BX, R8
  9923  
  9924  check_limit:
  9925  	CMPQ SI, $0x00
  9926  	JHI  loop
  9927  	RET
  9928  
  9929  // func AmdAxpyUnsafeX_V2A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9930  // Requires: SSE
  9931  TEXT ·AmdAxpyUnsafeX_V2A14R4(SB), NOSPLIT, $0-48
  9932  	MOVSS alpha+0(FP), X0
  9933  	MOVQ  xs+8(FP), AX
  9934  	MOVQ  incx+16(FP), CX
  9935  	MOVQ  ys+24(FP), DX
  9936  	MOVQ  incy+32(FP), BX
  9937  	MOVQ  n+40(FP), SI
  9938  	XORQ  DI, DI
  9939  	XORQ  R8, R8
  9940  	JMP   check_limit_unroll
  9941  	PCALIGN $0x08
  9942  	NOP
  9943  	NOP
  9944  	NOP
  9945  	NOP
  9946  	NOP
  9947  	NOP
  9948  
  9949  loop_unroll:
  9950  	MOVSS (AX)(DI*4), X1
  9951  	MULSS X0, X1
  9952  	ADDSS (DX)(R8*4), X1
  9953  	MOVSS X1, (DX)(R8*4)
  9954  	ADDQ  CX, DI
  9955  	ADDQ  BX, R8
  9956  	MOVSS (AX)(DI*4), X1
  9957  	MULSS X0, X1
  9958  	ADDSS (DX)(R8*4), X1
  9959  	MOVSS X1, (DX)(R8*4)
  9960  	ADDQ  CX, DI
  9961  	ADDQ  BX, R8
  9962  	MOVSS (AX)(DI*4), X1
  9963  	MULSS X0, X1
  9964  	ADDSS (DX)(R8*4), X1
  9965  	MOVSS X1, (DX)(R8*4)
  9966  	ADDQ  CX, DI
  9967  	ADDQ  BX, R8
  9968  	MOVSS (AX)(DI*4), X1
  9969  	MULSS X0, X1
  9970  	ADDSS (DX)(R8*4), X1
  9971  	MOVSS X1, (DX)(R8*4)
  9972  	ADDQ  CX, DI
  9973  	ADDQ  BX, R8
  9974  	SUBQ  $0x04, SI
  9975  
  9976  check_limit_unroll:
  9977  	CMPQ SI, $0x04
  9978  	JHI  loop_unroll
  9979  	JMP  check_limit
  9980  
  9981  loop:
  9982  	MOVSS (AX)(DI*4), X1
  9983  	MULSS X0, X1
  9984  	ADDSS (DX)(R8*4), X1
  9985  	MOVSS X1, (DX)(R8*4)
  9986  	DECQ  SI
  9987  	ADDQ  CX, DI
  9988  	ADDQ  BX, R8
  9989  
  9990  check_limit:
  9991  	CMPQ SI, $0x00
  9992  	JHI  loop
  9993  	RET
  9994  
  9995  // func AmdAxpyUnsafeX_V3A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
  9996  // Requires: SSE
  9997  TEXT ·AmdAxpyUnsafeX_V3A14R4(SB), NOSPLIT, $0-48
  9998  	MOVSS alpha+0(FP), X0
  9999  	MOVQ  xs+8(FP), AX
 10000  	MOVQ  incx+16(FP), CX
 10001  	MOVQ  ys+24(FP), DX
 10002  	MOVQ  incy+32(FP), BX
 10003  	MOVQ  n+40(FP), SI
 10004  	XORQ  DI, DI
 10005  	XORQ  R8, R8
 10006  	JMP   check_limit_unroll
 10007  	PCALIGN $0x08
 10008  	NOP
 10009  	NOP
 10010  	NOP
 10011  	NOP
 10012  	NOP
 10013  	NOP
 10014  
 10015  loop_unroll:
 10016  	MOVSS (AX)(DI*4), X1
 10017  	MULSS X0, X1
 10018  	ADDSS (DX)(R8*4), X1
 10019  	MOVSS X1, (DX)(R8*4)
 10020  	ADDQ  CX, DI
 10021  	ADDQ  BX, R8
 10022  	MOVSS (AX)(DI*4), X1
 10023  	MULSS X0, X1
 10024  	ADDSS (DX)(R8*4), X1
 10025  	MOVSS X1, (DX)(R8*4)
 10026  	ADDQ  CX, DI
 10027  	ADDQ  BX, R8
 10028  	MOVSS (AX)(DI*4), X1
 10029  	MULSS X0, X1
 10030  	ADDSS (DX)(R8*4), X1
 10031  	MOVSS X1, (DX)(R8*4)
 10032  	ADDQ  CX, DI
 10033  	ADDQ  BX, R8
 10034  	MOVSS (AX)(DI*4), X1
 10035  	MULSS X0, X1
 10036  	ADDSS (DX)(R8*4), X1
 10037  	MOVSS X1, (DX)(R8*4)
 10038  	ADDQ  CX, DI
 10039  	ADDQ  BX, R8
 10040  	SUBQ  $0x04, SI
 10041  
 10042  check_limit_unroll:
 10043  	CMPQ SI, $0x04
 10044  	JHI  loop_unroll
 10045  	JMP  check_limit
 10046  
 10047  loop:
 10048  	MOVSS (AX)(DI*4), X1
 10049  	MULSS X0, X1
 10050  	ADDSS (DX)(R8*4), X1
 10051  	MOVSS X1, (DX)(R8*4)
 10052  	DECQ  SI
 10053  	ADDQ  CX, DI
 10054  	ADDQ  BX, R8
 10055  
 10056  check_limit:
 10057  	CMPQ SI, $0x00
 10058  	JHI  loop
 10059  	RET
 10060  
 10061  // func AmdAxpyUnsafeX_V4A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10062  // Requires: SSE
 10063  TEXT ·AmdAxpyUnsafeX_V4A14R4(SB), NOSPLIT, $0-48
 10064  	MOVSS alpha+0(FP), X0
 10065  	MOVQ  xs+8(FP), AX
 10066  	MOVQ  incx+16(FP), CX
 10067  	MOVQ  ys+24(FP), DX
 10068  	MOVQ  incy+32(FP), BX
 10069  	MOVQ  n+40(FP), SI
 10070  	XORQ  DI, DI
 10071  	XORQ  R8, R8
 10072  	JMP   check_limit_unroll
 10073  	PCALIGN $0x08
 10074  	NOP
 10075  	NOP
 10076  	NOP
 10077  	NOP
 10078  	NOP
 10079  	NOP
 10080  
 10081  loop_unroll:
 10082  	MOVSS (AX)(DI*4), X1
 10083  	MULSS X0, X1
 10084  	ADDSS (DX)(R8*4), X1
 10085  	MOVSS X1, (DX)(R8*4)
 10086  	ADDQ  CX, DI
 10087  	ADDQ  BX, R8
 10088  	MOVSS (AX)(DI*4), X1
 10089  	MULSS X0, X1
 10090  	ADDSS (DX)(R8*4), X1
 10091  	MOVSS X1, (DX)(R8*4)
 10092  	ADDQ  CX, DI
 10093  	ADDQ  BX, R8
 10094  	MOVSS (AX)(DI*4), X1
 10095  	MULSS X0, X1
 10096  	ADDSS (DX)(R8*4), X1
 10097  	MOVSS X1, (DX)(R8*4)
 10098  	ADDQ  CX, DI
 10099  	ADDQ  BX, R8
 10100  	MOVSS (AX)(DI*4), X1
 10101  	MULSS X0, X1
 10102  	ADDSS (DX)(R8*4), X1
 10103  	MOVSS X1, (DX)(R8*4)
 10104  	ADDQ  CX, DI
 10105  	ADDQ  BX, R8
 10106  	SUBQ  $0x04, SI
 10107  
 10108  check_limit_unroll:
 10109  	CMPQ SI, $0x04
 10110  	JHI  loop_unroll
 10111  	JMP  check_limit
 10112  
 10113  loop:
 10114  	MOVSS (AX)(DI*4), X1
 10115  	MULSS X0, X1
 10116  	ADDSS (DX)(R8*4), X1
 10117  	MOVSS X1, (DX)(R8*4)
 10118  	DECQ  SI
 10119  	ADDQ  CX, DI
 10120  	ADDQ  BX, R8
 10121  
 10122  check_limit:
 10123  	CMPQ SI, $0x00
 10124  	JHI  loop
 10125  	RET
 10126  
 10127  // func AmdAxpyUnsafeX_V5A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10128  // Requires: SSE
 10129  TEXT ·AmdAxpyUnsafeX_V5A14R4(SB), NOSPLIT, $0-48
 10130  	MOVSS alpha+0(FP), X0
 10131  	MOVQ  xs+8(FP), AX
 10132  	MOVQ  incx+16(FP), CX
 10133  	MOVQ  ys+24(FP), DX
 10134  	MOVQ  incy+32(FP), BX
 10135  	MOVQ  n+40(FP), SI
 10136  	XORQ  DI, DI
 10137  	XORQ  R8, R8
 10138  	JMP   check_limit_unroll
 10139  	PCALIGN $0x08
 10140  	NOP
 10141  	NOP
 10142  	NOP
 10143  	NOP
 10144  	NOP
 10145  	NOP
 10146  
 10147  loop_unroll:
 10148  	MOVSS (AX)(DI*4), X1
 10149  	MULSS X0, X1
 10150  	ADDSS (DX)(R8*4), X1
 10151  	MOVSS X1, (DX)(R8*4)
 10152  	ADDQ  CX, DI
 10153  	ADDQ  BX, R8
 10154  	MOVSS (AX)(DI*4), X1
 10155  	MULSS X0, X1
 10156  	ADDSS (DX)(R8*4), X1
 10157  	MOVSS X1, (DX)(R8*4)
 10158  	ADDQ  CX, DI
 10159  	ADDQ  BX, R8
 10160  	MOVSS (AX)(DI*4), X1
 10161  	MULSS X0, X1
 10162  	ADDSS (DX)(R8*4), X1
 10163  	MOVSS X1, (DX)(R8*4)
 10164  	ADDQ  CX, DI
 10165  	ADDQ  BX, R8
 10166  	MOVSS (AX)(DI*4), X1
 10167  	MULSS X0, X1
 10168  	ADDSS (DX)(R8*4), X1
 10169  	MOVSS X1, (DX)(R8*4)
 10170  	ADDQ  CX, DI
 10171  	ADDQ  BX, R8
 10172  	SUBQ  $0x04, SI
 10173  
 10174  check_limit_unroll:
 10175  	CMPQ SI, $0x04
 10176  	JHI  loop_unroll
 10177  	JMP  check_limit
 10178  
 10179  loop:
 10180  	MOVSS (AX)(DI*4), X1
 10181  	MULSS X0, X1
 10182  	ADDSS (DX)(R8*4), X1
 10183  	MOVSS X1, (DX)(R8*4)
 10184  	DECQ  SI
 10185  	ADDQ  CX, DI
 10186  	ADDQ  BX, R8
 10187  
 10188  check_limit:
 10189  	CMPQ SI, $0x00
 10190  	JHI  loop
 10191  	RET
 10192  
 10193  // func AmdAxpyUnsafeX_V0A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10194  // Requires: SSE
 10195  TEXT ·AmdAxpyUnsafeX_V0A15R4(SB), NOSPLIT, $0-48
 10196  	MOVSS alpha+0(FP), X0
 10197  	MOVQ  xs+8(FP), AX
 10198  	MOVQ  incx+16(FP), CX
 10199  	MOVQ  ys+24(FP), DX
 10200  	MOVQ  incy+32(FP), BX
 10201  	MOVQ  n+40(FP), SI
 10202  	XORQ  DI, DI
 10203  	XORQ  R8, R8
 10204  	JMP   check_limit_unroll
 10205  	PCALIGN $0x08
 10206  	NOP
 10207  	NOP
 10208  	NOP
 10209  	NOP
 10210  	NOP
 10211  	NOP
 10212  	NOP
 10213  
 10214  loop_unroll:
 10215  	MOVSS (AX)(DI*4), X1
 10216  	MULSS X0, X1
 10217  	ADDSS (DX)(R8*4), X1
 10218  	MOVSS X1, (DX)(R8*4)
 10219  	ADDQ  CX, DI
 10220  	ADDQ  BX, R8
 10221  	MOVSS (AX)(DI*4), X1
 10222  	MULSS X0, X1
 10223  	ADDSS (DX)(R8*4), X1
 10224  	MOVSS X1, (DX)(R8*4)
 10225  	ADDQ  CX, DI
 10226  	ADDQ  BX, R8
 10227  	MOVSS (AX)(DI*4), X1
 10228  	MULSS X0, X1
 10229  	ADDSS (DX)(R8*4), X1
 10230  	MOVSS X1, (DX)(R8*4)
 10231  	ADDQ  CX, DI
 10232  	ADDQ  BX, R8
 10233  	MOVSS (AX)(DI*4), X1
 10234  	MULSS X0, X1
 10235  	ADDSS (DX)(R8*4), X1
 10236  	MOVSS X1, (DX)(R8*4)
 10237  	ADDQ  CX, DI
 10238  	ADDQ  BX, R8
 10239  	SUBQ  $0x04, SI
 10240  
 10241  check_limit_unroll:
 10242  	CMPQ SI, $0x04
 10243  	JHI  loop_unroll
 10244  	JMP  check_limit
 10245  
 10246  loop:
 10247  	MOVSS (AX)(DI*4), X1
 10248  	MULSS X0, X1
 10249  	ADDSS (DX)(R8*4), X1
 10250  	MOVSS X1, (DX)(R8*4)
 10251  	DECQ  SI
 10252  	ADDQ  CX, DI
 10253  	ADDQ  BX, R8
 10254  
 10255  check_limit:
 10256  	CMPQ SI, $0x00
 10257  	JHI  loop
 10258  	RET
 10259  
 10260  // func AmdAxpyUnsafeX_V1A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10261  // Requires: SSE
 10262  TEXT ·AmdAxpyUnsafeX_V1A15R4(SB), NOSPLIT, $0-48
 10263  	MOVSS alpha+0(FP), X0
 10264  	MOVQ  xs+8(FP), AX
 10265  	MOVQ  incx+16(FP), CX
 10266  	MOVQ  ys+24(FP), DX
 10267  	MOVQ  incy+32(FP), BX
 10268  	MOVQ  n+40(FP), SI
 10269  	XORQ  DI, DI
 10270  	XORQ  R8, R8
 10271  	JMP   check_limit_unroll
 10272  	PCALIGN $0x08
 10273  	NOP
 10274  	NOP
 10275  	NOP
 10276  	NOP
 10277  	NOP
 10278  	NOP
 10279  	NOP
 10280  
 10281  loop_unroll:
 10282  	MOVSS (AX)(DI*4), X1
 10283  	MULSS X0, X1
 10284  	ADDSS (DX)(R8*4), X1
 10285  	MOVSS X1, (DX)(R8*4)
 10286  	ADDQ  CX, DI
 10287  	ADDQ  BX, R8
 10288  	MOVSS (AX)(DI*4), X1
 10289  	MULSS X0, X1
 10290  	ADDSS (DX)(R8*4), X1
 10291  	MOVSS X1, (DX)(R8*4)
 10292  	ADDQ  CX, DI
 10293  	ADDQ  BX, R8
 10294  	MOVSS (AX)(DI*4), X1
 10295  	MULSS X0, X1
 10296  	ADDSS (DX)(R8*4), X1
 10297  	MOVSS X1, (DX)(R8*4)
 10298  	ADDQ  CX, DI
 10299  	ADDQ  BX, R8
 10300  	MOVSS (AX)(DI*4), X1
 10301  	MULSS X0, X1
 10302  	ADDSS (DX)(R8*4), X1
 10303  	MOVSS X1, (DX)(R8*4)
 10304  	ADDQ  CX, DI
 10305  	ADDQ  BX, R8
 10306  	SUBQ  $0x04, SI
 10307  
 10308  check_limit_unroll:
 10309  	CMPQ SI, $0x04
 10310  	JHI  loop_unroll
 10311  	JMP  check_limit
 10312  
 10313  loop:
 10314  	MOVSS (AX)(DI*4), X1
 10315  	MULSS X0, X1
 10316  	ADDSS (DX)(R8*4), X1
 10317  	MOVSS X1, (DX)(R8*4)
 10318  	DECQ  SI
 10319  	ADDQ  CX, DI
 10320  	ADDQ  BX, R8
 10321  
 10322  check_limit:
 10323  	CMPQ SI, $0x00
 10324  	JHI  loop
 10325  	RET
 10326  
 10327  // func AmdAxpyUnsafeX_V2A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10328  // Requires: SSE
 10329  TEXT ·AmdAxpyUnsafeX_V2A15R4(SB), NOSPLIT, $0-48
 10330  	MOVSS alpha+0(FP), X0
 10331  	MOVQ  xs+8(FP), AX
 10332  	MOVQ  incx+16(FP), CX
 10333  	MOVQ  ys+24(FP), DX
 10334  	MOVQ  incy+32(FP), BX
 10335  	MOVQ  n+40(FP), SI
 10336  	XORQ  DI, DI
 10337  	XORQ  R8, R8
 10338  	JMP   check_limit_unroll
 10339  	PCALIGN $0x08
 10340  	NOP
 10341  	NOP
 10342  	NOP
 10343  	NOP
 10344  	NOP
 10345  	NOP
 10346  	NOP
 10347  
 10348  loop_unroll:
 10349  	MOVSS (AX)(DI*4), X1
 10350  	MULSS X0, X1
 10351  	ADDSS (DX)(R8*4), X1
 10352  	MOVSS X1, (DX)(R8*4)
 10353  	ADDQ  CX, DI
 10354  	ADDQ  BX, R8
 10355  	MOVSS (AX)(DI*4), X1
 10356  	MULSS X0, X1
 10357  	ADDSS (DX)(R8*4), X1
 10358  	MOVSS X1, (DX)(R8*4)
 10359  	ADDQ  CX, DI
 10360  	ADDQ  BX, R8
 10361  	MOVSS (AX)(DI*4), X1
 10362  	MULSS X0, X1
 10363  	ADDSS (DX)(R8*4), X1
 10364  	MOVSS X1, (DX)(R8*4)
 10365  	ADDQ  CX, DI
 10366  	ADDQ  BX, R8
 10367  	MOVSS (AX)(DI*4), X1
 10368  	MULSS X0, X1
 10369  	ADDSS (DX)(R8*4), X1
 10370  	MOVSS X1, (DX)(R8*4)
 10371  	ADDQ  CX, DI
 10372  	ADDQ  BX, R8
 10373  	SUBQ  $0x04, SI
 10374  
 10375  check_limit_unroll:
 10376  	CMPQ SI, $0x04
 10377  	JHI  loop_unroll
 10378  	JMP  check_limit
 10379  
 10380  loop:
 10381  	MOVSS (AX)(DI*4), X1
 10382  	MULSS X0, X1
 10383  	ADDSS (DX)(R8*4), X1
 10384  	MOVSS X1, (DX)(R8*4)
 10385  	DECQ  SI
 10386  	ADDQ  CX, DI
 10387  	ADDQ  BX, R8
 10388  
 10389  check_limit:
 10390  	CMPQ SI, $0x00
 10391  	JHI  loop
 10392  	RET
 10393  
 10394  // func AmdAxpyUnsafeX_V3A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10395  // Requires: SSE
 10396  TEXT ·AmdAxpyUnsafeX_V3A15R4(SB), NOSPLIT, $0-48
 10397  	MOVSS alpha+0(FP), X0
 10398  	MOVQ  xs+8(FP), AX
 10399  	MOVQ  incx+16(FP), CX
 10400  	MOVQ  ys+24(FP), DX
 10401  	MOVQ  incy+32(FP), BX
 10402  	MOVQ  n+40(FP), SI
 10403  	XORQ  DI, DI
 10404  	XORQ  R8, R8
 10405  	JMP   check_limit_unroll
 10406  	PCALIGN $0x08
 10407  	NOP
 10408  	NOP
 10409  	NOP
 10410  	NOP
 10411  	NOP
 10412  	NOP
 10413  	NOP
 10414  
 10415  loop_unroll:
 10416  	MOVSS (AX)(DI*4), X1
 10417  	MULSS X0, X1
 10418  	ADDSS (DX)(R8*4), X1
 10419  	MOVSS X1, (DX)(R8*4)
 10420  	ADDQ  CX, DI
 10421  	ADDQ  BX, R8
 10422  	MOVSS (AX)(DI*4), X1
 10423  	MULSS X0, X1
 10424  	ADDSS (DX)(R8*4), X1
 10425  	MOVSS X1, (DX)(R8*4)
 10426  	ADDQ  CX, DI
 10427  	ADDQ  BX, R8
 10428  	MOVSS (AX)(DI*4), X1
 10429  	MULSS X0, X1
 10430  	ADDSS (DX)(R8*4), X1
 10431  	MOVSS X1, (DX)(R8*4)
 10432  	ADDQ  CX, DI
 10433  	ADDQ  BX, R8
 10434  	MOVSS (AX)(DI*4), X1
 10435  	MULSS X0, X1
 10436  	ADDSS (DX)(R8*4), X1
 10437  	MOVSS X1, (DX)(R8*4)
 10438  	ADDQ  CX, DI
 10439  	ADDQ  BX, R8
 10440  	SUBQ  $0x04, SI
 10441  
 10442  check_limit_unroll:
 10443  	CMPQ SI, $0x04
 10444  	JHI  loop_unroll
 10445  	JMP  check_limit
 10446  
 10447  loop:
 10448  	MOVSS (AX)(DI*4), X1
 10449  	MULSS X0, X1
 10450  	ADDSS (DX)(R8*4), X1
 10451  	MOVSS X1, (DX)(R8*4)
 10452  	DECQ  SI
 10453  	ADDQ  CX, DI
 10454  	ADDQ  BX, R8
 10455  
 10456  check_limit:
 10457  	CMPQ SI, $0x00
 10458  	JHI  loop
 10459  	RET
 10460  
 10461  // func AmdAxpyUnsafeX_V4A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10462  // Requires: SSE
 10463  TEXT ·AmdAxpyUnsafeX_V4A15R4(SB), NOSPLIT, $0-48
 10464  	MOVSS alpha+0(FP), X0
 10465  	MOVQ  xs+8(FP), AX
 10466  	MOVQ  incx+16(FP), CX
 10467  	MOVQ  ys+24(FP), DX
 10468  	MOVQ  incy+32(FP), BX
 10469  	MOVQ  n+40(FP), SI
 10470  	XORQ  DI, DI
 10471  	XORQ  R8, R8
 10472  	JMP   check_limit_unroll
 10473  	PCALIGN $0x08
 10474  	NOP
 10475  	NOP
 10476  	NOP
 10477  	NOP
 10478  	NOP
 10479  	NOP
 10480  	NOP
 10481  
 10482  loop_unroll:
 10483  	MOVSS (AX)(DI*4), X1
 10484  	MULSS X0, X1
 10485  	ADDSS (DX)(R8*4), X1
 10486  	MOVSS X1, (DX)(R8*4)
 10487  	ADDQ  CX, DI
 10488  	ADDQ  BX, R8
 10489  	MOVSS (AX)(DI*4), X1
 10490  	MULSS X0, X1
 10491  	ADDSS (DX)(R8*4), X1
 10492  	MOVSS X1, (DX)(R8*4)
 10493  	ADDQ  CX, DI
 10494  	ADDQ  BX, R8
 10495  	MOVSS (AX)(DI*4), X1
 10496  	MULSS X0, X1
 10497  	ADDSS (DX)(R8*4), X1
 10498  	MOVSS X1, (DX)(R8*4)
 10499  	ADDQ  CX, DI
 10500  	ADDQ  BX, R8
 10501  	MOVSS (AX)(DI*4), X1
 10502  	MULSS X0, X1
 10503  	ADDSS (DX)(R8*4), X1
 10504  	MOVSS X1, (DX)(R8*4)
 10505  	ADDQ  CX, DI
 10506  	ADDQ  BX, R8
 10507  	SUBQ  $0x04, SI
 10508  
 10509  check_limit_unroll:
 10510  	CMPQ SI, $0x04
 10511  	JHI  loop_unroll
 10512  	JMP  check_limit
 10513  
 10514  loop:
 10515  	MOVSS (AX)(DI*4), X1
 10516  	MULSS X0, X1
 10517  	ADDSS (DX)(R8*4), X1
 10518  	MOVSS X1, (DX)(R8*4)
 10519  	DECQ  SI
 10520  	ADDQ  CX, DI
 10521  	ADDQ  BX, R8
 10522  
 10523  check_limit:
 10524  	CMPQ SI, $0x00
 10525  	JHI  loop
 10526  	RET
 10527  
 10528  // func AmdAxpyUnsafeX_V5A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10529  // Requires: SSE
 10530  TEXT ·AmdAxpyUnsafeX_V5A15R4(SB), NOSPLIT, $0-48
 10531  	MOVSS alpha+0(FP), X0
 10532  	MOVQ  xs+8(FP), AX
 10533  	MOVQ  incx+16(FP), CX
 10534  	MOVQ  ys+24(FP), DX
 10535  	MOVQ  incy+32(FP), BX
 10536  	MOVQ  n+40(FP), SI
 10537  	XORQ  DI, DI
 10538  	XORQ  R8, R8
 10539  	JMP   check_limit_unroll
 10540  	PCALIGN $0x08
 10541  	NOP
 10542  	NOP
 10543  	NOP
 10544  	NOP
 10545  	NOP
 10546  	NOP
 10547  	NOP
 10548  
 10549  loop_unroll:
 10550  	MOVSS (AX)(DI*4), X1
 10551  	MULSS X0, X1
 10552  	ADDSS (DX)(R8*4), X1
 10553  	MOVSS X1, (DX)(R8*4)
 10554  	ADDQ  CX, DI
 10555  	ADDQ  BX, R8
 10556  	MOVSS (AX)(DI*4), X1
 10557  	MULSS X0, X1
 10558  	ADDSS (DX)(R8*4), X1
 10559  	MOVSS X1, (DX)(R8*4)
 10560  	ADDQ  CX, DI
 10561  	ADDQ  BX, R8
 10562  	MOVSS (AX)(DI*4), X1
 10563  	MULSS X0, X1
 10564  	ADDSS (DX)(R8*4), X1
 10565  	MOVSS X1, (DX)(R8*4)
 10566  	ADDQ  CX, DI
 10567  	ADDQ  BX, R8
 10568  	MOVSS (AX)(DI*4), X1
 10569  	MULSS X0, X1
 10570  	ADDSS (DX)(R8*4), X1
 10571  	MOVSS X1, (DX)(R8*4)
 10572  	ADDQ  CX, DI
 10573  	ADDQ  BX, R8
 10574  	SUBQ  $0x04, SI
 10575  
 10576  check_limit_unroll:
 10577  	CMPQ SI, $0x04
 10578  	JHI  loop_unroll
 10579  	JMP  check_limit
 10580  
 10581  loop:
 10582  	MOVSS (AX)(DI*4), X1
 10583  	MULSS X0, X1
 10584  	ADDSS (DX)(R8*4), X1
 10585  	MOVSS X1, (DX)(R8*4)
 10586  	DECQ  SI
 10587  	ADDQ  CX, DI
 10588  	ADDQ  BX, R8
 10589  
 10590  check_limit:
 10591  	CMPQ SI, $0x00
 10592  	JHI  loop
 10593  	RET
 10594  
 10595  // func AmdAxpyUnsafeX_V0A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10596  // Requires: SSE
 10597  TEXT ·AmdAxpyUnsafeX_V0A16R4(SB), NOSPLIT, $0-48
 10598  	MOVSS alpha+0(FP), X0
 10599  	MOVQ  xs+8(FP), AX
 10600  	MOVQ  incx+16(FP), CX
 10601  	MOVQ  ys+24(FP), DX
 10602  	MOVQ  incy+32(FP), BX
 10603  	MOVQ  n+40(FP), SI
 10604  	XORQ  DI, DI
 10605  	XORQ  R8, R8
 10606  	JMP   check_limit_unroll
 10607  	PCALIGN $0x10
 10608  
 10609  loop_unroll:
 10610  	MOVSS (AX)(DI*4), X1
 10611  	MULSS X0, X1
 10612  	ADDSS (DX)(R8*4), X1
 10613  	MOVSS X1, (DX)(R8*4)
 10614  	ADDQ  CX, DI
 10615  	ADDQ  BX, R8
 10616  	MOVSS (AX)(DI*4), X1
 10617  	MULSS X0, X1
 10618  	ADDSS (DX)(R8*4), X1
 10619  	MOVSS X1, (DX)(R8*4)
 10620  	ADDQ  CX, DI
 10621  	ADDQ  BX, R8
 10622  	MOVSS (AX)(DI*4), X1
 10623  	MULSS X0, X1
 10624  	ADDSS (DX)(R8*4), X1
 10625  	MOVSS X1, (DX)(R8*4)
 10626  	ADDQ  CX, DI
 10627  	ADDQ  BX, R8
 10628  	MOVSS (AX)(DI*4), X1
 10629  	MULSS X0, X1
 10630  	ADDSS (DX)(R8*4), X1
 10631  	MOVSS X1, (DX)(R8*4)
 10632  	ADDQ  CX, DI
 10633  	ADDQ  BX, R8
 10634  	SUBQ  $0x04, SI
 10635  
 10636  check_limit_unroll:
 10637  	CMPQ SI, $0x04
 10638  	JHI  loop_unroll
 10639  	JMP  check_limit
 10640  
 10641  loop:
 10642  	MOVSS (AX)(DI*4), X1
 10643  	MULSS X0, X1
 10644  	ADDSS (DX)(R8*4), X1
 10645  	MOVSS X1, (DX)(R8*4)
 10646  	DECQ  SI
 10647  	ADDQ  CX, DI
 10648  	ADDQ  BX, R8
 10649  
 10650  check_limit:
 10651  	CMPQ SI, $0x00
 10652  	JHI  loop
 10653  	RET
 10654  
 10655  // func AmdAxpyUnsafeX_V1A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10656  // Requires: SSE
 10657  TEXT ·AmdAxpyUnsafeX_V1A16R4(SB), NOSPLIT, $0-48
 10658  	MOVSS alpha+0(FP), X0
 10659  	MOVQ  xs+8(FP), AX
 10660  	MOVQ  incx+16(FP), CX
 10661  	MOVQ  ys+24(FP), DX
 10662  	MOVQ  incy+32(FP), BX
 10663  	MOVQ  n+40(FP), SI
 10664  	XORQ  DI, DI
 10665  	XORQ  R8, R8
 10666  	JMP   check_limit_unroll
 10667  	PCALIGN $0x10
 10668  
 10669  loop_unroll:
 10670  	MOVSS (AX)(DI*4), X1
 10671  	MULSS X0, X1
 10672  	ADDSS (DX)(R8*4), X1
 10673  	MOVSS X1, (DX)(R8*4)
 10674  	ADDQ  CX, DI
 10675  	ADDQ  BX, R8
 10676  	MOVSS (AX)(DI*4), X1
 10677  	MULSS X0, X1
 10678  	ADDSS (DX)(R8*4), X1
 10679  	MOVSS X1, (DX)(R8*4)
 10680  	ADDQ  CX, DI
 10681  	ADDQ  BX, R8
 10682  	MOVSS (AX)(DI*4), X1
 10683  	MULSS X0, X1
 10684  	ADDSS (DX)(R8*4), X1
 10685  	MOVSS X1, (DX)(R8*4)
 10686  	ADDQ  CX, DI
 10687  	ADDQ  BX, R8
 10688  	MOVSS (AX)(DI*4), X1
 10689  	MULSS X0, X1
 10690  	ADDSS (DX)(R8*4), X1
 10691  	MOVSS X1, (DX)(R8*4)
 10692  	ADDQ  CX, DI
 10693  	ADDQ  BX, R8
 10694  	SUBQ  $0x04, SI
 10695  
 10696  check_limit_unroll:
 10697  	CMPQ SI, $0x04
 10698  	JHI  loop_unroll
 10699  	JMP  check_limit
 10700  
 10701  loop:
 10702  	MOVSS (AX)(DI*4), X1
 10703  	MULSS X0, X1
 10704  	ADDSS (DX)(R8*4), X1
 10705  	MOVSS X1, (DX)(R8*4)
 10706  	DECQ  SI
 10707  	ADDQ  CX, DI
 10708  	ADDQ  BX, R8
 10709  
 10710  check_limit:
 10711  	CMPQ SI, $0x00
 10712  	JHI  loop
 10713  	RET
 10714  
 10715  // func AmdAxpyUnsafeX_V2A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10716  // Requires: SSE
 10717  TEXT ·AmdAxpyUnsafeX_V2A16R4(SB), NOSPLIT, $0-48
 10718  	MOVSS alpha+0(FP), X0
 10719  	MOVQ  xs+8(FP), AX
 10720  	MOVQ  incx+16(FP), CX
 10721  	MOVQ  ys+24(FP), DX
 10722  	MOVQ  incy+32(FP), BX
 10723  	MOVQ  n+40(FP), SI
 10724  	XORQ  DI, DI
 10725  	XORQ  R8, R8
 10726  	JMP   check_limit_unroll
 10727  	PCALIGN $0x10
 10728  
 10729  loop_unroll:
 10730  	MOVSS (AX)(DI*4), X1
 10731  	MULSS X0, X1
 10732  	ADDSS (DX)(R8*4), X1
 10733  	MOVSS X1, (DX)(R8*4)
 10734  	ADDQ  CX, DI
 10735  	ADDQ  BX, R8
 10736  	MOVSS (AX)(DI*4), X1
 10737  	MULSS X0, X1
 10738  	ADDSS (DX)(R8*4), X1
 10739  	MOVSS X1, (DX)(R8*4)
 10740  	ADDQ  CX, DI
 10741  	ADDQ  BX, R8
 10742  	MOVSS (AX)(DI*4), X1
 10743  	MULSS X0, X1
 10744  	ADDSS (DX)(R8*4), X1
 10745  	MOVSS X1, (DX)(R8*4)
 10746  	ADDQ  CX, DI
 10747  	ADDQ  BX, R8
 10748  	MOVSS (AX)(DI*4), X1
 10749  	MULSS X0, X1
 10750  	ADDSS (DX)(R8*4), X1
 10751  	MOVSS X1, (DX)(R8*4)
 10752  	ADDQ  CX, DI
 10753  	ADDQ  BX, R8
 10754  	SUBQ  $0x04, SI
 10755  
 10756  check_limit_unroll:
 10757  	CMPQ SI, $0x04
 10758  	JHI  loop_unroll
 10759  	JMP  check_limit
 10760  
 10761  loop:
 10762  	MOVSS (AX)(DI*4), X1
 10763  	MULSS X0, X1
 10764  	ADDSS (DX)(R8*4), X1
 10765  	MOVSS X1, (DX)(R8*4)
 10766  	DECQ  SI
 10767  	ADDQ  CX, DI
 10768  	ADDQ  BX, R8
 10769  
 10770  check_limit:
 10771  	CMPQ SI, $0x00
 10772  	JHI  loop
 10773  	RET
 10774  
 10775  // func AmdAxpyUnsafeX_V3A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10776  // Requires: SSE
 10777  TEXT ·AmdAxpyUnsafeX_V3A16R4(SB), NOSPLIT, $0-48
 10778  	MOVSS alpha+0(FP), X0
 10779  	MOVQ  xs+8(FP), AX
 10780  	MOVQ  incx+16(FP), CX
 10781  	MOVQ  ys+24(FP), DX
 10782  	MOVQ  incy+32(FP), BX
 10783  	MOVQ  n+40(FP), SI
 10784  	XORQ  DI, DI
 10785  	XORQ  R8, R8
 10786  	JMP   check_limit_unroll
 10787  	PCALIGN $0x10
 10788  
 10789  loop_unroll:
 10790  	MOVSS (AX)(DI*4), X1
 10791  	MULSS X0, X1
 10792  	ADDSS (DX)(R8*4), X1
 10793  	MOVSS X1, (DX)(R8*4)
 10794  	ADDQ  CX, DI
 10795  	ADDQ  BX, R8
 10796  	MOVSS (AX)(DI*4), X1
 10797  	MULSS X0, X1
 10798  	ADDSS (DX)(R8*4), X1
 10799  	MOVSS X1, (DX)(R8*4)
 10800  	ADDQ  CX, DI
 10801  	ADDQ  BX, R8
 10802  	MOVSS (AX)(DI*4), X1
 10803  	MULSS X0, X1
 10804  	ADDSS (DX)(R8*4), X1
 10805  	MOVSS X1, (DX)(R8*4)
 10806  	ADDQ  CX, DI
 10807  	ADDQ  BX, R8
 10808  	MOVSS (AX)(DI*4), X1
 10809  	MULSS X0, X1
 10810  	ADDSS (DX)(R8*4), X1
 10811  	MOVSS X1, (DX)(R8*4)
 10812  	ADDQ  CX, DI
 10813  	ADDQ  BX, R8
 10814  	SUBQ  $0x04, SI
 10815  
 10816  check_limit_unroll:
 10817  	CMPQ SI, $0x04
 10818  	JHI  loop_unroll
 10819  	JMP  check_limit
 10820  
 10821  loop:
 10822  	MOVSS (AX)(DI*4), X1
 10823  	MULSS X0, X1
 10824  	ADDSS (DX)(R8*4), X1
 10825  	MOVSS X1, (DX)(R8*4)
 10826  	DECQ  SI
 10827  	ADDQ  CX, DI
 10828  	ADDQ  BX, R8
 10829  
 10830  check_limit:
 10831  	CMPQ SI, $0x00
 10832  	JHI  loop
 10833  	RET
 10834  
 10835  // func AmdAxpyUnsafeX_V4A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10836  // Requires: SSE
 10837  TEXT ·AmdAxpyUnsafeX_V4A16R4(SB), NOSPLIT, $0-48
 10838  	MOVSS alpha+0(FP), X0
 10839  	MOVQ  xs+8(FP), AX
 10840  	MOVQ  incx+16(FP), CX
 10841  	MOVQ  ys+24(FP), DX
 10842  	MOVQ  incy+32(FP), BX
 10843  	MOVQ  n+40(FP), SI
 10844  	XORQ  DI, DI
 10845  	XORQ  R8, R8
 10846  	JMP   check_limit_unroll
 10847  	PCALIGN $0x10
 10848  
 10849  loop_unroll:
 10850  	MOVSS (AX)(DI*4), X1
 10851  	MULSS X0, X1
 10852  	ADDSS (DX)(R8*4), X1
 10853  	MOVSS X1, (DX)(R8*4)
 10854  	ADDQ  CX, DI
 10855  	ADDQ  BX, R8
 10856  	MOVSS (AX)(DI*4), X1
 10857  	MULSS X0, X1
 10858  	ADDSS (DX)(R8*4), X1
 10859  	MOVSS X1, (DX)(R8*4)
 10860  	ADDQ  CX, DI
 10861  	ADDQ  BX, R8
 10862  	MOVSS (AX)(DI*4), X1
 10863  	MULSS X0, X1
 10864  	ADDSS (DX)(R8*4), X1
 10865  	MOVSS X1, (DX)(R8*4)
 10866  	ADDQ  CX, DI
 10867  	ADDQ  BX, R8
 10868  	MOVSS (AX)(DI*4), X1
 10869  	MULSS X0, X1
 10870  	ADDSS (DX)(R8*4), X1
 10871  	MOVSS X1, (DX)(R8*4)
 10872  	ADDQ  CX, DI
 10873  	ADDQ  BX, R8
 10874  	SUBQ  $0x04, SI
 10875  
 10876  check_limit_unroll:
 10877  	CMPQ SI, $0x04
 10878  	JHI  loop_unroll
 10879  	JMP  check_limit
 10880  
 10881  loop:
 10882  	MOVSS (AX)(DI*4), X1
 10883  	MULSS X0, X1
 10884  	ADDSS (DX)(R8*4), X1
 10885  	MOVSS X1, (DX)(R8*4)
 10886  	DECQ  SI
 10887  	ADDQ  CX, DI
 10888  	ADDQ  BX, R8
 10889  
 10890  check_limit:
 10891  	CMPQ SI, $0x00
 10892  	JHI  loop
 10893  	RET
 10894  
 10895  // func AmdAxpyUnsafeX_V5A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10896  // Requires: SSE
 10897  TEXT ·AmdAxpyUnsafeX_V5A16R4(SB), NOSPLIT, $0-48
 10898  	MOVSS alpha+0(FP), X0
 10899  	MOVQ  xs+8(FP), AX
 10900  	MOVQ  incx+16(FP), CX
 10901  	MOVQ  ys+24(FP), DX
 10902  	MOVQ  incy+32(FP), BX
 10903  	MOVQ  n+40(FP), SI
 10904  	XORQ  DI, DI
 10905  	XORQ  R8, R8
 10906  	JMP   check_limit_unroll
 10907  	PCALIGN $0x10
 10908  
 10909  loop_unroll:
 10910  	MOVSS (AX)(DI*4), X1
 10911  	MULSS X0, X1
 10912  	ADDSS (DX)(R8*4), X1
 10913  	MOVSS X1, (DX)(R8*4)
 10914  	ADDQ  CX, DI
 10915  	ADDQ  BX, R8
 10916  	MOVSS (AX)(DI*4), X1
 10917  	MULSS X0, X1
 10918  	ADDSS (DX)(R8*4), X1
 10919  	MOVSS X1, (DX)(R8*4)
 10920  	ADDQ  CX, DI
 10921  	ADDQ  BX, R8
 10922  	MOVSS (AX)(DI*4), X1
 10923  	MULSS X0, X1
 10924  	ADDSS (DX)(R8*4), X1
 10925  	MOVSS X1, (DX)(R8*4)
 10926  	ADDQ  CX, DI
 10927  	ADDQ  BX, R8
 10928  	MOVSS (AX)(DI*4), X1
 10929  	MULSS X0, X1
 10930  	ADDSS (DX)(R8*4), X1
 10931  	MOVSS X1, (DX)(R8*4)
 10932  	ADDQ  CX, DI
 10933  	ADDQ  BX, R8
 10934  	SUBQ  $0x04, SI
 10935  
 10936  check_limit_unroll:
 10937  	CMPQ SI, $0x04
 10938  	JHI  loop_unroll
 10939  	JMP  check_limit
 10940  
 10941  loop:
 10942  	MOVSS (AX)(DI*4), X1
 10943  	MULSS X0, X1
 10944  	ADDSS (DX)(R8*4), X1
 10945  	MOVSS X1, (DX)(R8*4)
 10946  	DECQ  SI
 10947  	ADDQ  CX, DI
 10948  	ADDQ  BX, R8
 10949  
 10950  check_limit:
 10951  	CMPQ SI, $0x00
 10952  	JHI  loop
 10953  	RET
 10954  
 10955  // func AmdAxpyUnsafeX_V0A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 10956  // Requires: SSE
 10957  TEXT ·AmdAxpyUnsafeX_V0A0R8(SB), NOSPLIT, $0-48
 10958  	MOVSS alpha+0(FP), X0
 10959  	MOVQ  xs+8(FP), AX
 10960  	MOVQ  incx+16(FP), CX
 10961  	MOVQ  ys+24(FP), DX
 10962  	MOVQ  incy+32(FP), BX
 10963  	MOVQ  n+40(FP), SI
 10964  	XORQ  DI, DI
 10965  	XORQ  R8, R8
 10966  	JMP   check_limit_unroll
 10967  
 10968  loop_unroll:
 10969  	MOVSS (AX)(DI*4), X1
 10970  	MULSS X0, X1
 10971  	ADDSS (DX)(R8*4), X1
 10972  	MOVSS X1, (DX)(R8*4)
 10973  	ADDQ  CX, DI
 10974  	ADDQ  BX, R8
 10975  	MOVSS (AX)(DI*4), X1
 10976  	MULSS X0, X1
 10977  	ADDSS (DX)(R8*4), X1
 10978  	MOVSS X1, (DX)(R8*4)
 10979  	ADDQ  CX, DI
 10980  	ADDQ  BX, R8
 10981  	MOVSS (AX)(DI*4), X1
 10982  	MULSS X0, X1
 10983  	ADDSS (DX)(R8*4), X1
 10984  	MOVSS X1, (DX)(R8*4)
 10985  	ADDQ  CX, DI
 10986  	ADDQ  BX, R8
 10987  	MOVSS (AX)(DI*4), X1
 10988  	MULSS X0, X1
 10989  	ADDSS (DX)(R8*4), X1
 10990  	MOVSS X1, (DX)(R8*4)
 10991  	ADDQ  CX, DI
 10992  	ADDQ  BX, R8
 10993  	MOVSS (AX)(DI*4), X1
 10994  	MULSS X0, X1
 10995  	ADDSS (DX)(R8*4), X1
 10996  	MOVSS X1, (DX)(R8*4)
 10997  	ADDQ  CX, DI
 10998  	ADDQ  BX, R8
 10999  	MOVSS (AX)(DI*4), X1
 11000  	MULSS X0, X1
 11001  	ADDSS (DX)(R8*4), X1
 11002  	MOVSS X1, (DX)(R8*4)
 11003  	ADDQ  CX, DI
 11004  	ADDQ  BX, R8
 11005  	MOVSS (AX)(DI*4), X1
 11006  	MULSS X0, X1
 11007  	ADDSS (DX)(R8*4), X1
 11008  	MOVSS X1, (DX)(R8*4)
 11009  	ADDQ  CX, DI
 11010  	ADDQ  BX, R8
 11011  	MOVSS (AX)(DI*4), X1
 11012  	MULSS X0, X1
 11013  	ADDSS (DX)(R8*4), X1
 11014  	MOVSS X1, (DX)(R8*4)
 11015  	ADDQ  CX, DI
 11016  	ADDQ  BX, R8
 11017  	SUBQ  $0x08, SI
 11018  
 11019  check_limit_unroll:
 11020  	CMPQ SI, $0x08
 11021  	JHI  loop_unroll
 11022  	JMP  check_limit
 11023  
 11024  loop:
 11025  	MOVSS (AX)(DI*4), X1
 11026  	MULSS X0, X1
 11027  	ADDSS (DX)(R8*4), X1
 11028  	MOVSS X1, (DX)(R8*4)
 11029  	DECQ  SI
 11030  	ADDQ  CX, DI
 11031  	ADDQ  BX, R8
 11032  
 11033  check_limit:
 11034  	CMPQ SI, $0x00
 11035  	JHI  loop
 11036  	RET
 11037  
 11038  // func AmdAxpyUnsafeX_V1A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11039  // Requires: SSE
 11040  TEXT ·AmdAxpyUnsafeX_V1A0R8(SB), NOSPLIT, $0-48
 11041  	MOVSS alpha+0(FP), X0
 11042  	MOVQ  xs+8(FP), AX
 11043  	MOVQ  incx+16(FP), CX
 11044  	MOVQ  ys+24(FP), DX
 11045  	MOVQ  incy+32(FP), BX
 11046  	MOVQ  n+40(FP), SI
 11047  	XORQ  DI, DI
 11048  	XORQ  R8, R8
 11049  	JMP   check_limit_unroll
 11050  
 11051  loop_unroll:
 11052  	MOVSS (AX)(DI*4), X1
 11053  	MULSS X0, X1
 11054  	ADDSS (DX)(R8*4), X1
 11055  	MOVSS X1, (DX)(R8*4)
 11056  	ADDQ  CX, DI
 11057  	ADDQ  BX, R8
 11058  	MOVSS (AX)(DI*4), X1
 11059  	MULSS X0, X1
 11060  	ADDSS (DX)(R8*4), X1
 11061  	MOVSS X1, (DX)(R8*4)
 11062  	ADDQ  CX, DI
 11063  	ADDQ  BX, R8
 11064  	MOVSS (AX)(DI*4), X1
 11065  	MULSS X0, X1
 11066  	ADDSS (DX)(R8*4), X1
 11067  	MOVSS X1, (DX)(R8*4)
 11068  	ADDQ  CX, DI
 11069  	ADDQ  BX, R8
 11070  	MOVSS (AX)(DI*4), X1
 11071  	MULSS X0, X1
 11072  	ADDSS (DX)(R8*4), X1
 11073  	MOVSS X1, (DX)(R8*4)
 11074  	ADDQ  CX, DI
 11075  	ADDQ  BX, R8
 11076  	MOVSS (AX)(DI*4), X1
 11077  	MULSS X0, X1
 11078  	ADDSS (DX)(R8*4), X1
 11079  	MOVSS X1, (DX)(R8*4)
 11080  	ADDQ  CX, DI
 11081  	ADDQ  BX, R8
 11082  	MOVSS (AX)(DI*4), X1
 11083  	MULSS X0, X1
 11084  	ADDSS (DX)(R8*4), X1
 11085  	MOVSS X1, (DX)(R8*4)
 11086  	ADDQ  CX, DI
 11087  	ADDQ  BX, R8
 11088  	MOVSS (AX)(DI*4), X1
 11089  	MULSS X0, X1
 11090  	ADDSS (DX)(R8*4), X1
 11091  	MOVSS X1, (DX)(R8*4)
 11092  	ADDQ  CX, DI
 11093  	ADDQ  BX, R8
 11094  	MOVSS (AX)(DI*4), X1
 11095  	MULSS X0, X1
 11096  	ADDSS (DX)(R8*4), X1
 11097  	MOVSS X1, (DX)(R8*4)
 11098  	ADDQ  CX, DI
 11099  	ADDQ  BX, R8
 11100  	SUBQ  $0x08, SI
 11101  
 11102  check_limit_unroll:
 11103  	CMPQ SI, $0x08
 11104  	JHI  loop_unroll
 11105  	JMP  check_limit
 11106  
 11107  loop:
 11108  	MOVSS (AX)(DI*4), X1
 11109  	MULSS X0, X1
 11110  	ADDSS (DX)(R8*4), X1
 11111  	MOVSS X1, (DX)(R8*4)
 11112  	DECQ  SI
 11113  	ADDQ  CX, DI
 11114  	ADDQ  BX, R8
 11115  
 11116  check_limit:
 11117  	CMPQ SI, $0x00
 11118  	JHI  loop
 11119  	RET
 11120  
 11121  // func AmdAxpyUnsafeX_V2A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11122  // Requires: SSE
 11123  TEXT ·AmdAxpyUnsafeX_V2A0R8(SB), NOSPLIT, $0-48
 11124  	MOVSS alpha+0(FP), X0
 11125  	MOVQ  xs+8(FP), AX
 11126  	MOVQ  incx+16(FP), CX
 11127  	MOVQ  ys+24(FP), DX
 11128  	MOVQ  incy+32(FP), BX
 11129  	MOVQ  n+40(FP), SI
 11130  	XORQ  DI, DI
 11131  	XORQ  R8, R8
 11132  	JMP   check_limit_unroll
 11133  
 11134  loop_unroll:
 11135  	MOVSS (AX)(DI*4), X1
 11136  	MULSS X0, X1
 11137  	ADDSS (DX)(R8*4), X1
 11138  	MOVSS X1, (DX)(R8*4)
 11139  	ADDQ  CX, DI
 11140  	ADDQ  BX, R8
 11141  	MOVSS (AX)(DI*4), X1
 11142  	MULSS X0, X1
 11143  	ADDSS (DX)(R8*4), X1
 11144  	MOVSS X1, (DX)(R8*4)
 11145  	ADDQ  CX, DI
 11146  	ADDQ  BX, R8
 11147  	MOVSS (AX)(DI*4), X1
 11148  	MULSS X0, X1
 11149  	ADDSS (DX)(R8*4), X1
 11150  	MOVSS X1, (DX)(R8*4)
 11151  	ADDQ  CX, DI
 11152  	ADDQ  BX, R8
 11153  	MOVSS (AX)(DI*4), X1
 11154  	MULSS X0, X1
 11155  	ADDSS (DX)(R8*4), X1
 11156  	MOVSS X1, (DX)(R8*4)
 11157  	ADDQ  CX, DI
 11158  	ADDQ  BX, R8
 11159  	MOVSS (AX)(DI*4), X1
 11160  	MULSS X0, X1
 11161  	ADDSS (DX)(R8*4), X1
 11162  	MOVSS X1, (DX)(R8*4)
 11163  	ADDQ  CX, DI
 11164  	ADDQ  BX, R8
 11165  	MOVSS (AX)(DI*4), X1
 11166  	MULSS X0, X1
 11167  	ADDSS (DX)(R8*4), X1
 11168  	MOVSS X1, (DX)(R8*4)
 11169  	ADDQ  CX, DI
 11170  	ADDQ  BX, R8
 11171  	MOVSS (AX)(DI*4), X1
 11172  	MULSS X0, X1
 11173  	ADDSS (DX)(R8*4), X1
 11174  	MOVSS X1, (DX)(R8*4)
 11175  	ADDQ  CX, DI
 11176  	ADDQ  BX, R8
 11177  	MOVSS (AX)(DI*4), X1
 11178  	MULSS X0, X1
 11179  	ADDSS (DX)(R8*4), X1
 11180  	MOVSS X1, (DX)(R8*4)
 11181  	ADDQ  CX, DI
 11182  	ADDQ  BX, R8
 11183  	SUBQ  $0x08, SI
 11184  
 11185  check_limit_unroll:
 11186  	CMPQ SI, $0x08
 11187  	JHI  loop_unroll
 11188  	JMP  check_limit
 11189  
 11190  loop:
 11191  	MOVSS (AX)(DI*4), X1
 11192  	MULSS X0, X1
 11193  	ADDSS (DX)(R8*4), X1
 11194  	MOVSS X1, (DX)(R8*4)
 11195  	DECQ  SI
 11196  	ADDQ  CX, DI
 11197  	ADDQ  BX, R8
 11198  
 11199  check_limit:
 11200  	CMPQ SI, $0x00
 11201  	JHI  loop
 11202  	RET
 11203  
 11204  // func AmdAxpyUnsafeX_V3A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11205  // Requires: SSE
 11206  TEXT ·AmdAxpyUnsafeX_V3A0R8(SB), NOSPLIT, $0-48
 11207  	MOVSS alpha+0(FP), X0
 11208  	MOVQ  xs+8(FP), AX
 11209  	MOVQ  incx+16(FP), CX
 11210  	MOVQ  ys+24(FP), DX
 11211  	MOVQ  incy+32(FP), BX
 11212  	MOVQ  n+40(FP), SI
 11213  	XORQ  DI, DI
 11214  	XORQ  R8, R8
 11215  	JMP   check_limit_unroll
 11216  
 11217  loop_unroll:
 11218  	MOVSS (AX)(DI*4), X1
 11219  	MULSS X0, X1
 11220  	ADDSS (DX)(R8*4), X1
 11221  	MOVSS X1, (DX)(R8*4)
 11222  	ADDQ  CX, DI
 11223  	ADDQ  BX, R8
 11224  	MOVSS (AX)(DI*4), X1
 11225  	MULSS X0, X1
 11226  	ADDSS (DX)(R8*4), X1
 11227  	MOVSS X1, (DX)(R8*4)
 11228  	ADDQ  CX, DI
 11229  	ADDQ  BX, R8
 11230  	MOVSS (AX)(DI*4), X1
 11231  	MULSS X0, X1
 11232  	ADDSS (DX)(R8*4), X1
 11233  	MOVSS X1, (DX)(R8*4)
 11234  	ADDQ  CX, DI
 11235  	ADDQ  BX, R8
 11236  	MOVSS (AX)(DI*4), X1
 11237  	MULSS X0, X1
 11238  	ADDSS (DX)(R8*4), X1
 11239  	MOVSS X1, (DX)(R8*4)
 11240  	ADDQ  CX, DI
 11241  	ADDQ  BX, R8
 11242  	MOVSS (AX)(DI*4), X1
 11243  	MULSS X0, X1
 11244  	ADDSS (DX)(R8*4), X1
 11245  	MOVSS X1, (DX)(R8*4)
 11246  	ADDQ  CX, DI
 11247  	ADDQ  BX, R8
 11248  	MOVSS (AX)(DI*4), X1
 11249  	MULSS X0, X1
 11250  	ADDSS (DX)(R8*4), X1
 11251  	MOVSS X1, (DX)(R8*4)
 11252  	ADDQ  CX, DI
 11253  	ADDQ  BX, R8
 11254  	MOVSS (AX)(DI*4), X1
 11255  	MULSS X0, X1
 11256  	ADDSS (DX)(R8*4), X1
 11257  	MOVSS X1, (DX)(R8*4)
 11258  	ADDQ  CX, DI
 11259  	ADDQ  BX, R8
 11260  	MOVSS (AX)(DI*4), X1
 11261  	MULSS X0, X1
 11262  	ADDSS (DX)(R8*4), X1
 11263  	MOVSS X1, (DX)(R8*4)
 11264  	ADDQ  CX, DI
 11265  	ADDQ  BX, R8
 11266  	SUBQ  $0x08, SI
 11267  
 11268  check_limit_unroll:
 11269  	CMPQ SI, $0x08
 11270  	JHI  loop_unroll
 11271  	JMP  check_limit
 11272  
 11273  loop:
 11274  	MOVSS (AX)(DI*4), X1
 11275  	MULSS X0, X1
 11276  	ADDSS (DX)(R8*4), X1
 11277  	MOVSS X1, (DX)(R8*4)
 11278  	DECQ  SI
 11279  	ADDQ  CX, DI
 11280  	ADDQ  BX, R8
 11281  
 11282  check_limit:
 11283  	CMPQ SI, $0x00
 11284  	JHI  loop
 11285  	RET
 11286  
 11287  // func AmdAxpyUnsafeX_V4A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11288  // Requires: SSE
 11289  TEXT ·AmdAxpyUnsafeX_V4A0R8(SB), NOSPLIT, $0-48
 11290  	MOVSS alpha+0(FP), X0
 11291  	MOVQ  xs+8(FP), AX
 11292  	MOVQ  incx+16(FP), CX
 11293  	MOVQ  ys+24(FP), DX
 11294  	MOVQ  incy+32(FP), BX
 11295  	MOVQ  n+40(FP), SI
 11296  	XORQ  DI, DI
 11297  	XORQ  R8, R8
 11298  	JMP   check_limit_unroll
 11299  
 11300  loop_unroll:
 11301  	MOVSS (AX)(DI*4), X1
 11302  	MULSS X0, X1
 11303  	ADDSS (DX)(R8*4), X1
 11304  	MOVSS X1, (DX)(R8*4)
 11305  	ADDQ  CX, DI
 11306  	ADDQ  BX, R8
 11307  	MOVSS (AX)(DI*4), X1
 11308  	MULSS X0, X1
 11309  	ADDSS (DX)(R8*4), X1
 11310  	MOVSS X1, (DX)(R8*4)
 11311  	ADDQ  CX, DI
 11312  	ADDQ  BX, R8
 11313  	MOVSS (AX)(DI*4), X1
 11314  	MULSS X0, X1
 11315  	ADDSS (DX)(R8*4), X1
 11316  	MOVSS X1, (DX)(R8*4)
 11317  	ADDQ  CX, DI
 11318  	ADDQ  BX, R8
 11319  	MOVSS (AX)(DI*4), X1
 11320  	MULSS X0, X1
 11321  	ADDSS (DX)(R8*4), X1
 11322  	MOVSS X1, (DX)(R8*4)
 11323  	ADDQ  CX, DI
 11324  	ADDQ  BX, R8
 11325  	MOVSS (AX)(DI*4), X1
 11326  	MULSS X0, X1
 11327  	ADDSS (DX)(R8*4), X1
 11328  	MOVSS X1, (DX)(R8*4)
 11329  	ADDQ  CX, DI
 11330  	ADDQ  BX, R8
 11331  	MOVSS (AX)(DI*4), X1
 11332  	MULSS X0, X1
 11333  	ADDSS (DX)(R8*4), X1
 11334  	MOVSS X1, (DX)(R8*4)
 11335  	ADDQ  CX, DI
 11336  	ADDQ  BX, R8
 11337  	MOVSS (AX)(DI*4), X1
 11338  	MULSS X0, X1
 11339  	ADDSS (DX)(R8*4), X1
 11340  	MOVSS X1, (DX)(R8*4)
 11341  	ADDQ  CX, DI
 11342  	ADDQ  BX, R8
 11343  	MOVSS (AX)(DI*4), X1
 11344  	MULSS X0, X1
 11345  	ADDSS (DX)(R8*4), X1
 11346  	MOVSS X1, (DX)(R8*4)
 11347  	ADDQ  CX, DI
 11348  	ADDQ  BX, R8
 11349  	SUBQ  $0x08, SI
 11350  
 11351  check_limit_unroll:
 11352  	CMPQ SI, $0x08
 11353  	JHI  loop_unroll
 11354  	JMP  check_limit
 11355  
 11356  loop:
 11357  	MOVSS (AX)(DI*4), X1
 11358  	MULSS X0, X1
 11359  	ADDSS (DX)(R8*4), X1
 11360  	MOVSS X1, (DX)(R8*4)
 11361  	DECQ  SI
 11362  	ADDQ  CX, DI
 11363  	ADDQ  BX, R8
 11364  
 11365  check_limit:
 11366  	CMPQ SI, $0x00
 11367  	JHI  loop
 11368  	RET
 11369  
 11370  // func AmdAxpyUnsafeX_V5A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11371  // Requires: SSE
 11372  TEXT ·AmdAxpyUnsafeX_V5A0R8(SB), NOSPLIT, $0-48
 11373  	MOVSS alpha+0(FP), X0
 11374  	MOVQ  xs+8(FP), AX
 11375  	MOVQ  incx+16(FP), CX
 11376  	MOVQ  ys+24(FP), DX
 11377  	MOVQ  incy+32(FP), BX
 11378  	MOVQ  n+40(FP), SI
 11379  	XORQ  DI, DI
 11380  	XORQ  R8, R8
 11381  	JMP   check_limit_unroll
 11382  
 11383  loop_unroll:
 11384  	MOVSS (AX)(DI*4), X1
 11385  	MULSS X0, X1
 11386  	ADDSS (DX)(R8*4), X1
 11387  	MOVSS X1, (DX)(R8*4)
 11388  	ADDQ  CX, DI
 11389  	ADDQ  BX, R8
 11390  	MOVSS (AX)(DI*4), X1
 11391  	MULSS X0, X1
 11392  	ADDSS (DX)(R8*4), X1
 11393  	MOVSS X1, (DX)(R8*4)
 11394  	ADDQ  CX, DI
 11395  	ADDQ  BX, R8
 11396  	MOVSS (AX)(DI*4), X1
 11397  	MULSS X0, X1
 11398  	ADDSS (DX)(R8*4), X1
 11399  	MOVSS X1, (DX)(R8*4)
 11400  	ADDQ  CX, DI
 11401  	ADDQ  BX, R8
 11402  	MOVSS (AX)(DI*4), X1
 11403  	MULSS X0, X1
 11404  	ADDSS (DX)(R8*4), X1
 11405  	MOVSS X1, (DX)(R8*4)
 11406  	ADDQ  CX, DI
 11407  	ADDQ  BX, R8
 11408  	MOVSS (AX)(DI*4), X1
 11409  	MULSS X0, X1
 11410  	ADDSS (DX)(R8*4), X1
 11411  	MOVSS X1, (DX)(R8*4)
 11412  	ADDQ  CX, DI
 11413  	ADDQ  BX, R8
 11414  	MOVSS (AX)(DI*4), X1
 11415  	MULSS X0, X1
 11416  	ADDSS (DX)(R8*4), X1
 11417  	MOVSS X1, (DX)(R8*4)
 11418  	ADDQ  CX, DI
 11419  	ADDQ  BX, R8
 11420  	MOVSS (AX)(DI*4), X1
 11421  	MULSS X0, X1
 11422  	ADDSS (DX)(R8*4), X1
 11423  	MOVSS X1, (DX)(R8*4)
 11424  	ADDQ  CX, DI
 11425  	ADDQ  BX, R8
 11426  	MOVSS (AX)(DI*4), X1
 11427  	MULSS X0, X1
 11428  	ADDSS (DX)(R8*4), X1
 11429  	MOVSS X1, (DX)(R8*4)
 11430  	ADDQ  CX, DI
 11431  	ADDQ  BX, R8
 11432  	SUBQ  $0x08, SI
 11433  
 11434  check_limit_unroll:
 11435  	CMPQ SI, $0x08
 11436  	JHI  loop_unroll
 11437  	JMP  check_limit
 11438  
 11439  loop:
 11440  	MOVSS (AX)(DI*4), X1
 11441  	MULSS X0, X1
 11442  	ADDSS (DX)(R8*4), X1
 11443  	MOVSS X1, (DX)(R8*4)
 11444  	DECQ  SI
 11445  	ADDQ  CX, DI
 11446  	ADDQ  BX, R8
 11447  
 11448  check_limit:
 11449  	CMPQ SI, $0x00
 11450  	JHI  loop
 11451  	RET
 11452  
 11453  // func AmdAxpyUnsafeX_V0A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11454  // Requires: SSE
 11455  TEXT ·AmdAxpyUnsafeX_V0A8R8(SB), NOSPLIT, $0-48
 11456  	MOVSS alpha+0(FP), X0
 11457  	MOVQ  xs+8(FP), AX
 11458  	MOVQ  incx+16(FP), CX
 11459  	MOVQ  ys+24(FP), DX
 11460  	MOVQ  incy+32(FP), BX
 11461  	MOVQ  n+40(FP), SI
 11462  	XORQ  DI, DI
 11463  	XORQ  R8, R8
 11464  	JMP   check_limit_unroll
 11465  	PCALIGN $0x08
 11466  
 11467  loop_unroll:
 11468  	MOVSS (AX)(DI*4), X1
 11469  	MULSS X0, X1
 11470  	ADDSS (DX)(R8*4), X1
 11471  	MOVSS X1, (DX)(R8*4)
 11472  	ADDQ  CX, DI
 11473  	ADDQ  BX, R8
 11474  	MOVSS (AX)(DI*4), X1
 11475  	MULSS X0, X1
 11476  	ADDSS (DX)(R8*4), X1
 11477  	MOVSS X1, (DX)(R8*4)
 11478  	ADDQ  CX, DI
 11479  	ADDQ  BX, R8
 11480  	MOVSS (AX)(DI*4), X1
 11481  	MULSS X0, X1
 11482  	ADDSS (DX)(R8*4), X1
 11483  	MOVSS X1, (DX)(R8*4)
 11484  	ADDQ  CX, DI
 11485  	ADDQ  BX, R8
 11486  	MOVSS (AX)(DI*4), X1
 11487  	MULSS X0, X1
 11488  	ADDSS (DX)(R8*4), X1
 11489  	MOVSS X1, (DX)(R8*4)
 11490  	ADDQ  CX, DI
 11491  	ADDQ  BX, R8
 11492  	MOVSS (AX)(DI*4), X1
 11493  	MULSS X0, X1
 11494  	ADDSS (DX)(R8*4), X1
 11495  	MOVSS X1, (DX)(R8*4)
 11496  	ADDQ  CX, DI
 11497  	ADDQ  BX, R8
 11498  	MOVSS (AX)(DI*4), X1
 11499  	MULSS X0, X1
 11500  	ADDSS (DX)(R8*4), X1
 11501  	MOVSS X1, (DX)(R8*4)
 11502  	ADDQ  CX, DI
 11503  	ADDQ  BX, R8
 11504  	MOVSS (AX)(DI*4), X1
 11505  	MULSS X0, X1
 11506  	ADDSS (DX)(R8*4), X1
 11507  	MOVSS X1, (DX)(R8*4)
 11508  	ADDQ  CX, DI
 11509  	ADDQ  BX, R8
 11510  	MOVSS (AX)(DI*4), X1
 11511  	MULSS X0, X1
 11512  	ADDSS (DX)(R8*4), X1
 11513  	MOVSS X1, (DX)(R8*4)
 11514  	ADDQ  CX, DI
 11515  	ADDQ  BX, R8
 11516  	SUBQ  $0x08, SI
 11517  
 11518  check_limit_unroll:
 11519  	CMPQ SI, $0x08
 11520  	JHI  loop_unroll
 11521  	JMP  check_limit
 11522  
 11523  loop:
 11524  	MOVSS (AX)(DI*4), X1
 11525  	MULSS X0, X1
 11526  	ADDSS (DX)(R8*4), X1
 11527  	MOVSS X1, (DX)(R8*4)
 11528  	DECQ  SI
 11529  	ADDQ  CX, DI
 11530  	ADDQ  BX, R8
 11531  
 11532  check_limit:
 11533  	CMPQ SI, $0x00
 11534  	JHI  loop
 11535  	RET
 11536  
 11537  // func AmdAxpyUnsafeX_V1A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11538  // Requires: SSE
 11539  TEXT ·AmdAxpyUnsafeX_V1A8R8(SB), NOSPLIT, $0-48
 11540  	MOVSS alpha+0(FP), X0
 11541  	MOVQ  xs+8(FP), AX
 11542  	MOVQ  incx+16(FP), CX
 11543  	MOVQ  ys+24(FP), DX
 11544  	MOVQ  incy+32(FP), BX
 11545  	MOVQ  n+40(FP), SI
 11546  	XORQ  DI, DI
 11547  	XORQ  R8, R8
 11548  	JMP   check_limit_unroll
 11549  	PCALIGN $0x08
 11550  
 11551  loop_unroll:
 11552  	MOVSS (AX)(DI*4), X1
 11553  	MULSS X0, X1
 11554  	ADDSS (DX)(R8*4), X1
 11555  	MOVSS X1, (DX)(R8*4)
 11556  	ADDQ  CX, DI
 11557  	ADDQ  BX, R8
 11558  	MOVSS (AX)(DI*4), X1
 11559  	MULSS X0, X1
 11560  	ADDSS (DX)(R8*4), X1
 11561  	MOVSS X1, (DX)(R8*4)
 11562  	ADDQ  CX, DI
 11563  	ADDQ  BX, R8
 11564  	MOVSS (AX)(DI*4), X1
 11565  	MULSS X0, X1
 11566  	ADDSS (DX)(R8*4), X1
 11567  	MOVSS X1, (DX)(R8*4)
 11568  	ADDQ  CX, DI
 11569  	ADDQ  BX, R8
 11570  	MOVSS (AX)(DI*4), X1
 11571  	MULSS X0, X1
 11572  	ADDSS (DX)(R8*4), X1
 11573  	MOVSS X1, (DX)(R8*4)
 11574  	ADDQ  CX, DI
 11575  	ADDQ  BX, R8
 11576  	MOVSS (AX)(DI*4), X1
 11577  	MULSS X0, X1
 11578  	ADDSS (DX)(R8*4), X1
 11579  	MOVSS X1, (DX)(R8*4)
 11580  	ADDQ  CX, DI
 11581  	ADDQ  BX, R8
 11582  	MOVSS (AX)(DI*4), X1
 11583  	MULSS X0, X1
 11584  	ADDSS (DX)(R8*4), X1
 11585  	MOVSS X1, (DX)(R8*4)
 11586  	ADDQ  CX, DI
 11587  	ADDQ  BX, R8
 11588  	MOVSS (AX)(DI*4), X1
 11589  	MULSS X0, X1
 11590  	ADDSS (DX)(R8*4), X1
 11591  	MOVSS X1, (DX)(R8*4)
 11592  	ADDQ  CX, DI
 11593  	ADDQ  BX, R8
 11594  	MOVSS (AX)(DI*4), X1
 11595  	MULSS X0, X1
 11596  	ADDSS (DX)(R8*4), X1
 11597  	MOVSS X1, (DX)(R8*4)
 11598  	ADDQ  CX, DI
 11599  	ADDQ  BX, R8
 11600  	SUBQ  $0x08, SI
 11601  
 11602  check_limit_unroll:
 11603  	CMPQ SI, $0x08
 11604  	JHI  loop_unroll
 11605  	JMP  check_limit
 11606  
 11607  loop:
 11608  	MOVSS (AX)(DI*4), X1
 11609  	MULSS X0, X1
 11610  	ADDSS (DX)(R8*4), X1
 11611  	MOVSS X1, (DX)(R8*4)
 11612  	DECQ  SI
 11613  	ADDQ  CX, DI
 11614  	ADDQ  BX, R8
 11615  
 11616  check_limit:
 11617  	CMPQ SI, $0x00
 11618  	JHI  loop
 11619  	RET
 11620  
 11621  // func AmdAxpyUnsafeX_V2A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11622  // Requires: SSE
 11623  TEXT ·AmdAxpyUnsafeX_V2A8R8(SB), NOSPLIT, $0-48
 11624  	MOVSS alpha+0(FP), X0
 11625  	MOVQ  xs+8(FP), AX
 11626  	MOVQ  incx+16(FP), CX
 11627  	MOVQ  ys+24(FP), DX
 11628  	MOVQ  incy+32(FP), BX
 11629  	MOVQ  n+40(FP), SI
 11630  	XORQ  DI, DI
 11631  	XORQ  R8, R8
 11632  	JMP   check_limit_unroll
 11633  	PCALIGN $0x08
 11634  
 11635  loop_unroll:
 11636  	MOVSS (AX)(DI*4), X1
 11637  	MULSS X0, X1
 11638  	ADDSS (DX)(R8*4), X1
 11639  	MOVSS X1, (DX)(R8*4)
 11640  	ADDQ  CX, DI
 11641  	ADDQ  BX, R8
 11642  	MOVSS (AX)(DI*4), X1
 11643  	MULSS X0, X1
 11644  	ADDSS (DX)(R8*4), X1
 11645  	MOVSS X1, (DX)(R8*4)
 11646  	ADDQ  CX, DI
 11647  	ADDQ  BX, R8
 11648  	MOVSS (AX)(DI*4), X1
 11649  	MULSS X0, X1
 11650  	ADDSS (DX)(R8*4), X1
 11651  	MOVSS X1, (DX)(R8*4)
 11652  	ADDQ  CX, DI
 11653  	ADDQ  BX, R8
 11654  	MOVSS (AX)(DI*4), X1
 11655  	MULSS X0, X1
 11656  	ADDSS (DX)(R8*4), X1
 11657  	MOVSS X1, (DX)(R8*4)
 11658  	ADDQ  CX, DI
 11659  	ADDQ  BX, R8
 11660  	MOVSS (AX)(DI*4), X1
 11661  	MULSS X0, X1
 11662  	ADDSS (DX)(R8*4), X1
 11663  	MOVSS X1, (DX)(R8*4)
 11664  	ADDQ  CX, DI
 11665  	ADDQ  BX, R8
 11666  	MOVSS (AX)(DI*4), X1
 11667  	MULSS X0, X1
 11668  	ADDSS (DX)(R8*4), X1
 11669  	MOVSS X1, (DX)(R8*4)
 11670  	ADDQ  CX, DI
 11671  	ADDQ  BX, R8
 11672  	MOVSS (AX)(DI*4), X1
 11673  	MULSS X0, X1
 11674  	ADDSS (DX)(R8*4), X1
 11675  	MOVSS X1, (DX)(R8*4)
 11676  	ADDQ  CX, DI
 11677  	ADDQ  BX, R8
 11678  	MOVSS (AX)(DI*4), X1
 11679  	MULSS X0, X1
 11680  	ADDSS (DX)(R8*4), X1
 11681  	MOVSS X1, (DX)(R8*4)
 11682  	ADDQ  CX, DI
 11683  	ADDQ  BX, R8
 11684  	SUBQ  $0x08, SI
 11685  
 11686  check_limit_unroll:
 11687  	CMPQ SI, $0x08
 11688  	JHI  loop_unroll
 11689  	JMP  check_limit
 11690  
 11691  loop:
 11692  	MOVSS (AX)(DI*4), X1
 11693  	MULSS X0, X1
 11694  	ADDSS (DX)(R8*4), X1
 11695  	MOVSS X1, (DX)(R8*4)
 11696  	DECQ  SI
 11697  	ADDQ  CX, DI
 11698  	ADDQ  BX, R8
 11699  
 11700  check_limit:
 11701  	CMPQ SI, $0x00
 11702  	JHI  loop
 11703  	RET
 11704  
 11705  // func AmdAxpyUnsafeX_V3A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11706  // Requires: SSE
 11707  TEXT ·AmdAxpyUnsafeX_V3A8R8(SB), NOSPLIT, $0-48
 11708  	MOVSS alpha+0(FP), X0
 11709  	MOVQ  xs+8(FP), AX
 11710  	MOVQ  incx+16(FP), CX
 11711  	MOVQ  ys+24(FP), DX
 11712  	MOVQ  incy+32(FP), BX
 11713  	MOVQ  n+40(FP), SI
 11714  	XORQ  DI, DI
 11715  	XORQ  R8, R8
 11716  	JMP   check_limit_unroll
 11717  	PCALIGN $0x08
 11718  
 11719  loop_unroll:
 11720  	MOVSS (AX)(DI*4), X1
 11721  	MULSS X0, X1
 11722  	ADDSS (DX)(R8*4), X1
 11723  	MOVSS X1, (DX)(R8*4)
 11724  	ADDQ  CX, DI
 11725  	ADDQ  BX, R8
 11726  	MOVSS (AX)(DI*4), X1
 11727  	MULSS X0, X1
 11728  	ADDSS (DX)(R8*4), X1
 11729  	MOVSS X1, (DX)(R8*4)
 11730  	ADDQ  CX, DI
 11731  	ADDQ  BX, R8
 11732  	MOVSS (AX)(DI*4), X1
 11733  	MULSS X0, X1
 11734  	ADDSS (DX)(R8*4), X1
 11735  	MOVSS X1, (DX)(R8*4)
 11736  	ADDQ  CX, DI
 11737  	ADDQ  BX, R8
 11738  	MOVSS (AX)(DI*4), X1
 11739  	MULSS X0, X1
 11740  	ADDSS (DX)(R8*4), X1
 11741  	MOVSS X1, (DX)(R8*4)
 11742  	ADDQ  CX, DI
 11743  	ADDQ  BX, R8
 11744  	MOVSS (AX)(DI*4), X1
 11745  	MULSS X0, X1
 11746  	ADDSS (DX)(R8*4), X1
 11747  	MOVSS X1, (DX)(R8*4)
 11748  	ADDQ  CX, DI
 11749  	ADDQ  BX, R8
 11750  	MOVSS (AX)(DI*4), X1
 11751  	MULSS X0, X1
 11752  	ADDSS (DX)(R8*4), X1
 11753  	MOVSS X1, (DX)(R8*4)
 11754  	ADDQ  CX, DI
 11755  	ADDQ  BX, R8
 11756  	MOVSS (AX)(DI*4), X1
 11757  	MULSS X0, X1
 11758  	ADDSS (DX)(R8*4), X1
 11759  	MOVSS X1, (DX)(R8*4)
 11760  	ADDQ  CX, DI
 11761  	ADDQ  BX, R8
 11762  	MOVSS (AX)(DI*4), X1
 11763  	MULSS X0, X1
 11764  	ADDSS (DX)(R8*4), X1
 11765  	MOVSS X1, (DX)(R8*4)
 11766  	ADDQ  CX, DI
 11767  	ADDQ  BX, R8
 11768  	SUBQ  $0x08, SI
 11769  
 11770  check_limit_unroll:
 11771  	CMPQ SI, $0x08
 11772  	JHI  loop_unroll
 11773  	JMP  check_limit
 11774  
 11775  loop:
 11776  	MOVSS (AX)(DI*4), X1
 11777  	MULSS X0, X1
 11778  	ADDSS (DX)(R8*4), X1
 11779  	MOVSS X1, (DX)(R8*4)
 11780  	DECQ  SI
 11781  	ADDQ  CX, DI
 11782  	ADDQ  BX, R8
 11783  
 11784  check_limit:
 11785  	CMPQ SI, $0x00
 11786  	JHI  loop
 11787  	RET
 11788  
 11789  // func AmdAxpyUnsafeX_V4A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11790  // Requires: SSE
 11791  TEXT ·AmdAxpyUnsafeX_V4A8R8(SB), NOSPLIT, $0-48
 11792  	MOVSS alpha+0(FP), X0
 11793  	MOVQ  xs+8(FP), AX
 11794  	MOVQ  incx+16(FP), CX
 11795  	MOVQ  ys+24(FP), DX
 11796  	MOVQ  incy+32(FP), BX
 11797  	MOVQ  n+40(FP), SI
 11798  	XORQ  DI, DI
 11799  	XORQ  R8, R8
 11800  	JMP   check_limit_unroll
 11801  	PCALIGN $0x08
 11802  
 11803  loop_unroll:
 11804  	MOVSS (AX)(DI*4), X1
 11805  	MULSS X0, X1
 11806  	ADDSS (DX)(R8*4), X1
 11807  	MOVSS X1, (DX)(R8*4)
 11808  	ADDQ  CX, DI
 11809  	ADDQ  BX, R8
 11810  	MOVSS (AX)(DI*4), X1
 11811  	MULSS X0, X1
 11812  	ADDSS (DX)(R8*4), X1
 11813  	MOVSS X1, (DX)(R8*4)
 11814  	ADDQ  CX, DI
 11815  	ADDQ  BX, R8
 11816  	MOVSS (AX)(DI*4), X1
 11817  	MULSS X0, X1
 11818  	ADDSS (DX)(R8*4), X1
 11819  	MOVSS X1, (DX)(R8*4)
 11820  	ADDQ  CX, DI
 11821  	ADDQ  BX, R8
 11822  	MOVSS (AX)(DI*4), X1
 11823  	MULSS X0, X1
 11824  	ADDSS (DX)(R8*4), X1
 11825  	MOVSS X1, (DX)(R8*4)
 11826  	ADDQ  CX, DI
 11827  	ADDQ  BX, R8
 11828  	MOVSS (AX)(DI*4), X1
 11829  	MULSS X0, X1
 11830  	ADDSS (DX)(R8*4), X1
 11831  	MOVSS X1, (DX)(R8*4)
 11832  	ADDQ  CX, DI
 11833  	ADDQ  BX, R8
 11834  	MOVSS (AX)(DI*4), X1
 11835  	MULSS X0, X1
 11836  	ADDSS (DX)(R8*4), X1
 11837  	MOVSS X1, (DX)(R8*4)
 11838  	ADDQ  CX, DI
 11839  	ADDQ  BX, R8
 11840  	MOVSS (AX)(DI*4), X1
 11841  	MULSS X0, X1
 11842  	ADDSS (DX)(R8*4), X1
 11843  	MOVSS X1, (DX)(R8*4)
 11844  	ADDQ  CX, DI
 11845  	ADDQ  BX, R8
 11846  	MOVSS (AX)(DI*4), X1
 11847  	MULSS X0, X1
 11848  	ADDSS (DX)(R8*4), X1
 11849  	MOVSS X1, (DX)(R8*4)
 11850  	ADDQ  CX, DI
 11851  	ADDQ  BX, R8
 11852  	SUBQ  $0x08, SI
 11853  
 11854  check_limit_unroll:
 11855  	CMPQ SI, $0x08
 11856  	JHI  loop_unroll
 11857  	JMP  check_limit
 11858  
 11859  loop:
 11860  	MOVSS (AX)(DI*4), X1
 11861  	MULSS X0, X1
 11862  	ADDSS (DX)(R8*4), X1
 11863  	MOVSS X1, (DX)(R8*4)
 11864  	DECQ  SI
 11865  	ADDQ  CX, DI
 11866  	ADDQ  BX, R8
 11867  
 11868  check_limit:
 11869  	CMPQ SI, $0x00
 11870  	JHI  loop
 11871  	RET
 11872  
 11873  // func AmdAxpyUnsafeX_V5A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11874  // Requires: SSE
 11875  TEXT ·AmdAxpyUnsafeX_V5A8R8(SB), NOSPLIT, $0-48
 11876  	MOVSS alpha+0(FP), X0
 11877  	MOVQ  xs+8(FP), AX
 11878  	MOVQ  incx+16(FP), CX
 11879  	MOVQ  ys+24(FP), DX
 11880  	MOVQ  incy+32(FP), BX
 11881  	MOVQ  n+40(FP), SI
 11882  	XORQ  DI, DI
 11883  	XORQ  R8, R8
 11884  	JMP   check_limit_unroll
 11885  	PCALIGN $0x08
 11886  
 11887  loop_unroll:
 11888  	MOVSS (AX)(DI*4), X1
 11889  	MULSS X0, X1
 11890  	ADDSS (DX)(R8*4), X1
 11891  	MOVSS X1, (DX)(R8*4)
 11892  	ADDQ  CX, DI
 11893  	ADDQ  BX, R8
 11894  	MOVSS (AX)(DI*4), X1
 11895  	MULSS X0, X1
 11896  	ADDSS (DX)(R8*4), X1
 11897  	MOVSS X1, (DX)(R8*4)
 11898  	ADDQ  CX, DI
 11899  	ADDQ  BX, R8
 11900  	MOVSS (AX)(DI*4), X1
 11901  	MULSS X0, X1
 11902  	ADDSS (DX)(R8*4), X1
 11903  	MOVSS X1, (DX)(R8*4)
 11904  	ADDQ  CX, DI
 11905  	ADDQ  BX, R8
 11906  	MOVSS (AX)(DI*4), X1
 11907  	MULSS X0, X1
 11908  	ADDSS (DX)(R8*4), X1
 11909  	MOVSS X1, (DX)(R8*4)
 11910  	ADDQ  CX, DI
 11911  	ADDQ  BX, R8
 11912  	MOVSS (AX)(DI*4), X1
 11913  	MULSS X0, X1
 11914  	ADDSS (DX)(R8*4), X1
 11915  	MOVSS X1, (DX)(R8*4)
 11916  	ADDQ  CX, DI
 11917  	ADDQ  BX, R8
 11918  	MOVSS (AX)(DI*4), X1
 11919  	MULSS X0, X1
 11920  	ADDSS (DX)(R8*4), X1
 11921  	MOVSS X1, (DX)(R8*4)
 11922  	ADDQ  CX, DI
 11923  	ADDQ  BX, R8
 11924  	MOVSS (AX)(DI*4), X1
 11925  	MULSS X0, X1
 11926  	ADDSS (DX)(R8*4), X1
 11927  	MOVSS X1, (DX)(R8*4)
 11928  	ADDQ  CX, DI
 11929  	ADDQ  BX, R8
 11930  	MOVSS (AX)(DI*4), X1
 11931  	MULSS X0, X1
 11932  	ADDSS (DX)(R8*4), X1
 11933  	MOVSS X1, (DX)(R8*4)
 11934  	ADDQ  CX, DI
 11935  	ADDQ  BX, R8
 11936  	SUBQ  $0x08, SI
 11937  
 11938  check_limit_unroll:
 11939  	CMPQ SI, $0x08
 11940  	JHI  loop_unroll
 11941  	JMP  check_limit
 11942  
 11943  loop:
 11944  	MOVSS (AX)(DI*4), X1
 11945  	MULSS X0, X1
 11946  	ADDSS (DX)(R8*4), X1
 11947  	MOVSS X1, (DX)(R8*4)
 11948  	DECQ  SI
 11949  	ADDQ  CX, DI
 11950  	ADDQ  BX, R8
 11951  
 11952  check_limit:
 11953  	CMPQ SI, $0x00
 11954  	JHI  loop
 11955  	RET
 11956  
 11957  // func AmdAxpyUnsafeX_V0A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 11958  // Requires: SSE
 11959  TEXT ·AmdAxpyUnsafeX_V0A9R8(SB), NOSPLIT, $0-48
 11960  	MOVSS alpha+0(FP), X0
 11961  	MOVQ  xs+8(FP), AX
 11962  	MOVQ  incx+16(FP), CX
 11963  	MOVQ  ys+24(FP), DX
 11964  	MOVQ  incy+32(FP), BX
 11965  	MOVQ  n+40(FP), SI
 11966  	XORQ  DI, DI
 11967  	XORQ  R8, R8
 11968  	JMP   check_limit_unroll
 11969  	PCALIGN $0x08
 11970  	NOP
 11971  
 11972  loop_unroll:
 11973  	MOVSS (AX)(DI*4), X1
 11974  	MULSS X0, X1
 11975  	ADDSS (DX)(R8*4), X1
 11976  	MOVSS X1, (DX)(R8*4)
 11977  	ADDQ  CX, DI
 11978  	ADDQ  BX, R8
 11979  	MOVSS (AX)(DI*4), X1
 11980  	MULSS X0, X1
 11981  	ADDSS (DX)(R8*4), X1
 11982  	MOVSS X1, (DX)(R8*4)
 11983  	ADDQ  CX, DI
 11984  	ADDQ  BX, R8
 11985  	MOVSS (AX)(DI*4), X1
 11986  	MULSS X0, X1
 11987  	ADDSS (DX)(R8*4), X1
 11988  	MOVSS X1, (DX)(R8*4)
 11989  	ADDQ  CX, DI
 11990  	ADDQ  BX, R8
 11991  	MOVSS (AX)(DI*4), X1
 11992  	MULSS X0, X1
 11993  	ADDSS (DX)(R8*4), X1
 11994  	MOVSS X1, (DX)(R8*4)
 11995  	ADDQ  CX, DI
 11996  	ADDQ  BX, R8
 11997  	MOVSS (AX)(DI*4), X1
 11998  	MULSS X0, X1
 11999  	ADDSS (DX)(R8*4), X1
 12000  	MOVSS X1, (DX)(R8*4)
 12001  	ADDQ  CX, DI
 12002  	ADDQ  BX, R8
 12003  	MOVSS (AX)(DI*4), X1
 12004  	MULSS X0, X1
 12005  	ADDSS (DX)(R8*4), X1
 12006  	MOVSS X1, (DX)(R8*4)
 12007  	ADDQ  CX, DI
 12008  	ADDQ  BX, R8
 12009  	MOVSS (AX)(DI*4), X1
 12010  	MULSS X0, X1
 12011  	ADDSS (DX)(R8*4), X1
 12012  	MOVSS X1, (DX)(R8*4)
 12013  	ADDQ  CX, DI
 12014  	ADDQ  BX, R8
 12015  	MOVSS (AX)(DI*4), X1
 12016  	MULSS X0, X1
 12017  	ADDSS (DX)(R8*4), X1
 12018  	MOVSS X1, (DX)(R8*4)
 12019  	ADDQ  CX, DI
 12020  	ADDQ  BX, R8
 12021  	SUBQ  $0x08, SI
 12022  
 12023  check_limit_unroll:
 12024  	CMPQ SI, $0x08
 12025  	JHI  loop_unroll
 12026  	JMP  check_limit
 12027  
 12028  loop:
 12029  	MOVSS (AX)(DI*4), X1
 12030  	MULSS X0, X1
 12031  	ADDSS (DX)(R8*4), X1
 12032  	MOVSS X1, (DX)(R8*4)
 12033  	DECQ  SI
 12034  	ADDQ  CX, DI
 12035  	ADDQ  BX, R8
 12036  
 12037  check_limit:
 12038  	CMPQ SI, $0x00
 12039  	JHI  loop
 12040  	RET
 12041  
 12042  // func AmdAxpyUnsafeX_V1A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12043  // Requires: SSE
 12044  TEXT ·AmdAxpyUnsafeX_V1A9R8(SB), NOSPLIT, $0-48
 12045  	MOVSS alpha+0(FP), X0
 12046  	MOVQ  xs+8(FP), AX
 12047  	MOVQ  incx+16(FP), CX
 12048  	MOVQ  ys+24(FP), DX
 12049  	MOVQ  incy+32(FP), BX
 12050  	MOVQ  n+40(FP), SI
 12051  	XORQ  DI, DI
 12052  	XORQ  R8, R8
 12053  	JMP   check_limit_unroll
 12054  	PCALIGN $0x08
 12055  	NOP
 12056  
 12057  loop_unroll:
 12058  	MOVSS (AX)(DI*4), X1
 12059  	MULSS X0, X1
 12060  	ADDSS (DX)(R8*4), X1
 12061  	MOVSS X1, (DX)(R8*4)
 12062  	ADDQ  CX, DI
 12063  	ADDQ  BX, R8
 12064  	MOVSS (AX)(DI*4), X1
 12065  	MULSS X0, X1
 12066  	ADDSS (DX)(R8*4), X1
 12067  	MOVSS X1, (DX)(R8*4)
 12068  	ADDQ  CX, DI
 12069  	ADDQ  BX, R8
 12070  	MOVSS (AX)(DI*4), X1
 12071  	MULSS X0, X1
 12072  	ADDSS (DX)(R8*4), X1
 12073  	MOVSS X1, (DX)(R8*4)
 12074  	ADDQ  CX, DI
 12075  	ADDQ  BX, R8
 12076  	MOVSS (AX)(DI*4), X1
 12077  	MULSS X0, X1
 12078  	ADDSS (DX)(R8*4), X1
 12079  	MOVSS X1, (DX)(R8*4)
 12080  	ADDQ  CX, DI
 12081  	ADDQ  BX, R8
 12082  	MOVSS (AX)(DI*4), X1
 12083  	MULSS X0, X1
 12084  	ADDSS (DX)(R8*4), X1
 12085  	MOVSS X1, (DX)(R8*4)
 12086  	ADDQ  CX, DI
 12087  	ADDQ  BX, R8
 12088  	MOVSS (AX)(DI*4), X1
 12089  	MULSS X0, X1
 12090  	ADDSS (DX)(R8*4), X1
 12091  	MOVSS X1, (DX)(R8*4)
 12092  	ADDQ  CX, DI
 12093  	ADDQ  BX, R8
 12094  	MOVSS (AX)(DI*4), X1
 12095  	MULSS X0, X1
 12096  	ADDSS (DX)(R8*4), X1
 12097  	MOVSS X1, (DX)(R8*4)
 12098  	ADDQ  CX, DI
 12099  	ADDQ  BX, R8
 12100  	MOVSS (AX)(DI*4), X1
 12101  	MULSS X0, X1
 12102  	ADDSS (DX)(R8*4), X1
 12103  	MOVSS X1, (DX)(R8*4)
 12104  	ADDQ  CX, DI
 12105  	ADDQ  BX, R8
 12106  	SUBQ  $0x08, SI
 12107  
 12108  check_limit_unroll:
 12109  	CMPQ SI, $0x08
 12110  	JHI  loop_unroll
 12111  	JMP  check_limit
 12112  
 12113  loop:
 12114  	MOVSS (AX)(DI*4), X1
 12115  	MULSS X0, X1
 12116  	ADDSS (DX)(R8*4), X1
 12117  	MOVSS X1, (DX)(R8*4)
 12118  	DECQ  SI
 12119  	ADDQ  CX, DI
 12120  	ADDQ  BX, R8
 12121  
 12122  check_limit:
 12123  	CMPQ SI, $0x00
 12124  	JHI  loop
 12125  	RET
 12126  
 12127  // func AmdAxpyUnsafeX_V2A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12128  // Requires: SSE
 12129  TEXT ·AmdAxpyUnsafeX_V2A9R8(SB), NOSPLIT, $0-48
 12130  	MOVSS alpha+0(FP), X0
 12131  	MOVQ  xs+8(FP), AX
 12132  	MOVQ  incx+16(FP), CX
 12133  	MOVQ  ys+24(FP), DX
 12134  	MOVQ  incy+32(FP), BX
 12135  	MOVQ  n+40(FP), SI
 12136  	XORQ  DI, DI
 12137  	XORQ  R8, R8
 12138  	JMP   check_limit_unroll
 12139  	PCALIGN $0x08
 12140  	NOP
 12141  
 12142  loop_unroll:
 12143  	MOVSS (AX)(DI*4), X1
 12144  	MULSS X0, X1
 12145  	ADDSS (DX)(R8*4), X1
 12146  	MOVSS X1, (DX)(R8*4)
 12147  	ADDQ  CX, DI
 12148  	ADDQ  BX, R8
 12149  	MOVSS (AX)(DI*4), X1
 12150  	MULSS X0, X1
 12151  	ADDSS (DX)(R8*4), X1
 12152  	MOVSS X1, (DX)(R8*4)
 12153  	ADDQ  CX, DI
 12154  	ADDQ  BX, R8
 12155  	MOVSS (AX)(DI*4), X1
 12156  	MULSS X0, X1
 12157  	ADDSS (DX)(R8*4), X1
 12158  	MOVSS X1, (DX)(R8*4)
 12159  	ADDQ  CX, DI
 12160  	ADDQ  BX, R8
 12161  	MOVSS (AX)(DI*4), X1
 12162  	MULSS X0, X1
 12163  	ADDSS (DX)(R8*4), X1
 12164  	MOVSS X1, (DX)(R8*4)
 12165  	ADDQ  CX, DI
 12166  	ADDQ  BX, R8
 12167  	MOVSS (AX)(DI*4), X1
 12168  	MULSS X0, X1
 12169  	ADDSS (DX)(R8*4), X1
 12170  	MOVSS X1, (DX)(R8*4)
 12171  	ADDQ  CX, DI
 12172  	ADDQ  BX, R8
 12173  	MOVSS (AX)(DI*4), X1
 12174  	MULSS X0, X1
 12175  	ADDSS (DX)(R8*4), X1
 12176  	MOVSS X1, (DX)(R8*4)
 12177  	ADDQ  CX, DI
 12178  	ADDQ  BX, R8
 12179  	MOVSS (AX)(DI*4), X1
 12180  	MULSS X0, X1
 12181  	ADDSS (DX)(R8*4), X1
 12182  	MOVSS X1, (DX)(R8*4)
 12183  	ADDQ  CX, DI
 12184  	ADDQ  BX, R8
 12185  	MOVSS (AX)(DI*4), X1
 12186  	MULSS X0, X1
 12187  	ADDSS (DX)(R8*4), X1
 12188  	MOVSS X1, (DX)(R8*4)
 12189  	ADDQ  CX, DI
 12190  	ADDQ  BX, R8
 12191  	SUBQ  $0x08, SI
 12192  
 12193  check_limit_unroll:
 12194  	CMPQ SI, $0x08
 12195  	JHI  loop_unroll
 12196  	JMP  check_limit
 12197  
 12198  loop:
 12199  	MOVSS (AX)(DI*4), X1
 12200  	MULSS X0, X1
 12201  	ADDSS (DX)(R8*4), X1
 12202  	MOVSS X1, (DX)(R8*4)
 12203  	DECQ  SI
 12204  	ADDQ  CX, DI
 12205  	ADDQ  BX, R8
 12206  
 12207  check_limit:
 12208  	CMPQ SI, $0x00
 12209  	JHI  loop
 12210  	RET
 12211  
 12212  // func AmdAxpyUnsafeX_V3A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12213  // Requires: SSE
 12214  TEXT ·AmdAxpyUnsafeX_V3A9R8(SB), NOSPLIT, $0-48
 12215  	MOVSS alpha+0(FP), X0
 12216  	MOVQ  xs+8(FP), AX
 12217  	MOVQ  incx+16(FP), CX
 12218  	MOVQ  ys+24(FP), DX
 12219  	MOVQ  incy+32(FP), BX
 12220  	MOVQ  n+40(FP), SI
 12221  	XORQ  DI, DI
 12222  	XORQ  R8, R8
 12223  	JMP   check_limit_unroll
 12224  	PCALIGN $0x08
 12225  	NOP
 12226  
 12227  loop_unroll:
 12228  	MOVSS (AX)(DI*4), X1
 12229  	MULSS X0, X1
 12230  	ADDSS (DX)(R8*4), X1
 12231  	MOVSS X1, (DX)(R8*4)
 12232  	ADDQ  CX, DI
 12233  	ADDQ  BX, R8
 12234  	MOVSS (AX)(DI*4), X1
 12235  	MULSS X0, X1
 12236  	ADDSS (DX)(R8*4), X1
 12237  	MOVSS X1, (DX)(R8*4)
 12238  	ADDQ  CX, DI
 12239  	ADDQ  BX, R8
 12240  	MOVSS (AX)(DI*4), X1
 12241  	MULSS X0, X1
 12242  	ADDSS (DX)(R8*4), X1
 12243  	MOVSS X1, (DX)(R8*4)
 12244  	ADDQ  CX, DI
 12245  	ADDQ  BX, R8
 12246  	MOVSS (AX)(DI*4), X1
 12247  	MULSS X0, X1
 12248  	ADDSS (DX)(R8*4), X1
 12249  	MOVSS X1, (DX)(R8*4)
 12250  	ADDQ  CX, DI
 12251  	ADDQ  BX, R8
 12252  	MOVSS (AX)(DI*4), X1
 12253  	MULSS X0, X1
 12254  	ADDSS (DX)(R8*4), X1
 12255  	MOVSS X1, (DX)(R8*4)
 12256  	ADDQ  CX, DI
 12257  	ADDQ  BX, R8
 12258  	MOVSS (AX)(DI*4), X1
 12259  	MULSS X0, X1
 12260  	ADDSS (DX)(R8*4), X1
 12261  	MOVSS X1, (DX)(R8*4)
 12262  	ADDQ  CX, DI
 12263  	ADDQ  BX, R8
 12264  	MOVSS (AX)(DI*4), X1
 12265  	MULSS X0, X1
 12266  	ADDSS (DX)(R8*4), X1
 12267  	MOVSS X1, (DX)(R8*4)
 12268  	ADDQ  CX, DI
 12269  	ADDQ  BX, R8
 12270  	MOVSS (AX)(DI*4), X1
 12271  	MULSS X0, X1
 12272  	ADDSS (DX)(R8*4), X1
 12273  	MOVSS X1, (DX)(R8*4)
 12274  	ADDQ  CX, DI
 12275  	ADDQ  BX, R8
 12276  	SUBQ  $0x08, SI
 12277  
 12278  check_limit_unroll:
 12279  	CMPQ SI, $0x08
 12280  	JHI  loop_unroll
 12281  	JMP  check_limit
 12282  
 12283  loop:
 12284  	MOVSS (AX)(DI*4), X1
 12285  	MULSS X0, X1
 12286  	ADDSS (DX)(R8*4), X1
 12287  	MOVSS X1, (DX)(R8*4)
 12288  	DECQ  SI
 12289  	ADDQ  CX, DI
 12290  	ADDQ  BX, R8
 12291  
 12292  check_limit:
 12293  	CMPQ SI, $0x00
 12294  	JHI  loop
 12295  	RET
 12296  
 12297  // func AmdAxpyUnsafeX_V4A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12298  // Requires: SSE
 12299  TEXT ·AmdAxpyUnsafeX_V4A9R8(SB), NOSPLIT, $0-48
 12300  	MOVSS alpha+0(FP), X0
 12301  	MOVQ  xs+8(FP), AX
 12302  	MOVQ  incx+16(FP), CX
 12303  	MOVQ  ys+24(FP), DX
 12304  	MOVQ  incy+32(FP), BX
 12305  	MOVQ  n+40(FP), SI
 12306  	XORQ  DI, DI
 12307  	XORQ  R8, R8
 12308  	JMP   check_limit_unroll
 12309  	PCALIGN $0x08
 12310  	NOP
 12311  
 12312  loop_unroll:
 12313  	MOVSS (AX)(DI*4), X1
 12314  	MULSS X0, X1
 12315  	ADDSS (DX)(R8*4), X1
 12316  	MOVSS X1, (DX)(R8*4)
 12317  	ADDQ  CX, DI
 12318  	ADDQ  BX, R8
 12319  	MOVSS (AX)(DI*4), X1
 12320  	MULSS X0, X1
 12321  	ADDSS (DX)(R8*4), X1
 12322  	MOVSS X1, (DX)(R8*4)
 12323  	ADDQ  CX, DI
 12324  	ADDQ  BX, R8
 12325  	MOVSS (AX)(DI*4), X1
 12326  	MULSS X0, X1
 12327  	ADDSS (DX)(R8*4), X1
 12328  	MOVSS X1, (DX)(R8*4)
 12329  	ADDQ  CX, DI
 12330  	ADDQ  BX, R8
 12331  	MOVSS (AX)(DI*4), X1
 12332  	MULSS X0, X1
 12333  	ADDSS (DX)(R8*4), X1
 12334  	MOVSS X1, (DX)(R8*4)
 12335  	ADDQ  CX, DI
 12336  	ADDQ  BX, R8
 12337  	MOVSS (AX)(DI*4), X1
 12338  	MULSS X0, X1
 12339  	ADDSS (DX)(R8*4), X1
 12340  	MOVSS X1, (DX)(R8*4)
 12341  	ADDQ  CX, DI
 12342  	ADDQ  BX, R8
 12343  	MOVSS (AX)(DI*4), X1
 12344  	MULSS X0, X1
 12345  	ADDSS (DX)(R8*4), X1
 12346  	MOVSS X1, (DX)(R8*4)
 12347  	ADDQ  CX, DI
 12348  	ADDQ  BX, R8
 12349  	MOVSS (AX)(DI*4), X1
 12350  	MULSS X0, X1
 12351  	ADDSS (DX)(R8*4), X1
 12352  	MOVSS X1, (DX)(R8*4)
 12353  	ADDQ  CX, DI
 12354  	ADDQ  BX, R8
 12355  	MOVSS (AX)(DI*4), X1
 12356  	MULSS X0, X1
 12357  	ADDSS (DX)(R8*4), X1
 12358  	MOVSS X1, (DX)(R8*4)
 12359  	ADDQ  CX, DI
 12360  	ADDQ  BX, R8
 12361  	SUBQ  $0x08, SI
 12362  
 12363  check_limit_unroll:
 12364  	CMPQ SI, $0x08
 12365  	JHI  loop_unroll
 12366  	JMP  check_limit
 12367  
 12368  loop:
 12369  	MOVSS (AX)(DI*4), X1
 12370  	MULSS X0, X1
 12371  	ADDSS (DX)(R8*4), X1
 12372  	MOVSS X1, (DX)(R8*4)
 12373  	DECQ  SI
 12374  	ADDQ  CX, DI
 12375  	ADDQ  BX, R8
 12376  
 12377  check_limit:
 12378  	CMPQ SI, $0x00
 12379  	JHI  loop
 12380  	RET
 12381  
 12382  // func AmdAxpyUnsafeX_V5A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12383  // Requires: SSE
 12384  TEXT ·AmdAxpyUnsafeX_V5A9R8(SB), NOSPLIT, $0-48
 12385  	MOVSS alpha+0(FP), X0
 12386  	MOVQ  xs+8(FP), AX
 12387  	MOVQ  incx+16(FP), CX
 12388  	MOVQ  ys+24(FP), DX
 12389  	MOVQ  incy+32(FP), BX
 12390  	MOVQ  n+40(FP), SI
 12391  	XORQ  DI, DI
 12392  	XORQ  R8, R8
 12393  	JMP   check_limit_unroll
 12394  	PCALIGN $0x08
 12395  	NOP
 12396  
 12397  loop_unroll:
 12398  	MOVSS (AX)(DI*4), X1
 12399  	MULSS X0, X1
 12400  	ADDSS (DX)(R8*4), X1
 12401  	MOVSS X1, (DX)(R8*4)
 12402  	ADDQ  CX, DI
 12403  	ADDQ  BX, R8
 12404  	MOVSS (AX)(DI*4), X1
 12405  	MULSS X0, X1
 12406  	ADDSS (DX)(R8*4), X1
 12407  	MOVSS X1, (DX)(R8*4)
 12408  	ADDQ  CX, DI
 12409  	ADDQ  BX, R8
 12410  	MOVSS (AX)(DI*4), X1
 12411  	MULSS X0, X1
 12412  	ADDSS (DX)(R8*4), X1
 12413  	MOVSS X1, (DX)(R8*4)
 12414  	ADDQ  CX, DI
 12415  	ADDQ  BX, R8
 12416  	MOVSS (AX)(DI*4), X1
 12417  	MULSS X0, X1
 12418  	ADDSS (DX)(R8*4), X1
 12419  	MOVSS X1, (DX)(R8*4)
 12420  	ADDQ  CX, DI
 12421  	ADDQ  BX, R8
 12422  	MOVSS (AX)(DI*4), X1
 12423  	MULSS X0, X1
 12424  	ADDSS (DX)(R8*4), X1
 12425  	MOVSS X1, (DX)(R8*4)
 12426  	ADDQ  CX, DI
 12427  	ADDQ  BX, R8
 12428  	MOVSS (AX)(DI*4), X1
 12429  	MULSS X0, X1
 12430  	ADDSS (DX)(R8*4), X1
 12431  	MOVSS X1, (DX)(R8*4)
 12432  	ADDQ  CX, DI
 12433  	ADDQ  BX, R8
 12434  	MOVSS (AX)(DI*4), X1
 12435  	MULSS X0, X1
 12436  	ADDSS (DX)(R8*4), X1
 12437  	MOVSS X1, (DX)(R8*4)
 12438  	ADDQ  CX, DI
 12439  	ADDQ  BX, R8
 12440  	MOVSS (AX)(DI*4), X1
 12441  	MULSS X0, X1
 12442  	ADDSS (DX)(R8*4), X1
 12443  	MOVSS X1, (DX)(R8*4)
 12444  	ADDQ  CX, DI
 12445  	ADDQ  BX, R8
 12446  	SUBQ  $0x08, SI
 12447  
 12448  check_limit_unroll:
 12449  	CMPQ SI, $0x08
 12450  	JHI  loop_unroll
 12451  	JMP  check_limit
 12452  
 12453  loop:
 12454  	MOVSS (AX)(DI*4), X1
 12455  	MULSS X0, X1
 12456  	ADDSS (DX)(R8*4), X1
 12457  	MOVSS X1, (DX)(R8*4)
 12458  	DECQ  SI
 12459  	ADDQ  CX, DI
 12460  	ADDQ  BX, R8
 12461  
 12462  check_limit:
 12463  	CMPQ SI, $0x00
 12464  	JHI  loop
 12465  	RET
 12466  
 12467  // func AmdAxpyUnsafeX_V0A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12468  // Requires: SSE
 12469  TEXT ·AmdAxpyUnsafeX_V0A10R8(SB), NOSPLIT, $0-48
 12470  	MOVSS alpha+0(FP), X0
 12471  	MOVQ  xs+8(FP), AX
 12472  	MOVQ  incx+16(FP), CX
 12473  	MOVQ  ys+24(FP), DX
 12474  	MOVQ  incy+32(FP), BX
 12475  	MOVQ  n+40(FP), SI
 12476  	XORQ  DI, DI
 12477  	XORQ  R8, R8
 12478  	JMP   check_limit_unroll
 12479  	PCALIGN $0x08
 12480  	NOP
 12481  	NOP
 12482  
 12483  loop_unroll:
 12484  	MOVSS (AX)(DI*4), X1
 12485  	MULSS X0, X1
 12486  	ADDSS (DX)(R8*4), X1
 12487  	MOVSS X1, (DX)(R8*4)
 12488  	ADDQ  CX, DI
 12489  	ADDQ  BX, R8
 12490  	MOVSS (AX)(DI*4), X1
 12491  	MULSS X0, X1
 12492  	ADDSS (DX)(R8*4), X1
 12493  	MOVSS X1, (DX)(R8*4)
 12494  	ADDQ  CX, DI
 12495  	ADDQ  BX, R8
 12496  	MOVSS (AX)(DI*4), X1
 12497  	MULSS X0, X1
 12498  	ADDSS (DX)(R8*4), X1
 12499  	MOVSS X1, (DX)(R8*4)
 12500  	ADDQ  CX, DI
 12501  	ADDQ  BX, R8
 12502  	MOVSS (AX)(DI*4), X1
 12503  	MULSS X0, X1
 12504  	ADDSS (DX)(R8*4), X1
 12505  	MOVSS X1, (DX)(R8*4)
 12506  	ADDQ  CX, DI
 12507  	ADDQ  BX, R8
 12508  	MOVSS (AX)(DI*4), X1
 12509  	MULSS X0, X1
 12510  	ADDSS (DX)(R8*4), X1
 12511  	MOVSS X1, (DX)(R8*4)
 12512  	ADDQ  CX, DI
 12513  	ADDQ  BX, R8
 12514  	MOVSS (AX)(DI*4), X1
 12515  	MULSS X0, X1
 12516  	ADDSS (DX)(R8*4), X1
 12517  	MOVSS X1, (DX)(R8*4)
 12518  	ADDQ  CX, DI
 12519  	ADDQ  BX, R8
 12520  	MOVSS (AX)(DI*4), X1
 12521  	MULSS X0, X1
 12522  	ADDSS (DX)(R8*4), X1
 12523  	MOVSS X1, (DX)(R8*4)
 12524  	ADDQ  CX, DI
 12525  	ADDQ  BX, R8
 12526  	MOVSS (AX)(DI*4), X1
 12527  	MULSS X0, X1
 12528  	ADDSS (DX)(R8*4), X1
 12529  	MOVSS X1, (DX)(R8*4)
 12530  	ADDQ  CX, DI
 12531  	ADDQ  BX, R8
 12532  	SUBQ  $0x08, SI
 12533  
 12534  check_limit_unroll:
 12535  	CMPQ SI, $0x08
 12536  	JHI  loop_unroll
 12537  	JMP  check_limit
 12538  
 12539  loop:
 12540  	MOVSS (AX)(DI*4), X1
 12541  	MULSS X0, X1
 12542  	ADDSS (DX)(R8*4), X1
 12543  	MOVSS X1, (DX)(R8*4)
 12544  	DECQ  SI
 12545  	ADDQ  CX, DI
 12546  	ADDQ  BX, R8
 12547  
 12548  check_limit:
 12549  	CMPQ SI, $0x00
 12550  	JHI  loop
 12551  	RET
 12552  
 12553  // func AmdAxpyUnsafeX_V1A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12554  // Requires: SSE
 12555  TEXT ·AmdAxpyUnsafeX_V1A10R8(SB), NOSPLIT, $0-48
 12556  	MOVSS alpha+0(FP), X0
 12557  	MOVQ  xs+8(FP), AX
 12558  	MOVQ  incx+16(FP), CX
 12559  	MOVQ  ys+24(FP), DX
 12560  	MOVQ  incy+32(FP), BX
 12561  	MOVQ  n+40(FP), SI
 12562  	XORQ  DI, DI
 12563  	XORQ  R8, R8
 12564  	JMP   check_limit_unroll
 12565  	PCALIGN $0x08
 12566  	NOP
 12567  	NOP
 12568  
 12569  loop_unroll:
 12570  	MOVSS (AX)(DI*4), X1
 12571  	MULSS X0, X1
 12572  	ADDSS (DX)(R8*4), X1
 12573  	MOVSS X1, (DX)(R8*4)
 12574  	ADDQ  CX, DI
 12575  	ADDQ  BX, R8
 12576  	MOVSS (AX)(DI*4), X1
 12577  	MULSS X0, X1
 12578  	ADDSS (DX)(R8*4), X1
 12579  	MOVSS X1, (DX)(R8*4)
 12580  	ADDQ  CX, DI
 12581  	ADDQ  BX, R8
 12582  	MOVSS (AX)(DI*4), X1
 12583  	MULSS X0, X1
 12584  	ADDSS (DX)(R8*4), X1
 12585  	MOVSS X1, (DX)(R8*4)
 12586  	ADDQ  CX, DI
 12587  	ADDQ  BX, R8
 12588  	MOVSS (AX)(DI*4), X1
 12589  	MULSS X0, X1
 12590  	ADDSS (DX)(R8*4), X1
 12591  	MOVSS X1, (DX)(R8*4)
 12592  	ADDQ  CX, DI
 12593  	ADDQ  BX, R8
 12594  	MOVSS (AX)(DI*4), X1
 12595  	MULSS X0, X1
 12596  	ADDSS (DX)(R8*4), X1
 12597  	MOVSS X1, (DX)(R8*4)
 12598  	ADDQ  CX, DI
 12599  	ADDQ  BX, R8
 12600  	MOVSS (AX)(DI*4), X1
 12601  	MULSS X0, X1
 12602  	ADDSS (DX)(R8*4), X1
 12603  	MOVSS X1, (DX)(R8*4)
 12604  	ADDQ  CX, DI
 12605  	ADDQ  BX, R8
 12606  	MOVSS (AX)(DI*4), X1
 12607  	MULSS X0, X1
 12608  	ADDSS (DX)(R8*4), X1
 12609  	MOVSS X1, (DX)(R8*4)
 12610  	ADDQ  CX, DI
 12611  	ADDQ  BX, R8
 12612  	MOVSS (AX)(DI*4), X1
 12613  	MULSS X0, X1
 12614  	ADDSS (DX)(R8*4), X1
 12615  	MOVSS X1, (DX)(R8*4)
 12616  	ADDQ  CX, DI
 12617  	ADDQ  BX, R8
 12618  	SUBQ  $0x08, SI
 12619  
 12620  check_limit_unroll:
 12621  	CMPQ SI, $0x08
 12622  	JHI  loop_unroll
 12623  	JMP  check_limit
 12624  
 12625  loop:
 12626  	MOVSS (AX)(DI*4), X1
 12627  	MULSS X0, X1
 12628  	ADDSS (DX)(R8*4), X1
 12629  	MOVSS X1, (DX)(R8*4)
 12630  	DECQ  SI
 12631  	ADDQ  CX, DI
 12632  	ADDQ  BX, R8
 12633  
 12634  check_limit:
 12635  	CMPQ SI, $0x00
 12636  	JHI  loop
 12637  	RET
 12638  
 12639  // func AmdAxpyUnsafeX_V2A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12640  // Requires: SSE
 12641  TEXT ·AmdAxpyUnsafeX_V2A10R8(SB), NOSPLIT, $0-48
 12642  	MOVSS alpha+0(FP), X0
 12643  	MOVQ  xs+8(FP), AX
 12644  	MOVQ  incx+16(FP), CX
 12645  	MOVQ  ys+24(FP), DX
 12646  	MOVQ  incy+32(FP), BX
 12647  	MOVQ  n+40(FP), SI
 12648  	XORQ  DI, DI
 12649  	XORQ  R8, R8
 12650  	JMP   check_limit_unroll
 12651  	PCALIGN $0x08
 12652  	NOP
 12653  	NOP
 12654  
 12655  loop_unroll:
 12656  	MOVSS (AX)(DI*4), X1
 12657  	MULSS X0, X1
 12658  	ADDSS (DX)(R8*4), X1
 12659  	MOVSS X1, (DX)(R8*4)
 12660  	ADDQ  CX, DI
 12661  	ADDQ  BX, R8
 12662  	MOVSS (AX)(DI*4), X1
 12663  	MULSS X0, X1
 12664  	ADDSS (DX)(R8*4), X1
 12665  	MOVSS X1, (DX)(R8*4)
 12666  	ADDQ  CX, DI
 12667  	ADDQ  BX, R8
 12668  	MOVSS (AX)(DI*4), X1
 12669  	MULSS X0, X1
 12670  	ADDSS (DX)(R8*4), X1
 12671  	MOVSS X1, (DX)(R8*4)
 12672  	ADDQ  CX, DI
 12673  	ADDQ  BX, R8
 12674  	MOVSS (AX)(DI*4), X1
 12675  	MULSS X0, X1
 12676  	ADDSS (DX)(R8*4), X1
 12677  	MOVSS X1, (DX)(R8*4)
 12678  	ADDQ  CX, DI
 12679  	ADDQ  BX, R8
 12680  	MOVSS (AX)(DI*4), X1
 12681  	MULSS X0, X1
 12682  	ADDSS (DX)(R8*4), X1
 12683  	MOVSS X1, (DX)(R8*4)
 12684  	ADDQ  CX, DI
 12685  	ADDQ  BX, R8
 12686  	MOVSS (AX)(DI*4), X1
 12687  	MULSS X0, X1
 12688  	ADDSS (DX)(R8*4), X1
 12689  	MOVSS X1, (DX)(R8*4)
 12690  	ADDQ  CX, DI
 12691  	ADDQ  BX, R8
 12692  	MOVSS (AX)(DI*4), X1
 12693  	MULSS X0, X1
 12694  	ADDSS (DX)(R8*4), X1
 12695  	MOVSS X1, (DX)(R8*4)
 12696  	ADDQ  CX, DI
 12697  	ADDQ  BX, R8
 12698  	MOVSS (AX)(DI*4), X1
 12699  	MULSS X0, X1
 12700  	ADDSS (DX)(R8*4), X1
 12701  	MOVSS X1, (DX)(R8*4)
 12702  	ADDQ  CX, DI
 12703  	ADDQ  BX, R8
 12704  	SUBQ  $0x08, SI
 12705  
 12706  check_limit_unroll:
 12707  	CMPQ SI, $0x08
 12708  	JHI  loop_unroll
 12709  	JMP  check_limit
 12710  
 12711  loop:
 12712  	MOVSS (AX)(DI*4), X1
 12713  	MULSS X0, X1
 12714  	ADDSS (DX)(R8*4), X1
 12715  	MOVSS X1, (DX)(R8*4)
 12716  	DECQ  SI
 12717  	ADDQ  CX, DI
 12718  	ADDQ  BX, R8
 12719  
 12720  check_limit:
 12721  	CMPQ SI, $0x00
 12722  	JHI  loop
 12723  	RET
 12724  
 12725  // func AmdAxpyUnsafeX_V3A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12726  // Requires: SSE
 12727  TEXT ·AmdAxpyUnsafeX_V3A10R8(SB), NOSPLIT, $0-48
 12728  	MOVSS alpha+0(FP), X0
 12729  	MOVQ  xs+8(FP), AX
 12730  	MOVQ  incx+16(FP), CX
 12731  	MOVQ  ys+24(FP), DX
 12732  	MOVQ  incy+32(FP), BX
 12733  	MOVQ  n+40(FP), SI
 12734  	XORQ  DI, DI
 12735  	XORQ  R8, R8
 12736  	JMP   check_limit_unroll
 12737  	PCALIGN $0x08
 12738  	NOP
 12739  	NOP
 12740  
 12741  loop_unroll:
 12742  	MOVSS (AX)(DI*4), X1
 12743  	MULSS X0, X1
 12744  	ADDSS (DX)(R8*4), X1
 12745  	MOVSS X1, (DX)(R8*4)
 12746  	ADDQ  CX, DI
 12747  	ADDQ  BX, R8
 12748  	MOVSS (AX)(DI*4), X1
 12749  	MULSS X0, X1
 12750  	ADDSS (DX)(R8*4), X1
 12751  	MOVSS X1, (DX)(R8*4)
 12752  	ADDQ  CX, DI
 12753  	ADDQ  BX, R8
 12754  	MOVSS (AX)(DI*4), X1
 12755  	MULSS X0, X1
 12756  	ADDSS (DX)(R8*4), X1
 12757  	MOVSS X1, (DX)(R8*4)
 12758  	ADDQ  CX, DI
 12759  	ADDQ  BX, R8
 12760  	MOVSS (AX)(DI*4), X1
 12761  	MULSS X0, X1
 12762  	ADDSS (DX)(R8*4), X1
 12763  	MOVSS X1, (DX)(R8*4)
 12764  	ADDQ  CX, DI
 12765  	ADDQ  BX, R8
 12766  	MOVSS (AX)(DI*4), X1
 12767  	MULSS X0, X1
 12768  	ADDSS (DX)(R8*4), X1
 12769  	MOVSS X1, (DX)(R8*4)
 12770  	ADDQ  CX, DI
 12771  	ADDQ  BX, R8
 12772  	MOVSS (AX)(DI*4), X1
 12773  	MULSS X0, X1
 12774  	ADDSS (DX)(R8*4), X1
 12775  	MOVSS X1, (DX)(R8*4)
 12776  	ADDQ  CX, DI
 12777  	ADDQ  BX, R8
 12778  	MOVSS (AX)(DI*4), X1
 12779  	MULSS X0, X1
 12780  	ADDSS (DX)(R8*4), X1
 12781  	MOVSS X1, (DX)(R8*4)
 12782  	ADDQ  CX, DI
 12783  	ADDQ  BX, R8
 12784  	MOVSS (AX)(DI*4), X1
 12785  	MULSS X0, X1
 12786  	ADDSS (DX)(R8*4), X1
 12787  	MOVSS X1, (DX)(R8*4)
 12788  	ADDQ  CX, DI
 12789  	ADDQ  BX, R8
 12790  	SUBQ  $0x08, SI
 12791  
 12792  check_limit_unroll:
 12793  	CMPQ SI, $0x08
 12794  	JHI  loop_unroll
 12795  	JMP  check_limit
 12796  
 12797  loop:
 12798  	MOVSS (AX)(DI*4), X1
 12799  	MULSS X0, X1
 12800  	ADDSS (DX)(R8*4), X1
 12801  	MOVSS X1, (DX)(R8*4)
 12802  	DECQ  SI
 12803  	ADDQ  CX, DI
 12804  	ADDQ  BX, R8
 12805  
 12806  check_limit:
 12807  	CMPQ SI, $0x00
 12808  	JHI  loop
 12809  	RET
 12810  
 12811  // func AmdAxpyUnsafeX_V4A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12812  // Requires: SSE
 12813  TEXT ·AmdAxpyUnsafeX_V4A10R8(SB), NOSPLIT, $0-48
 12814  	MOVSS alpha+0(FP), X0
 12815  	MOVQ  xs+8(FP), AX
 12816  	MOVQ  incx+16(FP), CX
 12817  	MOVQ  ys+24(FP), DX
 12818  	MOVQ  incy+32(FP), BX
 12819  	MOVQ  n+40(FP), SI
 12820  	XORQ  DI, DI
 12821  	XORQ  R8, R8
 12822  	JMP   check_limit_unroll
 12823  	PCALIGN $0x08
 12824  	NOP
 12825  	NOP
 12826  
 12827  loop_unroll:
 12828  	MOVSS (AX)(DI*4), X1
 12829  	MULSS X0, X1
 12830  	ADDSS (DX)(R8*4), X1
 12831  	MOVSS X1, (DX)(R8*4)
 12832  	ADDQ  CX, DI
 12833  	ADDQ  BX, R8
 12834  	MOVSS (AX)(DI*4), X1
 12835  	MULSS X0, X1
 12836  	ADDSS (DX)(R8*4), X1
 12837  	MOVSS X1, (DX)(R8*4)
 12838  	ADDQ  CX, DI
 12839  	ADDQ  BX, R8
 12840  	MOVSS (AX)(DI*4), X1
 12841  	MULSS X0, X1
 12842  	ADDSS (DX)(R8*4), X1
 12843  	MOVSS X1, (DX)(R8*4)
 12844  	ADDQ  CX, DI
 12845  	ADDQ  BX, R8
 12846  	MOVSS (AX)(DI*4), X1
 12847  	MULSS X0, X1
 12848  	ADDSS (DX)(R8*4), X1
 12849  	MOVSS X1, (DX)(R8*4)
 12850  	ADDQ  CX, DI
 12851  	ADDQ  BX, R8
 12852  	MOVSS (AX)(DI*4), X1
 12853  	MULSS X0, X1
 12854  	ADDSS (DX)(R8*4), X1
 12855  	MOVSS X1, (DX)(R8*4)
 12856  	ADDQ  CX, DI
 12857  	ADDQ  BX, R8
 12858  	MOVSS (AX)(DI*4), X1
 12859  	MULSS X0, X1
 12860  	ADDSS (DX)(R8*4), X1
 12861  	MOVSS X1, (DX)(R8*4)
 12862  	ADDQ  CX, DI
 12863  	ADDQ  BX, R8
 12864  	MOVSS (AX)(DI*4), X1
 12865  	MULSS X0, X1
 12866  	ADDSS (DX)(R8*4), X1
 12867  	MOVSS X1, (DX)(R8*4)
 12868  	ADDQ  CX, DI
 12869  	ADDQ  BX, R8
 12870  	MOVSS (AX)(DI*4), X1
 12871  	MULSS X0, X1
 12872  	ADDSS (DX)(R8*4), X1
 12873  	MOVSS X1, (DX)(R8*4)
 12874  	ADDQ  CX, DI
 12875  	ADDQ  BX, R8
 12876  	SUBQ  $0x08, SI
 12877  
 12878  check_limit_unroll:
 12879  	CMPQ SI, $0x08
 12880  	JHI  loop_unroll
 12881  	JMP  check_limit
 12882  
 12883  loop:
 12884  	MOVSS (AX)(DI*4), X1
 12885  	MULSS X0, X1
 12886  	ADDSS (DX)(R8*4), X1
 12887  	MOVSS X1, (DX)(R8*4)
 12888  	DECQ  SI
 12889  	ADDQ  CX, DI
 12890  	ADDQ  BX, R8
 12891  
 12892  check_limit:
 12893  	CMPQ SI, $0x00
 12894  	JHI  loop
 12895  	RET
 12896  
 12897  // func AmdAxpyUnsafeX_V5A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12898  // Requires: SSE
 12899  TEXT ·AmdAxpyUnsafeX_V5A10R8(SB), NOSPLIT, $0-48
 12900  	MOVSS alpha+0(FP), X0
 12901  	MOVQ  xs+8(FP), AX
 12902  	MOVQ  incx+16(FP), CX
 12903  	MOVQ  ys+24(FP), DX
 12904  	MOVQ  incy+32(FP), BX
 12905  	MOVQ  n+40(FP), SI
 12906  	XORQ  DI, DI
 12907  	XORQ  R8, R8
 12908  	JMP   check_limit_unroll
 12909  	PCALIGN $0x08
 12910  	NOP
 12911  	NOP
 12912  
 12913  loop_unroll:
 12914  	MOVSS (AX)(DI*4), X1
 12915  	MULSS X0, X1
 12916  	ADDSS (DX)(R8*4), X1
 12917  	MOVSS X1, (DX)(R8*4)
 12918  	ADDQ  CX, DI
 12919  	ADDQ  BX, R8
 12920  	MOVSS (AX)(DI*4), X1
 12921  	MULSS X0, X1
 12922  	ADDSS (DX)(R8*4), X1
 12923  	MOVSS X1, (DX)(R8*4)
 12924  	ADDQ  CX, DI
 12925  	ADDQ  BX, R8
 12926  	MOVSS (AX)(DI*4), X1
 12927  	MULSS X0, X1
 12928  	ADDSS (DX)(R8*4), X1
 12929  	MOVSS X1, (DX)(R8*4)
 12930  	ADDQ  CX, DI
 12931  	ADDQ  BX, R8
 12932  	MOVSS (AX)(DI*4), X1
 12933  	MULSS X0, X1
 12934  	ADDSS (DX)(R8*4), X1
 12935  	MOVSS X1, (DX)(R8*4)
 12936  	ADDQ  CX, DI
 12937  	ADDQ  BX, R8
 12938  	MOVSS (AX)(DI*4), X1
 12939  	MULSS X0, X1
 12940  	ADDSS (DX)(R8*4), X1
 12941  	MOVSS X1, (DX)(R8*4)
 12942  	ADDQ  CX, DI
 12943  	ADDQ  BX, R8
 12944  	MOVSS (AX)(DI*4), X1
 12945  	MULSS X0, X1
 12946  	ADDSS (DX)(R8*4), X1
 12947  	MOVSS X1, (DX)(R8*4)
 12948  	ADDQ  CX, DI
 12949  	ADDQ  BX, R8
 12950  	MOVSS (AX)(DI*4), X1
 12951  	MULSS X0, X1
 12952  	ADDSS (DX)(R8*4), X1
 12953  	MOVSS X1, (DX)(R8*4)
 12954  	ADDQ  CX, DI
 12955  	ADDQ  BX, R8
 12956  	MOVSS (AX)(DI*4), X1
 12957  	MULSS X0, X1
 12958  	ADDSS (DX)(R8*4), X1
 12959  	MOVSS X1, (DX)(R8*4)
 12960  	ADDQ  CX, DI
 12961  	ADDQ  BX, R8
 12962  	SUBQ  $0x08, SI
 12963  
 12964  check_limit_unroll:
 12965  	CMPQ SI, $0x08
 12966  	JHI  loop_unroll
 12967  	JMP  check_limit
 12968  
 12969  loop:
 12970  	MOVSS (AX)(DI*4), X1
 12971  	MULSS X0, X1
 12972  	ADDSS (DX)(R8*4), X1
 12973  	MOVSS X1, (DX)(R8*4)
 12974  	DECQ  SI
 12975  	ADDQ  CX, DI
 12976  	ADDQ  BX, R8
 12977  
 12978  check_limit:
 12979  	CMPQ SI, $0x00
 12980  	JHI  loop
 12981  	RET
 12982  
 12983  // func AmdAxpyUnsafeX_V0A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 12984  // Requires: SSE
 12985  TEXT ·AmdAxpyUnsafeX_V0A11R8(SB), NOSPLIT, $0-48
 12986  	MOVSS alpha+0(FP), X0
 12987  	MOVQ  xs+8(FP), AX
 12988  	MOVQ  incx+16(FP), CX
 12989  	MOVQ  ys+24(FP), DX
 12990  	MOVQ  incy+32(FP), BX
 12991  	MOVQ  n+40(FP), SI
 12992  	XORQ  DI, DI
 12993  	XORQ  R8, R8
 12994  	JMP   check_limit_unroll
 12995  	PCALIGN $0x08
 12996  	NOP
 12997  	NOP
 12998  	NOP
 12999  
 13000  loop_unroll:
 13001  	MOVSS (AX)(DI*4), X1
 13002  	MULSS X0, X1
 13003  	ADDSS (DX)(R8*4), X1
 13004  	MOVSS X1, (DX)(R8*4)
 13005  	ADDQ  CX, DI
 13006  	ADDQ  BX, R8
 13007  	MOVSS (AX)(DI*4), X1
 13008  	MULSS X0, X1
 13009  	ADDSS (DX)(R8*4), X1
 13010  	MOVSS X1, (DX)(R8*4)
 13011  	ADDQ  CX, DI
 13012  	ADDQ  BX, R8
 13013  	MOVSS (AX)(DI*4), X1
 13014  	MULSS X0, X1
 13015  	ADDSS (DX)(R8*4), X1
 13016  	MOVSS X1, (DX)(R8*4)
 13017  	ADDQ  CX, DI
 13018  	ADDQ  BX, R8
 13019  	MOVSS (AX)(DI*4), X1
 13020  	MULSS X0, X1
 13021  	ADDSS (DX)(R8*4), X1
 13022  	MOVSS X1, (DX)(R8*4)
 13023  	ADDQ  CX, DI
 13024  	ADDQ  BX, R8
 13025  	MOVSS (AX)(DI*4), X1
 13026  	MULSS X0, X1
 13027  	ADDSS (DX)(R8*4), X1
 13028  	MOVSS X1, (DX)(R8*4)
 13029  	ADDQ  CX, DI
 13030  	ADDQ  BX, R8
 13031  	MOVSS (AX)(DI*4), X1
 13032  	MULSS X0, X1
 13033  	ADDSS (DX)(R8*4), X1
 13034  	MOVSS X1, (DX)(R8*4)
 13035  	ADDQ  CX, DI
 13036  	ADDQ  BX, R8
 13037  	MOVSS (AX)(DI*4), X1
 13038  	MULSS X0, X1
 13039  	ADDSS (DX)(R8*4), X1
 13040  	MOVSS X1, (DX)(R8*4)
 13041  	ADDQ  CX, DI
 13042  	ADDQ  BX, R8
 13043  	MOVSS (AX)(DI*4), X1
 13044  	MULSS X0, X1
 13045  	ADDSS (DX)(R8*4), X1
 13046  	MOVSS X1, (DX)(R8*4)
 13047  	ADDQ  CX, DI
 13048  	ADDQ  BX, R8
 13049  	SUBQ  $0x08, SI
 13050  
 13051  check_limit_unroll:
 13052  	CMPQ SI, $0x08
 13053  	JHI  loop_unroll
 13054  	JMP  check_limit
 13055  
 13056  loop:
 13057  	MOVSS (AX)(DI*4), X1
 13058  	MULSS X0, X1
 13059  	ADDSS (DX)(R8*4), X1
 13060  	MOVSS X1, (DX)(R8*4)
 13061  	DECQ  SI
 13062  	ADDQ  CX, DI
 13063  	ADDQ  BX, R8
 13064  
 13065  check_limit:
 13066  	CMPQ SI, $0x00
 13067  	JHI  loop
 13068  	RET
 13069  
 13070  // func AmdAxpyUnsafeX_V1A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13071  // Requires: SSE
 13072  TEXT ·AmdAxpyUnsafeX_V1A11R8(SB), NOSPLIT, $0-48
 13073  	MOVSS alpha+0(FP), X0
 13074  	MOVQ  xs+8(FP), AX
 13075  	MOVQ  incx+16(FP), CX
 13076  	MOVQ  ys+24(FP), DX
 13077  	MOVQ  incy+32(FP), BX
 13078  	MOVQ  n+40(FP), SI
 13079  	XORQ  DI, DI
 13080  	XORQ  R8, R8
 13081  	JMP   check_limit_unroll
 13082  	PCALIGN $0x08
 13083  	NOP
 13084  	NOP
 13085  	NOP
 13086  
 13087  loop_unroll:
 13088  	MOVSS (AX)(DI*4), X1
 13089  	MULSS X0, X1
 13090  	ADDSS (DX)(R8*4), X1
 13091  	MOVSS X1, (DX)(R8*4)
 13092  	ADDQ  CX, DI
 13093  	ADDQ  BX, R8
 13094  	MOVSS (AX)(DI*4), X1
 13095  	MULSS X0, X1
 13096  	ADDSS (DX)(R8*4), X1
 13097  	MOVSS X1, (DX)(R8*4)
 13098  	ADDQ  CX, DI
 13099  	ADDQ  BX, R8
 13100  	MOVSS (AX)(DI*4), X1
 13101  	MULSS X0, X1
 13102  	ADDSS (DX)(R8*4), X1
 13103  	MOVSS X1, (DX)(R8*4)
 13104  	ADDQ  CX, DI
 13105  	ADDQ  BX, R8
 13106  	MOVSS (AX)(DI*4), X1
 13107  	MULSS X0, X1
 13108  	ADDSS (DX)(R8*4), X1
 13109  	MOVSS X1, (DX)(R8*4)
 13110  	ADDQ  CX, DI
 13111  	ADDQ  BX, R8
 13112  	MOVSS (AX)(DI*4), X1
 13113  	MULSS X0, X1
 13114  	ADDSS (DX)(R8*4), X1
 13115  	MOVSS X1, (DX)(R8*4)
 13116  	ADDQ  CX, DI
 13117  	ADDQ  BX, R8
 13118  	MOVSS (AX)(DI*4), X1
 13119  	MULSS X0, X1
 13120  	ADDSS (DX)(R8*4), X1
 13121  	MOVSS X1, (DX)(R8*4)
 13122  	ADDQ  CX, DI
 13123  	ADDQ  BX, R8
 13124  	MOVSS (AX)(DI*4), X1
 13125  	MULSS X0, X1
 13126  	ADDSS (DX)(R8*4), X1
 13127  	MOVSS X1, (DX)(R8*4)
 13128  	ADDQ  CX, DI
 13129  	ADDQ  BX, R8
 13130  	MOVSS (AX)(DI*4), X1
 13131  	MULSS X0, X1
 13132  	ADDSS (DX)(R8*4), X1
 13133  	MOVSS X1, (DX)(R8*4)
 13134  	ADDQ  CX, DI
 13135  	ADDQ  BX, R8
 13136  	SUBQ  $0x08, SI
 13137  
 13138  check_limit_unroll:
 13139  	CMPQ SI, $0x08
 13140  	JHI  loop_unroll
 13141  	JMP  check_limit
 13142  
 13143  loop:
 13144  	MOVSS (AX)(DI*4), X1
 13145  	MULSS X0, X1
 13146  	ADDSS (DX)(R8*4), X1
 13147  	MOVSS X1, (DX)(R8*4)
 13148  	DECQ  SI
 13149  	ADDQ  CX, DI
 13150  	ADDQ  BX, R8
 13151  
 13152  check_limit:
 13153  	CMPQ SI, $0x00
 13154  	JHI  loop
 13155  	RET
 13156  
 13157  // func AmdAxpyUnsafeX_V2A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13158  // Requires: SSE
 13159  TEXT ·AmdAxpyUnsafeX_V2A11R8(SB), NOSPLIT, $0-48
 13160  	MOVSS alpha+0(FP), X0
 13161  	MOVQ  xs+8(FP), AX
 13162  	MOVQ  incx+16(FP), CX
 13163  	MOVQ  ys+24(FP), DX
 13164  	MOVQ  incy+32(FP), BX
 13165  	MOVQ  n+40(FP), SI
 13166  	XORQ  DI, DI
 13167  	XORQ  R8, R8
 13168  	JMP   check_limit_unroll
 13169  	PCALIGN $0x08
 13170  	NOP
 13171  	NOP
 13172  	NOP
 13173  
 13174  loop_unroll:
 13175  	MOVSS (AX)(DI*4), X1
 13176  	MULSS X0, X1
 13177  	ADDSS (DX)(R8*4), X1
 13178  	MOVSS X1, (DX)(R8*4)
 13179  	ADDQ  CX, DI
 13180  	ADDQ  BX, R8
 13181  	MOVSS (AX)(DI*4), X1
 13182  	MULSS X0, X1
 13183  	ADDSS (DX)(R8*4), X1
 13184  	MOVSS X1, (DX)(R8*4)
 13185  	ADDQ  CX, DI
 13186  	ADDQ  BX, R8
 13187  	MOVSS (AX)(DI*4), X1
 13188  	MULSS X0, X1
 13189  	ADDSS (DX)(R8*4), X1
 13190  	MOVSS X1, (DX)(R8*4)
 13191  	ADDQ  CX, DI
 13192  	ADDQ  BX, R8
 13193  	MOVSS (AX)(DI*4), X1
 13194  	MULSS X0, X1
 13195  	ADDSS (DX)(R8*4), X1
 13196  	MOVSS X1, (DX)(R8*4)
 13197  	ADDQ  CX, DI
 13198  	ADDQ  BX, R8
 13199  	MOVSS (AX)(DI*4), X1
 13200  	MULSS X0, X1
 13201  	ADDSS (DX)(R8*4), X1
 13202  	MOVSS X1, (DX)(R8*4)
 13203  	ADDQ  CX, DI
 13204  	ADDQ  BX, R8
 13205  	MOVSS (AX)(DI*4), X1
 13206  	MULSS X0, X1
 13207  	ADDSS (DX)(R8*4), X1
 13208  	MOVSS X1, (DX)(R8*4)
 13209  	ADDQ  CX, DI
 13210  	ADDQ  BX, R8
 13211  	MOVSS (AX)(DI*4), X1
 13212  	MULSS X0, X1
 13213  	ADDSS (DX)(R8*4), X1
 13214  	MOVSS X1, (DX)(R8*4)
 13215  	ADDQ  CX, DI
 13216  	ADDQ  BX, R8
 13217  	MOVSS (AX)(DI*4), X1
 13218  	MULSS X0, X1
 13219  	ADDSS (DX)(R8*4), X1
 13220  	MOVSS X1, (DX)(R8*4)
 13221  	ADDQ  CX, DI
 13222  	ADDQ  BX, R8
 13223  	SUBQ  $0x08, SI
 13224  
 13225  check_limit_unroll:
 13226  	CMPQ SI, $0x08
 13227  	JHI  loop_unroll
 13228  	JMP  check_limit
 13229  
 13230  loop:
 13231  	MOVSS (AX)(DI*4), X1
 13232  	MULSS X0, X1
 13233  	ADDSS (DX)(R8*4), X1
 13234  	MOVSS X1, (DX)(R8*4)
 13235  	DECQ  SI
 13236  	ADDQ  CX, DI
 13237  	ADDQ  BX, R8
 13238  
 13239  check_limit:
 13240  	CMPQ SI, $0x00
 13241  	JHI  loop
 13242  	RET
 13243  
 13244  // func AmdAxpyUnsafeX_V3A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13245  // Requires: SSE
 13246  TEXT ·AmdAxpyUnsafeX_V3A11R8(SB), NOSPLIT, $0-48
 13247  	MOVSS alpha+0(FP), X0
 13248  	MOVQ  xs+8(FP), AX
 13249  	MOVQ  incx+16(FP), CX
 13250  	MOVQ  ys+24(FP), DX
 13251  	MOVQ  incy+32(FP), BX
 13252  	MOVQ  n+40(FP), SI
 13253  	XORQ  DI, DI
 13254  	XORQ  R8, R8
 13255  	JMP   check_limit_unroll
 13256  	PCALIGN $0x08
 13257  	NOP
 13258  	NOP
 13259  	NOP
 13260  
 13261  loop_unroll:
 13262  	MOVSS (AX)(DI*4), X1
 13263  	MULSS X0, X1
 13264  	ADDSS (DX)(R8*4), X1
 13265  	MOVSS X1, (DX)(R8*4)
 13266  	ADDQ  CX, DI
 13267  	ADDQ  BX, R8
 13268  	MOVSS (AX)(DI*4), X1
 13269  	MULSS X0, X1
 13270  	ADDSS (DX)(R8*4), X1
 13271  	MOVSS X1, (DX)(R8*4)
 13272  	ADDQ  CX, DI
 13273  	ADDQ  BX, R8
 13274  	MOVSS (AX)(DI*4), X1
 13275  	MULSS X0, X1
 13276  	ADDSS (DX)(R8*4), X1
 13277  	MOVSS X1, (DX)(R8*4)
 13278  	ADDQ  CX, DI
 13279  	ADDQ  BX, R8
 13280  	MOVSS (AX)(DI*4), X1
 13281  	MULSS X0, X1
 13282  	ADDSS (DX)(R8*4), X1
 13283  	MOVSS X1, (DX)(R8*4)
 13284  	ADDQ  CX, DI
 13285  	ADDQ  BX, R8
 13286  	MOVSS (AX)(DI*4), X1
 13287  	MULSS X0, X1
 13288  	ADDSS (DX)(R8*4), X1
 13289  	MOVSS X1, (DX)(R8*4)
 13290  	ADDQ  CX, DI
 13291  	ADDQ  BX, R8
 13292  	MOVSS (AX)(DI*4), X1
 13293  	MULSS X0, X1
 13294  	ADDSS (DX)(R8*4), X1
 13295  	MOVSS X1, (DX)(R8*4)
 13296  	ADDQ  CX, DI
 13297  	ADDQ  BX, R8
 13298  	MOVSS (AX)(DI*4), X1
 13299  	MULSS X0, X1
 13300  	ADDSS (DX)(R8*4), X1
 13301  	MOVSS X1, (DX)(R8*4)
 13302  	ADDQ  CX, DI
 13303  	ADDQ  BX, R8
 13304  	MOVSS (AX)(DI*4), X1
 13305  	MULSS X0, X1
 13306  	ADDSS (DX)(R8*4), X1
 13307  	MOVSS X1, (DX)(R8*4)
 13308  	ADDQ  CX, DI
 13309  	ADDQ  BX, R8
 13310  	SUBQ  $0x08, SI
 13311  
 13312  check_limit_unroll:
 13313  	CMPQ SI, $0x08
 13314  	JHI  loop_unroll
 13315  	JMP  check_limit
 13316  
 13317  loop:
 13318  	MOVSS (AX)(DI*4), X1
 13319  	MULSS X0, X1
 13320  	ADDSS (DX)(R8*4), X1
 13321  	MOVSS X1, (DX)(R8*4)
 13322  	DECQ  SI
 13323  	ADDQ  CX, DI
 13324  	ADDQ  BX, R8
 13325  
 13326  check_limit:
 13327  	CMPQ SI, $0x00
 13328  	JHI  loop
 13329  	RET
 13330  
 13331  // func AmdAxpyUnsafeX_V4A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13332  // Requires: SSE
 13333  TEXT ·AmdAxpyUnsafeX_V4A11R8(SB), NOSPLIT, $0-48
 13334  	MOVSS alpha+0(FP), X0
 13335  	MOVQ  xs+8(FP), AX
 13336  	MOVQ  incx+16(FP), CX
 13337  	MOVQ  ys+24(FP), DX
 13338  	MOVQ  incy+32(FP), BX
 13339  	MOVQ  n+40(FP), SI
 13340  	XORQ  DI, DI
 13341  	XORQ  R8, R8
 13342  	JMP   check_limit_unroll
 13343  	PCALIGN $0x08
 13344  	NOP
 13345  	NOP
 13346  	NOP
 13347  
 13348  loop_unroll:
 13349  	MOVSS (AX)(DI*4), X1
 13350  	MULSS X0, X1
 13351  	ADDSS (DX)(R8*4), X1
 13352  	MOVSS X1, (DX)(R8*4)
 13353  	ADDQ  CX, DI
 13354  	ADDQ  BX, R8
 13355  	MOVSS (AX)(DI*4), X1
 13356  	MULSS X0, X1
 13357  	ADDSS (DX)(R8*4), X1
 13358  	MOVSS X1, (DX)(R8*4)
 13359  	ADDQ  CX, DI
 13360  	ADDQ  BX, R8
 13361  	MOVSS (AX)(DI*4), X1
 13362  	MULSS X0, X1
 13363  	ADDSS (DX)(R8*4), X1
 13364  	MOVSS X1, (DX)(R8*4)
 13365  	ADDQ  CX, DI
 13366  	ADDQ  BX, R8
 13367  	MOVSS (AX)(DI*4), X1
 13368  	MULSS X0, X1
 13369  	ADDSS (DX)(R8*4), X1
 13370  	MOVSS X1, (DX)(R8*4)
 13371  	ADDQ  CX, DI
 13372  	ADDQ  BX, R8
 13373  	MOVSS (AX)(DI*4), X1
 13374  	MULSS X0, X1
 13375  	ADDSS (DX)(R8*4), X1
 13376  	MOVSS X1, (DX)(R8*4)
 13377  	ADDQ  CX, DI
 13378  	ADDQ  BX, R8
 13379  	MOVSS (AX)(DI*4), X1
 13380  	MULSS X0, X1
 13381  	ADDSS (DX)(R8*4), X1
 13382  	MOVSS X1, (DX)(R8*4)
 13383  	ADDQ  CX, DI
 13384  	ADDQ  BX, R8
 13385  	MOVSS (AX)(DI*4), X1
 13386  	MULSS X0, X1
 13387  	ADDSS (DX)(R8*4), X1
 13388  	MOVSS X1, (DX)(R8*4)
 13389  	ADDQ  CX, DI
 13390  	ADDQ  BX, R8
 13391  	MOVSS (AX)(DI*4), X1
 13392  	MULSS X0, X1
 13393  	ADDSS (DX)(R8*4), X1
 13394  	MOVSS X1, (DX)(R8*4)
 13395  	ADDQ  CX, DI
 13396  	ADDQ  BX, R8
 13397  	SUBQ  $0x08, SI
 13398  
 13399  check_limit_unroll:
 13400  	CMPQ SI, $0x08
 13401  	JHI  loop_unroll
 13402  	JMP  check_limit
 13403  
 13404  loop:
 13405  	MOVSS (AX)(DI*4), X1
 13406  	MULSS X0, X1
 13407  	ADDSS (DX)(R8*4), X1
 13408  	MOVSS X1, (DX)(R8*4)
 13409  	DECQ  SI
 13410  	ADDQ  CX, DI
 13411  	ADDQ  BX, R8
 13412  
 13413  check_limit:
 13414  	CMPQ SI, $0x00
 13415  	JHI  loop
 13416  	RET
 13417  
 13418  // func AmdAxpyUnsafeX_V5A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13419  // Requires: SSE
 13420  TEXT ·AmdAxpyUnsafeX_V5A11R8(SB), NOSPLIT, $0-48
 13421  	MOVSS alpha+0(FP), X0
 13422  	MOVQ  xs+8(FP), AX
 13423  	MOVQ  incx+16(FP), CX
 13424  	MOVQ  ys+24(FP), DX
 13425  	MOVQ  incy+32(FP), BX
 13426  	MOVQ  n+40(FP), SI
 13427  	XORQ  DI, DI
 13428  	XORQ  R8, R8
 13429  	JMP   check_limit_unroll
 13430  	PCALIGN $0x08
 13431  	NOP
 13432  	NOP
 13433  	NOP
 13434  
 13435  loop_unroll:
 13436  	MOVSS (AX)(DI*4), X1
 13437  	MULSS X0, X1
 13438  	ADDSS (DX)(R8*4), X1
 13439  	MOVSS X1, (DX)(R8*4)
 13440  	ADDQ  CX, DI
 13441  	ADDQ  BX, R8
 13442  	MOVSS (AX)(DI*4), X1
 13443  	MULSS X0, X1
 13444  	ADDSS (DX)(R8*4), X1
 13445  	MOVSS X1, (DX)(R8*4)
 13446  	ADDQ  CX, DI
 13447  	ADDQ  BX, R8
 13448  	MOVSS (AX)(DI*4), X1
 13449  	MULSS X0, X1
 13450  	ADDSS (DX)(R8*4), X1
 13451  	MOVSS X1, (DX)(R8*4)
 13452  	ADDQ  CX, DI
 13453  	ADDQ  BX, R8
 13454  	MOVSS (AX)(DI*4), X1
 13455  	MULSS X0, X1
 13456  	ADDSS (DX)(R8*4), X1
 13457  	MOVSS X1, (DX)(R8*4)
 13458  	ADDQ  CX, DI
 13459  	ADDQ  BX, R8
 13460  	MOVSS (AX)(DI*4), X1
 13461  	MULSS X0, X1
 13462  	ADDSS (DX)(R8*4), X1
 13463  	MOVSS X1, (DX)(R8*4)
 13464  	ADDQ  CX, DI
 13465  	ADDQ  BX, R8
 13466  	MOVSS (AX)(DI*4), X1
 13467  	MULSS X0, X1
 13468  	ADDSS (DX)(R8*4), X1
 13469  	MOVSS X1, (DX)(R8*4)
 13470  	ADDQ  CX, DI
 13471  	ADDQ  BX, R8
 13472  	MOVSS (AX)(DI*4), X1
 13473  	MULSS X0, X1
 13474  	ADDSS (DX)(R8*4), X1
 13475  	MOVSS X1, (DX)(R8*4)
 13476  	ADDQ  CX, DI
 13477  	ADDQ  BX, R8
 13478  	MOVSS (AX)(DI*4), X1
 13479  	MULSS X0, X1
 13480  	ADDSS (DX)(R8*4), X1
 13481  	MOVSS X1, (DX)(R8*4)
 13482  	ADDQ  CX, DI
 13483  	ADDQ  BX, R8
 13484  	SUBQ  $0x08, SI
 13485  
 13486  check_limit_unroll:
 13487  	CMPQ SI, $0x08
 13488  	JHI  loop_unroll
 13489  	JMP  check_limit
 13490  
 13491  loop:
 13492  	MOVSS (AX)(DI*4), X1
 13493  	MULSS X0, X1
 13494  	ADDSS (DX)(R8*4), X1
 13495  	MOVSS X1, (DX)(R8*4)
 13496  	DECQ  SI
 13497  	ADDQ  CX, DI
 13498  	ADDQ  BX, R8
 13499  
 13500  check_limit:
 13501  	CMPQ SI, $0x00
 13502  	JHI  loop
 13503  	RET
 13504  
 13505  // func AmdAxpyUnsafeX_V0A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13506  // Requires: SSE
 13507  TEXT ·AmdAxpyUnsafeX_V0A12R8(SB), NOSPLIT, $0-48
 13508  	MOVSS alpha+0(FP), X0
 13509  	MOVQ  xs+8(FP), AX
 13510  	MOVQ  incx+16(FP), CX
 13511  	MOVQ  ys+24(FP), DX
 13512  	MOVQ  incy+32(FP), BX
 13513  	MOVQ  n+40(FP), SI
 13514  	XORQ  DI, DI
 13515  	XORQ  R8, R8
 13516  	JMP   check_limit_unroll
 13517  	PCALIGN $0x08
 13518  	NOP
 13519  	NOP
 13520  	NOP
 13521  	NOP
 13522  
 13523  loop_unroll:
 13524  	MOVSS (AX)(DI*4), X1
 13525  	MULSS X0, X1
 13526  	ADDSS (DX)(R8*4), X1
 13527  	MOVSS X1, (DX)(R8*4)
 13528  	ADDQ  CX, DI
 13529  	ADDQ  BX, R8
 13530  	MOVSS (AX)(DI*4), X1
 13531  	MULSS X0, X1
 13532  	ADDSS (DX)(R8*4), X1
 13533  	MOVSS X1, (DX)(R8*4)
 13534  	ADDQ  CX, DI
 13535  	ADDQ  BX, R8
 13536  	MOVSS (AX)(DI*4), X1
 13537  	MULSS X0, X1
 13538  	ADDSS (DX)(R8*4), X1
 13539  	MOVSS X1, (DX)(R8*4)
 13540  	ADDQ  CX, DI
 13541  	ADDQ  BX, R8
 13542  	MOVSS (AX)(DI*4), X1
 13543  	MULSS X0, X1
 13544  	ADDSS (DX)(R8*4), X1
 13545  	MOVSS X1, (DX)(R8*4)
 13546  	ADDQ  CX, DI
 13547  	ADDQ  BX, R8
 13548  	MOVSS (AX)(DI*4), X1
 13549  	MULSS X0, X1
 13550  	ADDSS (DX)(R8*4), X1
 13551  	MOVSS X1, (DX)(R8*4)
 13552  	ADDQ  CX, DI
 13553  	ADDQ  BX, R8
 13554  	MOVSS (AX)(DI*4), X1
 13555  	MULSS X0, X1
 13556  	ADDSS (DX)(R8*4), X1
 13557  	MOVSS X1, (DX)(R8*4)
 13558  	ADDQ  CX, DI
 13559  	ADDQ  BX, R8
 13560  	MOVSS (AX)(DI*4), X1
 13561  	MULSS X0, X1
 13562  	ADDSS (DX)(R8*4), X1
 13563  	MOVSS X1, (DX)(R8*4)
 13564  	ADDQ  CX, DI
 13565  	ADDQ  BX, R8
 13566  	MOVSS (AX)(DI*4), X1
 13567  	MULSS X0, X1
 13568  	ADDSS (DX)(R8*4), X1
 13569  	MOVSS X1, (DX)(R8*4)
 13570  	ADDQ  CX, DI
 13571  	ADDQ  BX, R8
 13572  	SUBQ  $0x08, SI
 13573  
 13574  check_limit_unroll:
 13575  	CMPQ SI, $0x08
 13576  	JHI  loop_unroll
 13577  	JMP  check_limit
 13578  
 13579  loop:
 13580  	MOVSS (AX)(DI*4), X1
 13581  	MULSS X0, X1
 13582  	ADDSS (DX)(R8*4), X1
 13583  	MOVSS X1, (DX)(R8*4)
 13584  	DECQ  SI
 13585  	ADDQ  CX, DI
 13586  	ADDQ  BX, R8
 13587  
 13588  check_limit:
 13589  	CMPQ SI, $0x00
 13590  	JHI  loop
 13591  	RET
 13592  
 13593  // func AmdAxpyUnsafeX_V1A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13594  // Requires: SSE
 13595  TEXT ·AmdAxpyUnsafeX_V1A12R8(SB), NOSPLIT, $0-48
 13596  	MOVSS alpha+0(FP), X0
 13597  	MOVQ  xs+8(FP), AX
 13598  	MOVQ  incx+16(FP), CX
 13599  	MOVQ  ys+24(FP), DX
 13600  	MOVQ  incy+32(FP), BX
 13601  	MOVQ  n+40(FP), SI
 13602  	XORQ  DI, DI
 13603  	XORQ  R8, R8
 13604  	JMP   check_limit_unroll
 13605  	PCALIGN $0x08
 13606  	NOP
 13607  	NOP
 13608  	NOP
 13609  	NOP
 13610  
 13611  loop_unroll:
 13612  	MOVSS (AX)(DI*4), X1
 13613  	MULSS X0, X1
 13614  	ADDSS (DX)(R8*4), X1
 13615  	MOVSS X1, (DX)(R8*4)
 13616  	ADDQ  CX, DI
 13617  	ADDQ  BX, R8
 13618  	MOVSS (AX)(DI*4), X1
 13619  	MULSS X0, X1
 13620  	ADDSS (DX)(R8*4), X1
 13621  	MOVSS X1, (DX)(R8*4)
 13622  	ADDQ  CX, DI
 13623  	ADDQ  BX, R8
 13624  	MOVSS (AX)(DI*4), X1
 13625  	MULSS X0, X1
 13626  	ADDSS (DX)(R8*4), X1
 13627  	MOVSS X1, (DX)(R8*4)
 13628  	ADDQ  CX, DI
 13629  	ADDQ  BX, R8
 13630  	MOVSS (AX)(DI*4), X1
 13631  	MULSS X0, X1
 13632  	ADDSS (DX)(R8*4), X1
 13633  	MOVSS X1, (DX)(R8*4)
 13634  	ADDQ  CX, DI
 13635  	ADDQ  BX, R8
 13636  	MOVSS (AX)(DI*4), X1
 13637  	MULSS X0, X1
 13638  	ADDSS (DX)(R8*4), X1
 13639  	MOVSS X1, (DX)(R8*4)
 13640  	ADDQ  CX, DI
 13641  	ADDQ  BX, R8
 13642  	MOVSS (AX)(DI*4), X1
 13643  	MULSS X0, X1
 13644  	ADDSS (DX)(R8*4), X1
 13645  	MOVSS X1, (DX)(R8*4)
 13646  	ADDQ  CX, DI
 13647  	ADDQ  BX, R8
 13648  	MOVSS (AX)(DI*4), X1
 13649  	MULSS X0, X1
 13650  	ADDSS (DX)(R8*4), X1
 13651  	MOVSS X1, (DX)(R8*4)
 13652  	ADDQ  CX, DI
 13653  	ADDQ  BX, R8
 13654  	MOVSS (AX)(DI*4), X1
 13655  	MULSS X0, X1
 13656  	ADDSS (DX)(R8*4), X1
 13657  	MOVSS X1, (DX)(R8*4)
 13658  	ADDQ  CX, DI
 13659  	ADDQ  BX, R8
 13660  	SUBQ  $0x08, SI
 13661  
 13662  check_limit_unroll:
 13663  	CMPQ SI, $0x08
 13664  	JHI  loop_unroll
 13665  	JMP  check_limit
 13666  
 13667  loop:
 13668  	MOVSS (AX)(DI*4), X1
 13669  	MULSS X0, X1
 13670  	ADDSS (DX)(R8*4), X1
 13671  	MOVSS X1, (DX)(R8*4)
 13672  	DECQ  SI
 13673  	ADDQ  CX, DI
 13674  	ADDQ  BX, R8
 13675  
 13676  check_limit:
 13677  	CMPQ SI, $0x00
 13678  	JHI  loop
 13679  	RET
 13680  
 13681  // func AmdAxpyUnsafeX_V2A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13682  // Requires: SSE
 13683  TEXT ·AmdAxpyUnsafeX_V2A12R8(SB), NOSPLIT, $0-48
 13684  	MOVSS alpha+0(FP), X0
 13685  	MOVQ  xs+8(FP), AX
 13686  	MOVQ  incx+16(FP), CX
 13687  	MOVQ  ys+24(FP), DX
 13688  	MOVQ  incy+32(FP), BX
 13689  	MOVQ  n+40(FP), SI
 13690  	XORQ  DI, DI
 13691  	XORQ  R8, R8
 13692  	JMP   check_limit_unroll
 13693  	PCALIGN $0x08
 13694  	NOP
 13695  	NOP
 13696  	NOP
 13697  	NOP
 13698  
 13699  loop_unroll:
 13700  	MOVSS (AX)(DI*4), X1
 13701  	MULSS X0, X1
 13702  	ADDSS (DX)(R8*4), X1
 13703  	MOVSS X1, (DX)(R8*4)
 13704  	ADDQ  CX, DI
 13705  	ADDQ  BX, R8
 13706  	MOVSS (AX)(DI*4), X1
 13707  	MULSS X0, X1
 13708  	ADDSS (DX)(R8*4), X1
 13709  	MOVSS X1, (DX)(R8*4)
 13710  	ADDQ  CX, DI
 13711  	ADDQ  BX, R8
 13712  	MOVSS (AX)(DI*4), X1
 13713  	MULSS X0, X1
 13714  	ADDSS (DX)(R8*4), X1
 13715  	MOVSS X1, (DX)(R8*4)
 13716  	ADDQ  CX, DI
 13717  	ADDQ  BX, R8
 13718  	MOVSS (AX)(DI*4), X1
 13719  	MULSS X0, X1
 13720  	ADDSS (DX)(R8*4), X1
 13721  	MOVSS X1, (DX)(R8*4)
 13722  	ADDQ  CX, DI
 13723  	ADDQ  BX, R8
 13724  	MOVSS (AX)(DI*4), X1
 13725  	MULSS X0, X1
 13726  	ADDSS (DX)(R8*4), X1
 13727  	MOVSS X1, (DX)(R8*4)
 13728  	ADDQ  CX, DI
 13729  	ADDQ  BX, R8
 13730  	MOVSS (AX)(DI*4), X1
 13731  	MULSS X0, X1
 13732  	ADDSS (DX)(R8*4), X1
 13733  	MOVSS X1, (DX)(R8*4)
 13734  	ADDQ  CX, DI
 13735  	ADDQ  BX, R8
 13736  	MOVSS (AX)(DI*4), X1
 13737  	MULSS X0, X1
 13738  	ADDSS (DX)(R8*4), X1
 13739  	MOVSS X1, (DX)(R8*4)
 13740  	ADDQ  CX, DI
 13741  	ADDQ  BX, R8
 13742  	MOVSS (AX)(DI*4), X1
 13743  	MULSS X0, X1
 13744  	ADDSS (DX)(R8*4), X1
 13745  	MOVSS X1, (DX)(R8*4)
 13746  	ADDQ  CX, DI
 13747  	ADDQ  BX, R8
 13748  	SUBQ  $0x08, SI
 13749  
 13750  check_limit_unroll:
 13751  	CMPQ SI, $0x08
 13752  	JHI  loop_unroll
 13753  	JMP  check_limit
 13754  
 13755  loop:
 13756  	MOVSS (AX)(DI*4), X1
 13757  	MULSS X0, X1
 13758  	ADDSS (DX)(R8*4), X1
 13759  	MOVSS X1, (DX)(R8*4)
 13760  	DECQ  SI
 13761  	ADDQ  CX, DI
 13762  	ADDQ  BX, R8
 13763  
 13764  check_limit:
 13765  	CMPQ SI, $0x00
 13766  	JHI  loop
 13767  	RET
 13768  
 13769  // func AmdAxpyUnsafeX_V3A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13770  // Requires: SSE
 13771  TEXT ·AmdAxpyUnsafeX_V3A12R8(SB), NOSPLIT, $0-48
 13772  	MOVSS alpha+0(FP), X0
 13773  	MOVQ  xs+8(FP), AX
 13774  	MOVQ  incx+16(FP), CX
 13775  	MOVQ  ys+24(FP), DX
 13776  	MOVQ  incy+32(FP), BX
 13777  	MOVQ  n+40(FP), SI
 13778  	XORQ  DI, DI
 13779  	XORQ  R8, R8
 13780  	JMP   check_limit_unroll
 13781  	PCALIGN $0x08
 13782  	NOP
 13783  	NOP
 13784  	NOP
 13785  	NOP
 13786  
 13787  loop_unroll:
 13788  	MOVSS (AX)(DI*4), X1
 13789  	MULSS X0, X1
 13790  	ADDSS (DX)(R8*4), X1
 13791  	MOVSS X1, (DX)(R8*4)
 13792  	ADDQ  CX, DI
 13793  	ADDQ  BX, R8
 13794  	MOVSS (AX)(DI*4), X1
 13795  	MULSS X0, X1
 13796  	ADDSS (DX)(R8*4), X1
 13797  	MOVSS X1, (DX)(R8*4)
 13798  	ADDQ  CX, DI
 13799  	ADDQ  BX, R8
 13800  	MOVSS (AX)(DI*4), X1
 13801  	MULSS X0, X1
 13802  	ADDSS (DX)(R8*4), X1
 13803  	MOVSS X1, (DX)(R8*4)
 13804  	ADDQ  CX, DI
 13805  	ADDQ  BX, R8
 13806  	MOVSS (AX)(DI*4), X1
 13807  	MULSS X0, X1
 13808  	ADDSS (DX)(R8*4), X1
 13809  	MOVSS X1, (DX)(R8*4)
 13810  	ADDQ  CX, DI
 13811  	ADDQ  BX, R8
 13812  	MOVSS (AX)(DI*4), X1
 13813  	MULSS X0, X1
 13814  	ADDSS (DX)(R8*4), X1
 13815  	MOVSS X1, (DX)(R8*4)
 13816  	ADDQ  CX, DI
 13817  	ADDQ  BX, R8
 13818  	MOVSS (AX)(DI*4), X1
 13819  	MULSS X0, X1
 13820  	ADDSS (DX)(R8*4), X1
 13821  	MOVSS X1, (DX)(R8*4)
 13822  	ADDQ  CX, DI
 13823  	ADDQ  BX, R8
 13824  	MOVSS (AX)(DI*4), X1
 13825  	MULSS X0, X1
 13826  	ADDSS (DX)(R8*4), X1
 13827  	MOVSS X1, (DX)(R8*4)
 13828  	ADDQ  CX, DI
 13829  	ADDQ  BX, R8
 13830  	MOVSS (AX)(DI*4), X1
 13831  	MULSS X0, X1
 13832  	ADDSS (DX)(R8*4), X1
 13833  	MOVSS X1, (DX)(R8*4)
 13834  	ADDQ  CX, DI
 13835  	ADDQ  BX, R8
 13836  	SUBQ  $0x08, SI
 13837  
 13838  check_limit_unroll:
 13839  	CMPQ SI, $0x08
 13840  	JHI  loop_unroll
 13841  	JMP  check_limit
 13842  
 13843  loop:
 13844  	MOVSS (AX)(DI*4), X1
 13845  	MULSS X0, X1
 13846  	ADDSS (DX)(R8*4), X1
 13847  	MOVSS X1, (DX)(R8*4)
 13848  	DECQ  SI
 13849  	ADDQ  CX, DI
 13850  	ADDQ  BX, R8
 13851  
 13852  check_limit:
 13853  	CMPQ SI, $0x00
 13854  	JHI  loop
 13855  	RET
 13856  
 13857  // func AmdAxpyUnsafeX_V4A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13858  // Requires: SSE
 13859  TEXT ·AmdAxpyUnsafeX_V4A12R8(SB), NOSPLIT, $0-48
 13860  	MOVSS alpha+0(FP), X0
 13861  	MOVQ  xs+8(FP), AX
 13862  	MOVQ  incx+16(FP), CX
 13863  	MOVQ  ys+24(FP), DX
 13864  	MOVQ  incy+32(FP), BX
 13865  	MOVQ  n+40(FP), SI
 13866  	XORQ  DI, DI
 13867  	XORQ  R8, R8
 13868  	JMP   check_limit_unroll
 13869  	PCALIGN $0x08
 13870  	NOP
 13871  	NOP
 13872  	NOP
 13873  	NOP
 13874  
 13875  loop_unroll:
 13876  	MOVSS (AX)(DI*4), X1
 13877  	MULSS X0, X1
 13878  	ADDSS (DX)(R8*4), X1
 13879  	MOVSS X1, (DX)(R8*4)
 13880  	ADDQ  CX, DI
 13881  	ADDQ  BX, R8
 13882  	MOVSS (AX)(DI*4), X1
 13883  	MULSS X0, X1
 13884  	ADDSS (DX)(R8*4), X1
 13885  	MOVSS X1, (DX)(R8*4)
 13886  	ADDQ  CX, DI
 13887  	ADDQ  BX, R8
 13888  	MOVSS (AX)(DI*4), X1
 13889  	MULSS X0, X1
 13890  	ADDSS (DX)(R8*4), X1
 13891  	MOVSS X1, (DX)(R8*4)
 13892  	ADDQ  CX, DI
 13893  	ADDQ  BX, R8
 13894  	MOVSS (AX)(DI*4), X1
 13895  	MULSS X0, X1
 13896  	ADDSS (DX)(R8*4), X1
 13897  	MOVSS X1, (DX)(R8*4)
 13898  	ADDQ  CX, DI
 13899  	ADDQ  BX, R8
 13900  	MOVSS (AX)(DI*4), X1
 13901  	MULSS X0, X1
 13902  	ADDSS (DX)(R8*4), X1
 13903  	MOVSS X1, (DX)(R8*4)
 13904  	ADDQ  CX, DI
 13905  	ADDQ  BX, R8
 13906  	MOVSS (AX)(DI*4), X1
 13907  	MULSS X0, X1
 13908  	ADDSS (DX)(R8*4), X1
 13909  	MOVSS X1, (DX)(R8*4)
 13910  	ADDQ  CX, DI
 13911  	ADDQ  BX, R8
 13912  	MOVSS (AX)(DI*4), X1
 13913  	MULSS X0, X1
 13914  	ADDSS (DX)(R8*4), X1
 13915  	MOVSS X1, (DX)(R8*4)
 13916  	ADDQ  CX, DI
 13917  	ADDQ  BX, R8
 13918  	MOVSS (AX)(DI*4), X1
 13919  	MULSS X0, X1
 13920  	ADDSS (DX)(R8*4), X1
 13921  	MOVSS X1, (DX)(R8*4)
 13922  	ADDQ  CX, DI
 13923  	ADDQ  BX, R8
 13924  	SUBQ  $0x08, SI
 13925  
 13926  check_limit_unroll:
 13927  	CMPQ SI, $0x08
 13928  	JHI  loop_unroll
 13929  	JMP  check_limit
 13930  
 13931  loop:
 13932  	MOVSS (AX)(DI*4), X1
 13933  	MULSS X0, X1
 13934  	ADDSS (DX)(R8*4), X1
 13935  	MOVSS X1, (DX)(R8*4)
 13936  	DECQ  SI
 13937  	ADDQ  CX, DI
 13938  	ADDQ  BX, R8
 13939  
 13940  check_limit:
 13941  	CMPQ SI, $0x00
 13942  	JHI  loop
 13943  	RET
 13944  
 13945  // func AmdAxpyUnsafeX_V5A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 13946  // Requires: SSE
 13947  TEXT ·AmdAxpyUnsafeX_V5A12R8(SB), NOSPLIT, $0-48
 13948  	MOVSS alpha+0(FP), X0
 13949  	MOVQ  xs+8(FP), AX
 13950  	MOVQ  incx+16(FP), CX
 13951  	MOVQ  ys+24(FP), DX
 13952  	MOVQ  incy+32(FP), BX
 13953  	MOVQ  n+40(FP), SI
 13954  	XORQ  DI, DI
 13955  	XORQ  R8, R8
 13956  	JMP   check_limit_unroll
 13957  	PCALIGN $0x08
 13958  	NOP
 13959  	NOP
 13960  	NOP
 13961  	NOP
 13962  
 13963  loop_unroll:
 13964  	MOVSS (AX)(DI*4), X1
 13965  	MULSS X0, X1
 13966  	ADDSS (DX)(R8*4), X1
 13967  	MOVSS X1, (DX)(R8*4)
 13968  	ADDQ  CX, DI
 13969  	ADDQ  BX, R8
 13970  	MOVSS (AX)(DI*4), X1
 13971  	MULSS X0, X1
 13972  	ADDSS (DX)(R8*4), X1
 13973  	MOVSS X1, (DX)(R8*4)
 13974  	ADDQ  CX, DI
 13975  	ADDQ  BX, R8
 13976  	MOVSS (AX)(DI*4), X1
 13977  	MULSS X0, X1
 13978  	ADDSS (DX)(R8*4), X1
 13979  	MOVSS X1, (DX)(R8*4)
 13980  	ADDQ  CX, DI
 13981  	ADDQ  BX, R8
 13982  	MOVSS (AX)(DI*4), X1
 13983  	MULSS X0, X1
 13984  	ADDSS (DX)(R8*4), X1
 13985  	MOVSS X1, (DX)(R8*4)
 13986  	ADDQ  CX, DI
 13987  	ADDQ  BX, R8
 13988  	MOVSS (AX)(DI*4), X1
 13989  	MULSS X0, X1
 13990  	ADDSS (DX)(R8*4), X1
 13991  	MOVSS X1, (DX)(R8*4)
 13992  	ADDQ  CX, DI
 13993  	ADDQ  BX, R8
 13994  	MOVSS (AX)(DI*4), X1
 13995  	MULSS X0, X1
 13996  	ADDSS (DX)(R8*4), X1
 13997  	MOVSS X1, (DX)(R8*4)
 13998  	ADDQ  CX, DI
 13999  	ADDQ  BX, R8
 14000  	MOVSS (AX)(DI*4), X1
 14001  	MULSS X0, X1
 14002  	ADDSS (DX)(R8*4), X1
 14003  	MOVSS X1, (DX)(R8*4)
 14004  	ADDQ  CX, DI
 14005  	ADDQ  BX, R8
 14006  	MOVSS (AX)(DI*4), X1
 14007  	MULSS X0, X1
 14008  	ADDSS (DX)(R8*4), X1
 14009  	MOVSS X1, (DX)(R8*4)
 14010  	ADDQ  CX, DI
 14011  	ADDQ  BX, R8
 14012  	SUBQ  $0x08, SI
 14013  
 14014  check_limit_unroll:
 14015  	CMPQ SI, $0x08
 14016  	JHI  loop_unroll
 14017  	JMP  check_limit
 14018  
 14019  loop:
 14020  	MOVSS (AX)(DI*4), X1
 14021  	MULSS X0, X1
 14022  	ADDSS (DX)(R8*4), X1
 14023  	MOVSS X1, (DX)(R8*4)
 14024  	DECQ  SI
 14025  	ADDQ  CX, DI
 14026  	ADDQ  BX, R8
 14027  
 14028  check_limit:
 14029  	CMPQ SI, $0x00
 14030  	JHI  loop
 14031  	RET
 14032  
 14033  // func AmdAxpyUnsafeX_V0A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14034  // Requires: SSE
 14035  TEXT ·AmdAxpyUnsafeX_V0A13R8(SB), NOSPLIT, $0-48
 14036  	MOVSS alpha+0(FP), X0
 14037  	MOVQ  xs+8(FP), AX
 14038  	MOVQ  incx+16(FP), CX
 14039  	MOVQ  ys+24(FP), DX
 14040  	MOVQ  incy+32(FP), BX
 14041  	MOVQ  n+40(FP), SI
 14042  	XORQ  DI, DI
 14043  	XORQ  R8, R8
 14044  	JMP   check_limit_unroll
 14045  	PCALIGN $0x08
 14046  	NOP
 14047  	NOP
 14048  	NOP
 14049  	NOP
 14050  	NOP
 14051  
 14052  loop_unroll:
 14053  	MOVSS (AX)(DI*4), X1
 14054  	MULSS X0, X1
 14055  	ADDSS (DX)(R8*4), X1
 14056  	MOVSS X1, (DX)(R8*4)
 14057  	ADDQ  CX, DI
 14058  	ADDQ  BX, R8
 14059  	MOVSS (AX)(DI*4), X1
 14060  	MULSS X0, X1
 14061  	ADDSS (DX)(R8*4), X1
 14062  	MOVSS X1, (DX)(R8*4)
 14063  	ADDQ  CX, DI
 14064  	ADDQ  BX, R8
 14065  	MOVSS (AX)(DI*4), X1
 14066  	MULSS X0, X1
 14067  	ADDSS (DX)(R8*4), X1
 14068  	MOVSS X1, (DX)(R8*4)
 14069  	ADDQ  CX, DI
 14070  	ADDQ  BX, R8
 14071  	MOVSS (AX)(DI*4), X1
 14072  	MULSS X0, X1
 14073  	ADDSS (DX)(R8*4), X1
 14074  	MOVSS X1, (DX)(R8*4)
 14075  	ADDQ  CX, DI
 14076  	ADDQ  BX, R8
 14077  	MOVSS (AX)(DI*4), X1
 14078  	MULSS X0, X1
 14079  	ADDSS (DX)(R8*4), X1
 14080  	MOVSS X1, (DX)(R8*4)
 14081  	ADDQ  CX, DI
 14082  	ADDQ  BX, R8
 14083  	MOVSS (AX)(DI*4), X1
 14084  	MULSS X0, X1
 14085  	ADDSS (DX)(R8*4), X1
 14086  	MOVSS X1, (DX)(R8*4)
 14087  	ADDQ  CX, DI
 14088  	ADDQ  BX, R8
 14089  	MOVSS (AX)(DI*4), X1
 14090  	MULSS X0, X1
 14091  	ADDSS (DX)(R8*4), X1
 14092  	MOVSS X1, (DX)(R8*4)
 14093  	ADDQ  CX, DI
 14094  	ADDQ  BX, R8
 14095  	MOVSS (AX)(DI*4), X1
 14096  	MULSS X0, X1
 14097  	ADDSS (DX)(R8*4), X1
 14098  	MOVSS X1, (DX)(R8*4)
 14099  	ADDQ  CX, DI
 14100  	ADDQ  BX, R8
 14101  	SUBQ  $0x08, SI
 14102  
 14103  check_limit_unroll:
 14104  	CMPQ SI, $0x08
 14105  	JHI  loop_unroll
 14106  	JMP  check_limit
 14107  
 14108  loop:
 14109  	MOVSS (AX)(DI*4), X1
 14110  	MULSS X0, X1
 14111  	ADDSS (DX)(R8*4), X1
 14112  	MOVSS X1, (DX)(R8*4)
 14113  	DECQ  SI
 14114  	ADDQ  CX, DI
 14115  	ADDQ  BX, R8
 14116  
 14117  check_limit:
 14118  	CMPQ SI, $0x00
 14119  	JHI  loop
 14120  	RET
 14121  
 14122  // func AmdAxpyUnsafeX_V1A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14123  // Requires: SSE
 14124  TEXT ·AmdAxpyUnsafeX_V1A13R8(SB), NOSPLIT, $0-48
 14125  	MOVSS alpha+0(FP), X0
 14126  	MOVQ  xs+8(FP), AX
 14127  	MOVQ  incx+16(FP), CX
 14128  	MOVQ  ys+24(FP), DX
 14129  	MOVQ  incy+32(FP), BX
 14130  	MOVQ  n+40(FP), SI
 14131  	XORQ  DI, DI
 14132  	XORQ  R8, R8
 14133  	JMP   check_limit_unroll
 14134  	PCALIGN $0x08
 14135  	NOP
 14136  	NOP
 14137  	NOP
 14138  	NOP
 14139  	NOP
 14140  
 14141  loop_unroll:
 14142  	MOVSS (AX)(DI*4), X1
 14143  	MULSS X0, X1
 14144  	ADDSS (DX)(R8*4), X1
 14145  	MOVSS X1, (DX)(R8*4)
 14146  	ADDQ  CX, DI
 14147  	ADDQ  BX, R8
 14148  	MOVSS (AX)(DI*4), X1
 14149  	MULSS X0, X1
 14150  	ADDSS (DX)(R8*4), X1
 14151  	MOVSS X1, (DX)(R8*4)
 14152  	ADDQ  CX, DI
 14153  	ADDQ  BX, R8
 14154  	MOVSS (AX)(DI*4), X1
 14155  	MULSS X0, X1
 14156  	ADDSS (DX)(R8*4), X1
 14157  	MOVSS X1, (DX)(R8*4)
 14158  	ADDQ  CX, DI
 14159  	ADDQ  BX, R8
 14160  	MOVSS (AX)(DI*4), X1
 14161  	MULSS X0, X1
 14162  	ADDSS (DX)(R8*4), X1
 14163  	MOVSS X1, (DX)(R8*4)
 14164  	ADDQ  CX, DI
 14165  	ADDQ  BX, R8
 14166  	MOVSS (AX)(DI*4), X1
 14167  	MULSS X0, X1
 14168  	ADDSS (DX)(R8*4), X1
 14169  	MOVSS X1, (DX)(R8*4)
 14170  	ADDQ  CX, DI
 14171  	ADDQ  BX, R8
 14172  	MOVSS (AX)(DI*4), X1
 14173  	MULSS X0, X1
 14174  	ADDSS (DX)(R8*4), X1
 14175  	MOVSS X1, (DX)(R8*4)
 14176  	ADDQ  CX, DI
 14177  	ADDQ  BX, R8
 14178  	MOVSS (AX)(DI*4), X1
 14179  	MULSS X0, X1
 14180  	ADDSS (DX)(R8*4), X1
 14181  	MOVSS X1, (DX)(R8*4)
 14182  	ADDQ  CX, DI
 14183  	ADDQ  BX, R8
 14184  	MOVSS (AX)(DI*4), X1
 14185  	MULSS X0, X1
 14186  	ADDSS (DX)(R8*4), X1
 14187  	MOVSS X1, (DX)(R8*4)
 14188  	ADDQ  CX, DI
 14189  	ADDQ  BX, R8
 14190  	SUBQ  $0x08, SI
 14191  
 14192  check_limit_unroll:
 14193  	CMPQ SI, $0x08
 14194  	JHI  loop_unroll
 14195  	JMP  check_limit
 14196  
 14197  loop:
 14198  	MOVSS (AX)(DI*4), X1
 14199  	MULSS X0, X1
 14200  	ADDSS (DX)(R8*4), X1
 14201  	MOVSS X1, (DX)(R8*4)
 14202  	DECQ  SI
 14203  	ADDQ  CX, DI
 14204  	ADDQ  BX, R8
 14205  
 14206  check_limit:
 14207  	CMPQ SI, $0x00
 14208  	JHI  loop
 14209  	RET
 14210  
 14211  // func AmdAxpyUnsafeX_V2A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14212  // Requires: SSE
 14213  TEXT ·AmdAxpyUnsafeX_V2A13R8(SB), NOSPLIT, $0-48
 14214  	MOVSS alpha+0(FP), X0
 14215  	MOVQ  xs+8(FP), AX
 14216  	MOVQ  incx+16(FP), CX
 14217  	MOVQ  ys+24(FP), DX
 14218  	MOVQ  incy+32(FP), BX
 14219  	MOVQ  n+40(FP), SI
 14220  	XORQ  DI, DI
 14221  	XORQ  R8, R8
 14222  	JMP   check_limit_unroll
 14223  	PCALIGN $0x08
 14224  	NOP
 14225  	NOP
 14226  	NOP
 14227  	NOP
 14228  	NOP
 14229  
 14230  loop_unroll:
 14231  	MOVSS (AX)(DI*4), X1
 14232  	MULSS X0, X1
 14233  	ADDSS (DX)(R8*4), X1
 14234  	MOVSS X1, (DX)(R8*4)
 14235  	ADDQ  CX, DI
 14236  	ADDQ  BX, R8
 14237  	MOVSS (AX)(DI*4), X1
 14238  	MULSS X0, X1
 14239  	ADDSS (DX)(R8*4), X1
 14240  	MOVSS X1, (DX)(R8*4)
 14241  	ADDQ  CX, DI
 14242  	ADDQ  BX, R8
 14243  	MOVSS (AX)(DI*4), X1
 14244  	MULSS X0, X1
 14245  	ADDSS (DX)(R8*4), X1
 14246  	MOVSS X1, (DX)(R8*4)
 14247  	ADDQ  CX, DI
 14248  	ADDQ  BX, R8
 14249  	MOVSS (AX)(DI*4), X1
 14250  	MULSS X0, X1
 14251  	ADDSS (DX)(R8*4), X1
 14252  	MOVSS X1, (DX)(R8*4)
 14253  	ADDQ  CX, DI
 14254  	ADDQ  BX, R8
 14255  	MOVSS (AX)(DI*4), X1
 14256  	MULSS X0, X1
 14257  	ADDSS (DX)(R8*4), X1
 14258  	MOVSS X1, (DX)(R8*4)
 14259  	ADDQ  CX, DI
 14260  	ADDQ  BX, R8
 14261  	MOVSS (AX)(DI*4), X1
 14262  	MULSS X0, X1
 14263  	ADDSS (DX)(R8*4), X1
 14264  	MOVSS X1, (DX)(R8*4)
 14265  	ADDQ  CX, DI
 14266  	ADDQ  BX, R8
 14267  	MOVSS (AX)(DI*4), X1
 14268  	MULSS X0, X1
 14269  	ADDSS (DX)(R8*4), X1
 14270  	MOVSS X1, (DX)(R8*4)
 14271  	ADDQ  CX, DI
 14272  	ADDQ  BX, R8
 14273  	MOVSS (AX)(DI*4), X1
 14274  	MULSS X0, X1
 14275  	ADDSS (DX)(R8*4), X1
 14276  	MOVSS X1, (DX)(R8*4)
 14277  	ADDQ  CX, DI
 14278  	ADDQ  BX, R8
 14279  	SUBQ  $0x08, SI
 14280  
 14281  check_limit_unroll:
 14282  	CMPQ SI, $0x08
 14283  	JHI  loop_unroll
 14284  	JMP  check_limit
 14285  
 14286  loop:
 14287  	MOVSS (AX)(DI*4), X1
 14288  	MULSS X0, X1
 14289  	ADDSS (DX)(R8*4), X1
 14290  	MOVSS X1, (DX)(R8*4)
 14291  	DECQ  SI
 14292  	ADDQ  CX, DI
 14293  	ADDQ  BX, R8
 14294  
 14295  check_limit:
 14296  	CMPQ SI, $0x00
 14297  	JHI  loop
 14298  	RET
 14299  
 14300  // func AmdAxpyUnsafeX_V3A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14301  // Requires: SSE
 14302  TEXT ·AmdAxpyUnsafeX_V3A13R8(SB), NOSPLIT, $0-48
 14303  	MOVSS alpha+0(FP), X0
 14304  	MOVQ  xs+8(FP), AX
 14305  	MOVQ  incx+16(FP), CX
 14306  	MOVQ  ys+24(FP), DX
 14307  	MOVQ  incy+32(FP), BX
 14308  	MOVQ  n+40(FP), SI
 14309  	XORQ  DI, DI
 14310  	XORQ  R8, R8
 14311  	JMP   check_limit_unroll
 14312  	PCALIGN $0x08
 14313  	NOP
 14314  	NOP
 14315  	NOP
 14316  	NOP
 14317  	NOP
 14318  
 14319  loop_unroll:
 14320  	MOVSS (AX)(DI*4), X1
 14321  	MULSS X0, X1
 14322  	ADDSS (DX)(R8*4), X1
 14323  	MOVSS X1, (DX)(R8*4)
 14324  	ADDQ  CX, DI
 14325  	ADDQ  BX, R8
 14326  	MOVSS (AX)(DI*4), X1
 14327  	MULSS X0, X1
 14328  	ADDSS (DX)(R8*4), X1
 14329  	MOVSS X1, (DX)(R8*4)
 14330  	ADDQ  CX, DI
 14331  	ADDQ  BX, R8
 14332  	MOVSS (AX)(DI*4), X1
 14333  	MULSS X0, X1
 14334  	ADDSS (DX)(R8*4), X1
 14335  	MOVSS X1, (DX)(R8*4)
 14336  	ADDQ  CX, DI
 14337  	ADDQ  BX, R8
 14338  	MOVSS (AX)(DI*4), X1
 14339  	MULSS X0, X1
 14340  	ADDSS (DX)(R8*4), X1
 14341  	MOVSS X1, (DX)(R8*4)
 14342  	ADDQ  CX, DI
 14343  	ADDQ  BX, R8
 14344  	MOVSS (AX)(DI*4), X1
 14345  	MULSS X0, X1
 14346  	ADDSS (DX)(R8*4), X1
 14347  	MOVSS X1, (DX)(R8*4)
 14348  	ADDQ  CX, DI
 14349  	ADDQ  BX, R8
 14350  	MOVSS (AX)(DI*4), X1
 14351  	MULSS X0, X1
 14352  	ADDSS (DX)(R8*4), X1
 14353  	MOVSS X1, (DX)(R8*4)
 14354  	ADDQ  CX, DI
 14355  	ADDQ  BX, R8
 14356  	MOVSS (AX)(DI*4), X1
 14357  	MULSS X0, X1
 14358  	ADDSS (DX)(R8*4), X1
 14359  	MOVSS X1, (DX)(R8*4)
 14360  	ADDQ  CX, DI
 14361  	ADDQ  BX, R8
 14362  	MOVSS (AX)(DI*4), X1
 14363  	MULSS X0, X1
 14364  	ADDSS (DX)(R8*4), X1
 14365  	MOVSS X1, (DX)(R8*4)
 14366  	ADDQ  CX, DI
 14367  	ADDQ  BX, R8
 14368  	SUBQ  $0x08, SI
 14369  
 14370  check_limit_unroll:
 14371  	CMPQ SI, $0x08
 14372  	JHI  loop_unroll
 14373  	JMP  check_limit
 14374  
 14375  loop:
 14376  	MOVSS (AX)(DI*4), X1
 14377  	MULSS X0, X1
 14378  	ADDSS (DX)(R8*4), X1
 14379  	MOVSS X1, (DX)(R8*4)
 14380  	DECQ  SI
 14381  	ADDQ  CX, DI
 14382  	ADDQ  BX, R8
 14383  
 14384  check_limit:
 14385  	CMPQ SI, $0x00
 14386  	JHI  loop
 14387  	RET
 14388  
 14389  // func AmdAxpyUnsafeX_V4A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14390  // Requires: SSE
 14391  TEXT ·AmdAxpyUnsafeX_V4A13R8(SB), NOSPLIT, $0-48
 14392  	MOVSS alpha+0(FP), X0
 14393  	MOVQ  xs+8(FP), AX
 14394  	MOVQ  incx+16(FP), CX
 14395  	MOVQ  ys+24(FP), DX
 14396  	MOVQ  incy+32(FP), BX
 14397  	MOVQ  n+40(FP), SI
 14398  	XORQ  DI, DI
 14399  	XORQ  R8, R8
 14400  	JMP   check_limit_unroll
 14401  	PCALIGN $0x08
 14402  	NOP
 14403  	NOP
 14404  	NOP
 14405  	NOP
 14406  	NOP
 14407  
 14408  loop_unroll:
 14409  	MOVSS (AX)(DI*4), X1
 14410  	MULSS X0, X1
 14411  	ADDSS (DX)(R8*4), X1
 14412  	MOVSS X1, (DX)(R8*4)
 14413  	ADDQ  CX, DI
 14414  	ADDQ  BX, R8
 14415  	MOVSS (AX)(DI*4), X1
 14416  	MULSS X0, X1
 14417  	ADDSS (DX)(R8*4), X1
 14418  	MOVSS X1, (DX)(R8*4)
 14419  	ADDQ  CX, DI
 14420  	ADDQ  BX, R8
 14421  	MOVSS (AX)(DI*4), X1
 14422  	MULSS X0, X1
 14423  	ADDSS (DX)(R8*4), X1
 14424  	MOVSS X1, (DX)(R8*4)
 14425  	ADDQ  CX, DI
 14426  	ADDQ  BX, R8
 14427  	MOVSS (AX)(DI*4), X1
 14428  	MULSS X0, X1
 14429  	ADDSS (DX)(R8*4), X1
 14430  	MOVSS X1, (DX)(R8*4)
 14431  	ADDQ  CX, DI
 14432  	ADDQ  BX, R8
 14433  	MOVSS (AX)(DI*4), X1
 14434  	MULSS X0, X1
 14435  	ADDSS (DX)(R8*4), X1
 14436  	MOVSS X1, (DX)(R8*4)
 14437  	ADDQ  CX, DI
 14438  	ADDQ  BX, R8
 14439  	MOVSS (AX)(DI*4), X1
 14440  	MULSS X0, X1
 14441  	ADDSS (DX)(R8*4), X1
 14442  	MOVSS X1, (DX)(R8*4)
 14443  	ADDQ  CX, DI
 14444  	ADDQ  BX, R8
 14445  	MOVSS (AX)(DI*4), X1
 14446  	MULSS X0, X1
 14447  	ADDSS (DX)(R8*4), X1
 14448  	MOVSS X1, (DX)(R8*4)
 14449  	ADDQ  CX, DI
 14450  	ADDQ  BX, R8
 14451  	MOVSS (AX)(DI*4), X1
 14452  	MULSS X0, X1
 14453  	ADDSS (DX)(R8*4), X1
 14454  	MOVSS X1, (DX)(R8*4)
 14455  	ADDQ  CX, DI
 14456  	ADDQ  BX, R8
 14457  	SUBQ  $0x08, SI
 14458  
 14459  check_limit_unroll:
 14460  	CMPQ SI, $0x08
 14461  	JHI  loop_unroll
 14462  	JMP  check_limit
 14463  
 14464  loop:
 14465  	MOVSS (AX)(DI*4), X1
 14466  	MULSS X0, X1
 14467  	ADDSS (DX)(R8*4), X1
 14468  	MOVSS X1, (DX)(R8*4)
 14469  	DECQ  SI
 14470  	ADDQ  CX, DI
 14471  	ADDQ  BX, R8
 14472  
 14473  check_limit:
 14474  	CMPQ SI, $0x00
 14475  	JHI  loop
 14476  	RET
 14477  
 14478  // func AmdAxpyUnsafeX_V5A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14479  // Requires: SSE
 14480  TEXT ·AmdAxpyUnsafeX_V5A13R8(SB), NOSPLIT, $0-48
 14481  	MOVSS alpha+0(FP), X0
 14482  	MOVQ  xs+8(FP), AX
 14483  	MOVQ  incx+16(FP), CX
 14484  	MOVQ  ys+24(FP), DX
 14485  	MOVQ  incy+32(FP), BX
 14486  	MOVQ  n+40(FP), SI
 14487  	XORQ  DI, DI
 14488  	XORQ  R8, R8
 14489  	JMP   check_limit_unroll
 14490  	PCALIGN $0x08
 14491  	NOP
 14492  	NOP
 14493  	NOP
 14494  	NOP
 14495  	NOP
 14496  
 14497  loop_unroll:
 14498  	MOVSS (AX)(DI*4), X1
 14499  	MULSS X0, X1
 14500  	ADDSS (DX)(R8*4), X1
 14501  	MOVSS X1, (DX)(R8*4)
 14502  	ADDQ  CX, DI
 14503  	ADDQ  BX, R8
 14504  	MOVSS (AX)(DI*4), X1
 14505  	MULSS X0, X1
 14506  	ADDSS (DX)(R8*4), X1
 14507  	MOVSS X1, (DX)(R8*4)
 14508  	ADDQ  CX, DI
 14509  	ADDQ  BX, R8
 14510  	MOVSS (AX)(DI*4), X1
 14511  	MULSS X0, X1
 14512  	ADDSS (DX)(R8*4), X1
 14513  	MOVSS X1, (DX)(R8*4)
 14514  	ADDQ  CX, DI
 14515  	ADDQ  BX, R8
 14516  	MOVSS (AX)(DI*4), X1
 14517  	MULSS X0, X1
 14518  	ADDSS (DX)(R8*4), X1
 14519  	MOVSS X1, (DX)(R8*4)
 14520  	ADDQ  CX, DI
 14521  	ADDQ  BX, R8
 14522  	MOVSS (AX)(DI*4), X1
 14523  	MULSS X0, X1
 14524  	ADDSS (DX)(R8*4), X1
 14525  	MOVSS X1, (DX)(R8*4)
 14526  	ADDQ  CX, DI
 14527  	ADDQ  BX, R8
 14528  	MOVSS (AX)(DI*4), X1
 14529  	MULSS X0, X1
 14530  	ADDSS (DX)(R8*4), X1
 14531  	MOVSS X1, (DX)(R8*4)
 14532  	ADDQ  CX, DI
 14533  	ADDQ  BX, R8
 14534  	MOVSS (AX)(DI*4), X1
 14535  	MULSS X0, X1
 14536  	ADDSS (DX)(R8*4), X1
 14537  	MOVSS X1, (DX)(R8*4)
 14538  	ADDQ  CX, DI
 14539  	ADDQ  BX, R8
 14540  	MOVSS (AX)(DI*4), X1
 14541  	MULSS X0, X1
 14542  	ADDSS (DX)(R8*4), X1
 14543  	MOVSS X1, (DX)(R8*4)
 14544  	ADDQ  CX, DI
 14545  	ADDQ  BX, R8
 14546  	SUBQ  $0x08, SI
 14547  
 14548  check_limit_unroll:
 14549  	CMPQ SI, $0x08
 14550  	JHI  loop_unroll
 14551  	JMP  check_limit
 14552  
 14553  loop:
 14554  	MOVSS (AX)(DI*4), X1
 14555  	MULSS X0, X1
 14556  	ADDSS (DX)(R8*4), X1
 14557  	MOVSS X1, (DX)(R8*4)
 14558  	DECQ  SI
 14559  	ADDQ  CX, DI
 14560  	ADDQ  BX, R8
 14561  
 14562  check_limit:
 14563  	CMPQ SI, $0x00
 14564  	JHI  loop
 14565  	RET
 14566  
 14567  // func AmdAxpyUnsafeX_V0A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14568  // Requires: SSE
 14569  TEXT ·AmdAxpyUnsafeX_V0A14R8(SB), NOSPLIT, $0-48
 14570  	MOVSS alpha+0(FP), X0
 14571  	MOVQ  xs+8(FP), AX
 14572  	MOVQ  incx+16(FP), CX
 14573  	MOVQ  ys+24(FP), DX
 14574  	MOVQ  incy+32(FP), BX
 14575  	MOVQ  n+40(FP), SI
 14576  	XORQ  DI, DI
 14577  	XORQ  R8, R8
 14578  	JMP   check_limit_unroll
 14579  	PCALIGN $0x08
 14580  	NOP
 14581  	NOP
 14582  	NOP
 14583  	NOP
 14584  	NOP
 14585  	NOP
 14586  
 14587  loop_unroll:
 14588  	MOVSS (AX)(DI*4), X1
 14589  	MULSS X0, X1
 14590  	ADDSS (DX)(R8*4), X1
 14591  	MOVSS X1, (DX)(R8*4)
 14592  	ADDQ  CX, DI
 14593  	ADDQ  BX, R8
 14594  	MOVSS (AX)(DI*4), X1
 14595  	MULSS X0, X1
 14596  	ADDSS (DX)(R8*4), X1
 14597  	MOVSS X1, (DX)(R8*4)
 14598  	ADDQ  CX, DI
 14599  	ADDQ  BX, R8
 14600  	MOVSS (AX)(DI*4), X1
 14601  	MULSS X0, X1
 14602  	ADDSS (DX)(R8*4), X1
 14603  	MOVSS X1, (DX)(R8*4)
 14604  	ADDQ  CX, DI
 14605  	ADDQ  BX, R8
 14606  	MOVSS (AX)(DI*4), X1
 14607  	MULSS X0, X1
 14608  	ADDSS (DX)(R8*4), X1
 14609  	MOVSS X1, (DX)(R8*4)
 14610  	ADDQ  CX, DI
 14611  	ADDQ  BX, R8
 14612  	MOVSS (AX)(DI*4), X1
 14613  	MULSS X0, X1
 14614  	ADDSS (DX)(R8*4), X1
 14615  	MOVSS X1, (DX)(R8*4)
 14616  	ADDQ  CX, DI
 14617  	ADDQ  BX, R8
 14618  	MOVSS (AX)(DI*4), X1
 14619  	MULSS X0, X1
 14620  	ADDSS (DX)(R8*4), X1
 14621  	MOVSS X1, (DX)(R8*4)
 14622  	ADDQ  CX, DI
 14623  	ADDQ  BX, R8
 14624  	MOVSS (AX)(DI*4), X1
 14625  	MULSS X0, X1
 14626  	ADDSS (DX)(R8*4), X1
 14627  	MOVSS X1, (DX)(R8*4)
 14628  	ADDQ  CX, DI
 14629  	ADDQ  BX, R8
 14630  	MOVSS (AX)(DI*4), X1
 14631  	MULSS X0, X1
 14632  	ADDSS (DX)(R8*4), X1
 14633  	MOVSS X1, (DX)(R8*4)
 14634  	ADDQ  CX, DI
 14635  	ADDQ  BX, R8
 14636  	SUBQ  $0x08, SI
 14637  
 14638  check_limit_unroll:
 14639  	CMPQ SI, $0x08
 14640  	JHI  loop_unroll
 14641  	JMP  check_limit
 14642  
 14643  loop:
 14644  	MOVSS (AX)(DI*4), X1
 14645  	MULSS X0, X1
 14646  	ADDSS (DX)(R8*4), X1
 14647  	MOVSS X1, (DX)(R8*4)
 14648  	DECQ  SI
 14649  	ADDQ  CX, DI
 14650  	ADDQ  BX, R8
 14651  
 14652  check_limit:
 14653  	CMPQ SI, $0x00
 14654  	JHI  loop
 14655  	RET
 14656  
 14657  // func AmdAxpyUnsafeX_V1A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14658  // Requires: SSE
 14659  TEXT ·AmdAxpyUnsafeX_V1A14R8(SB), NOSPLIT, $0-48
 14660  	MOVSS alpha+0(FP), X0
 14661  	MOVQ  xs+8(FP), AX
 14662  	MOVQ  incx+16(FP), CX
 14663  	MOVQ  ys+24(FP), DX
 14664  	MOVQ  incy+32(FP), BX
 14665  	MOVQ  n+40(FP), SI
 14666  	XORQ  DI, DI
 14667  	XORQ  R8, R8
 14668  	JMP   check_limit_unroll
 14669  	PCALIGN $0x08
 14670  	NOP
 14671  	NOP
 14672  	NOP
 14673  	NOP
 14674  	NOP
 14675  	NOP
 14676  
 14677  loop_unroll:
 14678  	MOVSS (AX)(DI*4), X1
 14679  	MULSS X0, X1
 14680  	ADDSS (DX)(R8*4), X1
 14681  	MOVSS X1, (DX)(R8*4)
 14682  	ADDQ  CX, DI
 14683  	ADDQ  BX, R8
 14684  	MOVSS (AX)(DI*4), X1
 14685  	MULSS X0, X1
 14686  	ADDSS (DX)(R8*4), X1
 14687  	MOVSS X1, (DX)(R8*4)
 14688  	ADDQ  CX, DI
 14689  	ADDQ  BX, R8
 14690  	MOVSS (AX)(DI*4), X1
 14691  	MULSS X0, X1
 14692  	ADDSS (DX)(R8*4), X1
 14693  	MOVSS X1, (DX)(R8*4)
 14694  	ADDQ  CX, DI
 14695  	ADDQ  BX, R8
 14696  	MOVSS (AX)(DI*4), X1
 14697  	MULSS X0, X1
 14698  	ADDSS (DX)(R8*4), X1
 14699  	MOVSS X1, (DX)(R8*4)
 14700  	ADDQ  CX, DI
 14701  	ADDQ  BX, R8
 14702  	MOVSS (AX)(DI*4), X1
 14703  	MULSS X0, X1
 14704  	ADDSS (DX)(R8*4), X1
 14705  	MOVSS X1, (DX)(R8*4)
 14706  	ADDQ  CX, DI
 14707  	ADDQ  BX, R8
 14708  	MOVSS (AX)(DI*4), X1
 14709  	MULSS X0, X1
 14710  	ADDSS (DX)(R8*4), X1
 14711  	MOVSS X1, (DX)(R8*4)
 14712  	ADDQ  CX, DI
 14713  	ADDQ  BX, R8
 14714  	MOVSS (AX)(DI*4), X1
 14715  	MULSS X0, X1
 14716  	ADDSS (DX)(R8*4), X1
 14717  	MOVSS X1, (DX)(R8*4)
 14718  	ADDQ  CX, DI
 14719  	ADDQ  BX, R8
 14720  	MOVSS (AX)(DI*4), X1
 14721  	MULSS X0, X1
 14722  	ADDSS (DX)(R8*4), X1
 14723  	MOVSS X1, (DX)(R8*4)
 14724  	ADDQ  CX, DI
 14725  	ADDQ  BX, R8
 14726  	SUBQ  $0x08, SI
 14727  
 14728  check_limit_unroll:
 14729  	CMPQ SI, $0x08
 14730  	JHI  loop_unroll
 14731  	JMP  check_limit
 14732  
 14733  loop:
 14734  	MOVSS (AX)(DI*4), X1
 14735  	MULSS X0, X1
 14736  	ADDSS (DX)(R8*4), X1
 14737  	MOVSS X1, (DX)(R8*4)
 14738  	DECQ  SI
 14739  	ADDQ  CX, DI
 14740  	ADDQ  BX, R8
 14741  
 14742  check_limit:
 14743  	CMPQ SI, $0x00
 14744  	JHI  loop
 14745  	RET
 14746  
 14747  // func AmdAxpyUnsafeX_V2A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14748  // Requires: SSE
 14749  TEXT ·AmdAxpyUnsafeX_V2A14R8(SB), NOSPLIT, $0-48
 14750  	MOVSS alpha+0(FP), X0
 14751  	MOVQ  xs+8(FP), AX
 14752  	MOVQ  incx+16(FP), CX
 14753  	MOVQ  ys+24(FP), DX
 14754  	MOVQ  incy+32(FP), BX
 14755  	MOVQ  n+40(FP), SI
 14756  	XORQ  DI, DI
 14757  	XORQ  R8, R8
 14758  	JMP   check_limit_unroll
 14759  	PCALIGN $0x08
 14760  	NOP
 14761  	NOP
 14762  	NOP
 14763  	NOP
 14764  	NOP
 14765  	NOP
 14766  
 14767  loop_unroll:
 14768  	MOVSS (AX)(DI*4), X1
 14769  	MULSS X0, X1
 14770  	ADDSS (DX)(R8*4), X1
 14771  	MOVSS X1, (DX)(R8*4)
 14772  	ADDQ  CX, DI
 14773  	ADDQ  BX, R8
 14774  	MOVSS (AX)(DI*4), X1
 14775  	MULSS X0, X1
 14776  	ADDSS (DX)(R8*4), X1
 14777  	MOVSS X1, (DX)(R8*4)
 14778  	ADDQ  CX, DI
 14779  	ADDQ  BX, R8
 14780  	MOVSS (AX)(DI*4), X1
 14781  	MULSS X0, X1
 14782  	ADDSS (DX)(R8*4), X1
 14783  	MOVSS X1, (DX)(R8*4)
 14784  	ADDQ  CX, DI
 14785  	ADDQ  BX, R8
 14786  	MOVSS (AX)(DI*4), X1
 14787  	MULSS X0, X1
 14788  	ADDSS (DX)(R8*4), X1
 14789  	MOVSS X1, (DX)(R8*4)
 14790  	ADDQ  CX, DI
 14791  	ADDQ  BX, R8
 14792  	MOVSS (AX)(DI*4), X1
 14793  	MULSS X0, X1
 14794  	ADDSS (DX)(R8*4), X1
 14795  	MOVSS X1, (DX)(R8*4)
 14796  	ADDQ  CX, DI
 14797  	ADDQ  BX, R8
 14798  	MOVSS (AX)(DI*4), X1
 14799  	MULSS X0, X1
 14800  	ADDSS (DX)(R8*4), X1
 14801  	MOVSS X1, (DX)(R8*4)
 14802  	ADDQ  CX, DI
 14803  	ADDQ  BX, R8
 14804  	MOVSS (AX)(DI*4), X1
 14805  	MULSS X0, X1
 14806  	ADDSS (DX)(R8*4), X1
 14807  	MOVSS X1, (DX)(R8*4)
 14808  	ADDQ  CX, DI
 14809  	ADDQ  BX, R8
 14810  	MOVSS (AX)(DI*4), X1
 14811  	MULSS X0, X1
 14812  	ADDSS (DX)(R8*4), X1
 14813  	MOVSS X1, (DX)(R8*4)
 14814  	ADDQ  CX, DI
 14815  	ADDQ  BX, R8
 14816  	SUBQ  $0x08, SI
 14817  
 14818  check_limit_unroll:
 14819  	CMPQ SI, $0x08
 14820  	JHI  loop_unroll
 14821  	JMP  check_limit
 14822  
 14823  loop:
 14824  	MOVSS (AX)(DI*4), X1
 14825  	MULSS X0, X1
 14826  	ADDSS (DX)(R8*4), X1
 14827  	MOVSS X1, (DX)(R8*4)
 14828  	DECQ  SI
 14829  	ADDQ  CX, DI
 14830  	ADDQ  BX, R8
 14831  
 14832  check_limit:
 14833  	CMPQ SI, $0x00
 14834  	JHI  loop
 14835  	RET
 14836  
 14837  // func AmdAxpyUnsafeX_V3A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14838  // Requires: SSE
 14839  TEXT ·AmdAxpyUnsafeX_V3A14R8(SB), NOSPLIT, $0-48
 14840  	MOVSS alpha+0(FP), X0
 14841  	MOVQ  xs+8(FP), AX
 14842  	MOVQ  incx+16(FP), CX
 14843  	MOVQ  ys+24(FP), DX
 14844  	MOVQ  incy+32(FP), BX
 14845  	MOVQ  n+40(FP), SI
 14846  	XORQ  DI, DI
 14847  	XORQ  R8, R8
 14848  	JMP   check_limit_unroll
 14849  	PCALIGN $0x08
 14850  	NOP
 14851  	NOP
 14852  	NOP
 14853  	NOP
 14854  	NOP
 14855  	NOP
 14856  
 14857  loop_unroll:
 14858  	MOVSS (AX)(DI*4), X1
 14859  	MULSS X0, X1
 14860  	ADDSS (DX)(R8*4), X1
 14861  	MOVSS X1, (DX)(R8*4)
 14862  	ADDQ  CX, DI
 14863  	ADDQ  BX, R8
 14864  	MOVSS (AX)(DI*4), X1
 14865  	MULSS X0, X1
 14866  	ADDSS (DX)(R8*4), X1
 14867  	MOVSS X1, (DX)(R8*4)
 14868  	ADDQ  CX, DI
 14869  	ADDQ  BX, R8
 14870  	MOVSS (AX)(DI*4), X1
 14871  	MULSS X0, X1
 14872  	ADDSS (DX)(R8*4), X1
 14873  	MOVSS X1, (DX)(R8*4)
 14874  	ADDQ  CX, DI
 14875  	ADDQ  BX, R8
 14876  	MOVSS (AX)(DI*4), X1
 14877  	MULSS X0, X1
 14878  	ADDSS (DX)(R8*4), X1
 14879  	MOVSS X1, (DX)(R8*4)
 14880  	ADDQ  CX, DI
 14881  	ADDQ  BX, R8
 14882  	MOVSS (AX)(DI*4), X1
 14883  	MULSS X0, X1
 14884  	ADDSS (DX)(R8*4), X1
 14885  	MOVSS X1, (DX)(R8*4)
 14886  	ADDQ  CX, DI
 14887  	ADDQ  BX, R8
 14888  	MOVSS (AX)(DI*4), X1
 14889  	MULSS X0, X1
 14890  	ADDSS (DX)(R8*4), X1
 14891  	MOVSS X1, (DX)(R8*4)
 14892  	ADDQ  CX, DI
 14893  	ADDQ  BX, R8
 14894  	MOVSS (AX)(DI*4), X1
 14895  	MULSS X0, X1
 14896  	ADDSS (DX)(R8*4), X1
 14897  	MOVSS X1, (DX)(R8*4)
 14898  	ADDQ  CX, DI
 14899  	ADDQ  BX, R8
 14900  	MOVSS (AX)(DI*4), X1
 14901  	MULSS X0, X1
 14902  	ADDSS (DX)(R8*4), X1
 14903  	MOVSS X1, (DX)(R8*4)
 14904  	ADDQ  CX, DI
 14905  	ADDQ  BX, R8
 14906  	SUBQ  $0x08, SI
 14907  
 14908  check_limit_unroll:
 14909  	CMPQ SI, $0x08
 14910  	JHI  loop_unroll
 14911  	JMP  check_limit
 14912  
 14913  loop:
 14914  	MOVSS (AX)(DI*4), X1
 14915  	MULSS X0, X1
 14916  	ADDSS (DX)(R8*4), X1
 14917  	MOVSS X1, (DX)(R8*4)
 14918  	DECQ  SI
 14919  	ADDQ  CX, DI
 14920  	ADDQ  BX, R8
 14921  
 14922  check_limit:
 14923  	CMPQ SI, $0x00
 14924  	JHI  loop
 14925  	RET
 14926  
 14927  // func AmdAxpyUnsafeX_V4A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 14928  // Requires: SSE
 14929  TEXT ·AmdAxpyUnsafeX_V4A14R8(SB), NOSPLIT, $0-48
 14930  	MOVSS alpha+0(FP), X0
 14931  	MOVQ  xs+8(FP), AX
 14932  	MOVQ  incx+16(FP), CX
 14933  	MOVQ  ys+24(FP), DX
 14934  	MOVQ  incy+32(FP), BX
 14935  	MOVQ  n+40(FP), SI
 14936  	XORQ  DI, DI
 14937  	XORQ  R8, R8
 14938  	JMP   check_limit_unroll
 14939  	PCALIGN $0x08
 14940  	NOP
 14941  	NOP
 14942  	NOP
 14943  	NOP
 14944  	NOP
 14945  	NOP
 14946  
 14947  loop_unroll:
 14948  	MOVSS (AX)(DI*4), X1
 14949  	MULSS X0, X1
 14950  	ADDSS (DX)(R8*4), X1
 14951  	MOVSS X1, (DX)(R8*4)
 14952  	ADDQ  CX, DI
 14953  	ADDQ  BX, R8
 14954  	MOVSS (AX)(DI*4), X1
 14955  	MULSS X0, X1
 14956  	ADDSS (DX)(R8*4), X1
 14957  	MOVSS X1, (DX)(R8*4)
 14958  	ADDQ  CX, DI
 14959  	ADDQ  BX, R8
 14960  	MOVSS (AX)(DI*4), X1
 14961  	MULSS X0, X1
 14962  	ADDSS (DX)(R8*4), X1
 14963  	MOVSS X1, (DX)(R8*4)
 14964  	ADDQ  CX, DI
 14965  	ADDQ  BX, R8
 14966  	MOVSS (AX)(DI*4), X1
 14967  	MULSS X0, X1
 14968  	ADDSS (DX)(R8*4), X1
 14969  	MOVSS X1, (DX)(R8*4)
 14970  	ADDQ  CX, DI
 14971  	ADDQ  BX, R8
 14972  	MOVSS (AX)(DI*4), X1
 14973  	MULSS X0, X1
 14974  	ADDSS (DX)(R8*4), X1
 14975  	MOVSS X1, (DX)(R8*4)
 14976  	ADDQ  CX, DI
 14977  	ADDQ  BX, R8
 14978  	MOVSS (AX)(DI*4), X1
 14979  	MULSS X0, X1
 14980  	ADDSS (DX)(R8*4), X1
 14981  	MOVSS X1, (DX)(R8*4)
 14982  	ADDQ  CX, DI
 14983  	ADDQ  BX, R8
 14984  	MOVSS (AX)(DI*4), X1
 14985  	MULSS X0, X1
 14986  	ADDSS (DX)(R8*4), X1
 14987  	MOVSS X1, (DX)(R8*4)
 14988  	ADDQ  CX, DI
 14989  	ADDQ  BX, R8
 14990  	MOVSS (AX)(DI*4), X1
 14991  	MULSS X0, X1
 14992  	ADDSS (DX)(R8*4), X1
 14993  	MOVSS X1, (DX)(R8*4)
 14994  	ADDQ  CX, DI
 14995  	ADDQ  BX, R8
 14996  	SUBQ  $0x08, SI
 14997  
 14998  check_limit_unroll:
 14999  	CMPQ SI, $0x08
 15000  	JHI  loop_unroll
 15001  	JMP  check_limit
 15002  
 15003  loop:
 15004  	MOVSS (AX)(DI*4), X1
 15005  	MULSS X0, X1
 15006  	ADDSS (DX)(R8*4), X1
 15007  	MOVSS X1, (DX)(R8*4)
 15008  	DECQ  SI
 15009  	ADDQ  CX, DI
 15010  	ADDQ  BX, R8
 15011  
 15012  check_limit:
 15013  	CMPQ SI, $0x00
 15014  	JHI  loop
 15015  	RET
 15016  
 15017  // func AmdAxpyUnsafeX_V5A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15018  // Requires: SSE
 15019  TEXT ·AmdAxpyUnsafeX_V5A14R8(SB), NOSPLIT, $0-48
 15020  	MOVSS alpha+0(FP), X0
 15021  	MOVQ  xs+8(FP), AX
 15022  	MOVQ  incx+16(FP), CX
 15023  	MOVQ  ys+24(FP), DX
 15024  	MOVQ  incy+32(FP), BX
 15025  	MOVQ  n+40(FP), SI
 15026  	XORQ  DI, DI
 15027  	XORQ  R8, R8
 15028  	JMP   check_limit_unroll
 15029  	PCALIGN $0x08
 15030  	NOP
 15031  	NOP
 15032  	NOP
 15033  	NOP
 15034  	NOP
 15035  	NOP
 15036  
 15037  loop_unroll:
 15038  	MOVSS (AX)(DI*4), X1
 15039  	MULSS X0, X1
 15040  	ADDSS (DX)(R8*4), X1
 15041  	MOVSS X1, (DX)(R8*4)
 15042  	ADDQ  CX, DI
 15043  	ADDQ  BX, R8
 15044  	MOVSS (AX)(DI*4), X1
 15045  	MULSS X0, X1
 15046  	ADDSS (DX)(R8*4), X1
 15047  	MOVSS X1, (DX)(R8*4)
 15048  	ADDQ  CX, DI
 15049  	ADDQ  BX, R8
 15050  	MOVSS (AX)(DI*4), X1
 15051  	MULSS X0, X1
 15052  	ADDSS (DX)(R8*4), X1
 15053  	MOVSS X1, (DX)(R8*4)
 15054  	ADDQ  CX, DI
 15055  	ADDQ  BX, R8
 15056  	MOVSS (AX)(DI*4), X1
 15057  	MULSS X0, X1
 15058  	ADDSS (DX)(R8*4), X1
 15059  	MOVSS X1, (DX)(R8*4)
 15060  	ADDQ  CX, DI
 15061  	ADDQ  BX, R8
 15062  	MOVSS (AX)(DI*4), X1
 15063  	MULSS X0, X1
 15064  	ADDSS (DX)(R8*4), X1
 15065  	MOVSS X1, (DX)(R8*4)
 15066  	ADDQ  CX, DI
 15067  	ADDQ  BX, R8
 15068  	MOVSS (AX)(DI*4), X1
 15069  	MULSS X0, X1
 15070  	ADDSS (DX)(R8*4), X1
 15071  	MOVSS X1, (DX)(R8*4)
 15072  	ADDQ  CX, DI
 15073  	ADDQ  BX, R8
 15074  	MOVSS (AX)(DI*4), X1
 15075  	MULSS X0, X1
 15076  	ADDSS (DX)(R8*4), X1
 15077  	MOVSS X1, (DX)(R8*4)
 15078  	ADDQ  CX, DI
 15079  	ADDQ  BX, R8
 15080  	MOVSS (AX)(DI*4), X1
 15081  	MULSS X0, X1
 15082  	ADDSS (DX)(R8*4), X1
 15083  	MOVSS X1, (DX)(R8*4)
 15084  	ADDQ  CX, DI
 15085  	ADDQ  BX, R8
 15086  	SUBQ  $0x08, SI
 15087  
 15088  check_limit_unroll:
 15089  	CMPQ SI, $0x08
 15090  	JHI  loop_unroll
 15091  	JMP  check_limit
 15092  
 15093  loop:
 15094  	MOVSS (AX)(DI*4), X1
 15095  	MULSS X0, X1
 15096  	ADDSS (DX)(R8*4), X1
 15097  	MOVSS X1, (DX)(R8*4)
 15098  	DECQ  SI
 15099  	ADDQ  CX, DI
 15100  	ADDQ  BX, R8
 15101  
 15102  check_limit:
 15103  	CMPQ SI, $0x00
 15104  	JHI  loop
 15105  	RET
 15106  
 15107  // func AmdAxpyUnsafeX_V0A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15108  // Requires: SSE
 15109  TEXT ·AmdAxpyUnsafeX_V0A15R8(SB), NOSPLIT, $0-48
 15110  	MOVSS alpha+0(FP), X0
 15111  	MOVQ  xs+8(FP), AX
 15112  	MOVQ  incx+16(FP), CX
 15113  	MOVQ  ys+24(FP), DX
 15114  	MOVQ  incy+32(FP), BX
 15115  	MOVQ  n+40(FP), SI
 15116  	XORQ  DI, DI
 15117  	XORQ  R8, R8
 15118  	JMP   check_limit_unroll
 15119  	PCALIGN $0x08
 15120  	NOP
 15121  	NOP
 15122  	NOP
 15123  	NOP
 15124  	NOP
 15125  	NOP
 15126  	NOP
 15127  
 15128  loop_unroll:
 15129  	MOVSS (AX)(DI*4), X1
 15130  	MULSS X0, X1
 15131  	ADDSS (DX)(R8*4), X1
 15132  	MOVSS X1, (DX)(R8*4)
 15133  	ADDQ  CX, DI
 15134  	ADDQ  BX, R8
 15135  	MOVSS (AX)(DI*4), X1
 15136  	MULSS X0, X1
 15137  	ADDSS (DX)(R8*4), X1
 15138  	MOVSS X1, (DX)(R8*4)
 15139  	ADDQ  CX, DI
 15140  	ADDQ  BX, R8
 15141  	MOVSS (AX)(DI*4), X1
 15142  	MULSS X0, X1
 15143  	ADDSS (DX)(R8*4), X1
 15144  	MOVSS X1, (DX)(R8*4)
 15145  	ADDQ  CX, DI
 15146  	ADDQ  BX, R8
 15147  	MOVSS (AX)(DI*4), X1
 15148  	MULSS X0, X1
 15149  	ADDSS (DX)(R8*4), X1
 15150  	MOVSS X1, (DX)(R8*4)
 15151  	ADDQ  CX, DI
 15152  	ADDQ  BX, R8
 15153  	MOVSS (AX)(DI*4), X1
 15154  	MULSS X0, X1
 15155  	ADDSS (DX)(R8*4), X1
 15156  	MOVSS X1, (DX)(R8*4)
 15157  	ADDQ  CX, DI
 15158  	ADDQ  BX, R8
 15159  	MOVSS (AX)(DI*4), X1
 15160  	MULSS X0, X1
 15161  	ADDSS (DX)(R8*4), X1
 15162  	MOVSS X1, (DX)(R8*4)
 15163  	ADDQ  CX, DI
 15164  	ADDQ  BX, R8
 15165  	MOVSS (AX)(DI*4), X1
 15166  	MULSS X0, X1
 15167  	ADDSS (DX)(R8*4), X1
 15168  	MOVSS X1, (DX)(R8*4)
 15169  	ADDQ  CX, DI
 15170  	ADDQ  BX, R8
 15171  	MOVSS (AX)(DI*4), X1
 15172  	MULSS X0, X1
 15173  	ADDSS (DX)(R8*4), X1
 15174  	MOVSS X1, (DX)(R8*4)
 15175  	ADDQ  CX, DI
 15176  	ADDQ  BX, R8
 15177  	SUBQ  $0x08, SI
 15178  
 15179  check_limit_unroll:
 15180  	CMPQ SI, $0x08
 15181  	JHI  loop_unroll
 15182  	JMP  check_limit
 15183  
 15184  loop:
 15185  	MOVSS (AX)(DI*4), X1
 15186  	MULSS X0, X1
 15187  	ADDSS (DX)(R8*4), X1
 15188  	MOVSS X1, (DX)(R8*4)
 15189  	DECQ  SI
 15190  	ADDQ  CX, DI
 15191  	ADDQ  BX, R8
 15192  
 15193  check_limit:
 15194  	CMPQ SI, $0x00
 15195  	JHI  loop
 15196  	RET
 15197  
 15198  // func AmdAxpyUnsafeX_V1A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15199  // Requires: SSE
 15200  TEXT ·AmdAxpyUnsafeX_V1A15R8(SB), NOSPLIT, $0-48
 15201  	MOVSS alpha+0(FP), X0
 15202  	MOVQ  xs+8(FP), AX
 15203  	MOVQ  incx+16(FP), CX
 15204  	MOVQ  ys+24(FP), DX
 15205  	MOVQ  incy+32(FP), BX
 15206  	MOVQ  n+40(FP), SI
 15207  	XORQ  DI, DI
 15208  	XORQ  R8, R8
 15209  	JMP   check_limit_unroll
 15210  	PCALIGN $0x08
 15211  	NOP
 15212  	NOP
 15213  	NOP
 15214  	NOP
 15215  	NOP
 15216  	NOP
 15217  	NOP
 15218  
 15219  loop_unroll:
 15220  	MOVSS (AX)(DI*4), X1
 15221  	MULSS X0, X1
 15222  	ADDSS (DX)(R8*4), X1
 15223  	MOVSS X1, (DX)(R8*4)
 15224  	ADDQ  CX, DI
 15225  	ADDQ  BX, R8
 15226  	MOVSS (AX)(DI*4), X1
 15227  	MULSS X0, X1
 15228  	ADDSS (DX)(R8*4), X1
 15229  	MOVSS X1, (DX)(R8*4)
 15230  	ADDQ  CX, DI
 15231  	ADDQ  BX, R8
 15232  	MOVSS (AX)(DI*4), X1
 15233  	MULSS X0, X1
 15234  	ADDSS (DX)(R8*4), X1
 15235  	MOVSS X1, (DX)(R8*4)
 15236  	ADDQ  CX, DI
 15237  	ADDQ  BX, R8
 15238  	MOVSS (AX)(DI*4), X1
 15239  	MULSS X0, X1
 15240  	ADDSS (DX)(R8*4), X1
 15241  	MOVSS X1, (DX)(R8*4)
 15242  	ADDQ  CX, DI
 15243  	ADDQ  BX, R8
 15244  	MOVSS (AX)(DI*4), X1
 15245  	MULSS X0, X1
 15246  	ADDSS (DX)(R8*4), X1
 15247  	MOVSS X1, (DX)(R8*4)
 15248  	ADDQ  CX, DI
 15249  	ADDQ  BX, R8
 15250  	MOVSS (AX)(DI*4), X1
 15251  	MULSS X0, X1
 15252  	ADDSS (DX)(R8*4), X1
 15253  	MOVSS X1, (DX)(R8*4)
 15254  	ADDQ  CX, DI
 15255  	ADDQ  BX, R8
 15256  	MOVSS (AX)(DI*4), X1
 15257  	MULSS X0, X1
 15258  	ADDSS (DX)(R8*4), X1
 15259  	MOVSS X1, (DX)(R8*4)
 15260  	ADDQ  CX, DI
 15261  	ADDQ  BX, R8
 15262  	MOVSS (AX)(DI*4), X1
 15263  	MULSS X0, X1
 15264  	ADDSS (DX)(R8*4), X1
 15265  	MOVSS X1, (DX)(R8*4)
 15266  	ADDQ  CX, DI
 15267  	ADDQ  BX, R8
 15268  	SUBQ  $0x08, SI
 15269  
 15270  check_limit_unroll:
 15271  	CMPQ SI, $0x08
 15272  	JHI  loop_unroll
 15273  	JMP  check_limit
 15274  
 15275  loop:
 15276  	MOVSS (AX)(DI*4), X1
 15277  	MULSS X0, X1
 15278  	ADDSS (DX)(R8*4), X1
 15279  	MOVSS X1, (DX)(R8*4)
 15280  	DECQ  SI
 15281  	ADDQ  CX, DI
 15282  	ADDQ  BX, R8
 15283  
 15284  check_limit:
 15285  	CMPQ SI, $0x00
 15286  	JHI  loop
 15287  	RET
 15288  
 15289  // func AmdAxpyUnsafeX_V2A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15290  // Requires: SSE
 15291  TEXT ·AmdAxpyUnsafeX_V2A15R8(SB), NOSPLIT, $0-48
 15292  	MOVSS alpha+0(FP), X0
 15293  	MOVQ  xs+8(FP), AX
 15294  	MOVQ  incx+16(FP), CX
 15295  	MOVQ  ys+24(FP), DX
 15296  	MOVQ  incy+32(FP), BX
 15297  	MOVQ  n+40(FP), SI
 15298  	XORQ  DI, DI
 15299  	XORQ  R8, R8
 15300  	JMP   check_limit_unroll
 15301  	PCALIGN $0x08
 15302  	NOP
 15303  	NOP
 15304  	NOP
 15305  	NOP
 15306  	NOP
 15307  	NOP
 15308  	NOP
 15309  
 15310  loop_unroll:
 15311  	MOVSS (AX)(DI*4), X1
 15312  	MULSS X0, X1
 15313  	ADDSS (DX)(R8*4), X1
 15314  	MOVSS X1, (DX)(R8*4)
 15315  	ADDQ  CX, DI
 15316  	ADDQ  BX, R8
 15317  	MOVSS (AX)(DI*4), X1
 15318  	MULSS X0, X1
 15319  	ADDSS (DX)(R8*4), X1
 15320  	MOVSS X1, (DX)(R8*4)
 15321  	ADDQ  CX, DI
 15322  	ADDQ  BX, R8
 15323  	MOVSS (AX)(DI*4), X1
 15324  	MULSS X0, X1
 15325  	ADDSS (DX)(R8*4), X1
 15326  	MOVSS X1, (DX)(R8*4)
 15327  	ADDQ  CX, DI
 15328  	ADDQ  BX, R8
 15329  	MOVSS (AX)(DI*4), X1
 15330  	MULSS X0, X1
 15331  	ADDSS (DX)(R8*4), X1
 15332  	MOVSS X1, (DX)(R8*4)
 15333  	ADDQ  CX, DI
 15334  	ADDQ  BX, R8
 15335  	MOVSS (AX)(DI*4), X1
 15336  	MULSS X0, X1
 15337  	ADDSS (DX)(R8*4), X1
 15338  	MOVSS X1, (DX)(R8*4)
 15339  	ADDQ  CX, DI
 15340  	ADDQ  BX, R8
 15341  	MOVSS (AX)(DI*4), X1
 15342  	MULSS X0, X1
 15343  	ADDSS (DX)(R8*4), X1
 15344  	MOVSS X1, (DX)(R8*4)
 15345  	ADDQ  CX, DI
 15346  	ADDQ  BX, R8
 15347  	MOVSS (AX)(DI*4), X1
 15348  	MULSS X0, X1
 15349  	ADDSS (DX)(R8*4), X1
 15350  	MOVSS X1, (DX)(R8*4)
 15351  	ADDQ  CX, DI
 15352  	ADDQ  BX, R8
 15353  	MOVSS (AX)(DI*4), X1
 15354  	MULSS X0, X1
 15355  	ADDSS (DX)(R8*4), X1
 15356  	MOVSS X1, (DX)(R8*4)
 15357  	ADDQ  CX, DI
 15358  	ADDQ  BX, R8
 15359  	SUBQ  $0x08, SI
 15360  
 15361  check_limit_unroll:
 15362  	CMPQ SI, $0x08
 15363  	JHI  loop_unroll
 15364  	JMP  check_limit
 15365  
 15366  loop:
 15367  	MOVSS (AX)(DI*4), X1
 15368  	MULSS X0, X1
 15369  	ADDSS (DX)(R8*4), X1
 15370  	MOVSS X1, (DX)(R8*4)
 15371  	DECQ  SI
 15372  	ADDQ  CX, DI
 15373  	ADDQ  BX, R8
 15374  
 15375  check_limit:
 15376  	CMPQ SI, $0x00
 15377  	JHI  loop
 15378  	RET
 15379  
 15380  // func AmdAxpyUnsafeX_V3A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15381  // Requires: SSE
 15382  TEXT ·AmdAxpyUnsafeX_V3A15R8(SB), NOSPLIT, $0-48
 15383  	MOVSS alpha+0(FP), X0
 15384  	MOVQ  xs+8(FP), AX
 15385  	MOVQ  incx+16(FP), CX
 15386  	MOVQ  ys+24(FP), DX
 15387  	MOVQ  incy+32(FP), BX
 15388  	MOVQ  n+40(FP), SI
 15389  	XORQ  DI, DI
 15390  	XORQ  R8, R8
 15391  	JMP   check_limit_unroll
 15392  	PCALIGN $0x08
 15393  	NOP
 15394  	NOP
 15395  	NOP
 15396  	NOP
 15397  	NOP
 15398  	NOP
 15399  	NOP
 15400  
 15401  loop_unroll:
 15402  	MOVSS (AX)(DI*4), X1
 15403  	MULSS X0, X1
 15404  	ADDSS (DX)(R8*4), X1
 15405  	MOVSS X1, (DX)(R8*4)
 15406  	ADDQ  CX, DI
 15407  	ADDQ  BX, R8
 15408  	MOVSS (AX)(DI*4), X1
 15409  	MULSS X0, X1
 15410  	ADDSS (DX)(R8*4), X1
 15411  	MOVSS X1, (DX)(R8*4)
 15412  	ADDQ  CX, DI
 15413  	ADDQ  BX, R8
 15414  	MOVSS (AX)(DI*4), X1
 15415  	MULSS X0, X1
 15416  	ADDSS (DX)(R8*4), X1
 15417  	MOVSS X1, (DX)(R8*4)
 15418  	ADDQ  CX, DI
 15419  	ADDQ  BX, R8
 15420  	MOVSS (AX)(DI*4), X1
 15421  	MULSS X0, X1
 15422  	ADDSS (DX)(R8*4), X1
 15423  	MOVSS X1, (DX)(R8*4)
 15424  	ADDQ  CX, DI
 15425  	ADDQ  BX, R8
 15426  	MOVSS (AX)(DI*4), X1
 15427  	MULSS X0, X1
 15428  	ADDSS (DX)(R8*4), X1
 15429  	MOVSS X1, (DX)(R8*4)
 15430  	ADDQ  CX, DI
 15431  	ADDQ  BX, R8
 15432  	MOVSS (AX)(DI*4), X1
 15433  	MULSS X0, X1
 15434  	ADDSS (DX)(R8*4), X1
 15435  	MOVSS X1, (DX)(R8*4)
 15436  	ADDQ  CX, DI
 15437  	ADDQ  BX, R8
 15438  	MOVSS (AX)(DI*4), X1
 15439  	MULSS X0, X1
 15440  	ADDSS (DX)(R8*4), X1
 15441  	MOVSS X1, (DX)(R8*4)
 15442  	ADDQ  CX, DI
 15443  	ADDQ  BX, R8
 15444  	MOVSS (AX)(DI*4), X1
 15445  	MULSS X0, X1
 15446  	ADDSS (DX)(R8*4), X1
 15447  	MOVSS X1, (DX)(R8*4)
 15448  	ADDQ  CX, DI
 15449  	ADDQ  BX, R8
 15450  	SUBQ  $0x08, SI
 15451  
 15452  check_limit_unroll:
 15453  	CMPQ SI, $0x08
 15454  	JHI  loop_unroll
 15455  	JMP  check_limit
 15456  
 15457  loop:
 15458  	MOVSS (AX)(DI*4), X1
 15459  	MULSS X0, X1
 15460  	ADDSS (DX)(R8*4), X1
 15461  	MOVSS X1, (DX)(R8*4)
 15462  	DECQ  SI
 15463  	ADDQ  CX, DI
 15464  	ADDQ  BX, R8
 15465  
 15466  check_limit:
 15467  	CMPQ SI, $0x00
 15468  	JHI  loop
 15469  	RET
 15470  
 15471  // func AmdAxpyUnsafeX_V4A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15472  // Requires: SSE
 15473  TEXT ·AmdAxpyUnsafeX_V4A15R8(SB), NOSPLIT, $0-48
 15474  	MOVSS alpha+0(FP), X0
 15475  	MOVQ  xs+8(FP), AX
 15476  	MOVQ  incx+16(FP), CX
 15477  	MOVQ  ys+24(FP), DX
 15478  	MOVQ  incy+32(FP), BX
 15479  	MOVQ  n+40(FP), SI
 15480  	XORQ  DI, DI
 15481  	XORQ  R8, R8
 15482  	JMP   check_limit_unroll
 15483  	PCALIGN $0x08
 15484  	NOP
 15485  	NOP
 15486  	NOP
 15487  	NOP
 15488  	NOP
 15489  	NOP
 15490  	NOP
 15491  
 15492  loop_unroll:
 15493  	MOVSS (AX)(DI*4), X1
 15494  	MULSS X0, X1
 15495  	ADDSS (DX)(R8*4), X1
 15496  	MOVSS X1, (DX)(R8*4)
 15497  	ADDQ  CX, DI
 15498  	ADDQ  BX, R8
 15499  	MOVSS (AX)(DI*4), X1
 15500  	MULSS X0, X1
 15501  	ADDSS (DX)(R8*4), X1
 15502  	MOVSS X1, (DX)(R8*4)
 15503  	ADDQ  CX, DI
 15504  	ADDQ  BX, R8
 15505  	MOVSS (AX)(DI*4), X1
 15506  	MULSS X0, X1
 15507  	ADDSS (DX)(R8*4), X1
 15508  	MOVSS X1, (DX)(R8*4)
 15509  	ADDQ  CX, DI
 15510  	ADDQ  BX, R8
 15511  	MOVSS (AX)(DI*4), X1
 15512  	MULSS X0, X1
 15513  	ADDSS (DX)(R8*4), X1
 15514  	MOVSS X1, (DX)(R8*4)
 15515  	ADDQ  CX, DI
 15516  	ADDQ  BX, R8
 15517  	MOVSS (AX)(DI*4), X1
 15518  	MULSS X0, X1
 15519  	ADDSS (DX)(R8*4), X1
 15520  	MOVSS X1, (DX)(R8*4)
 15521  	ADDQ  CX, DI
 15522  	ADDQ  BX, R8
 15523  	MOVSS (AX)(DI*4), X1
 15524  	MULSS X0, X1
 15525  	ADDSS (DX)(R8*4), X1
 15526  	MOVSS X1, (DX)(R8*4)
 15527  	ADDQ  CX, DI
 15528  	ADDQ  BX, R8
 15529  	MOVSS (AX)(DI*4), X1
 15530  	MULSS X0, X1
 15531  	ADDSS (DX)(R8*4), X1
 15532  	MOVSS X1, (DX)(R8*4)
 15533  	ADDQ  CX, DI
 15534  	ADDQ  BX, R8
 15535  	MOVSS (AX)(DI*4), X1
 15536  	MULSS X0, X1
 15537  	ADDSS (DX)(R8*4), X1
 15538  	MOVSS X1, (DX)(R8*4)
 15539  	ADDQ  CX, DI
 15540  	ADDQ  BX, R8
 15541  	SUBQ  $0x08, SI
 15542  
 15543  check_limit_unroll:
 15544  	CMPQ SI, $0x08
 15545  	JHI  loop_unroll
 15546  	JMP  check_limit
 15547  
 15548  loop:
 15549  	MOVSS (AX)(DI*4), X1
 15550  	MULSS X0, X1
 15551  	ADDSS (DX)(R8*4), X1
 15552  	MOVSS X1, (DX)(R8*4)
 15553  	DECQ  SI
 15554  	ADDQ  CX, DI
 15555  	ADDQ  BX, R8
 15556  
 15557  check_limit:
 15558  	CMPQ SI, $0x00
 15559  	JHI  loop
 15560  	RET
 15561  
 15562  // func AmdAxpyUnsafeX_V5A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15563  // Requires: SSE
 15564  TEXT ·AmdAxpyUnsafeX_V5A15R8(SB), NOSPLIT, $0-48
 15565  	MOVSS alpha+0(FP), X0
 15566  	MOVQ  xs+8(FP), AX
 15567  	MOVQ  incx+16(FP), CX
 15568  	MOVQ  ys+24(FP), DX
 15569  	MOVQ  incy+32(FP), BX
 15570  	MOVQ  n+40(FP), SI
 15571  	XORQ  DI, DI
 15572  	XORQ  R8, R8
 15573  	JMP   check_limit_unroll
 15574  	PCALIGN $0x08
 15575  	NOP
 15576  	NOP
 15577  	NOP
 15578  	NOP
 15579  	NOP
 15580  	NOP
 15581  	NOP
 15582  
 15583  loop_unroll:
 15584  	MOVSS (AX)(DI*4), X1
 15585  	MULSS X0, X1
 15586  	ADDSS (DX)(R8*4), X1
 15587  	MOVSS X1, (DX)(R8*4)
 15588  	ADDQ  CX, DI
 15589  	ADDQ  BX, R8
 15590  	MOVSS (AX)(DI*4), X1
 15591  	MULSS X0, X1
 15592  	ADDSS (DX)(R8*4), X1
 15593  	MOVSS X1, (DX)(R8*4)
 15594  	ADDQ  CX, DI
 15595  	ADDQ  BX, R8
 15596  	MOVSS (AX)(DI*4), X1
 15597  	MULSS X0, X1
 15598  	ADDSS (DX)(R8*4), X1
 15599  	MOVSS X1, (DX)(R8*4)
 15600  	ADDQ  CX, DI
 15601  	ADDQ  BX, R8
 15602  	MOVSS (AX)(DI*4), X1
 15603  	MULSS X0, X1
 15604  	ADDSS (DX)(R8*4), X1
 15605  	MOVSS X1, (DX)(R8*4)
 15606  	ADDQ  CX, DI
 15607  	ADDQ  BX, R8
 15608  	MOVSS (AX)(DI*4), X1
 15609  	MULSS X0, X1
 15610  	ADDSS (DX)(R8*4), X1
 15611  	MOVSS X1, (DX)(R8*4)
 15612  	ADDQ  CX, DI
 15613  	ADDQ  BX, R8
 15614  	MOVSS (AX)(DI*4), X1
 15615  	MULSS X0, X1
 15616  	ADDSS (DX)(R8*4), X1
 15617  	MOVSS X1, (DX)(R8*4)
 15618  	ADDQ  CX, DI
 15619  	ADDQ  BX, R8
 15620  	MOVSS (AX)(DI*4), X1
 15621  	MULSS X0, X1
 15622  	ADDSS (DX)(R8*4), X1
 15623  	MOVSS X1, (DX)(R8*4)
 15624  	ADDQ  CX, DI
 15625  	ADDQ  BX, R8
 15626  	MOVSS (AX)(DI*4), X1
 15627  	MULSS X0, X1
 15628  	ADDSS (DX)(R8*4), X1
 15629  	MOVSS X1, (DX)(R8*4)
 15630  	ADDQ  CX, DI
 15631  	ADDQ  BX, R8
 15632  	SUBQ  $0x08, SI
 15633  
 15634  check_limit_unroll:
 15635  	CMPQ SI, $0x08
 15636  	JHI  loop_unroll
 15637  	JMP  check_limit
 15638  
 15639  loop:
 15640  	MOVSS (AX)(DI*4), X1
 15641  	MULSS X0, X1
 15642  	ADDSS (DX)(R8*4), X1
 15643  	MOVSS X1, (DX)(R8*4)
 15644  	DECQ  SI
 15645  	ADDQ  CX, DI
 15646  	ADDQ  BX, R8
 15647  
 15648  check_limit:
 15649  	CMPQ SI, $0x00
 15650  	JHI  loop
 15651  	RET
 15652  
 15653  // func AmdAxpyUnsafeX_V0A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15654  // Requires: SSE
 15655  TEXT ·AmdAxpyUnsafeX_V0A16R8(SB), NOSPLIT, $0-48
 15656  	MOVSS alpha+0(FP), X0
 15657  	MOVQ  xs+8(FP), AX
 15658  	MOVQ  incx+16(FP), CX
 15659  	MOVQ  ys+24(FP), DX
 15660  	MOVQ  incy+32(FP), BX
 15661  	MOVQ  n+40(FP), SI
 15662  	XORQ  DI, DI
 15663  	XORQ  R8, R8
 15664  	JMP   check_limit_unroll
 15665  	PCALIGN $0x10
 15666  
 15667  loop_unroll:
 15668  	MOVSS (AX)(DI*4), X1
 15669  	MULSS X0, X1
 15670  	ADDSS (DX)(R8*4), X1
 15671  	MOVSS X1, (DX)(R8*4)
 15672  	ADDQ  CX, DI
 15673  	ADDQ  BX, R8
 15674  	MOVSS (AX)(DI*4), X1
 15675  	MULSS X0, X1
 15676  	ADDSS (DX)(R8*4), X1
 15677  	MOVSS X1, (DX)(R8*4)
 15678  	ADDQ  CX, DI
 15679  	ADDQ  BX, R8
 15680  	MOVSS (AX)(DI*4), X1
 15681  	MULSS X0, X1
 15682  	ADDSS (DX)(R8*4), X1
 15683  	MOVSS X1, (DX)(R8*4)
 15684  	ADDQ  CX, DI
 15685  	ADDQ  BX, R8
 15686  	MOVSS (AX)(DI*4), X1
 15687  	MULSS X0, X1
 15688  	ADDSS (DX)(R8*4), X1
 15689  	MOVSS X1, (DX)(R8*4)
 15690  	ADDQ  CX, DI
 15691  	ADDQ  BX, R8
 15692  	MOVSS (AX)(DI*4), X1
 15693  	MULSS X0, X1
 15694  	ADDSS (DX)(R8*4), X1
 15695  	MOVSS X1, (DX)(R8*4)
 15696  	ADDQ  CX, DI
 15697  	ADDQ  BX, R8
 15698  	MOVSS (AX)(DI*4), X1
 15699  	MULSS X0, X1
 15700  	ADDSS (DX)(R8*4), X1
 15701  	MOVSS X1, (DX)(R8*4)
 15702  	ADDQ  CX, DI
 15703  	ADDQ  BX, R8
 15704  	MOVSS (AX)(DI*4), X1
 15705  	MULSS X0, X1
 15706  	ADDSS (DX)(R8*4), X1
 15707  	MOVSS X1, (DX)(R8*4)
 15708  	ADDQ  CX, DI
 15709  	ADDQ  BX, R8
 15710  	MOVSS (AX)(DI*4), X1
 15711  	MULSS X0, X1
 15712  	ADDSS (DX)(R8*4), X1
 15713  	MOVSS X1, (DX)(R8*4)
 15714  	ADDQ  CX, DI
 15715  	ADDQ  BX, R8
 15716  	SUBQ  $0x08, SI
 15717  
 15718  check_limit_unroll:
 15719  	CMPQ SI, $0x08
 15720  	JHI  loop_unroll
 15721  	JMP  check_limit
 15722  
 15723  loop:
 15724  	MOVSS (AX)(DI*4), X1
 15725  	MULSS X0, X1
 15726  	ADDSS (DX)(R8*4), X1
 15727  	MOVSS X1, (DX)(R8*4)
 15728  	DECQ  SI
 15729  	ADDQ  CX, DI
 15730  	ADDQ  BX, R8
 15731  
 15732  check_limit:
 15733  	CMPQ SI, $0x00
 15734  	JHI  loop
 15735  	RET
 15736  
 15737  // func AmdAxpyUnsafeX_V1A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15738  // Requires: SSE
 15739  TEXT ·AmdAxpyUnsafeX_V1A16R8(SB), NOSPLIT, $0-48
 15740  	MOVSS alpha+0(FP), X0
 15741  	MOVQ  xs+8(FP), AX
 15742  	MOVQ  incx+16(FP), CX
 15743  	MOVQ  ys+24(FP), DX
 15744  	MOVQ  incy+32(FP), BX
 15745  	MOVQ  n+40(FP), SI
 15746  	XORQ  DI, DI
 15747  	XORQ  R8, R8
 15748  	JMP   check_limit_unroll
 15749  	PCALIGN $0x10
 15750  
 15751  loop_unroll:
 15752  	MOVSS (AX)(DI*4), X1
 15753  	MULSS X0, X1
 15754  	ADDSS (DX)(R8*4), X1
 15755  	MOVSS X1, (DX)(R8*4)
 15756  	ADDQ  CX, DI
 15757  	ADDQ  BX, R8
 15758  	MOVSS (AX)(DI*4), X1
 15759  	MULSS X0, X1
 15760  	ADDSS (DX)(R8*4), X1
 15761  	MOVSS X1, (DX)(R8*4)
 15762  	ADDQ  CX, DI
 15763  	ADDQ  BX, R8
 15764  	MOVSS (AX)(DI*4), X1
 15765  	MULSS X0, X1
 15766  	ADDSS (DX)(R8*4), X1
 15767  	MOVSS X1, (DX)(R8*4)
 15768  	ADDQ  CX, DI
 15769  	ADDQ  BX, R8
 15770  	MOVSS (AX)(DI*4), X1
 15771  	MULSS X0, X1
 15772  	ADDSS (DX)(R8*4), X1
 15773  	MOVSS X1, (DX)(R8*4)
 15774  	ADDQ  CX, DI
 15775  	ADDQ  BX, R8
 15776  	MOVSS (AX)(DI*4), X1
 15777  	MULSS X0, X1
 15778  	ADDSS (DX)(R8*4), X1
 15779  	MOVSS X1, (DX)(R8*4)
 15780  	ADDQ  CX, DI
 15781  	ADDQ  BX, R8
 15782  	MOVSS (AX)(DI*4), X1
 15783  	MULSS X0, X1
 15784  	ADDSS (DX)(R8*4), X1
 15785  	MOVSS X1, (DX)(R8*4)
 15786  	ADDQ  CX, DI
 15787  	ADDQ  BX, R8
 15788  	MOVSS (AX)(DI*4), X1
 15789  	MULSS X0, X1
 15790  	ADDSS (DX)(R8*4), X1
 15791  	MOVSS X1, (DX)(R8*4)
 15792  	ADDQ  CX, DI
 15793  	ADDQ  BX, R8
 15794  	MOVSS (AX)(DI*4), X1
 15795  	MULSS X0, X1
 15796  	ADDSS (DX)(R8*4), X1
 15797  	MOVSS X1, (DX)(R8*4)
 15798  	ADDQ  CX, DI
 15799  	ADDQ  BX, R8
 15800  	SUBQ  $0x08, SI
 15801  
 15802  check_limit_unroll:
 15803  	CMPQ SI, $0x08
 15804  	JHI  loop_unroll
 15805  	JMP  check_limit
 15806  
 15807  loop:
 15808  	MOVSS (AX)(DI*4), X1
 15809  	MULSS X0, X1
 15810  	ADDSS (DX)(R8*4), X1
 15811  	MOVSS X1, (DX)(R8*4)
 15812  	DECQ  SI
 15813  	ADDQ  CX, DI
 15814  	ADDQ  BX, R8
 15815  
 15816  check_limit:
 15817  	CMPQ SI, $0x00
 15818  	JHI  loop
 15819  	RET
 15820  
 15821  // func AmdAxpyUnsafeX_V2A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15822  // Requires: SSE
 15823  TEXT ·AmdAxpyUnsafeX_V2A16R8(SB), NOSPLIT, $0-48
 15824  	MOVSS alpha+0(FP), X0
 15825  	MOVQ  xs+8(FP), AX
 15826  	MOVQ  incx+16(FP), CX
 15827  	MOVQ  ys+24(FP), DX
 15828  	MOVQ  incy+32(FP), BX
 15829  	MOVQ  n+40(FP), SI
 15830  	XORQ  DI, DI
 15831  	XORQ  R8, R8
 15832  	JMP   check_limit_unroll
 15833  	PCALIGN $0x10
 15834  
 15835  loop_unroll:
 15836  	MOVSS (AX)(DI*4), X1
 15837  	MULSS X0, X1
 15838  	ADDSS (DX)(R8*4), X1
 15839  	MOVSS X1, (DX)(R8*4)
 15840  	ADDQ  CX, DI
 15841  	ADDQ  BX, R8
 15842  	MOVSS (AX)(DI*4), X1
 15843  	MULSS X0, X1
 15844  	ADDSS (DX)(R8*4), X1
 15845  	MOVSS X1, (DX)(R8*4)
 15846  	ADDQ  CX, DI
 15847  	ADDQ  BX, R8
 15848  	MOVSS (AX)(DI*4), X1
 15849  	MULSS X0, X1
 15850  	ADDSS (DX)(R8*4), X1
 15851  	MOVSS X1, (DX)(R8*4)
 15852  	ADDQ  CX, DI
 15853  	ADDQ  BX, R8
 15854  	MOVSS (AX)(DI*4), X1
 15855  	MULSS X0, X1
 15856  	ADDSS (DX)(R8*4), X1
 15857  	MOVSS X1, (DX)(R8*4)
 15858  	ADDQ  CX, DI
 15859  	ADDQ  BX, R8
 15860  	MOVSS (AX)(DI*4), X1
 15861  	MULSS X0, X1
 15862  	ADDSS (DX)(R8*4), X1
 15863  	MOVSS X1, (DX)(R8*4)
 15864  	ADDQ  CX, DI
 15865  	ADDQ  BX, R8
 15866  	MOVSS (AX)(DI*4), X1
 15867  	MULSS X0, X1
 15868  	ADDSS (DX)(R8*4), X1
 15869  	MOVSS X1, (DX)(R8*4)
 15870  	ADDQ  CX, DI
 15871  	ADDQ  BX, R8
 15872  	MOVSS (AX)(DI*4), X1
 15873  	MULSS X0, X1
 15874  	ADDSS (DX)(R8*4), X1
 15875  	MOVSS X1, (DX)(R8*4)
 15876  	ADDQ  CX, DI
 15877  	ADDQ  BX, R8
 15878  	MOVSS (AX)(DI*4), X1
 15879  	MULSS X0, X1
 15880  	ADDSS (DX)(R8*4), X1
 15881  	MOVSS X1, (DX)(R8*4)
 15882  	ADDQ  CX, DI
 15883  	ADDQ  BX, R8
 15884  	SUBQ  $0x08, SI
 15885  
 15886  check_limit_unroll:
 15887  	CMPQ SI, $0x08
 15888  	JHI  loop_unroll
 15889  	JMP  check_limit
 15890  
 15891  loop:
 15892  	MOVSS (AX)(DI*4), X1
 15893  	MULSS X0, X1
 15894  	ADDSS (DX)(R8*4), X1
 15895  	MOVSS X1, (DX)(R8*4)
 15896  	DECQ  SI
 15897  	ADDQ  CX, DI
 15898  	ADDQ  BX, R8
 15899  
 15900  check_limit:
 15901  	CMPQ SI, $0x00
 15902  	JHI  loop
 15903  	RET
 15904  
 15905  // func AmdAxpyUnsafeX_V3A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15906  // Requires: SSE
 15907  TEXT ·AmdAxpyUnsafeX_V3A16R8(SB), NOSPLIT, $0-48
 15908  	MOVSS alpha+0(FP), X0
 15909  	MOVQ  xs+8(FP), AX
 15910  	MOVQ  incx+16(FP), CX
 15911  	MOVQ  ys+24(FP), DX
 15912  	MOVQ  incy+32(FP), BX
 15913  	MOVQ  n+40(FP), SI
 15914  	XORQ  DI, DI
 15915  	XORQ  R8, R8
 15916  	JMP   check_limit_unroll
 15917  	PCALIGN $0x10
 15918  
 15919  loop_unroll:
 15920  	MOVSS (AX)(DI*4), X1
 15921  	MULSS X0, X1
 15922  	ADDSS (DX)(R8*4), X1
 15923  	MOVSS X1, (DX)(R8*4)
 15924  	ADDQ  CX, DI
 15925  	ADDQ  BX, R8
 15926  	MOVSS (AX)(DI*4), X1
 15927  	MULSS X0, X1
 15928  	ADDSS (DX)(R8*4), X1
 15929  	MOVSS X1, (DX)(R8*4)
 15930  	ADDQ  CX, DI
 15931  	ADDQ  BX, R8
 15932  	MOVSS (AX)(DI*4), X1
 15933  	MULSS X0, X1
 15934  	ADDSS (DX)(R8*4), X1
 15935  	MOVSS X1, (DX)(R8*4)
 15936  	ADDQ  CX, DI
 15937  	ADDQ  BX, R8
 15938  	MOVSS (AX)(DI*4), X1
 15939  	MULSS X0, X1
 15940  	ADDSS (DX)(R8*4), X1
 15941  	MOVSS X1, (DX)(R8*4)
 15942  	ADDQ  CX, DI
 15943  	ADDQ  BX, R8
 15944  	MOVSS (AX)(DI*4), X1
 15945  	MULSS X0, X1
 15946  	ADDSS (DX)(R8*4), X1
 15947  	MOVSS X1, (DX)(R8*4)
 15948  	ADDQ  CX, DI
 15949  	ADDQ  BX, R8
 15950  	MOVSS (AX)(DI*4), X1
 15951  	MULSS X0, X1
 15952  	ADDSS (DX)(R8*4), X1
 15953  	MOVSS X1, (DX)(R8*4)
 15954  	ADDQ  CX, DI
 15955  	ADDQ  BX, R8
 15956  	MOVSS (AX)(DI*4), X1
 15957  	MULSS X0, X1
 15958  	ADDSS (DX)(R8*4), X1
 15959  	MOVSS X1, (DX)(R8*4)
 15960  	ADDQ  CX, DI
 15961  	ADDQ  BX, R8
 15962  	MOVSS (AX)(DI*4), X1
 15963  	MULSS X0, X1
 15964  	ADDSS (DX)(R8*4), X1
 15965  	MOVSS X1, (DX)(R8*4)
 15966  	ADDQ  CX, DI
 15967  	ADDQ  BX, R8
 15968  	SUBQ  $0x08, SI
 15969  
 15970  check_limit_unroll:
 15971  	CMPQ SI, $0x08
 15972  	JHI  loop_unroll
 15973  	JMP  check_limit
 15974  
 15975  loop:
 15976  	MOVSS (AX)(DI*4), X1
 15977  	MULSS X0, X1
 15978  	ADDSS (DX)(R8*4), X1
 15979  	MOVSS X1, (DX)(R8*4)
 15980  	DECQ  SI
 15981  	ADDQ  CX, DI
 15982  	ADDQ  BX, R8
 15983  
 15984  check_limit:
 15985  	CMPQ SI, $0x00
 15986  	JHI  loop
 15987  	RET
 15988  
 15989  // func AmdAxpyUnsafeX_V4A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 15990  // Requires: SSE
 15991  TEXT ·AmdAxpyUnsafeX_V4A16R8(SB), NOSPLIT, $0-48
 15992  	MOVSS alpha+0(FP), X0
 15993  	MOVQ  xs+8(FP), AX
 15994  	MOVQ  incx+16(FP), CX
 15995  	MOVQ  ys+24(FP), DX
 15996  	MOVQ  incy+32(FP), BX
 15997  	MOVQ  n+40(FP), SI
 15998  	XORQ  DI, DI
 15999  	XORQ  R8, R8
 16000  	JMP   check_limit_unroll
 16001  	PCALIGN $0x10
 16002  
 16003  loop_unroll:
 16004  	MOVSS (AX)(DI*4), X1
 16005  	MULSS X0, X1
 16006  	ADDSS (DX)(R8*4), X1
 16007  	MOVSS X1, (DX)(R8*4)
 16008  	ADDQ  CX, DI
 16009  	ADDQ  BX, R8
 16010  	MOVSS (AX)(DI*4), X1
 16011  	MULSS X0, X1
 16012  	ADDSS (DX)(R8*4), X1
 16013  	MOVSS X1, (DX)(R8*4)
 16014  	ADDQ  CX, DI
 16015  	ADDQ  BX, R8
 16016  	MOVSS (AX)(DI*4), X1
 16017  	MULSS X0, X1
 16018  	ADDSS (DX)(R8*4), X1
 16019  	MOVSS X1, (DX)(R8*4)
 16020  	ADDQ  CX, DI
 16021  	ADDQ  BX, R8
 16022  	MOVSS (AX)(DI*4), X1
 16023  	MULSS X0, X1
 16024  	ADDSS (DX)(R8*4), X1
 16025  	MOVSS X1, (DX)(R8*4)
 16026  	ADDQ  CX, DI
 16027  	ADDQ  BX, R8
 16028  	MOVSS (AX)(DI*4), X1
 16029  	MULSS X0, X1
 16030  	ADDSS (DX)(R8*4), X1
 16031  	MOVSS X1, (DX)(R8*4)
 16032  	ADDQ  CX, DI
 16033  	ADDQ  BX, R8
 16034  	MOVSS (AX)(DI*4), X1
 16035  	MULSS X0, X1
 16036  	ADDSS (DX)(R8*4), X1
 16037  	MOVSS X1, (DX)(R8*4)
 16038  	ADDQ  CX, DI
 16039  	ADDQ  BX, R8
 16040  	MOVSS (AX)(DI*4), X1
 16041  	MULSS X0, X1
 16042  	ADDSS (DX)(R8*4), X1
 16043  	MOVSS X1, (DX)(R8*4)
 16044  	ADDQ  CX, DI
 16045  	ADDQ  BX, R8
 16046  	MOVSS (AX)(DI*4), X1
 16047  	MULSS X0, X1
 16048  	ADDSS (DX)(R8*4), X1
 16049  	MOVSS X1, (DX)(R8*4)
 16050  	ADDQ  CX, DI
 16051  	ADDQ  BX, R8
 16052  	SUBQ  $0x08, SI
 16053  
 16054  check_limit_unroll:
 16055  	CMPQ SI, $0x08
 16056  	JHI  loop_unroll
 16057  	JMP  check_limit
 16058  
 16059  loop:
 16060  	MOVSS (AX)(DI*4), X1
 16061  	MULSS X0, X1
 16062  	ADDSS (DX)(R8*4), X1
 16063  	MOVSS X1, (DX)(R8*4)
 16064  	DECQ  SI
 16065  	ADDQ  CX, DI
 16066  	ADDQ  BX, R8
 16067  
 16068  check_limit:
 16069  	CMPQ SI, $0x00
 16070  	JHI  loop
 16071  	RET
 16072  
 16073  // func AmdAxpyUnsafeX_V5A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16074  // Requires: SSE
 16075  TEXT ·AmdAxpyUnsafeX_V5A16R8(SB), NOSPLIT, $0-48
 16076  	MOVSS alpha+0(FP), X0
 16077  	MOVQ  xs+8(FP), AX
 16078  	MOVQ  incx+16(FP), CX
 16079  	MOVQ  ys+24(FP), DX
 16080  	MOVQ  incy+32(FP), BX
 16081  	MOVQ  n+40(FP), SI
 16082  	XORQ  DI, DI
 16083  	XORQ  R8, R8
 16084  	JMP   check_limit_unroll
 16085  	PCALIGN $0x10
 16086  
 16087  loop_unroll:
 16088  	MOVSS (AX)(DI*4), X1
 16089  	MULSS X0, X1
 16090  	ADDSS (DX)(R8*4), X1
 16091  	MOVSS X1, (DX)(R8*4)
 16092  	ADDQ  CX, DI
 16093  	ADDQ  BX, R8
 16094  	MOVSS (AX)(DI*4), X1
 16095  	MULSS X0, X1
 16096  	ADDSS (DX)(R8*4), X1
 16097  	MOVSS X1, (DX)(R8*4)
 16098  	ADDQ  CX, DI
 16099  	ADDQ  BX, R8
 16100  	MOVSS (AX)(DI*4), X1
 16101  	MULSS X0, X1
 16102  	ADDSS (DX)(R8*4), X1
 16103  	MOVSS X1, (DX)(R8*4)
 16104  	ADDQ  CX, DI
 16105  	ADDQ  BX, R8
 16106  	MOVSS (AX)(DI*4), X1
 16107  	MULSS X0, X1
 16108  	ADDSS (DX)(R8*4), X1
 16109  	MOVSS X1, (DX)(R8*4)
 16110  	ADDQ  CX, DI
 16111  	ADDQ  BX, R8
 16112  	MOVSS (AX)(DI*4), X1
 16113  	MULSS X0, X1
 16114  	ADDSS (DX)(R8*4), X1
 16115  	MOVSS X1, (DX)(R8*4)
 16116  	ADDQ  CX, DI
 16117  	ADDQ  BX, R8
 16118  	MOVSS (AX)(DI*4), X1
 16119  	MULSS X0, X1
 16120  	ADDSS (DX)(R8*4), X1
 16121  	MOVSS X1, (DX)(R8*4)
 16122  	ADDQ  CX, DI
 16123  	ADDQ  BX, R8
 16124  	MOVSS (AX)(DI*4), X1
 16125  	MULSS X0, X1
 16126  	ADDSS (DX)(R8*4), X1
 16127  	MOVSS X1, (DX)(R8*4)
 16128  	ADDQ  CX, DI
 16129  	ADDQ  BX, R8
 16130  	MOVSS (AX)(DI*4), X1
 16131  	MULSS X0, X1
 16132  	ADDSS (DX)(R8*4), X1
 16133  	MOVSS X1, (DX)(R8*4)
 16134  	ADDQ  CX, DI
 16135  	ADDQ  BX, R8
 16136  	SUBQ  $0x08, SI
 16137  
 16138  check_limit_unroll:
 16139  	CMPQ SI, $0x08
 16140  	JHI  loop_unroll
 16141  	JMP  check_limit
 16142  
 16143  loop:
 16144  	MOVSS (AX)(DI*4), X1
 16145  	MULSS X0, X1
 16146  	ADDSS (DX)(R8*4), X1
 16147  	MOVSS X1, (DX)(R8*4)
 16148  	DECQ  SI
 16149  	ADDQ  CX, DI
 16150  	ADDQ  BX, R8
 16151  
 16152  check_limit:
 16153  	CMPQ SI, $0x00
 16154  	JHI  loop
 16155  	RET
 16156  
 16157  // func AmdAxpyUnsafeXInterleave_V0A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16158  // Requires: SSE
 16159  TEXT ·AmdAxpyUnsafeXInterleave_V0A0R4(SB), NOSPLIT, $0-48
 16160  	MOVSS alpha+0(FP), X0
 16161  	MOVQ  xs+8(FP), AX
 16162  	MOVQ  incx+16(FP), CX
 16163  	MOVQ  ys+24(FP), DX
 16164  	MOVQ  incy+32(FP), BX
 16165  	MOVQ  n+40(FP), SI
 16166  	XORQ  DI, DI
 16167  	XORQ  R8, R8
 16168  	JMP   check_limit_unroll
 16169  
 16170  loop_unroll:
 16171  	MOVSS (AX)(DI*4), X1
 16172  	ADDQ  CX, DI
 16173  	MOVSS (AX)(DI*4), X2
 16174  	ADDQ  CX, DI
 16175  	MOVSS (AX)(DI*4), X3
 16176  	ADDQ  CX, DI
 16177  	MOVSS (AX)(DI*4), X4
 16178  	ADDQ  CX, DI
 16179  	MULSS X0, X1
 16180  	MULSS X0, X2
 16181  	MULSS X0, X3
 16182  	MULSS X0, X4
 16183  	ADDSS (DX)(R8*4), X1
 16184  	MOVSS X1, (DX)(R8*4)
 16185  	ADDQ  BX, R8
 16186  	ADDSS (DX)(R8*4), X2
 16187  	MOVSS X2, (DX)(R8*4)
 16188  	ADDQ  BX, R8
 16189  	ADDSS (DX)(R8*4), X3
 16190  	MOVSS X3, (DX)(R8*4)
 16191  	ADDQ  BX, R8
 16192  	ADDSS (DX)(R8*4), X4
 16193  	MOVSS X4, (DX)(R8*4)
 16194  	ADDQ  BX, R8
 16195  	SUBQ  $0x04, SI
 16196  
 16197  check_limit_unroll:
 16198  	CMPQ SI, $0x04
 16199  	JHS  loop_unroll
 16200  	JMP  check_limit
 16201  
 16202  loop:
 16203  	MOVSS (AX)(DI*4), X1
 16204  	MULSS X0, X1
 16205  	ADDSS (DX)(R8*4), X1
 16206  	MOVSS X1, (DX)(R8*4)
 16207  	DECQ  SI
 16208  	ADDQ  CX, DI
 16209  	ADDQ  BX, R8
 16210  
 16211  check_limit:
 16212  	CMPQ SI, $0x00
 16213  	JHI  loop
 16214  	RET
 16215  
 16216  // func AmdAxpyUnsafeXInterleave_V1A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16217  // Requires: SSE
 16218  TEXT ·AmdAxpyUnsafeXInterleave_V1A0R4(SB), NOSPLIT, $0-48
 16219  	MOVSS alpha+0(FP), X0
 16220  	MOVQ  xs+8(FP), AX
 16221  	MOVQ  incx+16(FP), CX
 16222  	MOVQ  ys+24(FP), DX
 16223  	MOVQ  incy+32(FP), BX
 16224  	MOVQ  n+40(FP), SI
 16225  	XORQ  DI, DI
 16226  	XORQ  R8, R8
 16227  	JMP   check_limit_unroll
 16228  
 16229  loop_unroll:
 16230  	MOVSS (AX)(DI*4), X1
 16231  	ADDQ  CX, DI
 16232  	MOVSS (AX)(DI*4), X2
 16233  	ADDQ  CX, DI
 16234  	MOVSS (AX)(DI*4), X3
 16235  	ADDQ  CX, DI
 16236  	MOVSS (AX)(DI*4), X4
 16237  	ADDQ  CX, DI
 16238  	MULSS X0, X1
 16239  	MULSS X0, X2
 16240  	MULSS X0, X3
 16241  	MULSS X0, X4
 16242  	ADDSS (DX)(R8*4), X1
 16243  	MOVSS X1, (DX)(R8*4)
 16244  	ADDQ  BX, R8
 16245  	ADDSS (DX)(R8*4), X2
 16246  	MOVSS X2, (DX)(R8*4)
 16247  	ADDQ  BX, R8
 16248  	ADDSS (DX)(R8*4), X3
 16249  	MOVSS X3, (DX)(R8*4)
 16250  	ADDQ  BX, R8
 16251  	ADDSS (DX)(R8*4), X4
 16252  	MOVSS X4, (DX)(R8*4)
 16253  	ADDQ  BX, R8
 16254  	SUBQ  $0x04, SI
 16255  
 16256  check_limit_unroll:
 16257  	CMPQ SI, $0x04
 16258  	JHS  loop_unroll
 16259  	JMP  check_limit
 16260  
 16261  loop:
 16262  	MOVSS (AX)(DI*4), X1
 16263  	MULSS X0, X1
 16264  	ADDSS (DX)(R8*4), X1
 16265  	MOVSS X1, (DX)(R8*4)
 16266  	DECQ  SI
 16267  	ADDQ  CX, DI
 16268  	ADDQ  BX, R8
 16269  
 16270  check_limit:
 16271  	CMPQ SI, $0x00
 16272  	JHI  loop
 16273  	RET
 16274  
 16275  // func AmdAxpyUnsafeXInterleave_V2A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16276  // Requires: SSE
 16277  TEXT ·AmdAxpyUnsafeXInterleave_V2A0R4(SB), NOSPLIT, $0-48
 16278  	MOVSS alpha+0(FP), X0
 16279  	MOVQ  xs+8(FP), AX
 16280  	MOVQ  incx+16(FP), CX
 16281  	MOVQ  ys+24(FP), DX
 16282  	MOVQ  incy+32(FP), BX
 16283  	MOVQ  n+40(FP), SI
 16284  	XORQ  DI, DI
 16285  	XORQ  R8, R8
 16286  	JMP   check_limit_unroll
 16287  
 16288  loop_unroll:
 16289  	MOVSS (AX)(DI*4), X1
 16290  	ADDQ  CX, DI
 16291  	MOVSS (AX)(DI*4), X2
 16292  	ADDQ  CX, DI
 16293  	MOVSS (AX)(DI*4), X3
 16294  	ADDQ  CX, DI
 16295  	MOVSS (AX)(DI*4), X4
 16296  	ADDQ  CX, DI
 16297  	MULSS X0, X1
 16298  	MULSS X0, X2
 16299  	MULSS X0, X3
 16300  	MULSS X0, X4
 16301  	ADDSS (DX)(R8*4), X1
 16302  	MOVSS X1, (DX)(R8*4)
 16303  	ADDQ  BX, R8
 16304  	ADDSS (DX)(R8*4), X2
 16305  	MOVSS X2, (DX)(R8*4)
 16306  	ADDQ  BX, R8
 16307  	ADDSS (DX)(R8*4), X3
 16308  	MOVSS X3, (DX)(R8*4)
 16309  	ADDQ  BX, R8
 16310  	ADDSS (DX)(R8*4), X4
 16311  	MOVSS X4, (DX)(R8*4)
 16312  	ADDQ  BX, R8
 16313  	SUBQ  $0x04, SI
 16314  
 16315  check_limit_unroll:
 16316  	CMPQ SI, $0x04
 16317  	JHS  loop_unroll
 16318  	JMP  check_limit
 16319  
 16320  loop:
 16321  	MOVSS (AX)(DI*4), X1
 16322  	MULSS X0, X1
 16323  	ADDSS (DX)(R8*4), X1
 16324  	MOVSS X1, (DX)(R8*4)
 16325  	DECQ  SI
 16326  	ADDQ  CX, DI
 16327  	ADDQ  BX, R8
 16328  
 16329  check_limit:
 16330  	CMPQ SI, $0x00
 16331  	JHI  loop
 16332  	RET
 16333  
 16334  // func AmdAxpyUnsafeXInterleave_V3A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16335  // Requires: SSE
 16336  TEXT ·AmdAxpyUnsafeXInterleave_V3A0R4(SB), NOSPLIT, $0-48
 16337  	MOVSS alpha+0(FP), X0
 16338  	MOVQ  xs+8(FP), AX
 16339  	MOVQ  incx+16(FP), CX
 16340  	MOVQ  ys+24(FP), DX
 16341  	MOVQ  incy+32(FP), BX
 16342  	MOVQ  n+40(FP), SI
 16343  	XORQ  DI, DI
 16344  	XORQ  R8, R8
 16345  	JMP   check_limit_unroll
 16346  
 16347  loop_unroll:
 16348  	MOVSS (AX)(DI*4), X1
 16349  	ADDQ  CX, DI
 16350  	MOVSS (AX)(DI*4), X2
 16351  	ADDQ  CX, DI
 16352  	MOVSS (AX)(DI*4), X3
 16353  	ADDQ  CX, DI
 16354  	MOVSS (AX)(DI*4), X4
 16355  	ADDQ  CX, DI
 16356  	MULSS X0, X1
 16357  	MULSS X0, X2
 16358  	MULSS X0, X3
 16359  	MULSS X0, X4
 16360  	ADDSS (DX)(R8*4), X1
 16361  	MOVSS X1, (DX)(R8*4)
 16362  	ADDQ  BX, R8
 16363  	ADDSS (DX)(R8*4), X2
 16364  	MOVSS X2, (DX)(R8*4)
 16365  	ADDQ  BX, R8
 16366  	ADDSS (DX)(R8*4), X3
 16367  	MOVSS X3, (DX)(R8*4)
 16368  	ADDQ  BX, R8
 16369  	ADDSS (DX)(R8*4), X4
 16370  	MOVSS X4, (DX)(R8*4)
 16371  	ADDQ  BX, R8
 16372  	SUBQ  $0x04, SI
 16373  
 16374  check_limit_unroll:
 16375  	CMPQ SI, $0x04
 16376  	JHS  loop_unroll
 16377  	JMP  check_limit
 16378  
 16379  loop:
 16380  	MOVSS (AX)(DI*4), X1
 16381  	MULSS X0, X1
 16382  	ADDSS (DX)(R8*4), X1
 16383  	MOVSS X1, (DX)(R8*4)
 16384  	DECQ  SI
 16385  	ADDQ  CX, DI
 16386  	ADDQ  BX, R8
 16387  
 16388  check_limit:
 16389  	CMPQ SI, $0x00
 16390  	JHI  loop
 16391  	RET
 16392  
 16393  // func AmdAxpyUnsafeXInterleave_V4A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16394  // Requires: SSE
 16395  TEXT ·AmdAxpyUnsafeXInterleave_V4A0R4(SB), NOSPLIT, $0-48
 16396  	MOVSS alpha+0(FP), X0
 16397  	MOVQ  xs+8(FP), AX
 16398  	MOVQ  incx+16(FP), CX
 16399  	MOVQ  ys+24(FP), DX
 16400  	MOVQ  incy+32(FP), BX
 16401  	MOVQ  n+40(FP), SI
 16402  	XORQ  DI, DI
 16403  	XORQ  R8, R8
 16404  	JMP   check_limit_unroll
 16405  
 16406  loop_unroll:
 16407  	MOVSS (AX)(DI*4), X1
 16408  	ADDQ  CX, DI
 16409  	MOVSS (AX)(DI*4), X2
 16410  	ADDQ  CX, DI
 16411  	MOVSS (AX)(DI*4), X3
 16412  	ADDQ  CX, DI
 16413  	MOVSS (AX)(DI*4), X4
 16414  	ADDQ  CX, DI
 16415  	MULSS X0, X1
 16416  	MULSS X0, X2
 16417  	MULSS X0, X3
 16418  	MULSS X0, X4
 16419  	ADDSS (DX)(R8*4), X1
 16420  	MOVSS X1, (DX)(R8*4)
 16421  	ADDQ  BX, R8
 16422  	ADDSS (DX)(R8*4), X2
 16423  	MOVSS X2, (DX)(R8*4)
 16424  	ADDQ  BX, R8
 16425  	ADDSS (DX)(R8*4), X3
 16426  	MOVSS X3, (DX)(R8*4)
 16427  	ADDQ  BX, R8
 16428  	ADDSS (DX)(R8*4), X4
 16429  	MOVSS X4, (DX)(R8*4)
 16430  	ADDQ  BX, R8
 16431  	SUBQ  $0x04, SI
 16432  
 16433  check_limit_unroll:
 16434  	CMPQ SI, $0x04
 16435  	JHS  loop_unroll
 16436  	JMP  check_limit
 16437  
 16438  loop:
 16439  	MOVSS (AX)(DI*4), X1
 16440  	MULSS X0, X1
 16441  	ADDSS (DX)(R8*4), X1
 16442  	MOVSS X1, (DX)(R8*4)
 16443  	DECQ  SI
 16444  	ADDQ  CX, DI
 16445  	ADDQ  BX, R8
 16446  
 16447  check_limit:
 16448  	CMPQ SI, $0x00
 16449  	JHI  loop
 16450  	RET
 16451  
 16452  // func AmdAxpyUnsafeXInterleave_V5A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16453  // Requires: SSE
 16454  TEXT ·AmdAxpyUnsafeXInterleave_V5A0R4(SB), NOSPLIT, $0-48
 16455  	MOVSS alpha+0(FP), X0
 16456  	MOVQ  xs+8(FP), AX
 16457  	MOVQ  incx+16(FP), CX
 16458  	MOVQ  ys+24(FP), DX
 16459  	MOVQ  incy+32(FP), BX
 16460  	MOVQ  n+40(FP), SI
 16461  	XORQ  DI, DI
 16462  	XORQ  R8, R8
 16463  	JMP   check_limit_unroll
 16464  
 16465  loop_unroll:
 16466  	MOVSS (AX)(DI*4), X1
 16467  	ADDQ  CX, DI
 16468  	MOVSS (AX)(DI*4), X2
 16469  	ADDQ  CX, DI
 16470  	MOVSS (AX)(DI*4), X3
 16471  	ADDQ  CX, DI
 16472  	MOVSS (AX)(DI*4), X4
 16473  	ADDQ  CX, DI
 16474  	MULSS X0, X1
 16475  	MULSS X0, X2
 16476  	MULSS X0, X3
 16477  	MULSS X0, X4
 16478  	ADDSS (DX)(R8*4), X1
 16479  	MOVSS X1, (DX)(R8*4)
 16480  	ADDQ  BX, R8
 16481  	ADDSS (DX)(R8*4), X2
 16482  	MOVSS X2, (DX)(R8*4)
 16483  	ADDQ  BX, R8
 16484  	ADDSS (DX)(R8*4), X3
 16485  	MOVSS X3, (DX)(R8*4)
 16486  	ADDQ  BX, R8
 16487  	ADDSS (DX)(R8*4), X4
 16488  	MOVSS X4, (DX)(R8*4)
 16489  	ADDQ  BX, R8
 16490  	SUBQ  $0x04, SI
 16491  
 16492  check_limit_unroll:
 16493  	CMPQ SI, $0x04
 16494  	JHS  loop_unroll
 16495  	JMP  check_limit
 16496  
 16497  loop:
 16498  	MOVSS (AX)(DI*4), X1
 16499  	MULSS X0, X1
 16500  	ADDSS (DX)(R8*4), X1
 16501  	MOVSS X1, (DX)(R8*4)
 16502  	DECQ  SI
 16503  	ADDQ  CX, DI
 16504  	ADDQ  BX, R8
 16505  
 16506  check_limit:
 16507  	CMPQ SI, $0x00
 16508  	JHI  loop
 16509  	RET
 16510  
 16511  // func AmdAxpyUnsafeXInterleave_V0A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16512  // Requires: SSE
 16513  TEXT ·AmdAxpyUnsafeXInterleave_V0A8R4(SB), NOSPLIT, $0-48
 16514  	MOVSS alpha+0(FP), X0
 16515  	MOVQ  xs+8(FP), AX
 16516  	MOVQ  incx+16(FP), CX
 16517  	MOVQ  ys+24(FP), DX
 16518  	MOVQ  incy+32(FP), BX
 16519  	MOVQ  n+40(FP), SI
 16520  	XORQ  DI, DI
 16521  	XORQ  R8, R8
 16522  	JMP   check_limit_unroll
 16523  	PCALIGN $0x08
 16524  
 16525  loop_unroll:
 16526  	MOVSS (AX)(DI*4), X1
 16527  	ADDQ  CX, DI
 16528  	MOVSS (AX)(DI*4), X2
 16529  	ADDQ  CX, DI
 16530  	MOVSS (AX)(DI*4), X3
 16531  	ADDQ  CX, DI
 16532  	MOVSS (AX)(DI*4), X4
 16533  	ADDQ  CX, DI
 16534  	MULSS X0, X1
 16535  	MULSS X0, X2
 16536  	MULSS X0, X3
 16537  	MULSS X0, X4
 16538  	ADDSS (DX)(R8*4), X1
 16539  	MOVSS X1, (DX)(R8*4)
 16540  	ADDQ  BX, R8
 16541  	ADDSS (DX)(R8*4), X2
 16542  	MOVSS X2, (DX)(R8*4)
 16543  	ADDQ  BX, R8
 16544  	ADDSS (DX)(R8*4), X3
 16545  	MOVSS X3, (DX)(R8*4)
 16546  	ADDQ  BX, R8
 16547  	ADDSS (DX)(R8*4), X4
 16548  	MOVSS X4, (DX)(R8*4)
 16549  	ADDQ  BX, R8
 16550  	SUBQ  $0x04, SI
 16551  
 16552  check_limit_unroll:
 16553  	CMPQ SI, $0x04
 16554  	JHS  loop_unroll
 16555  	JMP  check_limit
 16556  
 16557  loop:
 16558  	MOVSS (AX)(DI*4), X1
 16559  	MULSS X0, X1
 16560  	ADDSS (DX)(R8*4), X1
 16561  	MOVSS X1, (DX)(R8*4)
 16562  	DECQ  SI
 16563  	ADDQ  CX, DI
 16564  	ADDQ  BX, R8
 16565  
 16566  check_limit:
 16567  	CMPQ SI, $0x00
 16568  	JHI  loop
 16569  	RET
 16570  
 16571  // func AmdAxpyUnsafeXInterleave_V1A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16572  // Requires: SSE
 16573  TEXT ·AmdAxpyUnsafeXInterleave_V1A8R4(SB), NOSPLIT, $0-48
 16574  	MOVSS alpha+0(FP), X0
 16575  	MOVQ  xs+8(FP), AX
 16576  	MOVQ  incx+16(FP), CX
 16577  	MOVQ  ys+24(FP), DX
 16578  	MOVQ  incy+32(FP), BX
 16579  	MOVQ  n+40(FP), SI
 16580  	XORQ  DI, DI
 16581  	XORQ  R8, R8
 16582  	JMP   check_limit_unroll
 16583  	PCALIGN $0x08
 16584  
 16585  loop_unroll:
 16586  	MOVSS (AX)(DI*4), X1
 16587  	ADDQ  CX, DI
 16588  	MOVSS (AX)(DI*4), X2
 16589  	ADDQ  CX, DI
 16590  	MOVSS (AX)(DI*4), X3
 16591  	ADDQ  CX, DI
 16592  	MOVSS (AX)(DI*4), X4
 16593  	ADDQ  CX, DI
 16594  	MULSS X0, X1
 16595  	MULSS X0, X2
 16596  	MULSS X0, X3
 16597  	MULSS X0, X4
 16598  	ADDSS (DX)(R8*4), X1
 16599  	MOVSS X1, (DX)(R8*4)
 16600  	ADDQ  BX, R8
 16601  	ADDSS (DX)(R8*4), X2
 16602  	MOVSS X2, (DX)(R8*4)
 16603  	ADDQ  BX, R8
 16604  	ADDSS (DX)(R8*4), X3
 16605  	MOVSS X3, (DX)(R8*4)
 16606  	ADDQ  BX, R8
 16607  	ADDSS (DX)(R8*4), X4
 16608  	MOVSS X4, (DX)(R8*4)
 16609  	ADDQ  BX, R8
 16610  	SUBQ  $0x04, SI
 16611  
 16612  check_limit_unroll:
 16613  	CMPQ SI, $0x04
 16614  	JHS  loop_unroll
 16615  	JMP  check_limit
 16616  
 16617  loop:
 16618  	MOVSS (AX)(DI*4), X1
 16619  	MULSS X0, X1
 16620  	ADDSS (DX)(R8*4), X1
 16621  	MOVSS X1, (DX)(R8*4)
 16622  	DECQ  SI
 16623  	ADDQ  CX, DI
 16624  	ADDQ  BX, R8
 16625  
 16626  check_limit:
 16627  	CMPQ SI, $0x00
 16628  	JHI  loop
 16629  	RET
 16630  
 16631  // func AmdAxpyUnsafeXInterleave_V2A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16632  // Requires: SSE
 16633  TEXT ·AmdAxpyUnsafeXInterleave_V2A8R4(SB), NOSPLIT, $0-48
 16634  	MOVSS alpha+0(FP), X0
 16635  	MOVQ  xs+8(FP), AX
 16636  	MOVQ  incx+16(FP), CX
 16637  	MOVQ  ys+24(FP), DX
 16638  	MOVQ  incy+32(FP), BX
 16639  	MOVQ  n+40(FP), SI
 16640  	XORQ  DI, DI
 16641  	XORQ  R8, R8
 16642  	JMP   check_limit_unroll
 16643  	PCALIGN $0x08
 16644  
 16645  loop_unroll:
 16646  	MOVSS (AX)(DI*4), X1
 16647  	ADDQ  CX, DI
 16648  	MOVSS (AX)(DI*4), X2
 16649  	ADDQ  CX, DI
 16650  	MOVSS (AX)(DI*4), X3
 16651  	ADDQ  CX, DI
 16652  	MOVSS (AX)(DI*4), X4
 16653  	ADDQ  CX, DI
 16654  	MULSS X0, X1
 16655  	MULSS X0, X2
 16656  	MULSS X0, X3
 16657  	MULSS X0, X4
 16658  	ADDSS (DX)(R8*4), X1
 16659  	MOVSS X1, (DX)(R8*4)
 16660  	ADDQ  BX, R8
 16661  	ADDSS (DX)(R8*4), X2
 16662  	MOVSS X2, (DX)(R8*4)
 16663  	ADDQ  BX, R8
 16664  	ADDSS (DX)(R8*4), X3
 16665  	MOVSS X3, (DX)(R8*4)
 16666  	ADDQ  BX, R8
 16667  	ADDSS (DX)(R8*4), X4
 16668  	MOVSS X4, (DX)(R8*4)
 16669  	ADDQ  BX, R8
 16670  	SUBQ  $0x04, SI
 16671  
 16672  check_limit_unroll:
 16673  	CMPQ SI, $0x04
 16674  	JHS  loop_unroll
 16675  	JMP  check_limit
 16676  
 16677  loop:
 16678  	MOVSS (AX)(DI*4), X1
 16679  	MULSS X0, X1
 16680  	ADDSS (DX)(R8*4), X1
 16681  	MOVSS X1, (DX)(R8*4)
 16682  	DECQ  SI
 16683  	ADDQ  CX, DI
 16684  	ADDQ  BX, R8
 16685  
 16686  check_limit:
 16687  	CMPQ SI, $0x00
 16688  	JHI  loop
 16689  	RET
 16690  
 16691  // func AmdAxpyUnsafeXInterleave_V3A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16692  // Requires: SSE
 16693  TEXT ·AmdAxpyUnsafeXInterleave_V3A8R4(SB), NOSPLIT, $0-48
 16694  	MOVSS alpha+0(FP), X0
 16695  	MOVQ  xs+8(FP), AX
 16696  	MOVQ  incx+16(FP), CX
 16697  	MOVQ  ys+24(FP), DX
 16698  	MOVQ  incy+32(FP), BX
 16699  	MOVQ  n+40(FP), SI
 16700  	XORQ  DI, DI
 16701  	XORQ  R8, R8
 16702  	JMP   check_limit_unroll
 16703  	PCALIGN $0x08
 16704  
 16705  loop_unroll:
 16706  	MOVSS (AX)(DI*4), X1
 16707  	ADDQ  CX, DI
 16708  	MOVSS (AX)(DI*4), X2
 16709  	ADDQ  CX, DI
 16710  	MOVSS (AX)(DI*4), X3
 16711  	ADDQ  CX, DI
 16712  	MOVSS (AX)(DI*4), X4
 16713  	ADDQ  CX, DI
 16714  	MULSS X0, X1
 16715  	MULSS X0, X2
 16716  	MULSS X0, X3
 16717  	MULSS X0, X4
 16718  	ADDSS (DX)(R8*4), X1
 16719  	MOVSS X1, (DX)(R8*4)
 16720  	ADDQ  BX, R8
 16721  	ADDSS (DX)(R8*4), X2
 16722  	MOVSS X2, (DX)(R8*4)
 16723  	ADDQ  BX, R8
 16724  	ADDSS (DX)(R8*4), X3
 16725  	MOVSS X3, (DX)(R8*4)
 16726  	ADDQ  BX, R8
 16727  	ADDSS (DX)(R8*4), X4
 16728  	MOVSS X4, (DX)(R8*4)
 16729  	ADDQ  BX, R8
 16730  	SUBQ  $0x04, SI
 16731  
 16732  check_limit_unroll:
 16733  	CMPQ SI, $0x04
 16734  	JHS  loop_unroll
 16735  	JMP  check_limit
 16736  
 16737  loop:
 16738  	MOVSS (AX)(DI*4), X1
 16739  	MULSS X0, X1
 16740  	ADDSS (DX)(R8*4), X1
 16741  	MOVSS X1, (DX)(R8*4)
 16742  	DECQ  SI
 16743  	ADDQ  CX, DI
 16744  	ADDQ  BX, R8
 16745  
 16746  check_limit:
 16747  	CMPQ SI, $0x00
 16748  	JHI  loop
 16749  	RET
 16750  
 16751  // func AmdAxpyUnsafeXInterleave_V4A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16752  // Requires: SSE
 16753  TEXT ·AmdAxpyUnsafeXInterleave_V4A8R4(SB), NOSPLIT, $0-48
 16754  	MOVSS alpha+0(FP), X0
 16755  	MOVQ  xs+8(FP), AX
 16756  	MOVQ  incx+16(FP), CX
 16757  	MOVQ  ys+24(FP), DX
 16758  	MOVQ  incy+32(FP), BX
 16759  	MOVQ  n+40(FP), SI
 16760  	XORQ  DI, DI
 16761  	XORQ  R8, R8
 16762  	JMP   check_limit_unroll
 16763  	PCALIGN $0x08
 16764  
 16765  loop_unroll:
 16766  	MOVSS (AX)(DI*4), X1
 16767  	ADDQ  CX, DI
 16768  	MOVSS (AX)(DI*4), X2
 16769  	ADDQ  CX, DI
 16770  	MOVSS (AX)(DI*4), X3
 16771  	ADDQ  CX, DI
 16772  	MOVSS (AX)(DI*4), X4
 16773  	ADDQ  CX, DI
 16774  	MULSS X0, X1
 16775  	MULSS X0, X2
 16776  	MULSS X0, X3
 16777  	MULSS X0, X4
 16778  	ADDSS (DX)(R8*4), X1
 16779  	MOVSS X1, (DX)(R8*4)
 16780  	ADDQ  BX, R8
 16781  	ADDSS (DX)(R8*4), X2
 16782  	MOVSS X2, (DX)(R8*4)
 16783  	ADDQ  BX, R8
 16784  	ADDSS (DX)(R8*4), X3
 16785  	MOVSS X3, (DX)(R8*4)
 16786  	ADDQ  BX, R8
 16787  	ADDSS (DX)(R8*4), X4
 16788  	MOVSS X4, (DX)(R8*4)
 16789  	ADDQ  BX, R8
 16790  	SUBQ  $0x04, SI
 16791  
 16792  check_limit_unroll:
 16793  	CMPQ SI, $0x04
 16794  	JHS  loop_unroll
 16795  	JMP  check_limit
 16796  
 16797  loop:
 16798  	MOVSS (AX)(DI*4), X1
 16799  	MULSS X0, X1
 16800  	ADDSS (DX)(R8*4), X1
 16801  	MOVSS X1, (DX)(R8*4)
 16802  	DECQ  SI
 16803  	ADDQ  CX, DI
 16804  	ADDQ  BX, R8
 16805  
 16806  check_limit:
 16807  	CMPQ SI, $0x00
 16808  	JHI  loop
 16809  	RET
 16810  
 16811  // func AmdAxpyUnsafeXInterleave_V5A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16812  // Requires: SSE
 16813  TEXT ·AmdAxpyUnsafeXInterleave_V5A8R4(SB), NOSPLIT, $0-48
 16814  	MOVSS alpha+0(FP), X0
 16815  	MOVQ  xs+8(FP), AX
 16816  	MOVQ  incx+16(FP), CX
 16817  	MOVQ  ys+24(FP), DX
 16818  	MOVQ  incy+32(FP), BX
 16819  	MOVQ  n+40(FP), SI
 16820  	XORQ  DI, DI
 16821  	XORQ  R8, R8
 16822  	JMP   check_limit_unroll
 16823  	PCALIGN $0x08
 16824  
 16825  loop_unroll:
 16826  	MOVSS (AX)(DI*4), X1
 16827  	ADDQ  CX, DI
 16828  	MOVSS (AX)(DI*4), X2
 16829  	ADDQ  CX, DI
 16830  	MOVSS (AX)(DI*4), X3
 16831  	ADDQ  CX, DI
 16832  	MOVSS (AX)(DI*4), X4
 16833  	ADDQ  CX, DI
 16834  	MULSS X0, X1
 16835  	MULSS X0, X2
 16836  	MULSS X0, X3
 16837  	MULSS X0, X4
 16838  	ADDSS (DX)(R8*4), X1
 16839  	MOVSS X1, (DX)(R8*4)
 16840  	ADDQ  BX, R8
 16841  	ADDSS (DX)(R8*4), X2
 16842  	MOVSS X2, (DX)(R8*4)
 16843  	ADDQ  BX, R8
 16844  	ADDSS (DX)(R8*4), X3
 16845  	MOVSS X3, (DX)(R8*4)
 16846  	ADDQ  BX, R8
 16847  	ADDSS (DX)(R8*4), X4
 16848  	MOVSS X4, (DX)(R8*4)
 16849  	ADDQ  BX, R8
 16850  	SUBQ  $0x04, SI
 16851  
 16852  check_limit_unroll:
 16853  	CMPQ SI, $0x04
 16854  	JHS  loop_unroll
 16855  	JMP  check_limit
 16856  
 16857  loop:
 16858  	MOVSS (AX)(DI*4), X1
 16859  	MULSS X0, X1
 16860  	ADDSS (DX)(R8*4), X1
 16861  	MOVSS X1, (DX)(R8*4)
 16862  	DECQ  SI
 16863  	ADDQ  CX, DI
 16864  	ADDQ  BX, R8
 16865  
 16866  check_limit:
 16867  	CMPQ SI, $0x00
 16868  	JHI  loop
 16869  	RET
 16870  
 16871  // func AmdAxpyUnsafeXInterleave_V0A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16872  // Requires: SSE
 16873  TEXT ·AmdAxpyUnsafeXInterleave_V0A9R4(SB), NOSPLIT, $0-48
 16874  	MOVSS alpha+0(FP), X0
 16875  	MOVQ  xs+8(FP), AX
 16876  	MOVQ  incx+16(FP), CX
 16877  	MOVQ  ys+24(FP), DX
 16878  	MOVQ  incy+32(FP), BX
 16879  	MOVQ  n+40(FP), SI
 16880  	XORQ  DI, DI
 16881  	XORQ  R8, R8
 16882  	JMP   check_limit_unroll
 16883  	PCALIGN $0x08
 16884  	NOP
 16885  
 16886  loop_unroll:
 16887  	MOVSS (AX)(DI*4), X1
 16888  	ADDQ  CX, DI
 16889  	MOVSS (AX)(DI*4), X2
 16890  	ADDQ  CX, DI
 16891  	MOVSS (AX)(DI*4), X3
 16892  	ADDQ  CX, DI
 16893  	MOVSS (AX)(DI*4), X4
 16894  	ADDQ  CX, DI
 16895  	MULSS X0, X1
 16896  	MULSS X0, X2
 16897  	MULSS X0, X3
 16898  	MULSS X0, X4
 16899  	ADDSS (DX)(R8*4), X1
 16900  	MOVSS X1, (DX)(R8*4)
 16901  	ADDQ  BX, R8
 16902  	ADDSS (DX)(R8*4), X2
 16903  	MOVSS X2, (DX)(R8*4)
 16904  	ADDQ  BX, R8
 16905  	ADDSS (DX)(R8*4), X3
 16906  	MOVSS X3, (DX)(R8*4)
 16907  	ADDQ  BX, R8
 16908  	ADDSS (DX)(R8*4), X4
 16909  	MOVSS X4, (DX)(R8*4)
 16910  	ADDQ  BX, R8
 16911  	SUBQ  $0x04, SI
 16912  
 16913  check_limit_unroll:
 16914  	CMPQ SI, $0x04
 16915  	JHS  loop_unroll
 16916  	JMP  check_limit
 16917  
 16918  loop:
 16919  	MOVSS (AX)(DI*4), X1
 16920  	MULSS X0, X1
 16921  	ADDSS (DX)(R8*4), X1
 16922  	MOVSS X1, (DX)(R8*4)
 16923  	DECQ  SI
 16924  	ADDQ  CX, DI
 16925  	ADDQ  BX, R8
 16926  
 16927  check_limit:
 16928  	CMPQ SI, $0x00
 16929  	JHI  loop
 16930  	RET
 16931  
 16932  // func AmdAxpyUnsafeXInterleave_V1A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16933  // Requires: SSE
 16934  TEXT ·AmdAxpyUnsafeXInterleave_V1A9R4(SB), NOSPLIT, $0-48
 16935  	MOVSS alpha+0(FP), X0
 16936  	MOVQ  xs+8(FP), AX
 16937  	MOVQ  incx+16(FP), CX
 16938  	MOVQ  ys+24(FP), DX
 16939  	MOVQ  incy+32(FP), BX
 16940  	MOVQ  n+40(FP), SI
 16941  	XORQ  DI, DI
 16942  	XORQ  R8, R8
 16943  	JMP   check_limit_unroll
 16944  	PCALIGN $0x08
 16945  	NOP
 16946  
 16947  loop_unroll:
 16948  	MOVSS (AX)(DI*4), X1
 16949  	ADDQ  CX, DI
 16950  	MOVSS (AX)(DI*4), X2
 16951  	ADDQ  CX, DI
 16952  	MOVSS (AX)(DI*4), X3
 16953  	ADDQ  CX, DI
 16954  	MOVSS (AX)(DI*4), X4
 16955  	ADDQ  CX, DI
 16956  	MULSS X0, X1
 16957  	MULSS X0, X2
 16958  	MULSS X0, X3
 16959  	MULSS X0, X4
 16960  	ADDSS (DX)(R8*4), X1
 16961  	MOVSS X1, (DX)(R8*4)
 16962  	ADDQ  BX, R8
 16963  	ADDSS (DX)(R8*4), X2
 16964  	MOVSS X2, (DX)(R8*4)
 16965  	ADDQ  BX, R8
 16966  	ADDSS (DX)(R8*4), X3
 16967  	MOVSS X3, (DX)(R8*4)
 16968  	ADDQ  BX, R8
 16969  	ADDSS (DX)(R8*4), X4
 16970  	MOVSS X4, (DX)(R8*4)
 16971  	ADDQ  BX, R8
 16972  	SUBQ  $0x04, SI
 16973  
 16974  check_limit_unroll:
 16975  	CMPQ SI, $0x04
 16976  	JHS  loop_unroll
 16977  	JMP  check_limit
 16978  
 16979  loop:
 16980  	MOVSS (AX)(DI*4), X1
 16981  	MULSS X0, X1
 16982  	ADDSS (DX)(R8*4), X1
 16983  	MOVSS X1, (DX)(R8*4)
 16984  	DECQ  SI
 16985  	ADDQ  CX, DI
 16986  	ADDQ  BX, R8
 16987  
 16988  check_limit:
 16989  	CMPQ SI, $0x00
 16990  	JHI  loop
 16991  	RET
 16992  
 16993  // func AmdAxpyUnsafeXInterleave_V2A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 16994  // Requires: SSE
 16995  TEXT ·AmdAxpyUnsafeXInterleave_V2A9R4(SB), NOSPLIT, $0-48
 16996  	MOVSS alpha+0(FP), X0
 16997  	MOVQ  xs+8(FP), AX
 16998  	MOVQ  incx+16(FP), CX
 16999  	MOVQ  ys+24(FP), DX
 17000  	MOVQ  incy+32(FP), BX
 17001  	MOVQ  n+40(FP), SI
 17002  	XORQ  DI, DI
 17003  	XORQ  R8, R8
 17004  	JMP   check_limit_unroll
 17005  	PCALIGN $0x08
 17006  	NOP
 17007  
 17008  loop_unroll:
 17009  	MOVSS (AX)(DI*4), X1
 17010  	ADDQ  CX, DI
 17011  	MOVSS (AX)(DI*4), X2
 17012  	ADDQ  CX, DI
 17013  	MOVSS (AX)(DI*4), X3
 17014  	ADDQ  CX, DI
 17015  	MOVSS (AX)(DI*4), X4
 17016  	ADDQ  CX, DI
 17017  	MULSS X0, X1
 17018  	MULSS X0, X2
 17019  	MULSS X0, X3
 17020  	MULSS X0, X4
 17021  	ADDSS (DX)(R8*4), X1
 17022  	MOVSS X1, (DX)(R8*4)
 17023  	ADDQ  BX, R8
 17024  	ADDSS (DX)(R8*4), X2
 17025  	MOVSS X2, (DX)(R8*4)
 17026  	ADDQ  BX, R8
 17027  	ADDSS (DX)(R8*4), X3
 17028  	MOVSS X3, (DX)(R8*4)
 17029  	ADDQ  BX, R8
 17030  	ADDSS (DX)(R8*4), X4
 17031  	MOVSS X4, (DX)(R8*4)
 17032  	ADDQ  BX, R8
 17033  	SUBQ  $0x04, SI
 17034  
 17035  check_limit_unroll:
 17036  	CMPQ SI, $0x04
 17037  	JHS  loop_unroll
 17038  	JMP  check_limit
 17039  
 17040  loop:
 17041  	MOVSS (AX)(DI*4), X1
 17042  	MULSS X0, X1
 17043  	ADDSS (DX)(R8*4), X1
 17044  	MOVSS X1, (DX)(R8*4)
 17045  	DECQ  SI
 17046  	ADDQ  CX, DI
 17047  	ADDQ  BX, R8
 17048  
 17049  check_limit:
 17050  	CMPQ SI, $0x00
 17051  	JHI  loop
 17052  	RET
 17053  
 17054  // func AmdAxpyUnsafeXInterleave_V3A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17055  // Requires: SSE
 17056  TEXT ·AmdAxpyUnsafeXInterleave_V3A9R4(SB), NOSPLIT, $0-48
 17057  	MOVSS alpha+0(FP), X0
 17058  	MOVQ  xs+8(FP), AX
 17059  	MOVQ  incx+16(FP), CX
 17060  	MOVQ  ys+24(FP), DX
 17061  	MOVQ  incy+32(FP), BX
 17062  	MOVQ  n+40(FP), SI
 17063  	XORQ  DI, DI
 17064  	XORQ  R8, R8
 17065  	JMP   check_limit_unroll
 17066  	PCALIGN $0x08
 17067  	NOP
 17068  
 17069  loop_unroll:
 17070  	MOVSS (AX)(DI*4), X1
 17071  	ADDQ  CX, DI
 17072  	MOVSS (AX)(DI*4), X2
 17073  	ADDQ  CX, DI
 17074  	MOVSS (AX)(DI*4), X3
 17075  	ADDQ  CX, DI
 17076  	MOVSS (AX)(DI*4), X4
 17077  	ADDQ  CX, DI
 17078  	MULSS X0, X1
 17079  	MULSS X0, X2
 17080  	MULSS X0, X3
 17081  	MULSS X0, X4
 17082  	ADDSS (DX)(R8*4), X1
 17083  	MOVSS X1, (DX)(R8*4)
 17084  	ADDQ  BX, R8
 17085  	ADDSS (DX)(R8*4), X2
 17086  	MOVSS X2, (DX)(R8*4)
 17087  	ADDQ  BX, R8
 17088  	ADDSS (DX)(R8*4), X3
 17089  	MOVSS X3, (DX)(R8*4)
 17090  	ADDQ  BX, R8
 17091  	ADDSS (DX)(R8*4), X4
 17092  	MOVSS X4, (DX)(R8*4)
 17093  	ADDQ  BX, R8
 17094  	SUBQ  $0x04, SI
 17095  
 17096  check_limit_unroll:
 17097  	CMPQ SI, $0x04
 17098  	JHS  loop_unroll
 17099  	JMP  check_limit
 17100  
 17101  loop:
 17102  	MOVSS (AX)(DI*4), X1
 17103  	MULSS X0, X1
 17104  	ADDSS (DX)(R8*4), X1
 17105  	MOVSS X1, (DX)(R8*4)
 17106  	DECQ  SI
 17107  	ADDQ  CX, DI
 17108  	ADDQ  BX, R8
 17109  
 17110  check_limit:
 17111  	CMPQ SI, $0x00
 17112  	JHI  loop
 17113  	RET
 17114  
 17115  // func AmdAxpyUnsafeXInterleave_V4A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17116  // Requires: SSE
 17117  TEXT ·AmdAxpyUnsafeXInterleave_V4A9R4(SB), NOSPLIT, $0-48
 17118  	MOVSS alpha+0(FP), X0
 17119  	MOVQ  xs+8(FP), AX
 17120  	MOVQ  incx+16(FP), CX
 17121  	MOVQ  ys+24(FP), DX
 17122  	MOVQ  incy+32(FP), BX
 17123  	MOVQ  n+40(FP), SI
 17124  	XORQ  DI, DI
 17125  	XORQ  R8, R8
 17126  	JMP   check_limit_unroll
 17127  	PCALIGN $0x08
 17128  	NOP
 17129  
 17130  loop_unroll:
 17131  	MOVSS (AX)(DI*4), X1
 17132  	ADDQ  CX, DI
 17133  	MOVSS (AX)(DI*4), X2
 17134  	ADDQ  CX, DI
 17135  	MOVSS (AX)(DI*4), X3
 17136  	ADDQ  CX, DI
 17137  	MOVSS (AX)(DI*4), X4
 17138  	ADDQ  CX, DI
 17139  	MULSS X0, X1
 17140  	MULSS X0, X2
 17141  	MULSS X0, X3
 17142  	MULSS X0, X4
 17143  	ADDSS (DX)(R8*4), X1
 17144  	MOVSS X1, (DX)(R8*4)
 17145  	ADDQ  BX, R8
 17146  	ADDSS (DX)(R8*4), X2
 17147  	MOVSS X2, (DX)(R8*4)
 17148  	ADDQ  BX, R8
 17149  	ADDSS (DX)(R8*4), X3
 17150  	MOVSS X3, (DX)(R8*4)
 17151  	ADDQ  BX, R8
 17152  	ADDSS (DX)(R8*4), X4
 17153  	MOVSS X4, (DX)(R8*4)
 17154  	ADDQ  BX, R8
 17155  	SUBQ  $0x04, SI
 17156  
 17157  check_limit_unroll:
 17158  	CMPQ SI, $0x04
 17159  	JHS  loop_unroll
 17160  	JMP  check_limit
 17161  
 17162  loop:
 17163  	MOVSS (AX)(DI*4), X1
 17164  	MULSS X0, X1
 17165  	ADDSS (DX)(R8*4), X1
 17166  	MOVSS X1, (DX)(R8*4)
 17167  	DECQ  SI
 17168  	ADDQ  CX, DI
 17169  	ADDQ  BX, R8
 17170  
 17171  check_limit:
 17172  	CMPQ SI, $0x00
 17173  	JHI  loop
 17174  	RET
 17175  
 17176  // func AmdAxpyUnsafeXInterleave_V5A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17177  // Requires: SSE
 17178  TEXT ·AmdAxpyUnsafeXInterleave_V5A9R4(SB), NOSPLIT, $0-48
 17179  	MOVSS alpha+0(FP), X0
 17180  	MOVQ  xs+8(FP), AX
 17181  	MOVQ  incx+16(FP), CX
 17182  	MOVQ  ys+24(FP), DX
 17183  	MOVQ  incy+32(FP), BX
 17184  	MOVQ  n+40(FP), SI
 17185  	XORQ  DI, DI
 17186  	XORQ  R8, R8
 17187  	JMP   check_limit_unroll
 17188  	PCALIGN $0x08
 17189  	NOP
 17190  
 17191  loop_unroll:
 17192  	MOVSS (AX)(DI*4), X1
 17193  	ADDQ  CX, DI
 17194  	MOVSS (AX)(DI*4), X2
 17195  	ADDQ  CX, DI
 17196  	MOVSS (AX)(DI*4), X3
 17197  	ADDQ  CX, DI
 17198  	MOVSS (AX)(DI*4), X4
 17199  	ADDQ  CX, DI
 17200  	MULSS X0, X1
 17201  	MULSS X0, X2
 17202  	MULSS X0, X3
 17203  	MULSS X0, X4
 17204  	ADDSS (DX)(R8*4), X1
 17205  	MOVSS X1, (DX)(R8*4)
 17206  	ADDQ  BX, R8
 17207  	ADDSS (DX)(R8*4), X2
 17208  	MOVSS X2, (DX)(R8*4)
 17209  	ADDQ  BX, R8
 17210  	ADDSS (DX)(R8*4), X3
 17211  	MOVSS X3, (DX)(R8*4)
 17212  	ADDQ  BX, R8
 17213  	ADDSS (DX)(R8*4), X4
 17214  	MOVSS X4, (DX)(R8*4)
 17215  	ADDQ  BX, R8
 17216  	SUBQ  $0x04, SI
 17217  
 17218  check_limit_unroll:
 17219  	CMPQ SI, $0x04
 17220  	JHS  loop_unroll
 17221  	JMP  check_limit
 17222  
 17223  loop:
 17224  	MOVSS (AX)(DI*4), X1
 17225  	MULSS X0, X1
 17226  	ADDSS (DX)(R8*4), X1
 17227  	MOVSS X1, (DX)(R8*4)
 17228  	DECQ  SI
 17229  	ADDQ  CX, DI
 17230  	ADDQ  BX, R8
 17231  
 17232  check_limit:
 17233  	CMPQ SI, $0x00
 17234  	JHI  loop
 17235  	RET
 17236  
 17237  // func AmdAxpyUnsafeXInterleave_V0A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17238  // Requires: SSE
 17239  TEXT ·AmdAxpyUnsafeXInterleave_V0A10R4(SB), NOSPLIT, $0-48
 17240  	MOVSS alpha+0(FP), X0
 17241  	MOVQ  xs+8(FP), AX
 17242  	MOVQ  incx+16(FP), CX
 17243  	MOVQ  ys+24(FP), DX
 17244  	MOVQ  incy+32(FP), BX
 17245  	MOVQ  n+40(FP), SI
 17246  	XORQ  DI, DI
 17247  	XORQ  R8, R8
 17248  	JMP   check_limit_unroll
 17249  	PCALIGN $0x08
 17250  	NOP
 17251  	NOP
 17252  
 17253  loop_unroll:
 17254  	MOVSS (AX)(DI*4), X1
 17255  	ADDQ  CX, DI
 17256  	MOVSS (AX)(DI*4), X2
 17257  	ADDQ  CX, DI
 17258  	MOVSS (AX)(DI*4), X3
 17259  	ADDQ  CX, DI
 17260  	MOVSS (AX)(DI*4), X4
 17261  	ADDQ  CX, DI
 17262  	MULSS X0, X1
 17263  	MULSS X0, X2
 17264  	MULSS X0, X3
 17265  	MULSS X0, X4
 17266  	ADDSS (DX)(R8*4), X1
 17267  	MOVSS X1, (DX)(R8*4)
 17268  	ADDQ  BX, R8
 17269  	ADDSS (DX)(R8*4), X2
 17270  	MOVSS X2, (DX)(R8*4)
 17271  	ADDQ  BX, R8
 17272  	ADDSS (DX)(R8*4), X3
 17273  	MOVSS X3, (DX)(R8*4)
 17274  	ADDQ  BX, R8
 17275  	ADDSS (DX)(R8*4), X4
 17276  	MOVSS X4, (DX)(R8*4)
 17277  	ADDQ  BX, R8
 17278  	SUBQ  $0x04, SI
 17279  
 17280  check_limit_unroll:
 17281  	CMPQ SI, $0x04
 17282  	JHS  loop_unroll
 17283  	JMP  check_limit
 17284  
 17285  loop:
 17286  	MOVSS (AX)(DI*4), X1
 17287  	MULSS X0, X1
 17288  	ADDSS (DX)(R8*4), X1
 17289  	MOVSS X1, (DX)(R8*4)
 17290  	DECQ  SI
 17291  	ADDQ  CX, DI
 17292  	ADDQ  BX, R8
 17293  
 17294  check_limit:
 17295  	CMPQ SI, $0x00
 17296  	JHI  loop
 17297  	RET
 17298  
 17299  // func AmdAxpyUnsafeXInterleave_V1A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17300  // Requires: SSE
 17301  TEXT ·AmdAxpyUnsafeXInterleave_V1A10R4(SB), NOSPLIT, $0-48
 17302  	MOVSS alpha+0(FP), X0
 17303  	MOVQ  xs+8(FP), AX
 17304  	MOVQ  incx+16(FP), CX
 17305  	MOVQ  ys+24(FP), DX
 17306  	MOVQ  incy+32(FP), BX
 17307  	MOVQ  n+40(FP), SI
 17308  	XORQ  DI, DI
 17309  	XORQ  R8, R8
 17310  	JMP   check_limit_unroll
 17311  	PCALIGN $0x08
 17312  	NOP
 17313  	NOP
 17314  
 17315  loop_unroll:
 17316  	MOVSS (AX)(DI*4), X1
 17317  	ADDQ  CX, DI
 17318  	MOVSS (AX)(DI*4), X2
 17319  	ADDQ  CX, DI
 17320  	MOVSS (AX)(DI*4), X3
 17321  	ADDQ  CX, DI
 17322  	MOVSS (AX)(DI*4), X4
 17323  	ADDQ  CX, DI
 17324  	MULSS X0, X1
 17325  	MULSS X0, X2
 17326  	MULSS X0, X3
 17327  	MULSS X0, X4
 17328  	ADDSS (DX)(R8*4), X1
 17329  	MOVSS X1, (DX)(R8*4)
 17330  	ADDQ  BX, R8
 17331  	ADDSS (DX)(R8*4), X2
 17332  	MOVSS X2, (DX)(R8*4)
 17333  	ADDQ  BX, R8
 17334  	ADDSS (DX)(R8*4), X3
 17335  	MOVSS X3, (DX)(R8*4)
 17336  	ADDQ  BX, R8
 17337  	ADDSS (DX)(R8*4), X4
 17338  	MOVSS X4, (DX)(R8*4)
 17339  	ADDQ  BX, R8
 17340  	SUBQ  $0x04, SI
 17341  
 17342  check_limit_unroll:
 17343  	CMPQ SI, $0x04
 17344  	JHS  loop_unroll
 17345  	JMP  check_limit
 17346  
 17347  loop:
 17348  	MOVSS (AX)(DI*4), X1
 17349  	MULSS X0, X1
 17350  	ADDSS (DX)(R8*4), X1
 17351  	MOVSS X1, (DX)(R8*4)
 17352  	DECQ  SI
 17353  	ADDQ  CX, DI
 17354  	ADDQ  BX, R8
 17355  
 17356  check_limit:
 17357  	CMPQ SI, $0x00
 17358  	JHI  loop
 17359  	RET
 17360  
 17361  // func AmdAxpyUnsafeXInterleave_V2A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17362  // Requires: SSE
 17363  TEXT ·AmdAxpyUnsafeXInterleave_V2A10R4(SB), NOSPLIT, $0-48
 17364  	MOVSS alpha+0(FP), X0
 17365  	MOVQ  xs+8(FP), AX
 17366  	MOVQ  incx+16(FP), CX
 17367  	MOVQ  ys+24(FP), DX
 17368  	MOVQ  incy+32(FP), BX
 17369  	MOVQ  n+40(FP), SI
 17370  	XORQ  DI, DI
 17371  	XORQ  R8, R8
 17372  	JMP   check_limit_unroll
 17373  	PCALIGN $0x08
 17374  	NOP
 17375  	NOP
 17376  
 17377  loop_unroll:
 17378  	MOVSS (AX)(DI*4), X1
 17379  	ADDQ  CX, DI
 17380  	MOVSS (AX)(DI*4), X2
 17381  	ADDQ  CX, DI
 17382  	MOVSS (AX)(DI*4), X3
 17383  	ADDQ  CX, DI
 17384  	MOVSS (AX)(DI*4), X4
 17385  	ADDQ  CX, DI
 17386  	MULSS X0, X1
 17387  	MULSS X0, X2
 17388  	MULSS X0, X3
 17389  	MULSS X0, X4
 17390  	ADDSS (DX)(R8*4), X1
 17391  	MOVSS X1, (DX)(R8*4)
 17392  	ADDQ  BX, R8
 17393  	ADDSS (DX)(R8*4), X2
 17394  	MOVSS X2, (DX)(R8*4)
 17395  	ADDQ  BX, R8
 17396  	ADDSS (DX)(R8*4), X3
 17397  	MOVSS X3, (DX)(R8*4)
 17398  	ADDQ  BX, R8
 17399  	ADDSS (DX)(R8*4), X4
 17400  	MOVSS X4, (DX)(R8*4)
 17401  	ADDQ  BX, R8
 17402  	SUBQ  $0x04, SI
 17403  
 17404  check_limit_unroll:
 17405  	CMPQ SI, $0x04
 17406  	JHS  loop_unroll
 17407  	JMP  check_limit
 17408  
 17409  loop:
 17410  	MOVSS (AX)(DI*4), X1
 17411  	MULSS X0, X1
 17412  	ADDSS (DX)(R8*4), X1
 17413  	MOVSS X1, (DX)(R8*4)
 17414  	DECQ  SI
 17415  	ADDQ  CX, DI
 17416  	ADDQ  BX, R8
 17417  
 17418  check_limit:
 17419  	CMPQ SI, $0x00
 17420  	JHI  loop
 17421  	RET
 17422  
 17423  // func AmdAxpyUnsafeXInterleave_V3A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17424  // Requires: SSE
 17425  TEXT ·AmdAxpyUnsafeXInterleave_V3A10R4(SB), NOSPLIT, $0-48
 17426  	MOVSS alpha+0(FP), X0
 17427  	MOVQ  xs+8(FP), AX
 17428  	MOVQ  incx+16(FP), CX
 17429  	MOVQ  ys+24(FP), DX
 17430  	MOVQ  incy+32(FP), BX
 17431  	MOVQ  n+40(FP), SI
 17432  	XORQ  DI, DI
 17433  	XORQ  R8, R8
 17434  	JMP   check_limit_unroll
 17435  	PCALIGN $0x08
 17436  	NOP
 17437  	NOP
 17438  
 17439  loop_unroll:
 17440  	MOVSS (AX)(DI*4), X1
 17441  	ADDQ  CX, DI
 17442  	MOVSS (AX)(DI*4), X2
 17443  	ADDQ  CX, DI
 17444  	MOVSS (AX)(DI*4), X3
 17445  	ADDQ  CX, DI
 17446  	MOVSS (AX)(DI*4), X4
 17447  	ADDQ  CX, DI
 17448  	MULSS X0, X1
 17449  	MULSS X0, X2
 17450  	MULSS X0, X3
 17451  	MULSS X0, X4
 17452  	ADDSS (DX)(R8*4), X1
 17453  	MOVSS X1, (DX)(R8*4)
 17454  	ADDQ  BX, R8
 17455  	ADDSS (DX)(R8*4), X2
 17456  	MOVSS X2, (DX)(R8*4)
 17457  	ADDQ  BX, R8
 17458  	ADDSS (DX)(R8*4), X3
 17459  	MOVSS X3, (DX)(R8*4)
 17460  	ADDQ  BX, R8
 17461  	ADDSS (DX)(R8*4), X4
 17462  	MOVSS X4, (DX)(R8*4)
 17463  	ADDQ  BX, R8
 17464  	SUBQ  $0x04, SI
 17465  
 17466  check_limit_unroll:
 17467  	CMPQ SI, $0x04
 17468  	JHS  loop_unroll
 17469  	JMP  check_limit
 17470  
 17471  loop:
 17472  	MOVSS (AX)(DI*4), X1
 17473  	MULSS X0, X1
 17474  	ADDSS (DX)(R8*4), X1
 17475  	MOVSS X1, (DX)(R8*4)
 17476  	DECQ  SI
 17477  	ADDQ  CX, DI
 17478  	ADDQ  BX, R8
 17479  
 17480  check_limit:
 17481  	CMPQ SI, $0x00
 17482  	JHI  loop
 17483  	RET
 17484  
 17485  // func AmdAxpyUnsafeXInterleave_V4A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17486  // Requires: SSE
 17487  TEXT ·AmdAxpyUnsafeXInterleave_V4A10R4(SB), NOSPLIT, $0-48
 17488  	MOVSS alpha+0(FP), X0
 17489  	MOVQ  xs+8(FP), AX
 17490  	MOVQ  incx+16(FP), CX
 17491  	MOVQ  ys+24(FP), DX
 17492  	MOVQ  incy+32(FP), BX
 17493  	MOVQ  n+40(FP), SI
 17494  	XORQ  DI, DI
 17495  	XORQ  R8, R8
 17496  	JMP   check_limit_unroll
 17497  	PCALIGN $0x08
 17498  	NOP
 17499  	NOP
 17500  
 17501  loop_unroll:
 17502  	MOVSS (AX)(DI*4), X1
 17503  	ADDQ  CX, DI
 17504  	MOVSS (AX)(DI*4), X2
 17505  	ADDQ  CX, DI
 17506  	MOVSS (AX)(DI*4), X3
 17507  	ADDQ  CX, DI
 17508  	MOVSS (AX)(DI*4), X4
 17509  	ADDQ  CX, DI
 17510  	MULSS X0, X1
 17511  	MULSS X0, X2
 17512  	MULSS X0, X3
 17513  	MULSS X0, X4
 17514  	ADDSS (DX)(R8*4), X1
 17515  	MOVSS X1, (DX)(R8*4)
 17516  	ADDQ  BX, R8
 17517  	ADDSS (DX)(R8*4), X2
 17518  	MOVSS X2, (DX)(R8*4)
 17519  	ADDQ  BX, R8
 17520  	ADDSS (DX)(R8*4), X3
 17521  	MOVSS X3, (DX)(R8*4)
 17522  	ADDQ  BX, R8
 17523  	ADDSS (DX)(R8*4), X4
 17524  	MOVSS X4, (DX)(R8*4)
 17525  	ADDQ  BX, R8
 17526  	SUBQ  $0x04, SI
 17527  
 17528  check_limit_unroll:
 17529  	CMPQ SI, $0x04
 17530  	JHS  loop_unroll
 17531  	JMP  check_limit
 17532  
 17533  loop:
 17534  	MOVSS (AX)(DI*4), X1
 17535  	MULSS X0, X1
 17536  	ADDSS (DX)(R8*4), X1
 17537  	MOVSS X1, (DX)(R8*4)
 17538  	DECQ  SI
 17539  	ADDQ  CX, DI
 17540  	ADDQ  BX, R8
 17541  
 17542  check_limit:
 17543  	CMPQ SI, $0x00
 17544  	JHI  loop
 17545  	RET
 17546  
 17547  // func AmdAxpyUnsafeXInterleave_V5A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17548  // Requires: SSE
 17549  TEXT ·AmdAxpyUnsafeXInterleave_V5A10R4(SB), NOSPLIT, $0-48
 17550  	MOVSS alpha+0(FP), X0
 17551  	MOVQ  xs+8(FP), AX
 17552  	MOVQ  incx+16(FP), CX
 17553  	MOVQ  ys+24(FP), DX
 17554  	MOVQ  incy+32(FP), BX
 17555  	MOVQ  n+40(FP), SI
 17556  	XORQ  DI, DI
 17557  	XORQ  R8, R8
 17558  	JMP   check_limit_unroll
 17559  	PCALIGN $0x08
 17560  	NOP
 17561  	NOP
 17562  
 17563  loop_unroll:
 17564  	MOVSS (AX)(DI*4), X1
 17565  	ADDQ  CX, DI
 17566  	MOVSS (AX)(DI*4), X2
 17567  	ADDQ  CX, DI
 17568  	MOVSS (AX)(DI*4), X3
 17569  	ADDQ  CX, DI
 17570  	MOVSS (AX)(DI*4), X4
 17571  	ADDQ  CX, DI
 17572  	MULSS X0, X1
 17573  	MULSS X0, X2
 17574  	MULSS X0, X3
 17575  	MULSS X0, X4
 17576  	ADDSS (DX)(R8*4), X1
 17577  	MOVSS X1, (DX)(R8*4)
 17578  	ADDQ  BX, R8
 17579  	ADDSS (DX)(R8*4), X2
 17580  	MOVSS X2, (DX)(R8*4)
 17581  	ADDQ  BX, R8
 17582  	ADDSS (DX)(R8*4), X3
 17583  	MOVSS X3, (DX)(R8*4)
 17584  	ADDQ  BX, R8
 17585  	ADDSS (DX)(R8*4), X4
 17586  	MOVSS X4, (DX)(R8*4)
 17587  	ADDQ  BX, R8
 17588  	SUBQ  $0x04, SI
 17589  
 17590  check_limit_unroll:
 17591  	CMPQ SI, $0x04
 17592  	JHS  loop_unroll
 17593  	JMP  check_limit
 17594  
 17595  loop:
 17596  	MOVSS (AX)(DI*4), X1
 17597  	MULSS X0, X1
 17598  	ADDSS (DX)(R8*4), X1
 17599  	MOVSS X1, (DX)(R8*4)
 17600  	DECQ  SI
 17601  	ADDQ  CX, DI
 17602  	ADDQ  BX, R8
 17603  
 17604  check_limit:
 17605  	CMPQ SI, $0x00
 17606  	JHI  loop
 17607  	RET
 17608  
 17609  // func AmdAxpyUnsafeXInterleave_V0A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17610  // Requires: SSE
 17611  TEXT ·AmdAxpyUnsafeXInterleave_V0A11R4(SB), NOSPLIT, $0-48
 17612  	MOVSS alpha+0(FP), X0
 17613  	MOVQ  xs+8(FP), AX
 17614  	MOVQ  incx+16(FP), CX
 17615  	MOVQ  ys+24(FP), DX
 17616  	MOVQ  incy+32(FP), BX
 17617  	MOVQ  n+40(FP), SI
 17618  	XORQ  DI, DI
 17619  	XORQ  R8, R8
 17620  	JMP   check_limit_unroll
 17621  	PCALIGN $0x08
 17622  	NOP
 17623  	NOP
 17624  	NOP
 17625  
 17626  loop_unroll:
 17627  	MOVSS (AX)(DI*4), X1
 17628  	ADDQ  CX, DI
 17629  	MOVSS (AX)(DI*4), X2
 17630  	ADDQ  CX, DI
 17631  	MOVSS (AX)(DI*4), X3
 17632  	ADDQ  CX, DI
 17633  	MOVSS (AX)(DI*4), X4
 17634  	ADDQ  CX, DI
 17635  	MULSS X0, X1
 17636  	MULSS X0, X2
 17637  	MULSS X0, X3
 17638  	MULSS X0, X4
 17639  	ADDSS (DX)(R8*4), X1
 17640  	MOVSS X1, (DX)(R8*4)
 17641  	ADDQ  BX, R8
 17642  	ADDSS (DX)(R8*4), X2
 17643  	MOVSS X2, (DX)(R8*4)
 17644  	ADDQ  BX, R8
 17645  	ADDSS (DX)(R8*4), X3
 17646  	MOVSS X3, (DX)(R8*4)
 17647  	ADDQ  BX, R8
 17648  	ADDSS (DX)(R8*4), X4
 17649  	MOVSS X4, (DX)(R8*4)
 17650  	ADDQ  BX, R8
 17651  	SUBQ  $0x04, SI
 17652  
 17653  check_limit_unroll:
 17654  	CMPQ SI, $0x04
 17655  	JHS  loop_unroll
 17656  	JMP  check_limit
 17657  
 17658  loop:
 17659  	MOVSS (AX)(DI*4), X1
 17660  	MULSS X0, X1
 17661  	ADDSS (DX)(R8*4), X1
 17662  	MOVSS X1, (DX)(R8*4)
 17663  	DECQ  SI
 17664  	ADDQ  CX, DI
 17665  	ADDQ  BX, R8
 17666  
 17667  check_limit:
 17668  	CMPQ SI, $0x00
 17669  	JHI  loop
 17670  	RET
 17671  
 17672  // func AmdAxpyUnsafeXInterleave_V1A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17673  // Requires: SSE
 17674  TEXT ·AmdAxpyUnsafeXInterleave_V1A11R4(SB), NOSPLIT, $0-48
 17675  	MOVSS alpha+0(FP), X0
 17676  	MOVQ  xs+8(FP), AX
 17677  	MOVQ  incx+16(FP), CX
 17678  	MOVQ  ys+24(FP), DX
 17679  	MOVQ  incy+32(FP), BX
 17680  	MOVQ  n+40(FP), SI
 17681  	XORQ  DI, DI
 17682  	XORQ  R8, R8
 17683  	JMP   check_limit_unroll
 17684  	PCALIGN $0x08
 17685  	NOP
 17686  	NOP
 17687  	NOP
 17688  
 17689  loop_unroll:
 17690  	MOVSS (AX)(DI*4), X1
 17691  	ADDQ  CX, DI
 17692  	MOVSS (AX)(DI*4), X2
 17693  	ADDQ  CX, DI
 17694  	MOVSS (AX)(DI*4), X3
 17695  	ADDQ  CX, DI
 17696  	MOVSS (AX)(DI*4), X4
 17697  	ADDQ  CX, DI
 17698  	MULSS X0, X1
 17699  	MULSS X0, X2
 17700  	MULSS X0, X3
 17701  	MULSS X0, X4
 17702  	ADDSS (DX)(R8*4), X1
 17703  	MOVSS X1, (DX)(R8*4)
 17704  	ADDQ  BX, R8
 17705  	ADDSS (DX)(R8*4), X2
 17706  	MOVSS X2, (DX)(R8*4)
 17707  	ADDQ  BX, R8
 17708  	ADDSS (DX)(R8*4), X3
 17709  	MOVSS X3, (DX)(R8*4)
 17710  	ADDQ  BX, R8
 17711  	ADDSS (DX)(R8*4), X4
 17712  	MOVSS X4, (DX)(R8*4)
 17713  	ADDQ  BX, R8
 17714  	SUBQ  $0x04, SI
 17715  
 17716  check_limit_unroll:
 17717  	CMPQ SI, $0x04
 17718  	JHS  loop_unroll
 17719  	JMP  check_limit
 17720  
 17721  loop:
 17722  	MOVSS (AX)(DI*4), X1
 17723  	MULSS X0, X1
 17724  	ADDSS (DX)(R8*4), X1
 17725  	MOVSS X1, (DX)(R8*4)
 17726  	DECQ  SI
 17727  	ADDQ  CX, DI
 17728  	ADDQ  BX, R8
 17729  
 17730  check_limit:
 17731  	CMPQ SI, $0x00
 17732  	JHI  loop
 17733  	RET
 17734  
 17735  // func AmdAxpyUnsafeXInterleave_V2A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17736  // Requires: SSE
 17737  TEXT ·AmdAxpyUnsafeXInterleave_V2A11R4(SB), NOSPLIT, $0-48
 17738  	MOVSS alpha+0(FP), X0
 17739  	MOVQ  xs+8(FP), AX
 17740  	MOVQ  incx+16(FP), CX
 17741  	MOVQ  ys+24(FP), DX
 17742  	MOVQ  incy+32(FP), BX
 17743  	MOVQ  n+40(FP), SI
 17744  	XORQ  DI, DI
 17745  	XORQ  R8, R8
 17746  	JMP   check_limit_unroll
 17747  	PCALIGN $0x08
 17748  	NOP
 17749  	NOP
 17750  	NOP
 17751  
 17752  loop_unroll:
 17753  	MOVSS (AX)(DI*4), X1
 17754  	ADDQ  CX, DI
 17755  	MOVSS (AX)(DI*4), X2
 17756  	ADDQ  CX, DI
 17757  	MOVSS (AX)(DI*4), X3
 17758  	ADDQ  CX, DI
 17759  	MOVSS (AX)(DI*4), X4
 17760  	ADDQ  CX, DI
 17761  	MULSS X0, X1
 17762  	MULSS X0, X2
 17763  	MULSS X0, X3
 17764  	MULSS X0, X4
 17765  	ADDSS (DX)(R8*4), X1
 17766  	MOVSS X1, (DX)(R8*4)
 17767  	ADDQ  BX, R8
 17768  	ADDSS (DX)(R8*4), X2
 17769  	MOVSS X2, (DX)(R8*4)
 17770  	ADDQ  BX, R8
 17771  	ADDSS (DX)(R8*4), X3
 17772  	MOVSS X3, (DX)(R8*4)
 17773  	ADDQ  BX, R8
 17774  	ADDSS (DX)(R8*4), X4
 17775  	MOVSS X4, (DX)(R8*4)
 17776  	ADDQ  BX, R8
 17777  	SUBQ  $0x04, SI
 17778  
 17779  check_limit_unroll:
 17780  	CMPQ SI, $0x04
 17781  	JHS  loop_unroll
 17782  	JMP  check_limit
 17783  
 17784  loop:
 17785  	MOVSS (AX)(DI*4), X1
 17786  	MULSS X0, X1
 17787  	ADDSS (DX)(R8*4), X1
 17788  	MOVSS X1, (DX)(R8*4)
 17789  	DECQ  SI
 17790  	ADDQ  CX, DI
 17791  	ADDQ  BX, R8
 17792  
 17793  check_limit:
 17794  	CMPQ SI, $0x00
 17795  	JHI  loop
 17796  	RET
 17797  
 17798  // func AmdAxpyUnsafeXInterleave_V3A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17799  // Requires: SSE
 17800  TEXT ·AmdAxpyUnsafeXInterleave_V3A11R4(SB), NOSPLIT, $0-48
 17801  	MOVSS alpha+0(FP), X0
 17802  	MOVQ  xs+8(FP), AX
 17803  	MOVQ  incx+16(FP), CX
 17804  	MOVQ  ys+24(FP), DX
 17805  	MOVQ  incy+32(FP), BX
 17806  	MOVQ  n+40(FP), SI
 17807  	XORQ  DI, DI
 17808  	XORQ  R8, R8
 17809  	JMP   check_limit_unroll
 17810  	PCALIGN $0x08
 17811  	NOP
 17812  	NOP
 17813  	NOP
 17814  
 17815  loop_unroll:
 17816  	MOVSS (AX)(DI*4), X1
 17817  	ADDQ  CX, DI
 17818  	MOVSS (AX)(DI*4), X2
 17819  	ADDQ  CX, DI
 17820  	MOVSS (AX)(DI*4), X3
 17821  	ADDQ  CX, DI
 17822  	MOVSS (AX)(DI*4), X4
 17823  	ADDQ  CX, DI
 17824  	MULSS X0, X1
 17825  	MULSS X0, X2
 17826  	MULSS X0, X3
 17827  	MULSS X0, X4
 17828  	ADDSS (DX)(R8*4), X1
 17829  	MOVSS X1, (DX)(R8*4)
 17830  	ADDQ  BX, R8
 17831  	ADDSS (DX)(R8*4), X2
 17832  	MOVSS X2, (DX)(R8*4)
 17833  	ADDQ  BX, R8
 17834  	ADDSS (DX)(R8*4), X3
 17835  	MOVSS X3, (DX)(R8*4)
 17836  	ADDQ  BX, R8
 17837  	ADDSS (DX)(R8*4), X4
 17838  	MOVSS X4, (DX)(R8*4)
 17839  	ADDQ  BX, R8
 17840  	SUBQ  $0x04, SI
 17841  
 17842  check_limit_unroll:
 17843  	CMPQ SI, $0x04
 17844  	JHS  loop_unroll
 17845  	JMP  check_limit
 17846  
 17847  loop:
 17848  	MOVSS (AX)(DI*4), X1
 17849  	MULSS X0, X1
 17850  	ADDSS (DX)(R8*4), X1
 17851  	MOVSS X1, (DX)(R8*4)
 17852  	DECQ  SI
 17853  	ADDQ  CX, DI
 17854  	ADDQ  BX, R8
 17855  
 17856  check_limit:
 17857  	CMPQ SI, $0x00
 17858  	JHI  loop
 17859  	RET
 17860  
 17861  // func AmdAxpyUnsafeXInterleave_V4A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17862  // Requires: SSE
 17863  TEXT ·AmdAxpyUnsafeXInterleave_V4A11R4(SB), NOSPLIT, $0-48
 17864  	MOVSS alpha+0(FP), X0
 17865  	MOVQ  xs+8(FP), AX
 17866  	MOVQ  incx+16(FP), CX
 17867  	MOVQ  ys+24(FP), DX
 17868  	MOVQ  incy+32(FP), BX
 17869  	MOVQ  n+40(FP), SI
 17870  	XORQ  DI, DI
 17871  	XORQ  R8, R8
 17872  	JMP   check_limit_unroll
 17873  	PCALIGN $0x08
 17874  	NOP
 17875  	NOP
 17876  	NOP
 17877  
 17878  loop_unroll:
 17879  	MOVSS (AX)(DI*4), X1
 17880  	ADDQ  CX, DI
 17881  	MOVSS (AX)(DI*4), X2
 17882  	ADDQ  CX, DI
 17883  	MOVSS (AX)(DI*4), X3
 17884  	ADDQ  CX, DI
 17885  	MOVSS (AX)(DI*4), X4
 17886  	ADDQ  CX, DI
 17887  	MULSS X0, X1
 17888  	MULSS X0, X2
 17889  	MULSS X0, X3
 17890  	MULSS X0, X4
 17891  	ADDSS (DX)(R8*4), X1
 17892  	MOVSS X1, (DX)(R8*4)
 17893  	ADDQ  BX, R8
 17894  	ADDSS (DX)(R8*4), X2
 17895  	MOVSS X2, (DX)(R8*4)
 17896  	ADDQ  BX, R8
 17897  	ADDSS (DX)(R8*4), X3
 17898  	MOVSS X3, (DX)(R8*4)
 17899  	ADDQ  BX, R8
 17900  	ADDSS (DX)(R8*4), X4
 17901  	MOVSS X4, (DX)(R8*4)
 17902  	ADDQ  BX, R8
 17903  	SUBQ  $0x04, SI
 17904  
 17905  check_limit_unroll:
 17906  	CMPQ SI, $0x04
 17907  	JHS  loop_unroll
 17908  	JMP  check_limit
 17909  
 17910  loop:
 17911  	MOVSS (AX)(DI*4), X1
 17912  	MULSS X0, X1
 17913  	ADDSS (DX)(R8*4), X1
 17914  	MOVSS X1, (DX)(R8*4)
 17915  	DECQ  SI
 17916  	ADDQ  CX, DI
 17917  	ADDQ  BX, R8
 17918  
 17919  check_limit:
 17920  	CMPQ SI, $0x00
 17921  	JHI  loop
 17922  	RET
 17923  
 17924  // func AmdAxpyUnsafeXInterleave_V5A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17925  // Requires: SSE
 17926  TEXT ·AmdAxpyUnsafeXInterleave_V5A11R4(SB), NOSPLIT, $0-48
 17927  	MOVSS alpha+0(FP), X0
 17928  	MOVQ  xs+8(FP), AX
 17929  	MOVQ  incx+16(FP), CX
 17930  	MOVQ  ys+24(FP), DX
 17931  	MOVQ  incy+32(FP), BX
 17932  	MOVQ  n+40(FP), SI
 17933  	XORQ  DI, DI
 17934  	XORQ  R8, R8
 17935  	JMP   check_limit_unroll
 17936  	PCALIGN $0x08
 17937  	NOP
 17938  	NOP
 17939  	NOP
 17940  
 17941  loop_unroll:
 17942  	MOVSS (AX)(DI*4), X1
 17943  	ADDQ  CX, DI
 17944  	MOVSS (AX)(DI*4), X2
 17945  	ADDQ  CX, DI
 17946  	MOVSS (AX)(DI*4), X3
 17947  	ADDQ  CX, DI
 17948  	MOVSS (AX)(DI*4), X4
 17949  	ADDQ  CX, DI
 17950  	MULSS X0, X1
 17951  	MULSS X0, X2
 17952  	MULSS X0, X3
 17953  	MULSS X0, X4
 17954  	ADDSS (DX)(R8*4), X1
 17955  	MOVSS X1, (DX)(R8*4)
 17956  	ADDQ  BX, R8
 17957  	ADDSS (DX)(R8*4), X2
 17958  	MOVSS X2, (DX)(R8*4)
 17959  	ADDQ  BX, R8
 17960  	ADDSS (DX)(R8*4), X3
 17961  	MOVSS X3, (DX)(R8*4)
 17962  	ADDQ  BX, R8
 17963  	ADDSS (DX)(R8*4), X4
 17964  	MOVSS X4, (DX)(R8*4)
 17965  	ADDQ  BX, R8
 17966  	SUBQ  $0x04, SI
 17967  
 17968  check_limit_unroll:
 17969  	CMPQ SI, $0x04
 17970  	JHS  loop_unroll
 17971  	JMP  check_limit
 17972  
 17973  loop:
 17974  	MOVSS (AX)(DI*4), X1
 17975  	MULSS X0, X1
 17976  	ADDSS (DX)(R8*4), X1
 17977  	MOVSS X1, (DX)(R8*4)
 17978  	DECQ  SI
 17979  	ADDQ  CX, DI
 17980  	ADDQ  BX, R8
 17981  
 17982  check_limit:
 17983  	CMPQ SI, $0x00
 17984  	JHI  loop
 17985  	RET
 17986  
 17987  // func AmdAxpyUnsafeXInterleave_V0A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 17988  // Requires: SSE
 17989  TEXT ·AmdAxpyUnsafeXInterleave_V0A12R4(SB), NOSPLIT, $0-48
 17990  	MOVSS alpha+0(FP), X0
 17991  	MOVQ  xs+8(FP), AX
 17992  	MOVQ  incx+16(FP), CX
 17993  	MOVQ  ys+24(FP), DX
 17994  	MOVQ  incy+32(FP), BX
 17995  	MOVQ  n+40(FP), SI
 17996  	XORQ  DI, DI
 17997  	XORQ  R8, R8
 17998  	JMP   check_limit_unroll
 17999  	PCALIGN $0x08
 18000  	NOP
 18001  	NOP
 18002  	NOP
 18003  	NOP
 18004  
 18005  loop_unroll:
 18006  	MOVSS (AX)(DI*4), X1
 18007  	ADDQ  CX, DI
 18008  	MOVSS (AX)(DI*4), X2
 18009  	ADDQ  CX, DI
 18010  	MOVSS (AX)(DI*4), X3
 18011  	ADDQ  CX, DI
 18012  	MOVSS (AX)(DI*4), X4
 18013  	ADDQ  CX, DI
 18014  	MULSS X0, X1
 18015  	MULSS X0, X2
 18016  	MULSS X0, X3
 18017  	MULSS X0, X4
 18018  	ADDSS (DX)(R8*4), X1
 18019  	MOVSS X1, (DX)(R8*4)
 18020  	ADDQ  BX, R8
 18021  	ADDSS (DX)(R8*4), X2
 18022  	MOVSS X2, (DX)(R8*4)
 18023  	ADDQ  BX, R8
 18024  	ADDSS (DX)(R8*4), X3
 18025  	MOVSS X3, (DX)(R8*4)
 18026  	ADDQ  BX, R8
 18027  	ADDSS (DX)(R8*4), X4
 18028  	MOVSS X4, (DX)(R8*4)
 18029  	ADDQ  BX, R8
 18030  	SUBQ  $0x04, SI
 18031  
 18032  check_limit_unroll:
 18033  	CMPQ SI, $0x04
 18034  	JHS  loop_unroll
 18035  	JMP  check_limit
 18036  
 18037  loop:
 18038  	MOVSS (AX)(DI*4), X1
 18039  	MULSS X0, X1
 18040  	ADDSS (DX)(R8*4), X1
 18041  	MOVSS X1, (DX)(R8*4)
 18042  	DECQ  SI
 18043  	ADDQ  CX, DI
 18044  	ADDQ  BX, R8
 18045  
 18046  check_limit:
 18047  	CMPQ SI, $0x00
 18048  	JHI  loop
 18049  	RET
 18050  
 18051  // func AmdAxpyUnsafeXInterleave_V1A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18052  // Requires: SSE
 18053  TEXT ·AmdAxpyUnsafeXInterleave_V1A12R4(SB), NOSPLIT, $0-48
 18054  	MOVSS alpha+0(FP), X0
 18055  	MOVQ  xs+8(FP), AX
 18056  	MOVQ  incx+16(FP), CX
 18057  	MOVQ  ys+24(FP), DX
 18058  	MOVQ  incy+32(FP), BX
 18059  	MOVQ  n+40(FP), SI
 18060  	XORQ  DI, DI
 18061  	XORQ  R8, R8
 18062  	JMP   check_limit_unroll
 18063  	PCALIGN $0x08
 18064  	NOP
 18065  	NOP
 18066  	NOP
 18067  	NOP
 18068  
 18069  loop_unroll:
 18070  	MOVSS (AX)(DI*4), X1
 18071  	ADDQ  CX, DI
 18072  	MOVSS (AX)(DI*4), X2
 18073  	ADDQ  CX, DI
 18074  	MOVSS (AX)(DI*4), X3
 18075  	ADDQ  CX, DI
 18076  	MOVSS (AX)(DI*4), X4
 18077  	ADDQ  CX, DI
 18078  	MULSS X0, X1
 18079  	MULSS X0, X2
 18080  	MULSS X0, X3
 18081  	MULSS X0, X4
 18082  	ADDSS (DX)(R8*4), X1
 18083  	MOVSS X1, (DX)(R8*4)
 18084  	ADDQ  BX, R8
 18085  	ADDSS (DX)(R8*4), X2
 18086  	MOVSS X2, (DX)(R8*4)
 18087  	ADDQ  BX, R8
 18088  	ADDSS (DX)(R8*4), X3
 18089  	MOVSS X3, (DX)(R8*4)
 18090  	ADDQ  BX, R8
 18091  	ADDSS (DX)(R8*4), X4
 18092  	MOVSS X4, (DX)(R8*4)
 18093  	ADDQ  BX, R8
 18094  	SUBQ  $0x04, SI
 18095  
 18096  check_limit_unroll:
 18097  	CMPQ SI, $0x04
 18098  	JHS  loop_unroll
 18099  	JMP  check_limit
 18100  
 18101  loop:
 18102  	MOVSS (AX)(DI*4), X1
 18103  	MULSS X0, X1
 18104  	ADDSS (DX)(R8*4), X1
 18105  	MOVSS X1, (DX)(R8*4)
 18106  	DECQ  SI
 18107  	ADDQ  CX, DI
 18108  	ADDQ  BX, R8
 18109  
 18110  check_limit:
 18111  	CMPQ SI, $0x00
 18112  	JHI  loop
 18113  	RET
 18114  
 18115  // func AmdAxpyUnsafeXInterleave_V2A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18116  // Requires: SSE
 18117  TEXT ·AmdAxpyUnsafeXInterleave_V2A12R4(SB), NOSPLIT, $0-48
 18118  	MOVSS alpha+0(FP), X0
 18119  	MOVQ  xs+8(FP), AX
 18120  	MOVQ  incx+16(FP), CX
 18121  	MOVQ  ys+24(FP), DX
 18122  	MOVQ  incy+32(FP), BX
 18123  	MOVQ  n+40(FP), SI
 18124  	XORQ  DI, DI
 18125  	XORQ  R8, R8
 18126  	JMP   check_limit_unroll
 18127  	PCALIGN $0x08
 18128  	NOP
 18129  	NOP
 18130  	NOP
 18131  	NOP
 18132  
 18133  loop_unroll:
 18134  	MOVSS (AX)(DI*4), X1
 18135  	ADDQ  CX, DI
 18136  	MOVSS (AX)(DI*4), X2
 18137  	ADDQ  CX, DI
 18138  	MOVSS (AX)(DI*4), X3
 18139  	ADDQ  CX, DI
 18140  	MOVSS (AX)(DI*4), X4
 18141  	ADDQ  CX, DI
 18142  	MULSS X0, X1
 18143  	MULSS X0, X2
 18144  	MULSS X0, X3
 18145  	MULSS X0, X4
 18146  	ADDSS (DX)(R8*4), X1
 18147  	MOVSS X1, (DX)(R8*4)
 18148  	ADDQ  BX, R8
 18149  	ADDSS (DX)(R8*4), X2
 18150  	MOVSS X2, (DX)(R8*4)
 18151  	ADDQ  BX, R8
 18152  	ADDSS (DX)(R8*4), X3
 18153  	MOVSS X3, (DX)(R8*4)
 18154  	ADDQ  BX, R8
 18155  	ADDSS (DX)(R8*4), X4
 18156  	MOVSS X4, (DX)(R8*4)
 18157  	ADDQ  BX, R8
 18158  	SUBQ  $0x04, SI
 18159  
 18160  check_limit_unroll:
 18161  	CMPQ SI, $0x04
 18162  	JHS  loop_unroll
 18163  	JMP  check_limit
 18164  
 18165  loop:
 18166  	MOVSS (AX)(DI*4), X1
 18167  	MULSS X0, X1
 18168  	ADDSS (DX)(R8*4), X1
 18169  	MOVSS X1, (DX)(R8*4)
 18170  	DECQ  SI
 18171  	ADDQ  CX, DI
 18172  	ADDQ  BX, R8
 18173  
 18174  check_limit:
 18175  	CMPQ SI, $0x00
 18176  	JHI  loop
 18177  	RET
 18178  
 18179  // func AmdAxpyUnsafeXInterleave_V3A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18180  // Requires: SSE
 18181  TEXT ·AmdAxpyUnsafeXInterleave_V3A12R4(SB), NOSPLIT, $0-48
 18182  	MOVSS alpha+0(FP), X0
 18183  	MOVQ  xs+8(FP), AX
 18184  	MOVQ  incx+16(FP), CX
 18185  	MOVQ  ys+24(FP), DX
 18186  	MOVQ  incy+32(FP), BX
 18187  	MOVQ  n+40(FP), SI
 18188  	XORQ  DI, DI
 18189  	XORQ  R8, R8
 18190  	JMP   check_limit_unroll
 18191  	PCALIGN $0x08
 18192  	NOP
 18193  	NOP
 18194  	NOP
 18195  	NOP
 18196  
 18197  loop_unroll:
 18198  	MOVSS (AX)(DI*4), X1
 18199  	ADDQ  CX, DI
 18200  	MOVSS (AX)(DI*4), X2
 18201  	ADDQ  CX, DI
 18202  	MOVSS (AX)(DI*4), X3
 18203  	ADDQ  CX, DI
 18204  	MOVSS (AX)(DI*4), X4
 18205  	ADDQ  CX, DI
 18206  	MULSS X0, X1
 18207  	MULSS X0, X2
 18208  	MULSS X0, X3
 18209  	MULSS X0, X4
 18210  	ADDSS (DX)(R8*4), X1
 18211  	MOVSS X1, (DX)(R8*4)
 18212  	ADDQ  BX, R8
 18213  	ADDSS (DX)(R8*4), X2
 18214  	MOVSS X2, (DX)(R8*4)
 18215  	ADDQ  BX, R8
 18216  	ADDSS (DX)(R8*4), X3
 18217  	MOVSS X3, (DX)(R8*4)
 18218  	ADDQ  BX, R8
 18219  	ADDSS (DX)(R8*4), X4
 18220  	MOVSS X4, (DX)(R8*4)
 18221  	ADDQ  BX, R8
 18222  	SUBQ  $0x04, SI
 18223  
 18224  check_limit_unroll:
 18225  	CMPQ SI, $0x04
 18226  	JHS  loop_unroll
 18227  	JMP  check_limit
 18228  
 18229  loop:
 18230  	MOVSS (AX)(DI*4), X1
 18231  	MULSS X0, X1
 18232  	ADDSS (DX)(R8*4), X1
 18233  	MOVSS X1, (DX)(R8*4)
 18234  	DECQ  SI
 18235  	ADDQ  CX, DI
 18236  	ADDQ  BX, R8
 18237  
 18238  check_limit:
 18239  	CMPQ SI, $0x00
 18240  	JHI  loop
 18241  	RET
 18242  
 18243  // func AmdAxpyUnsafeXInterleave_V4A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18244  // Requires: SSE
 18245  TEXT ·AmdAxpyUnsafeXInterleave_V4A12R4(SB), NOSPLIT, $0-48
 18246  	MOVSS alpha+0(FP), X0
 18247  	MOVQ  xs+8(FP), AX
 18248  	MOVQ  incx+16(FP), CX
 18249  	MOVQ  ys+24(FP), DX
 18250  	MOVQ  incy+32(FP), BX
 18251  	MOVQ  n+40(FP), SI
 18252  	XORQ  DI, DI
 18253  	XORQ  R8, R8
 18254  	JMP   check_limit_unroll
 18255  	PCALIGN $0x08
 18256  	NOP
 18257  	NOP
 18258  	NOP
 18259  	NOP
 18260  
 18261  loop_unroll:
 18262  	MOVSS (AX)(DI*4), X1
 18263  	ADDQ  CX, DI
 18264  	MOVSS (AX)(DI*4), X2
 18265  	ADDQ  CX, DI
 18266  	MOVSS (AX)(DI*4), X3
 18267  	ADDQ  CX, DI
 18268  	MOVSS (AX)(DI*4), X4
 18269  	ADDQ  CX, DI
 18270  	MULSS X0, X1
 18271  	MULSS X0, X2
 18272  	MULSS X0, X3
 18273  	MULSS X0, X4
 18274  	ADDSS (DX)(R8*4), X1
 18275  	MOVSS X1, (DX)(R8*4)
 18276  	ADDQ  BX, R8
 18277  	ADDSS (DX)(R8*4), X2
 18278  	MOVSS X2, (DX)(R8*4)
 18279  	ADDQ  BX, R8
 18280  	ADDSS (DX)(R8*4), X3
 18281  	MOVSS X3, (DX)(R8*4)
 18282  	ADDQ  BX, R8
 18283  	ADDSS (DX)(R8*4), X4
 18284  	MOVSS X4, (DX)(R8*4)
 18285  	ADDQ  BX, R8
 18286  	SUBQ  $0x04, SI
 18287  
 18288  check_limit_unroll:
 18289  	CMPQ SI, $0x04
 18290  	JHS  loop_unroll
 18291  	JMP  check_limit
 18292  
 18293  loop:
 18294  	MOVSS (AX)(DI*4), X1
 18295  	MULSS X0, X1
 18296  	ADDSS (DX)(R8*4), X1
 18297  	MOVSS X1, (DX)(R8*4)
 18298  	DECQ  SI
 18299  	ADDQ  CX, DI
 18300  	ADDQ  BX, R8
 18301  
 18302  check_limit:
 18303  	CMPQ SI, $0x00
 18304  	JHI  loop
 18305  	RET
 18306  
 18307  // func AmdAxpyUnsafeXInterleave_V5A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18308  // Requires: SSE
 18309  TEXT ·AmdAxpyUnsafeXInterleave_V5A12R4(SB), NOSPLIT, $0-48
 18310  	MOVSS alpha+0(FP), X0
 18311  	MOVQ  xs+8(FP), AX
 18312  	MOVQ  incx+16(FP), CX
 18313  	MOVQ  ys+24(FP), DX
 18314  	MOVQ  incy+32(FP), BX
 18315  	MOVQ  n+40(FP), SI
 18316  	XORQ  DI, DI
 18317  	XORQ  R8, R8
 18318  	JMP   check_limit_unroll
 18319  	PCALIGN $0x08
 18320  	NOP
 18321  	NOP
 18322  	NOP
 18323  	NOP
 18324  
 18325  loop_unroll:
 18326  	MOVSS (AX)(DI*4), X1
 18327  	ADDQ  CX, DI
 18328  	MOVSS (AX)(DI*4), X2
 18329  	ADDQ  CX, DI
 18330  	MOVSS (AX)(DI*4), X3
 18331  	ADDQ  CX, DI
 18332  	MOVSS (AX)(DI*4), X4
 18333  	ADDQ  CX, DI
 18334  	MULSS X0, X1
 18335  	MULSS X0, X2
 18336  	MULSS X0, X3
 18337  	MULSS X0, X4
 18338  	ADDSS (DX)(R8*4), X1
 18339  	MOVSS X1, (DX)(R8*4)
 18340  	ADDQ  BX, R8
 18341  	ADDSS (DX)(R8*4), X2
 18342  	MOVSS X2, (DX)(R8*4)
 18343  	ADDQ  BX, R8
 18344  	ADDSS (DX)(R8*4), X3
 18345  	MOVSS X3, (DX)(R8*4)
 18346  	ADDQ  BX, R8
 18347  	ADDSS (DX)(R8*4), X4
 18348  	MOVSS X4, (DX)(R8*4)
 18349  	ADDQ  BX, R8
 18350  	SUBQ  $0x04, SI
 18351  
 18352  check_limit_unroll:
 18353  	CMPQ SI, $0x04
 18354  	JHS  loop_unroll
 18355  	JMP  check_limit
 18356  
 18357  loop:
 18358  	MOVSS (AX)(DI*4), X1
 18359  	MULSS X0, X1
 18360  	ADDSS (DX)(R8*4), X1
 18361  	MOVSS X1, (DX)(R8*4)
 18362  	DECQ  SI
 18363  	ADDQ  CX, DI
 18364  	ADDQ  BX, R8
 18365  
 18366  check_limit:
 18367  	CMPQ SI, $0x00
 18368  	JHI  loop
 18369  	RET
 18370  
 18371  // func AmdAxpyUnsafeXInterleave_V0A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18372  // Requires: SSE
 18373  TEXT ·AmdAxpyUnsafeXInterleave_V0A13R4(SB), NOSPLIT, $0-48
 18374  	MOVSS alpha+0(FP), X0
 18375  	MOVQ  xs+8(FP), AX
 18376  	MOVQ  incx+16(FP), CX
 18377  	MOVQ  ys+24(FP), DX
 18378  	MOVQ  incy+32(FP), BX
 18379  	MOVQ  n+40(FP), SI
 18380  	XORQ  DI, DI
 18381  	XORQ  R8, R8
 18382  	JMP   check_limit_unroll
 18383  	PCALIGN $0x08
 18384  	NOP
 18385  	NOP
 18386  	NOP
 18387  	NOP
 18388  	NOP
 18389  
 18390  loop_unroll:
 18391  	MOVSS (AX)(DI*4), X1
 18392  	ADDQ  CX, DI
 18393  	MOVSS (AX)(DI*4), X2
 18394  	ADDQ  CX, DI
 18395  	MOVSS (AX)(DI*4), X3
 18396  	ADDQ  CX, DI
 18397  	MOVSS (AX)(DI*4), X4
 18398  	ADDQ  CX, DI
 18399  	MULSS X0, X1
 18400  	MULSS X0, X2
 18401  	MULSS X0, X3
 18402  	MULSS X0, X4
 18403  	ADDSS (DX)(R8*4), X1
 18404  	MOVSS X1, (DX)(R8*4)
 18405  	ADDQ  BX, R8
 18406  	ADDSS (DX)(R8*4), X2
 18407  	MOVSS X2, (DX)(R8*4)
 18408  	ADDQ  BX, R8
 18409  	ADDSS (DX)(R8*4), X3
 18410  	MOVSS X3, (DX)(R8*4)
 18411  	ADDQ  BX, R8
 18412  	ADDSS (DX)(R8*4), X4
 18413  	MOVSS X4, (DX)(R8*4)
 18414  	ADDQ  BX, R8
 18415  	SUBQ  $0x04, SI
 18416  
 18417  check_limit_unroll:
 18418  	CMPQ SI, $0x04
 18419  	JHS  loop_unroll
 18420  	JMP  check_limit
 18421  
 18422  loop:
 18423  	MOVSS (AX)(DI*4), X1
 18424  	MULSS X0, X1
 18425  	ADDSS (DX)(R8*4), X1
 18426  	MOVSS X1, (DX)(R8*4)
 18427  	DECQ  SI
 18428  	ADDQ  CX, DI
 18429  	ADDQ  BX, R8
 18430  
 18431  check_limit:
 18432  	CMPQ SI, $0x00
 18433  	JHI  loop
 18434  	RET
 18435  
 18436  // func AmdAxpyUnsafeXInterleave_V1A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18437  // Requires: SSE
 18438  TEXT ·AmdAxpyUnsafeXInterleave_V1A13R4(SB), NOSPLIT, $0-48
 18439  	MOVSS alpha+0(FP), X0
 18440  	MOVQ  xs+8(FP), AX
 18441  	MOVQ  incx+16(FP), CX
 18442  	MOVQ  ys+24(FP), DX
 18443  	MOVQ  incy+32(FP), BX
 18444  	MOVQ  n+40(FP), SI
 18445  	XORQ  DI, DI
 18446  	XORQ  R8, R8
 18447  	JMP   check_limit_unroll
 18448  	PCALIGN $0x08
 18449  	NOP
 18450  	NOP
 18451  	NOP
 18452  	NOP
 18453  	NOP
 18454  
 18455  loop_unroll:
 18456  	MOVSS (AX)(DI*4), X1
 18457  	ADDQ  CX, DI
 18458  	MOVSS (AX)(DI*4), X2
 18459  	ADDQ  CX, DI
 18460  	MOVSS (AX)(DI*4), X3
 18461  	ADDQ  CX, DI
 18462  	MOVSS (AX)(DI*4), X4
 18463  	ADDQ  CX, DI
 18464  	MULSS X0, X1
 18465  	MULSS X0, X2
 18466  	MULSS X0, X3
 18467  	MULSS X0, X4
 18468  	ADDSS (DX)(R8*4), X1
 18469  	MOVSS X1, (DX)(R8*4)
 18470  	ADDQ  BX, R8
 18471  	ADDSS (DX)(R8*4), X2
 18472  	MOVSS X2, (DX)(R8*4)
 18473  	ADDQ  BX, R8
 18474  	ADDSS (DX)(R8*4), X3
 18475  	MOVSS X3, (DX)(R8*4)
 18476  	ADDQ  BX, R8
 18477  	ADDSS (DX)(R8*4), X4
 18478  	MOVSS X4, (DX)(R8*4)
 18479  	ADDQ  BX, R8
 18480  	SUBQ  $0x04, SI
 18481  
 18482  check_limit_unroll:
 18483  	CMPQ SI, $0x04
 18484  	JHS  loop_unroll
 18485  	JMP  check_limit
 18486  
 18487  loop:
 18488  	MOVSS (AX)(DI*4), X1
 18489  	MULSS X0, X1
 18490  	ADDSS (DX)(R8*4), X1
 18491  	MOVSS X1, (DX)(R8*4)
 18492  	DECQ  SI
 18493  	ADDQ  CX, DI
 18494  	ADDQ  BX, R8
 18495  
 18496  check_limit:
 18497  	CMPQ SI, $0x00
 18498  	JHI  loop
 18499  	RET
 18500  
 18501  // func AmdAxpyUnsafeXInterleave_V2A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18502  // Requires: SSE
 18503  TEXT ·AmdAxpyUnsafeXInterleave_V2A13R4(SB), NOSPLIT, $0-48
 18504  	MOVSS alpha+0(FP), X0
 18505  	MOVQ  xs+8(FP), AX
 18506  	MOVQ  incx+16(FP), CX
 18507  	MOVQ  ys+24(FP), DX
 18508  	MOVQ  incy+32(FP), BX
 18509  	MOVQ  n+40(FP), SI
 18510  	XORQ  DI, DI
 18511  	XORQ  R8, R8
 18512  	JMP   check_limit_unroll
 18513  	PCALIGN $0x08
 18514  	NOP
 18515  	NOP
 18516  	NOP
 18517  	NOP
 18518  	NOP
 18519  
 18520  loop_unroll:
 18521  	MOVSS (AX)(DI*4), X1
 18522  	ADDQ  CX, DI
 18523  	MOVSS (AX)(DI*4), X2
 18524  	ADDQ  CX, DI
 18525  	MOVSS (AX)(DI*4), X3
 18526  	ADDQ  CX, DI
 18527  	MOVSS (AX)(DI*4), X4
 18528  	ADDQ  CX, DI
 18529  	MULSS X0, X1
 18530  	MULSS X0, X2
 18531  	MULSS X0, X3
 18532  	MULSS X0, X4
 18533  	ADDSS (DX)(R8*4), X1
 18534  	MOVSS X1, (DX)(R8*4)
 18535  	ADDQ  BX, R8
 18536  	ADDSS (DX)(R8*4), X2
 18537  	MOVSS X2, (DX)(R8*4)
 18538  	ADDQ  BX, R8
 18539  	ADDSS (DX)(R8*4), X3
 18540  	MOVSS X3, (DX)(R8*4)
 18541  	ADDQ  BX, R8
 18542  	ADDSS (DX)(R8*4), X4
 18543  	MOVSS X4, (DX)(R8*4)
 18544  	ADDQ  BX, R8
 18545  	SUBQ  $0x04, SI
 18546  
 18547  check_limit_unroll:
 18548  	CMPQ SI, $0x04
 18549  	JHS  loop_unroll
 18550  	JMP  check_limit
 18551  
 18552  loop:
 18553  	MOVSS (AX)(DI*4), X1
 18554  	MULSS X0, X1
 18555  	ADDSS (DX)(R8*4), X1
 18556  	MOVSS X1, (DX)(R8*4)
 18557  	DECQ  SI
 18558  	ADDQ  CX, DI
 18559  	ADDQ  BX, R8
 18560  
 18561  check_limit:
 18562  	CMPQ SI, $0x00
 18563  	JHI  loop
 18564  	RET
 18565  
 18566  // func AmdAxpyUnsafeXInterleave_V3A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18567  // Requires: SSE
 18568  TEXT ·AmdAxpyUnsafeXInterleave_V3A13R4(SB), NOSPLIT, $0-48
 18569  	MOVSS alpha+0(FP), X0
 18570  	MOVQ  xs+8(FP), AX
 18571  	MOVQ  incx+16(FP), CX
 18572  	MOVQ  ys+24(FP), DX
 18573  	MOVQ  incy+32(FP), BX
 18574  	MOVQ  n+40(FP), SI
 18575  	XORQ  DI, DI
 18576  	XORQ  R8, R8
 18577  	JMP   check_limit_unroll
 18578  	PCALIGN $0x08
 18579  	NOP
 18580  	NOP
 18581  	NOP
 18582  	NOP
 18583  	NOP
 18584  
 18585  loop_unroll:
 18586  	MOVSS (AX)(DI*4), X1
 18587  	ADDQ  CX, DI
 18588  	MOVSS (AX)(DI*4), X2
 18589  	ADDQ  CX, DI
 18590  	MOVSS (AX)(DI*4), X3
 18591  	ADDQ  CX, DI
 18592  	MOVSS (AX)(DI*4), X4
 18593  	ADDQ  CX, DI
 18594  	MULSS X0, X1
 18595  	MULSS X0, X2
 18596  	MULSS X0, X3
 18597  	MULSS X0, X4
 18598  	ADDSS (DX)(R8*4), X1
 18599  	MOVSS X1, (DX)(R8*4)
 18600  	ADDQ  BX, R8
 18601  	ADDSS (DX)(R8*4), X2
 18602  	MOVSS X2, (DX)(R8*4)
 18603  	ADDQ  BX, R8
 18604  	ADDSS (DX)(R8*4), X3
 18605  	MOVSS X3, (DX)(R8*4)
 18606  	ADDQ  BX, R8
 18607  	ADDSS (DX)(R8*4), X4
 18608  	MOVSS X4, (DX)(R8*4)
 18609  	ADDQ  BX, R8
 18610  	SUBQ  $0x04, SI
 18611  
 18612  check_limit_unroll:
 18613  	CMPQ SI, $0x04
 18614  	JHS  loop_unroll
 18615  	JMP  check_limit
 18616  
 18617  loop:
 18618  	MOVSS (AX)(DI*4), X1
 18619  	MULSS X0, X1
 18620  	ADDSS (DX)(R8*4), X1
 18621  	MOVSS X1, (DX)(R8*4)
 18622  	DECQ  SI
 18623  	ADDQ  CX, DI
 18624  	ADDQ  BX, R8
 18625  
 18626  check_limit:
 18627  	CMPQ SI, $0x00
 18628  	JHI  loop
 18629  	RET
 18630  
 18631  // func AmdAxpyUnsafeXInterleave_V4A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18632  // Requires: SSE
 18633  TEXT ·AmdAxpyUnsafeXInterleave_V4A13R4(SB), NOSPLIT, $0-48
 18634  	MOVSS alpha+0(FP), X0
 18635  	MOVQ  xs+8(FP), AX
 18636  	MOVQ  incx+16(FP), CX
 18637  	MOVQ  ys+24(FP), DX
 18638  	MOVQ  incy+32(FP), BX
 18639  	MOVQ  n+40(FP), SI
 18640  	XORQ  DI, DI
 18641  	XORQ  R8, R8
 18642  	JMP   check_limit_unroll
 18643  	PCALIGN $0x08
 18644  	NOP
 18645  	NOP
 18646  	NOP
 18647  	NOP
 18648  	NOP
 18649  
 18650  loop_unroll:
 18651  	MOVSS (AX)(DI*4), X1
 18652  	ADDQ  CX, DI
 18653  	MOVSS (AX)(DI*4), X2
 18654  	ADDQ  CX, DI
 18655  	MOVSS (AX)(DI*4), X3
 18656  	ADDQ  CX, DI
 18657  	MOVSS (AX)(DI*4), X4
 18658  	ADDQ  CX, DI
 18659  	MULSS X0, X1
 18660  	MULSS X0, X2
 18661  	MULSS X0, X3
 18662  	MULSS X0, X4
 18663  	ADDSS (DX)(R8*4), X1
 18664  	MOVSS X1, (DX)(R8*4)
 18665  	ADDQ  BX, R8
 18666  	ADDSS (DX)(R8*4), X2
 18667  	MOVSS X2, (DX)(R8*4)
 18668  	ADDQ  BX, R8
 18669  	ADDSS (DX)(R8*4), X3
 18670  	MOVSS X3, (DX)(R8*4)
 18671  	ADDQ  BX, R8
 18672  	ADDSS (DX)(R8*4), X4
 18673  	MOVSS X4, (DX)(R8*4)
 18674  	ADDQ  BX, R8
 18675  	SUBQ  $0x04, SI
 18676  
 18677  check_limit_unroll:
 18678  	CMPQ SI, $0x04
 18679  	JHS  loop_unroll
 18680  	JMP  check_limit
 18681  
 18682  loop:
 18683  	MOVSS (AX)(DI*4), X1
 18684  	MULSS X0, X1
 18685  	ADDSS (DX)(R8*4), X1
 18686  	MOVSS X1, (DX)(R8*4)
 18687  	DECQ  SI
 18688  	ADDQ  CX, DI
 18689  	ADDQ  BX, R8
 18690  
 18691  check_limit:
 18692  	CMPQ SI, $0x00
 18693  	JHI  loop
 18694  	RET
 18695  
 18696  // func AmdAxpyUnsafeXInterleave_V5A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18697  // Requires: SSE
 18698  TEXT ·AmdAxpyUnsafeXInterleave_V5A13R4(SB), NOSPLIT, $0-48
 18699  	MOVSS alpha+0(FP), X0
 18700  	MOVQ  xs+8(FP), AX
 18701  	MOVQ  incx+16(FP), CX
 18702  	MOVQ  ys+24(FP), DX
 18703  	MOVQ  incy+32(FP), BX
 18704  	MOVQ  n+40(FP), SI
 18705  	XORQ  DI, DI
 18706  	XORQ  R8, R8
 18707  	JMP   check_limit_unroll
 18708  	PCALIGN $0x08
 18709  	NOP
 18710  	NOP
 18711  	NOP
 18712  	NOP
 18713  	NOP
 18714  
 18715  loop_unroll:
 18716  	MOVSS (AX)(DI*4), X1
 18717  	ADDQ  CX, DI
 18718  	MOVSS (AX)(DI*4), X2
 18719  	ADDQ  CX, DI
 18720  	MOVSS (AX)(DI*4), X3
 18721  	ADDQ  CX, DI
 18722  	MOVSS (AX)(DI*4), X4
 18723  	ADDQ  CX, DI
 18724  	MULSS X0, X1
 18725  	MULSS X0, X2
 18726  	MULSS X0, X3
 18727  	MULSS X0, X4
 18728  	ADDSS (DX)(R8*4), X1
 18729  	MOVSS X1, (DX)(R8*4)
 18730  	ADDQ  BX, R8
 18731  	ADDSS (DX)(R8*4), X2
 18732  	MOVSS X2, (DX)(R8*4)
 18733  	ADDQ  BX, R8
 18734  	ADDSS (DX)(R8*4), X3
 18735  	MOVSS X3, (DX)(R8*4)
 18736  	ADDQ  BX, R8
 18737  	ADDSS (DX)(R8*4), X4
 18738  	MOVSS X4, (DX)(R8*4)
 18739  	ADDQ  BX, R8
 18740  	SUBQ  $0x04, SI
 18741  
 18742  check_limit_unroll:
 18743  	CMPQ SI, $0x04
 18744  	JHS  loop_unroll
 18745  	JMP  check_limit
 18746  
 18747  loop:
 18748  	MOVSS (AX)(DI*4), X1
 18749  	MULSS X0, X1
 18750  	ADDSS (DX)(R8*4), X1
 18751  	MOVSS X1, (DX)(R8*4)
 18752  	DECQ  SI
 18753  	ADDQ  CX, DI
 18754  	ADDQ  BX, R8
 18755  
 18756  check_limit:
 18757  	CMPQ SI, $0x00
 18758  	JHI  loop
 18759  	RET
 18760  
 18761  // func AmdAxpyUnsafeXInterleave_V0A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18762  // Requires: SSE
 18763  TEXT ·AmdAxpyUnsafeXInterleave_V0A14R4(SB), NOSPLIT, $0-48
 18764  	MOVSS alpha+0(FP), X0
 18765  	MOVQ  xs+8(FP), AX
 18766  	MOVQ  incx+16(FP), CX
 18767  	MOVQ  ys+24(FP), DX
 18768  	MOVQ  incy+32(FP), BX
 18769  	MOVQ  n+40(FP), SI
 18770  	XORQ  DI, DI
 18771  	XORQ  R8, R8
 18772  	JMP   check_limit_unroll
 18773  	PCALIGN $0x08
 18774  	NOP
 18775  	NOP
 18776  	NOP
 18777  	NOP
 18778  	NOP
 18779  	NOP
 18780  
 18781  loop_unroll:
 18782  	MOVSS (AX)(DI*4), X1
 18783  	ADDQ  CX, DI
 18784  	MOVSS (AX)(DI*4), X2
 18785  	ADDQ  CX, DI
 18786  	MOVSS (AX)(DI*4), X3
 18787  	ADDQ  CX, DI
 18788  	MOVSS (AX)(DI*4), X4
 18789  	ADDQ  CX, DI
 18790  	MULSS X0, X1
 18791  	MULSS X0, X2
 18792  	MULSS X0, X3
 18793  	MULSS X0, X4
 18794  	ADDSS (DX)(R8*4), X1
 18795  	MOVSS X1, (DX)(R8*4)
 18796  	ADDQ  BX, R8
 18797  	ADDSS (DX)(R8*4), X2
 18798  	MOVSS X2, (DX)(R8*4)
 18799  	ADDQ  BX, R8
 18800  	ADDSS (DX)(R8*4), X3
 18801  	MOVSS X3, (DX)(R8*4)
 18802  	ADDQ  BX, R8
 18803  	ADDSS (DX)(R8*4), X4
 18804  	MOVSS X4, (DX)(R8*4)
 18805  	ADDQ  BX, R8
 18806  	SUBQ  $0x04, SI
 18807  
 18808  check_limit_unroll:
 18809  	CMPQ SI, $0x04
 18810  	JHS  loop_unroll
 18811  	JMP  check_limit
 18812  
 18813  loop:
 18814  	MOVSS (AX)(DI*4), X1
 18815  	MULSS X0, X1
 18816  	ADDSS (DX)(R8*4), X1
 18817  	MOVSS X1, (DX)(R8*4)
 18818  	DECQ  SI
 18819  	ADDQ  CX, DI
 18820  	ADDQ  BX, R8
 18821  
 18822  check_limit:
 18823  	CMPQ SI, $0x00
 18824  	JHI  loop
 18825  	RET
 18826  
 18827  // func AmdAxpyUnsafeXInterleave_V1A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18828  // Requires: SSE
 18829  TEXT ·AmdAxpyUnsafeXInterleave_V1A14R4(SB), NOSPLIT, $0-48
 18830  	MOVSS alpha+0(FP), X0
 18831  	MOVQ  xs+8(FP), AX
 18832  	MOVQ  incx+16(FP), CX
 18833  	MOVQ  ys+24(FP), DX
 18834  	MOVQ  incy+32(FP), BX
 18835  	MOVQ  n+40(FP), SI
 18836  	XORQ  DI, DI
 18837  	XORQ  R8, R8
 18838  	JMP   check_limit_unroll
 18839  	PCALIGN $0x08
 18840  	NOP
 18841  	NOP
 18842  	NOP
 18843  	NOP
 18844  	NOP
 18845  	NOP
 18846  
 18847  loop_unroll:
 18848  	MOVSS (AX)(DI*4), X1
 18849  	ADDQ  CX, DI
 18850  	MOVSS (AX)(DI*4), X2
 18851  	ADDQ  CX, DI
 18852  	MOVSS (AX)(DI*4), X3
 18853  	ADDQ  CX, DI
 18854  	MOVSS (AX)(DI*4), X4
 18855  	ADDQ  CX, DI
 18856  	MULSS X0, X1
 18857  	MULSS X0, X2
 18858  	MULSS X0, X3
 18859  	MULSS X0, X4
 18860  	ADDSS (DX)(R8*4), X1
 18861  	MOVSS X1, (DX)(R8*4)
 18862  	ADDQ  BX, R8
 18863  	ADDSS (DX)(R8*4), X2
 18864  	MOVSS X2, (DX)(R8*4)
 18865  	ADDQ  BX, R8
 18866  	ADDSS (DX)(R8*4), X3
 18867  	MOVSS X3, (DX)(R8*4)
 18868  	ADDQ  BX, R8
 18869  	ADDSS (DX)(R8*4), X4
 18870  	MOVSS X4, (DX)(R8*4)
 18871  	ADDQ  BX, R8
 18872  	SUBQ  $0x04, SI
 18873  
 18874  check_limit_unroll:
 18875  	CMPQ SI, $0x04
 18876  	JHS  loop_unroll
 18877  	JMP  check_limit
 18878  
 18879  loop:
 18880  	MOVSS (AX)(DI*4), X1
 18881  	MULSS X0, X1
 18882  	ADDSS (DX)(R8*4), X1
 18883  	MOVSS X1, (DX)(R8*4)
 18884  	DECQ  SI
 18885  	ADDQ  CX, DI
 18886  	ADDQ  BX, R8
 18887  
 18888  check_limit:
 18889  	CMPQ SI, $0x00
 18890  	JHI  loop
 18891  	RET
 18892  
 18893  // func AmdAxpyUnsafeXInterleave_V2A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18894  // Requires: SSE
 18895  TEXT ·AmdAxpyUnsafeXInterleave_V2A14R4(SB), NOSPLIT, $0-48
 18896  	MOVSS alpha+0(FP), X0
 18897  	MOVQ  xs+8(FP), AX
 18898  	MOVQ  incx+16(FP), CX
 18899  	MOVQ  ys+24(FP), DX
 18900  	MOVQ  incy+32(FP), BX
 18901  	MOVQ  n+40(FP), SI
 18902  	XORQ  DI, DI
 18903  	XORQ  R8, R8
 18904  	JMP   check_limit_unroll
 18905  	PCALIGN $0x08
 18906  	NOP
 18907  	NOP
 18908  	NOP
 18909  	NOP
 18910  	NOP
 18911  	NOP
 18912  
 18913  loop_unroll:
 18914  	MOVSS (AX)(DI*4), X1
 18915  	ADDQ  CX, DI
 18916  	MOVSS (AX)(DI*4), X2
 18917  	ADDQ  CX, DI
 18918  	MOVSS (AX)(DI*4), X3
 18919  	ADDQ  CX, DI
 18920  	MOVSS (AX)(DI*4), X4
 18921  	ADDQ  CX, DI
 18922  	MULSS X0, X1
 18923  	MULSS X0, X2
 18924  	MULSS X0, X3
 18925  	MULSS X0, X4
 18926  	ADDSS (DX)(R8*4), X1
 18927  	MOVSS X1, (DX)(R8*4)
 18928  	ADDQ  BX, R8
 18929  	ADDSS (DX)(R8*4), X2
 18930  	MOVSS X2, (DX)(R8*4)
 18931  	ADDQ  BX, R8
 18932  	ADDSS (DX)(R8*4), X3
 18933  	MOVSS X3, (DX)(R8*4)
 18934  	ADDQ  BX, R8
 18935  	ADDSS (DX)(R8*4), X4
 18936  	MOVSS X4, (DX)(R8*4)
 18937  	ADDQ  BX, R8
 18938  	SUBQ  $0x04, SI
 18939  
 18940  check_limit_unroll:
 18941  	CMPQ SI, $0x04
 18942  	JHS  loop_unroll
 18943  	JMP  check_limit
 18944  
 18945  loop:
 18946  	MOVSS (AX)(DI*4), X1
 18947  	MULSS X0, X1
 18948  	ADDSS (DX)(R8*4), X1
 18949  	MOVSS X1, (DX)(R8*4)
 18950  	DECQ  SI
 18951  	ADDQ  CX, DI
 18952  	ADDQ  BX, R8
 18953  
 18954  check_limit:
 18955  	CMPQ SI, $0x00
 18956  	JHI  loop
 18957  	RET
 18958  
 18959  // func AmdAxpyUnsafeXInterleave_V3A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 18960  // Requires: SSE
 18961  TEXT ·AmdAxpyUnsafeXInterleave_V3A14R4(SB), NOSPLIT, $0-48
 18962  	MOVSS alpha+0(FP), X0
 18963  	MOVQ  xs+8(FP), AX
 18964  	MOVQ  incx+16(FP), CX
 18965  	MOVQ  ys+24(FP), DX
 18966  	MOVQ  incy+32(FP), BX
 18967  	MOVQ  n+40(FP), SI
 18968  	XORQ  DI, DI
 18969  	XORQ  R8, R8
 18970  	JMP   check_limit_unroll
 18971  	PCALIGN $0x08
 18972  	NOP
 18973  	NOP
 18974  	NOP
 18975  	NOP
 18976  	NOP
 18977  	NOP
 18978  
 18979  loop_unroll:
 18980  	MOVSS (AX)(DI*4), X1
 18981  	ADDQ  CX, DI
 18982  	MOVSS (AX)(DI*4), X2
 18983  	ADDQ  CX, DI
 18984  	MOVSS (AX)(DI*4), X3
 18985  	ADDQ  CX, DI
 18986  	MOVSS (AX)(DI*4), X4
 18987  	ADDQ  CX, DI
 18988  	MULSS X0, X1
 18989  	MULSS X0, X2
 18990  	MULSS X0, X3
 18991  	MULSS X0, X4
 18992  	ADDSS (DX)(R8*4), X1
 18993  	MOVSS X1, (DX)(R8*4)
 18994  	ADDQ  BX, R8
 18995  	ADDSS (DX)(R8*4), X2
 18996  	MOVSS X2, (DX)(R8*4)
 18997  	ADDQ  BX, R8
 18998  	ADDSS (DX)(R8*4), X3
 18999  	MOVSS X3, (DX)(R8*4)
 19000  	ADDQ  BX, R8
 19001  	ADDSS (DX)(R8*4), X4
 19002  	MOVSS X4, (DX)(R8*4)
 19003  	ADDQ  BX, R8
 19004  	SUBQ  $0x04, SI
 19005  
 19006  check_limit_unroll:
 19007  	CMPQ SI, $0x04
 19008  	JHS  loop_unroll
 19009  	JMP  check_limit
 19010  
 19011  loop:
 19012  	MOVSS (AX)(DI*4), X1
 19013  	MULSS X0, X1
 19014  	ADDSS (DX)(R8*4), X1
 19015  	MOVSS X1, (DX)(R8*4)
 19016  	DECQ  SI
 19017  	ADDQ  CX, DI
 19018  	ADDQ  BX, R8
 19019  
 19020  check_limit:
 19021  	CMPQ SI, $0x00
 19022  	JHI  loop
 19023  	RET
 19024  
 19025  // func AmdAxpyUnsafeXInterleave_V4A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19026  // Requires: SSE
 19027  TEXT ·AmdAxpyUnsafeXInterleave_V4A14R4(SB), NOSPLIT, $0-48
 19028  	MOVSS alpha+0(FP), X0
 19029  	MOVQ  xs+8(FP), AX
 19030  	MOVQ  incx+16(FP), CX
 19031  	MOVQ  ys+24(FP), DX
 19032  	MOVQ  incy+32(FP), BX
 19033  	MOVQ  n+40(FP), SI
 19034  	XORQ  DI, DI
 19035  	XORQ  R8, R8
 19036  	JMP   check_limit_unroll
 19037  	PCALIGN $0x08
 19038  	NOP
 19039  	NOP
 19040  	NOP
 19041  	NOP
 19042  	NOP
 19043  	NOP
 19044  
 19045  loop_unroll:
 19046  	MOVSS (AX)(DI*4), X1
 19047  	ADDQ  CX, DI
 19048  	MOVSS (AX)(DI*4), X2
 19049  	ADDQ  CX, DI
 19050  	MOVSS (AX)(DI*4), X3
 19051  	ADDQ  CX, DI
 19052  	MOVSS (AX)(DI*4), X4
 19053  	ADDQ  CX, DI
 19054  	MULSS X0, X1
 19055  	MULSS X0, X2
 19056  	MULSS X0, X3
 19057  	MULSS X0, X4
 19058  	ADDSS (DX)(R8*4), X1
 19059  	MOVSS X1, (DX)(R8*4)
 19060  	ADDQ  BX, R8
 19061  	ADDSS (DX)(R8*4), X2
 19062  	MOVSS X2, (DX)(R8*4)
 19063  	ADDQ  BX, R8
 19064  	ADDSS (DX)(R8*4), X3
 19065  	MOVSS X3, (DX)(R8*4)
 19066  	ADDQ  BX, R8
 19067  	ADDSS (DX)(R8*4), X4
 19068  	MOVSS X4, (DX)(R8*4)
 19069  	ADDQ  BX, R8
 19070  	SUBQ  $0x04, SI
 19071  
 19072  check_limit_unroll:
 19073  	CMPQ SI, $0x04
 19074  	JHS  loop_unroll
 19075  	JMP  check_limit
 19076  
 19077  loop:
 19078  	MOVSS (AX)(DI*4), X1
 19079  	MULSS X0, X1
 19080  	ADDSS (DX)(R8*4), X1
 19081  	MOVSS X1, (DX)(R8*4)
 19082  	DECQ  SI
 19083  	ADDQ  CX, DI
 19084  	ADDQ  BX, R8
 19085  
 19086  check_limit:
 19087  	CMPQ SI, $0x00
 19088  	JHI  loop
 19089  	RET
 19090  
 19091  // func AmdAxpyUnsafeXInterleave_V5A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19092  // Requires: SSE
 19093  TEXT ·AmdAxpyUnsafeXInterleave_V5A14R4(SB), NOSPLIT, $0-48
 19094  	MOVSS alpha+0(FP), X0
 19095  	MOVQ  xs+8(FP), AX
 19096  	MOVQ  incx+16(FP), CX
 19097  	MOVQ  ys+24(FP), DX
 19098  	MOVQ  incy+32(FP), BX
 19099  	MOVQ  n+40(FP), SI
 19100  	XORQ  DI, DI
 19101  	XORQ  R8, R8
 19102  	JMP   check_limit_unroll
 19103  	PCALIGN $0x08
 19104  	NOP
 19105  	NOP
 19106  	NOP
 19107  	NOP
 19108  	NOP
 19109  	NOP
 19110  
 19111  loop_unroll:
 19112  	MOVSS (AX)(DI*4), X1
 19113  	ADDQ  CX, DI
 19114  	MOVSS (AX)(DI*4), X2
 19115  	ADDQ  CX, DI
 19116  	MOVSS (AX)(DI*4), X3
 19117  	ADDQ  CX, DI
 19118  	MOVSS (AX)(DI*4), X4
 19119  	ADDQ  CX, DI
 19120  	MULSS X0, X1
 19121  	MULSS X0, X2
 19122  	MULSS X0, X3
 19123  	MULSS X0, X4
 19124  	ADDSS (DX)(R8*4), X1
 19125  	MOVSS X1, (DX)(R8*4)
 19126  	ADDQ  BX, R8
 19127  	ADDSS (DX)(R8*4), X2
 19128  	MOVSS X2, (DX)(R8*4)
 19129  	ADDQ  BX, R8
 19130  	ADDSS (DX)(R8*4), X3
 19131  	MOVSS X3, (DX)(R8*4)
 19132  	ADDQ  BX, R8
 19133  	ADDSS (DX)(R8*4), X4
 19134  	MOVSS X4, (DX)(R8*4)
 19135  	ADDQ  BX, R8
 19136  	SUBQ  $0x04, SI
 19137  
 19138  check_limit_unroll:
 19139  	CMPQ SI, $0x04
 19140  	JHS  loop_unroll
 19141  	JMP  check_limit
 19142  
 19143  loop:
 19144  	MOVSS (AX)(DI*4), X1
 19145  	MULSS X0, X1
 19146  	ADDSS (DX)(R8*4), X1
 19147  	MOVSS X1, (DX)(R8*4)
 19148  	DECQ  SI
 19149  	ADDQ  CX, DI
 19150  	ADDQ  BX, R8
 19151  
 19152  check_limit:
 19153  	CMPQ SI, $0x00
 19154  	JHI  loop
 19155  	RET
 19156  
 19157  // func AmdAxpyUnsafeXInterleave_V0A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19158  // Requires: SSE
 19159  TEXT ·AmdAxpyUnsafeXInterleave_V0A15R4(SB), NOSPLIT, $0-48
 19160  	MOVSS alpha+0(FP), X0
 19161  	MOVQ  xs+8(FP), AX
 19162  	MOVQ  incx+16(FP), CX
 19163  	MOVQ  ys+24(FP), DX
 19164  	MOVQ  incy+32(FP), BX
 19165  	MOVQ  n+40(FP), SI
 19166  	XORQ  DI, DI
 19167  	XORQ  R8, R8
 19168  	JMP   check_limit_unroll
 19169  	PCALIGN $0x08
 19170  	NOP
 19171  	NOP
 19172  	NOP
 19173  	NOP
 19174  	NOP
 19175  	NOP
 19176  	NOP
 19177  
 19178  loop_unroll:
 19179  	MOVSS (AX)(DI*4), X1
 19180  	ADDQ  CX, DI
 19181  	MOVSS (AX)(DI*4), X2
 19182  	ADDQ  CX, DI
 19183  	MOVSS (AX)(DI*4), X3
 19184  	ADDQ  CX, DI
 19185  	MOVSS (AX)(DI*4), X4
 19186  	ADDQ  CX, DI
 19187  	MULSS X0, X1
 19188  	MULSS X0, X2
 19189  	MULSS X0, X3
 19190  	MULSS X0, X4
 19191  	ADDSS (DX)(R8*4), X1
 19192  	MOVSS X1, (DX)(R8*4)
 19193  	ADDQ  BX, R8
 19194  	ADDSS (DX)(R8*4), X2
 19195  	MOVSS X2, (DX)(R8*4)
 19196  	ADDQ  BX, R8
 19197  	ADDSS (DX)(R8*4), X3
 19198  	MOVSS X3, (DX)(R8*4)
 19199  	ADDQ  BX, R8
 19200  	ADDSS (DX)(R8*4), X4
 19201  	MOVSS X4, (DX)(R8*4)
 19202  	ADDQ  BX, R8
 19203  	SUBQ  $0x04, SI
 19204  
 19205  check_limit_unroll:
 19206  	CMPQ SI, $0x04
 19207  	JHS  loop_unroll
 19208  	JMP  check_limit
 19209  
 19210  loop:
 19211  	MOVSS (AX)(DI*4), X1
 19212  	MULSS X0, X1
 19213  	ADDSS (DX)(R8*4), X1
 19214  	MOVSS X1, (DX)(R8*4)
 19215  	DECQ  SI
 19216  	ADDQ  CX, DI
 19217  	ADDQ  BX, R8
 19218  
 19219  check_limit:
 19220  	CMPQ SI, $0x00
 19221  	JHI  loop
 19222  	RET
 19223  
 19224  // func AmdAxpyUnsafeXInterleave_V1A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19225  // Requires: SSE
 19226  TEXT ·AmdAxpyUnsafeXInterleave_V1A15R4(SB), NOSPLIT, $0-48
 19227  	MOVSS alpha+0(FP), X0
 19228  	MOVQ  xs+8(FP), AX
 19229  	MOVQ  incx+16(FP), CX
 19230  	MOVQ  ys+24(FP), DX
 19231  	MOVQ  incy+32(FP), BX
 19232  	MOVQ  n+40(FP), SI
 19233  	XORQ  DI, DI
 19234  	XORQ  R8, R8
 19235  	JMP   check_limit_unroll
 19236  	PCALIGN $0x08
 19237  	NOP
 19238  	NOP
 19239  	NOP
 19240  	NOP
 19241  	NOP
 19242  	NOP
 19243  	NOP
 19244  
 19245  loop_unroll:
 19246  	MOVSS (AX)(DI*4), X1
 19247  	ADDQ  CX, DI
 19248  	MOVSS (AX)(DI*4), X2
 19249  	ADDQ  CX, DI
 19250  	MOVSS (AX)(DI*4), X3
 19251  	ADDQ  CX, DI
 19252  	MOVSS (AX)(DI*4), X4
 19253  	ADDQ  CX, DI
 19254  	MULSS X0, X1
 19255  	MULSS X0, X2
 19256  	MULSS X0, X3
 19257  	MULSS X0, X4
 19258  	ADDSS (DX)(R8*4), X1
 19259  	MOVSS X1, (DX)(R8*4)
 19260  	ADDQ  BX, R8
 19261  	ADDSS (DX)(R8*4), X2
 19262  	MOVSS X2, (DX)(R8*4)
 19263  	ADDQ  BX, R8
 19264  	ADDSS (DX)(R8*4), X3
 19265  	MOVSS X3, (DX)(R8*4)
 19266  	ADDQ  BX, R8
 19267  	ADDSS (DX)(R8*4), X4
 19268  	MOVSS X4, (DX)(R8*4)
 19269  	ADDQ  BX, R8
 19270  	SUBQ  $0x04, SI
 19271  
 19272  check_limit_unroll:
 19273  	CMPQ SI, $0x04
 19274  	JHS  loop_unroll
 19275  	JMP  check_limit
 19276  
 19277  loop:
 19278  	MOVSS (AX)(DI*4), X1
 19279  	MULSS X0, X1
 19280  	ADDSS (DX)(R8*4), X1
 19281  	MOVSS X1, (DX)(R8*4)
 19282  	DECQ  SI
 19283  	ADDQ  CX, DI
 19284  	ADDQ  BX, R8
 19285  
 19286  check_limit:
 19287  	CMPQ SI, $0x00
 19288  	JHI  loop
 19289  	RET
 19290  
 19291  // func AmdAxpyUnsafeXInterleave_V2A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19292  // Requires: SSE
 19293  TEXT ·AmdAxpyUnsafeXInterleave_V2A15R4(SB), NOSPLIT, $0-48
 19294  	MOVSS alpha+0(FP), X0
 19295  	MOVQ  xs+8(FP), AX
 19296  	MOVQ  incx+16(FP), CX
 19297  	MOVQ  ys+24(FP), DX
 19298  	MOVQ  incy+32(FP), BX
 19299  	MOVQ  n+40(FP), SI
 19300  	XORQ  DI, DI
 19301  	XORQ  R8, R8
 19302  	JMP   check_limit_unroll
 19303  	PCALIGN $0x08
 19304  	NOP
 19305  	NOP
 19306  	NOP
 19307  	NOP
 19308  	NOP
 19309  	NOP
 19310  	NOP
 19311  
 19312  loop_unroll:
 19313  	MOVSS (AX)(DI*4), X1
 19314  	ADDQ  CX, DI
 19315  	MOVSS (AX)(DI*4), X2
 19316  	ADDQ  CX, DI
 19317  	MOVSS (AX)(DI*4), X3
 19318  	ADDQ  CX, DI
 19319  	MOVSS (AX)(DI*4), X4
 19320  	ADDQ  CX, DI
 19321  	MULSS X0, X1
 19322  	MULSS X0, X2
 19323  	MULSS X0, X3
 19324  	MULSS X0, X4
 19325  	ADDSS (DX)(R8*4), X1
 19326  	MOVSS X1, (DX)(R8*4)
 19327  	ADDQ  BX, R8
 19328  	ADDSS (DX)(R8*4), X2
 19329  	MOVSS X2, (DX)(R8*4)
 19330  	ADDQ  BX, R8
 19331  	ADDSS (DX)(R8*4), X3
 19332  	MOVSS X3, (DX)(R8*4)
 19333  	ADDQ  BX, R8
 19334  	ADDSS (DX)(R8*4), X4
 19335  	MOVSS X4, (DX)(R8*4)
 19336  	ADDQ  BX, R8
 19337  	SUBQ  $0x04, SI
 19338  
 19339  check_limit_unroll:
 19340  	CMPQ SI, $0x04
 19341  	JHS  loop_unroll
 19342  	JMP  check_limit
 19343  
 19344  loop:
 19345  	MOVSS (AX)(DI*4), X1
 19346  	MULSS X0, X1
 19347  	ADDSS (DX)(R8*4), X1
 19348  	MOVSS X1, (DX)(R8*4)
 19349  	DECQ  SI
 19350  	ADDQ  CX, DI
 19351  	ADDQ  BX, R8
 19352  
 19353  check_limit:
 19354  	CMPQ SI, $0x00
 19355  	JHI  loop
 19356  	RET
 19357  
 19358  // func AmdAxpyUnsafeXInterleave_V3A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19359  // Requires: SSE
 19360  TEXT ·AmdAxpyUnsafeXInterleave_V3A15R4(SB), NOSPLIT, $0-48
 19361  	MOVSS alpha+0(FP), X0
 19362  	MOVQ  xs+8(FP), AX
 19363  	MOVQ  incx+16(FP), CX
 19364  	MOVQ  ys+24(FP), DX
 19365  	MOVQ  incy+32(FP), BX
 19366  	MOVQ  n+40(FP), SI
 19367  	XORQ  DI, DI
 19368  	XORQ  R8, R8
 19369  	JMP   check_limit_unroll
 19370  	PCALIGN $0x08
 19371  	NOP
 19372  	NOP
 19373  	NOP
 19374  	NOP
 19375  	NOP
 19376  	NOP
 19377  	NOP
 19378  
 19379  loop_unroll:
 19380  	MOVSS (AX)(DI*4), X1
 19381  	ADDQ  CX, DI
 19382  	MOVSS (AX)(DI*4), X2
 19383  	ADDQ  CX, DI
 19384  	MOVSS (AX)(DI*4), X3
 19385  	ADDQ  CX, DI
 19386  	MOVSS (AX)(DI*4), X4
 19387  	ADDQ  CX, DI
 19388  	MULSS X0, X1
 19389  	MULSS X0, X2
 19390  	MULSS X0, X3
 19391  	MULSS X0, X4
 19392  	ADDSS (DX)(R8*4), X1
 19393  	MOVSS X1, (DX)(R8*4)
 19394  	ADDQ  BX, R8
 19395  	ADDSS (DX)(R8*4), X2
 19396  	MOVSS X2, (DX)(R8*4)
 19397  	ADDQ  BX, R8
 19398  	ADDSS (DX)(R8*4), X3
 19399  	MOVSS X3, (DX)(R8*4)
 19400  	ADDQ  BX, R8
 19401  	ADDSS (DX)(R8*4), X4
 19402  	MOVSS X4, (DX)(R8*4)
 19403  	ADDQ  BX, R8
 19404  	SUBQ  $0x04, SI
 19405  
 19406  check_limit_unroll:
 19407  	CMPQ SI, $0x04
 19408  	JHS  loop_unroll
 19409  	JMP  check_limit
 19410  
 19411  loop:
 19412  	MOVSS (AX)(DI*4), X1
 19413  	MULSS X0, X1
 19414  	ADDSS (DX)(R8*4), X1
 19415  	MOVSS X1, (DX)(R8*4)
 19416  	DECQ  SI
 19417  	ADDQ  CX, DI
 19418  	ADDQ  BX, R8
 19419  
 19420  check_limit:
 19421  	CMPQ SI, $0x00
 19422  	JHI  loop
 19423  	RET
 19424  
 19425  // func AmdAxpyUnsafeXInterleave_V4A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19426  // Requires: SSE
 19427  TEXT ·AmdAxpyUnsafeXInterleave_V4A15R4(SB), NOSPLIT, $0-48
 19428  	MOVSS alpha+0(FP), X0
 19429  	MOVQ  xs+8(FP), AX
 19430  	MOVQ  incx+16(FP), CX
 19431  	MOVQ  ys+24(FP), DX
 19432  	MOVQ  incy+32(FP), BX
 19433  	MOVQ  n+40(FP), SI
 19434  	XORQ  DI, DI
 19435  	XORQ  R8, R8
 19436  	JMP   check_limit_unroll
 19437  	PCALIGN $0x08
 19438  	NOP
 19439  	NOP
 19440  	NOP
 19441  	NOP
 19442  	NOP
 19443  	NOP
 19444  	NOP
 19445  
 19446  loop_unroll:
 19447  	MOVSS (AX)(DI*4), X1
 19448  	ADDQ  CX, DI
 19449  	MOVSS (AX)(DI*4), X2
 19450  	ADDQ  CX, DI
 19451  	MOVSS (AX)(DI*4), X3
 19452  	ADDQ  CX, DI
 19453  	MOVSS (AX)(DI*4), X4
 19454  	ADDQ  CX, DI
 19455  	MULSS X0, X1
 19456  	MULSS X0, X2
 19457  	MULSS X0, X3
 19458  	MULSS X0, X4
 19459  	ADDSS (DX)(R8*4), X1
 19460  	MOVSS X1, (DX)(R8*4)
 19461  	ADDQ  BX, R8
 19462  	ADDSS (DX)(R8*4), X2
 19463  	MOVSS X2, (DX)(R8*4)
 19464  	ADDQ  BX, R8
 19465  	ADDSS (DX)(R8*4), X3
 19466  	MOVSS X3, (DX)(R8*4)
 19467  	ADDQ  BX, R8
 19468  	ADDSS (DX)(R8*4), X4
 19469  	MOVSS X4, (DX)(R8*4)
 19470  	ADDQ  BX, R8
 19471  	SUBQ  $0x04, SI
 19472  
 19473  check_limit_unroll:
 19474  	CMPQ SI, $0x04
 19475  	JHS  loop_unroll
 19476  	JMP  check_limit
 19477  
 19478  loop:
 19479  	MOVSS (AX)(DI*4), X1
 19480  	MULSS X0, X1
 19481  	ADDSS (DX)(R8*4), X1
 19482  	MOVSS X1, (DX)(R8*4)
 19483  	DECQ  SI
 19484  	ADDQ  CX, DI
 19485  	ADDQ  BX, R8
 19486  
 19487  check_limit:
 19488  	CMPQ SI, $0x00
 19489  	JHI  loop
 19490  	RET
 19491  
 19492  // func AmdAxpyUnsafeXInterleave_V5A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19493  // Requires: SSE
 19494  TEXT ·AmdAxpyUnsafeXInterleave_V5A15R4(SB), NOSPLIT, $0-48
 19495  	MOVSS alpha+0(FP), X0
 19496  	MOVQ  xs+8(FP), AX
 19497  	MOVQ  incx+16(FP), CX
 19498  	MOVQ  ys+24(FP), DX
 19499  	MOVQ  incy+32(FP), BX
 19500  	MOVQ  n+40(FP), SI
 19501  	XORQ  DI, DI
 19502  	XORQ  R8, R8
 19503  	JMP   check_limit_unroll
 19504  	PCALIGN $0x08
 19505  	NOP
 19506  	NOP
 19507  	NOP
 19508  	NOP
 19509  	NOP
 19510  	NOP
 19511  	NOP
 19512  
 19513  loop_unroll:
 19514  	MOVSS (AX)(DI*4), X1
 19515  	ADDQ  CX, DI
 19516  	MOVSS (AX)(DI*4), X2
 19517  	ADDQ  CX, DI
 19518  	MOVSS (AX)(DI*4), X3
 19519  	ADDQ  CX, DI
 19520  	MOVSS (AX)(DI*4), X4
 19521  	ADDQ  CX, DI
 19522  	MULSS X0, X1
 19523  	MULSS X0, X2
 19524  	MULSS X0, X3
 19525  	MULSS X0, X4
 19526  	ADDSS (DX)(R8*4), X1
 19527  	MOVSS X1, (DX)(R8*4)
 19528  	ADDQ  BX, R8
 19529  	ADDSS (DX)(R8*4), X2
 19530  	MOVSS X2, (DX)(R8*4)
 19531  	ADDQ  BX, R8
 19532  	ADDSS (DX)(R8*4), X3
 19533  	MOVSS X3, (DX)(R8*4)
 19534  	ADDQ  BX, R8
 19535  	ADDSS (DX)(R8*4), X4
 19536  	MOVSS X4, (DX)(R8*4)
 19537  	ADDQ  BX, R8
 19538  	SUBQ  $0x04, SI
 19539  
 19540  check_limit_unroll:
 19541  	CMPQ SI, $0x04
 19542  	JHS  loop_unroll
 19543  	JMP  check_limit
 19544  
 19545  loop:
 19546  	MOVSS (AX)(DI*4), X1
 19547  	MULSS X0, X1
 19548  	ADDSS (DX)(R8*4), X1
 19549  	MOVSS X1, (DX)(R8*4)
 19550  	DECQ  SI
 19551  	ADDQ  CX, DI
 19552  	ADDQ  BX, R8
 19553  
 19554  check_limit:
 19555  	CMPQ SI, $0x00
 19556  	JHI  loop
 19557  	RET
 19558  
 19559  // func AmdAxpyUnsafeXInterleave_V0A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19560  // Requires: SSE
 19561  TEXT ·AmdAxpyUnsafeXInterleave_V0A16R4(SB), NOSPLIT, $0-48
 19562  	MOVSS alpha+0(FP), X0
 19563  	MOVQ  xs+8(FP), AX
 19564  	MOVQ  incx+16(FP), CX
 19565  	MOVQ  ys+24(FP), DX
 19566  	MOVQ  incy+32(FP), BX
 19567  	MOVQ  n+40(FP), SI
 19568  	XORQ  DI, DI
 19569  	XORQ  R8, R8
 19570  	JMP   check_limit_unroll
 19571  	PCALIGN $0x10
 19572  
 19573  loop_unroll:
 19574  	MOVSS (AX)(DI*4), X1
 19575  	ADDQ  CX, DI
 19576  	MOVSS (AX)(DI*4), X2
 19577  	ADDQ  CX, DI
 19578  	MOVSS (AX)(DI*4), X3
 19579  	ADDQ  CX, DI
 19580  	MOVSS (AX)(DI*4), X4
 19581  	ADDQ  CX, DI
 19582  	MULSS X0, X1
 19583  	MULSS X0, X2
 19584  	MULSS X0, X3
 19585  	MULSS X0, X4
 19586  	ADDSS (DX)(R8*4), X1
 19587  	MOVSS X1, (DX)(R8*4)
 19588  	ADDQ  BX, R8
 19589  	ADDSS (DX)(R8*4), X2
 19590  	MOVSS X2, (DX)(R8*4)
 19591  	ADDQ  BX, R8
 19592  	ADDSS (DX)(R8*4), X3
 19593  	MOVSS X3, (DX)(R8*4)
 19594  	ADDQ  BX, R8
 19595  	ADDSS (DX)(R8*4), X4
 19596  	MOVSS X4, (DX)(R8*4)
 19597  	ADDQ  BX, R8
 19598  	SUBQ  $0x04, SI
 19599  
 19600  check_limit_unroll:
 19601  	CMPQ SI, $0x04
 19602  	JHS  loop_unroll
 19603  	JMP  check_limit
 19604  
 19605  loop:
 19606  	MOVSS (AX)(DI*4), X1
 19607  	MULSS X0, X1
 19608  	ADDSS (DX)(R8*4), X1
 19609  	MOVSS X1, (DX)(R8*4)
 19610  	DECQ  SI
 19611  	ADDQ  CX, DI
 19612  	ADDQ  BX, R8
 19613  
 19614  check_limit:
 19615  	CMPQ SI, $0x00
 19616  	JHI  loop
 19617  	RET
 19618  
 19619  // func AmdAxpyUnsafeXInterleave_V1A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19620  // Requires: SSE
 19621  TEXT ·AmdAxpyUnsafeXInterleave_V1A16R4(SB), NOSPLIT, $0-48
 19622  	MOVSS alpha+0(FP), X0
 19623  	MOVQ  xs+8(FP), AX
 19624  	MOVQ  incx+16(FP), CX
 19625  	MOVQ  ys+24(FP), DX
 19626  	MOVQ  incy+32(FP), BX
 19627  	MOVQ  n+40(FP), SI
 19628  	XORQ  DI, DI
 19629  	XORQ  R8, R8
 19630  	JMP   check_limit_unroll
 19631  	PCALIGN $0x10
 19632  
 19633  loop_unroll:
 19634  	MOVSS (AX)(DI*4), X1
 19635  	ADDQ  CX, DI
 19636  	MOVSS (AX)(DI*4), X2
 19637  	ADDQ  CX, DI
 19638  	MOVSS (AX)(DI*4), X3
 19639  	ADDQ  CX, DI
 19640  	MOVSS (AX)(DI*4), X4
 19641  	ADDQ  CX, DI
 19642  	MULSS X0, X1
 19643  	MULSS X0, X2
 19644  	MULSS X0, X3
 19645  	MULSS X0, X4
 19646  	ADDSS (DX)(R8*4), X1
 19647  	MOVSS X1, (DX)(R8*4)
 19648  	ADDQ  BX, R8
 19649  	ADDSS (DX)(R8*4), X2
 19650  	MOVSS X2, (DX)(R8*4)
 19651  	ADDQ  BX, R8
 19652  	ADDSS (DX)(R8*4), X3
 19653  	MOVSS X3, (DX)(R8*4)
 19654  	ADDQ  BX, R8
 19655  	ADDSS (DX)(R8*4), X4
 19656  	MOVSS X4, (DX)(R8*4)
 19657  	ADDQ  BX, R8
 19658  	SUBQ  $0x04, SI
 19659  
 19660  check_limit_unroll:
 19661  	CMPQ SI, $0x04
 19662  	JHS  loop_unroll
 19663  	JMP  check_limit
 19664  
 19665  loop:
 19666  	MOVSS (AX)(DI*4), X1
 19667  	MULSS X0, X1
 19668  	ADDSS (DX)(R8*4), X1
 19669  	MOVSS X1, (DX)(R8*4)
 19670  	DECQ  SI
 19671  	ADDQ  CX, DI
 19672  	ADDQ  BX, R8
 19673  
 19674  check_limit:
 19675  	CMPQ SI, $0x00
 19676  	JHI  loop
 19677  	RET
 19678  
 19679  // func AmdAxpyUnsafeXInterleave_V2A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19680  // Requires: SSE
 19681  TEXT ·AmdAxpyUnsafeXInterleave_V2A16R4(SB), NOSPLIT, $0-48
 19682  	MOVSS alpha+0(FP), X0
 19683  	MOVQ  xs+8(FP), AX
 19684  	MOVQ  incx+16(FP), CX
 19685  	MOVQ  ys+24(FP), DX
 19686  	MOVQ  incy+32(FP), BX
 19687  	MOVQ  n+40(FP), SI
 19688  	XORQ  DI, DI
 19689  	XORQ  R8, R8
 19690  	JMP   check_limit_unroll
 19691  	PCALIGN $0x10
 19692  
 19693  loop_unroll:
 19694  	MOVSS (AX)(DI*4), X1
 19695  	ADDQ  CX, DI
 19696  	MOVSS (AX)(DI*4), X2
 19697  	ADDQ  CX, DI
 19698  	MOVSS (AX)(DI*4), X3
 19699  	ADDQ  CX, DI
 19700  	MOVSS (AX)(DI*4), X4
 19701  	ADDQ  CX, DI
 19702  	MULSS X0, X1
 19703  	MULSS X0, X2
 19704  	MULSS X0, X3
 19705  	MULSS X0, X4
 19706  	ADDSS (DX)(R8*4), X1
 19707  	MOVSS X1, (DX)(R8*4)
 19708  	ADDQ  BX, R8
 19709  	ADDSS (DX)(R8*4), X2
 19710  	MOVSS X2, (DX)(R8*4)
 19711  	ADDQ  BX, R8
 19712  	ADDSS (DX)(R8*4), X3
 19713  	MOVSS X3, (DX)(R8*4)
 19714  	ADDQ  BX, R8
 19715  	ADDSS (DX)(R8*4), X4
 19716  	MOVSS X4, (DX)(R8*4)
 19717  	ADDQ  BX, R8
 19718  	SUBQ  $0x04, SI
 19719  
 19720  check_limit_unroll:
 19721  	CMPQ SI, $0x04
 19722  	JHS  loop_unroll
 19723  	JMP  check_limit
 19724  
 19725  loop:
 19726  	MOVSS (AX)(DI*4), X1
 19727  	MULSS X0, X1
 19728  	ADDSS (DX)(R8*4), X1
 19729  	MOVSS X1, (DX)(R8*4)
 19730  	DECQ  SI
 19731  	ADDQ  CX, DI
 19732  	ADDQ  BX, R8
 19733  
 19734  check_limit:
 19735  	CMPQ SI, $0x00
 19736  	JHI  loop
 19737  	RET
 19738  
 19739  // func AmdAxpyUnsafeXInterleave_V3A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19740  // Requires: SSE
 19741  TEXT ·AmdAxpyUnsafeXInterleave_V3A16R4(SB), NOSPLIT, $0-48
 19742  	MOVSS alpha+0(FP), X0
 19743  	MOVQ  xs+8(FP), AX
 19744  	MOVQ  incx+16(FP), CX
 19745  	MOVQ  ys+24(FP), DX
 19746  	MOVQ  incy+32(FP), BX
 19747  	MOVQ  n+40(FP), SI
 19748  	XORQ  DI, DI
 19749  	XORQ  R8, R8
 19750  	JMP   check_limit_unroll
 19751  	PCALIGN $0x10
 19752  
 19753  loop_unroll:
 19754  	MOVSS (AX)(DI*4), X1
 19755  	ADDQ  CX, DI
 19756  	MOVSS (AX)(DI*4), X2
 19757  	ADDQ  CX, DI
 19758  	MOVSS (AX)(DI*4), X3
 19759  	ADDQ  CX, DI
 19760  	MOVSS (AX)(DI*4), X4
 19761  	ADDQ  CX, DI
 19762  	MULSS X0, X1
 19763  	MULSS X0, X2
 19764  	MULSS X0, X3
 19765  	MULSS X0, X4
 19766  	ADDSS (DX)(R8*4), X1
 19767  	MOVSS X1, (DX)(R8*4)
 19768  	ADDQ  BX, R8
 19769  	ADDSS (DX)(R8*4), X2
 19770  	MOVSS X2, (DX)(R8*4)
 19771  	ADDQ  BX, R8
 19772  	ADDSS (DX)(R8*4), X3
 19773  	MOVSS X3, (DX)(R8*4)
 19774  	ADDQ  BX, R8
 19775  	ADDSS (DX)(R8*4), X4
 19776  	MOVSS X4, (DX)(R8*4)
 19777  	ADDQ  BX, R8
 19778  	SUBQ  $0x04, SI
 19779  
 19780  check_limit_unroll:
 19781  	CMPQ SI, $0x04
 19782  	JHS  loop_unroll
 19783  	JMP  check_limit
 19784  
 19785  loop:
 19786  	MOVSS (AX)(DI*4), X1
 19787  	MULSS X0, X1
 19788  	ADDSS (DX)(R8*4), X1
 19789  	MOVSS X1, (DX)(R8*4)
 19790  	DECQ  SI
 19791  	ADDQ  CX, DI
 19792  	ADDQ  BX, R8
 19793  
 19794  check_limit:
 19795  	CMPQ SI, $0x00
 19796  	JHI  loop
 19797  	RET
 19798  
 19799  // func AmdAxpyUnsafeXInterleave_V4A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19800  // Requires: SSE
 19801  TEXT ·AmdAxpyUnsafeXInterleave_V4A16R4(SB), NOSPLIT, $0-48
 19802  	MOVSS alpha+0(FP), X0
 19803  	MOVQ  xs+8(FP), AX
 19804  	MOVQ  incx+16(FP), CX
 19805  	MOVQ  ys+24(FP), DX
 19806  	MOVQ  incy+32(FP), BX
 19807  	MOVQ  n+40(FP), SI
 19808  	XORQ  DI, DI
 19809  	XORQ  R8, R8
 19810  	JMP   check_limit_unroll
 19811  	PCALIGN $0x10
 19812  
 19813  loop_unroll:
 19814  	MOVSS (AX)(DI*4), X1
 19815  	ADDQ  CX, DI
 19816  	MOVSS (AX)(DI*4), X2
 19817  	ADDQ  CX, DI
 19818  	MOVSS (AX)(DI*4), X3
 19819  	ADDQ  CX, DI
 19820  	MOVSS (AX)(DI*4), X4
 19821  	ADDQ  CX, DI
 19822  	MULSS X0, X1
 19823  	MULSS X0, X2
 19824  	MULSS X0, X3
 19825  	MULSS X0, X4
 19826  	ADDSS (DX)(R8*4), X1
 19827  	MOVSS X1, (DX)(R8*4)
 19828  	ADDQ  BX, R8
 19829  	ADDSS (DX)(R8*4), X2
 19830  	MOVSS X2, (DX)(R8*4)
 19831  	ADDQ  BX, R8
 19832  	ADDSS (DX)(R8*4), X3
 19833  	MOVSS X3, (DX)(R8*4)
 19834  	ADDQ  BX, R8
 19835  	ADDSS (DX)(R8*4), X4
 19836  	MOVSS X4, (DX)(R8*4)
 19837  	ADDQ  BX, R8
 19838  	SUBQ  $0x04, SI
 19839  
 19840  check_limit_unroll:
 19841  	CMPQ SI, $0x04
 19842  	JHS  loop_unroll
 19843  	JMP  check_limit
 19844  
 19845  loop:
 19846  	MOVSS (AX)(DI*4), X1
 19847  	MULSS X0, X1
 19848  	ADDSS (DX)(R8*4), X1
 19849  	MOVSS X1, (DX)(R8*4)
 19850  	DECQ  SI
 19851  	ADDQ  CX, DI
 19852  	ADDQ  BX, R8
 19853  
 19854  check_limit:
 19855  	CMPQ SI, $0x00
 19856  	JHI  loop
 19857  	RET
 19858  
 19859  // func AmdAxpyUnsafeXInterleave_V5A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19860  // Requires: SSE
 19861  TEXT ·AmdAxpyUnsafeXInterleave_V5A16R4(SB), NOSPLIT, $0-48
 19862  	MOVSS alpha+0(FP), X0
 19863  	MOVQ  xs+8(FP), AX
 19864  	MOVQ  incx+16(FP), CX
 19865  	MOVQ  ys+24(FP), DX
 19866  	MOVQ  incy+32(FP), BX
 19867  	MOVQ  n+40(FP), SI
 19868  	XORQ  DI, DI
 19869  	XORQ  R8, R8
 19870  	JMP   check_limit_unroll
 19871  	PCALIGN $0x10
 19872  
 19873  loop_unroll:
 19874  	MOVSS (AX)(DI*4), X1
 19875  	ADDQ  CX, DI
 19876  	MOVSS (AX)(DI*4), X2
 19877  	ADDQ  CX, DI
 19878  	MOVSS (AX)(DI*4), X3
 19879  	ADDQ  CX, DI
 19880  	MOVSS (AX)(DI*4), X4
 19881  	ADDQ  CX, DI
 19882  	MULSS X0, X1
 19883  	MULSS X0, X2
 19884  	MULSS X0, X3
 19885  	MULSS X0, X4
 19886  	ADDSS (DX)(R8*4), X1
 19887  	MOVSS X1, (DX)(R8*4)
 19888  	ADDQ  BX, R8
 19889  	ADDSS (DX)(R8*4), X2
 19890  	MOVSS X2, (DX)(R8*4)
 19891  	ADDQ  BX, R8
 19892  	ADDSS (DX)(R8*4), X3
 19893  	MOVSS X3, (DX)(R8*4)
 19894  	ADDQ  BX, R8
 19895  	ADDSS (DX)(R8*4), X4
 19896  	MOVSS X4, (DX)(R8*4)
 19897  	ADDQ  BX, R8
 19898  	SUBQ  $0x04, SI
 19899  
 19900  check_limit_unroll:
 19901  	CMPQ SI, $0x04
 19902  	JHS  loop_unroll
 19903  	JMP  check_limit
 19904  
 19905  loop:
 19906  	MOVSS (AX)(DI*4), X1
 19907  	MULSS X0, X1
 19908  	ADDSS (DX)(R8*4), X1
 19909  	MOVSS X1, (DX)(R8*4)
 19910  	DECQ  SI
 19911  	ADDQ  CX, DI
 19912  	ADDQ  BX, R8
 19913  
 19914  check_limit:
 19915  	CMPQ SI, $0x00
 19916  	JHI  loop
 19917  	RET
 19918  
 19919  // func AmdAxpyUnsafeXInterleave_V0A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 19920  // Requires: SSE
 19921  TEXT ·AmdAxpyUnsafeXInterleave_V0A0R8(SB), NOSPLIT, $0-48
 19922  	MOVSS alpha+0(FP), X0
 19923  	MOVQ  xs+8(FP), AX
 19924  	MOVQ  incx+16(FP), CX
 19925  	MOVQ  ys+24(FP), DX
 19926  	MOVQ  incy+32(FP), BX
 19927  	MOVQ  n+40(FP), SI
 19928  	XORQ  DI, DI
 19929  	XORQ  R8, R8
 19930  	JMP   check_limit_unroll
 19931  
 19932  loop_unroll:
 19933  	MOVSS (AX)(DI*4), X1
 19934  	ADDQ  CX, DI
 19935  	MOVSS (AX)(DI*4), X2
 19936  	ADDQ  CX, DI
 19937  	MOVSS (AX)(DI*4), X3
 19938  	ADDQ  CX, DI
 19939  	MOVSS (AX)(DI*4), X4
 19940  	ADDQ  CX, DI
 19941  	MOVSS (AX)(DI*4), X5
 19942  	ADDQ  CX, DI
 19943  	MOVSS (AX)(DI*4), X6
 19944  	ADDQ  CX, DI
 19945  	MOVSS (AX)(DI*4), X7
 19946  	ADDQ  CX, DI
 19947  	MOVSS (AX)(DI*4), X8
 19948  	ADDQ  CX, DI
 19949  	MULSS X0, X1
 19950  	MULSS X0, X2
 19951  	MULSS X0, X3
 19952  	MULSS X0, X4
 19953  	MULSS X0, X5
 19954  	MULSS X0, X6
 19955  	MULSS X0, X7
 19956  	MULSS X0, X8
 19957  	ADDSS (DX)(R8*4), X1
 19958  	MOVSS X1, (DX)(R8*4)
 19959  	ADDQ  BX, R8
 19960  	ADDSS (DX)(R8*4), X2
 19961  	MOVSS X2, (DX)(R8*4)
 19962  	ADDQ  BX, R8
 19963  	ADDSS (DX)(R8*4), X3
 19964  	MOVSS X3, (DX)(R8*4)
 19965  	ADDQ  BX, R8
 19966  	ADDSS (DX)(R8*4), X4
 19967  	MOVSS X4, (DX)(R8*4)
 19968  	ADDQ  BX, R8
 19969  	ADDSS (DX)(R8*4), X5
 19970  	MOVSS X5, (DX)(R8*4)
 19971  	ADDQ  BX, R8
 19972  	ADDSS (DX)(R8*4), X6
 19973  	MOVSS X6, (DX)(R8*4)
 19974  	ADDQ  BX, R8
 19975  	ADDSS (DX)(R8*4), X7
 19976  	MOVSS X7, (DX)(R8*4)
 19977  	ADDQ  BX, R8
 19978  	ADDSS (DX)(R8*4), X8
 19979  	MOVSS X8, (DX)(R8*4)
 19980  	ADDQ  BX, R8
 19981  	SUBQ  $0x08, SI
 19982  
 19983  check_limit_unroll:
 19984  	CMPQ SI, $0x08
 19985  	JHS  loop_unroll
 19986  	JMP  check_limit
 19987  
 19988  loop:
 19989  	MOVSS (AX)(DI*4), X1
 19990  	MULSS X0, X1
 19991  	ADDSS (DX)(R8*4), X1
 19992  	MOVSS X1, (DX)(R8*4)
 19993  	DECQ  SI
 19994  	ADDQ  CX, DI
 19995  	ADDQ  BX, R8
 19996  
 19997  check_limit:
 19998  	CMPQ SI, $0x00
 19999  	JHI  loop
 20000  	RET
 20001  
 20002  // func AmdAxpyUnsafeXInterleave_V1A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20003  // Requires: SSE
 20004  TEXT ·AmdAxpyUnsafeXInterleave_V1A0R8(SB), NOSPLIT, $0-48
 20005  	MOVSS alpha+0(FP), X0
 20006  	MOVQ  xs+8(FP), AX
 20007  	MOVQ  incx+16(FP), CX
 20008  	MOVQ  ys+24(FP), DX
 20009  	MOVQ  incy+32(FP), BX
 20010  	MOVQ  n+40(FP), SI
 20011  	XORQ  DI, DI
 20012  	XORQ  R8, R8
 20013  	JMP   check_limit_unroll
 20014  
 20015  loop_unroll:
 20016  	MOVSS (AX)(DI*4), X1
 20017  	ADDQ  CX, DI
 20018  	MOVSS (AX)(DI*4), X2
 20019  	ADDQ  CX, DI
 20020  	MOVSS (AX)(DI*4), X3
 20021  	ADDQ  CX, DI
 20022  	MOVSS (AX)(DI*4), X4
 20023  	ADDQ  CX, DI
 20024  	MOVSS (AX)(DI*4), X5
 20025  	ADDQ  CX, DI
 20026  	MOVSS (AX)(DI*4), X6
 20027  	ADDQ  CX, DI
 20028  	MOVSS (AX)(DI*4), X7
 20029  	ADDQ  CX, DI
 20030  	MOVSS (AX)(DI*4), X8
 20031  	ADDQ  CX, DI
 20032  	MULSS X0, X1
 20033  	MULSS X0, X2
 20034  	MULSS X0, X3
 20035  	MULSS X0, X4
 20036  	MULSS X0, X5
 20037  	MULSS X0, X6
 20038  	MULSS X0, X7
 20039  	MULSS X0, X8
 20040  	ADDSS (DX)(R8*4), X1
 20041  	MOVSS X1, (DX)(R8*4)
 20042  	ADDQ  BX, R8
 20043  	ADDSS (DX)(R8*4), X2
 20044  	MOVSS X2, (DX)(R8*4)
 20045  	ADDQ  BX, R8
 20046  	ADDSS (DX)(R8*4), X3
 20047  	MOVSS X3, (DX)(R8*4)
 20048  	ADDQ  BX, R8
 20049  	ADDSS (DX)(R8*4), X4
 20050  	MOVSS X4, (DX)(R8*4)
 20051  	ADDQ  BX, R8
 20052  	ADDSS (DX)(R8*4), X5
 20053  	MOVSS X5, (DX)(R8*4)
 20054  	ADDQ  BX, R8
 20055  	ADDSS (DX)(R8*4), X6
 20056  	MOVSS X6, (DX)(R8*4)
 20057  	ADDQ  BX, R8
 20058  	ADDSS (DX)(R8*4), X7
 20059  	MOVSS X7, (DX)(R8*4)
 20060  	ADDQ  BX, R8
 20061  	ADDSS (DX)(R8*4), X8
 20062  	MOVSS X8, (DX)(R8*4)
 20063  	ADDQ  BX, R8
 20064  	SUBQ  $0x08, SI
 20065  
 20066  check_limit_unroll:
 20067  	CMPQ SI, $0x08
 20068  	JHS  loop_unroll
 20069  	JMP  check_limit
 20070  
 20071  loop:
 20072  	MOVSS (AX)(DI*4), X1
 20073  	MULSS X0, X1
 20074  	ADDSS (DX)(R8*4), X1
 20075  	MOVSS X1, (DX)(R8*4)
 20076  	DECQ  SI
 20077  	ADDQ  CX, DI
 20078  	ADDQ  BX, R8
 20079  
 20080  check_limit:
 20081  	CMPQ SI, $0x00
 20082  	JHI  loop
 20083  	RET
 20084  
 20085  // func AmdAxpyUnsafeXInterleave_V2A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20086  // Requires: SSE
 20087  TEXT ·AmdAxpyUnsafeXInterleave_V2A0R8(SB), NOSPLIT, $0-48
 20088  	MOVSS alpha+0(FP), X0
 20089  	MOVQ  xs+8(FP), AX
 20090  	MOVQ  incx+16(FP), CX
 20091  	MOVQ  ys+24(FP), DX
 20092  	MOVQ  incy+32(FP), BX
 20093  	MOVQ  n+40(FP), SI
 20094  	XORQ  DI, DI
 20095  	XORQ  R8, R8
 20096  	JMP   check_limit_unroll
 20097  
 20098  loop_unroll:
 20099  	MOVSS (AX)(DI*4), X1
 20100  	ADDQ  CX, DI
 20101  	MOVSS (AX)(DI*4), X2
 20102  	ADDQ  CX, DI
 20103  	MOVSS (AX)(DI*4), X3
 20104  	ADDQ  CX, DI
 20105  	MOVSS (AX)(DI*4), X4
 20106  	ADDQ  CX, DI
 20107  	MOVSS (AX)(DI*4), X5
 20108  	ADDQ  CX, DI
 20109  	MOVSS (AX)(DI*4), X6
 20110  	ADDQ  CX, DI
 20111  	MOVSS (AX)(DI*4), X7
 20112  	ADDQ  CX, DI
 20113  	MOVSS (AX)(DI*4), X8
 20114  	ADDQ  CX, DI
 20115  	MULSS X0, X1
 20116  	MULSS X0, X2
 20117  	MULSS X0, X3
 20118  	MULSS X0, X4
 20119  	MULSS X0, X5
 20120  	MULSS X0, X6
 20121  	MULSS X0, X7
 20122  	MULSS X0, X8
 20123  	ADDSS (DX)(R8*4), X1
 20124  	MOVSS X1, (DX)(R8*4)
 20125  	ADDQ  BX, R8
 20126  	ADDSS (DX)(R8*4), X2
 20127  	MOVSS X2, (DX)(R8*4)
 20128  	ADDQ  BX, R8
 20129  	ADDSS (DX)(R8*4), X3
 20130  	MOVSS X3, (DX)(R8*4)
 20131  	ADDQ  BX, R8
 20132  	ADDSS (DX)(R8*4), X4
 20133  	MOVSS X4, (DX)(R8*4)
 20134  	ADDQ  BX, R8
 20135  	ADDSS (DX)(R8*4), X5
 20136  	MOVSS X5, (DX)(R8*4)
 20137  	ADDQ  BX, R8
 20138  	ADDSS (DX)(R8*4), X6
 20139  	MOVSS X6, (DX)(R8*4)
 20140  	ADDQ  BX, R8
 20141  	ADDSS (DX)(R8*4), X7
 20142  	MOVSS X7, (DX)(R8*4)
 20143  	ADDQ  BX, R8
 20144  	ADDSS (DX)(R8*4), X8
 20145  	MOVSS X8, (DX)(R8*4)
 20146  	ADDQ  BX, R8
 20147  	SUBQ  $0x08, SI
 20148  
 20149  check_limit_unroll:
 20150  	CMPQ SI, $0x08
 20151  	JHS  loop_unroll
 20152  	JMP  check_limit
 20153  
 20154  loop:
 20155  	MOVSS (AX)(DI*4), X1
 20156  	MULSS X0, X1
 20157  	ADDSS (DX)(R8*4), X1
 20158  	MOVSS X1, (DX)(R8*4)
 20159  	DECQ  SI
 20160  	ADDQ  CX, DI
 20161  	ADDQ  BX, R8
 20162  
 20163  check_limit:
 20164  	CMPQ SI, $0x00
 20165  	JHI  loop
 20166  	RET
 20167  
 20168  // func AmdAxpyUnsafeXInterleave_V3A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20169  // Requires: SSE
 20170  TEXT ·AmdAxpyUnsafeXInterleave_V3A0R8(SB), NOSPLIT, $0-48
 20171  	MOVSS alpha+0(FP), X0
 20172  	MOVQ  xs+8(FP), AX
 20173  	MOVQ  incx+16(FP), CX
 20174  	MOVQ  ys+24(FP), DX
 20175  	MOVQ  incy+32(FP), BX
 20176  	MOVQ  n+40(FP), SI
 20177  	XORQ  DI, DI
 20178  	XORQ  R8, R8
 20179  	JMP   check_limit_unroll
 20180  
 20181  loop_unroll:
 20182  	MOVSS (AX)(DI*4), X1
 20183  	ADDQ  CX, DI
 20184  	MOVSS (AX)(DI*4), X2
 20185  	ADDQ  CX, DI
 20186  	MOVSS (AX)(DI*4), X3
 20187  	ADDQ  CX, DI
 20188  	MOVSS (AX)(DI*4), X4
 20189  	ADDQ  CX, DI
 20190  	MOVSS (AX)(DI*4), X5
 20191  	ADDQ  CX, DI
 20192  	MOVSS (AX)(DI*4), X6
 20193  	ADDQ  CX, DI
 20194  	MOVSS (AX)(DI*4), X7
 20195  	ADDQ  CX, DI
 20196  	MOVSS (AX)(DI*4), X8
 20197  	ADDQ  CX, DI
 20198  	MULSS X0, X1
 20199  	MULSS X0, X2
 20200  	MULSS X0, X3
 20201  	MULSS X0, X4
 20202  	MULSS X0, X5
 20203  	MULSS X0, X6
 20204  	MULSS X0, X7
 20205  	MULSS X0, X8
 20206  	ADDSS (DX)(R8*4), X1
 20207  	MOVSS X1, (DX)(R8*4)
 20208  	ADDQ  BX, R8
 20209  	ADDSS (DX)(R8*4), X2
 20210  	MOVSS X2, (DX)(R8*4)
 20211  	ADDQ  BX, R8
 20212  	ADDSS (DX)(R8*4), X3
 20213  	MOVSS X3, (DX)(R8*4)
 20214  	ADDQ  BX, R8
 20215  	ADDSS (DX)(R8*4), X4
 20216  	MOVSS X4, (DX)(R8*4)
 20217  	ADDQ  BX, R8
 20218  	ADDSS (DX)(R8*4), X5
 20219  	MOVSS X5, (DX)(R8*4)
 20220  	ADDQ  BX, R8
 20221  	ADDSS (DX)(R8*4), X6
 20222  	MOVSS X6, (DX)(R8*4)
 20223  	ADDQ  BX, R8
 20224  	ADDSS (DX)(R8*4), X7
 20225  	MOVSS X7, (DX)(R8*4)
 20226  	ADDQ  BX, R8
 20227  	ADDSS (DX)(R8*4), X8
 20228  	MOVSS X8, (DX)(R8*4)
 20229  	ADDQ  BX, R8
 20230  	SUBQ  $0x08, SI
 20231  
 20232  check_limit_unroll:
 20233  	CMPQ SI, $0x08
 20234  	JHS  loop_unroll
 20235  	JMP  check_limit
 20236  
 20237  loop:
 20238  	MOVSS (AX)(DI*4), X1
 20239  	MULSS X0, X1
 20240  	ADDSS (DX)(R8*4), X1
 20241  	MOVSS X1, (DX)(R8*4)
 20242  	DECQ  SI
 20243  	ADDQ  CX, DI
 20244  	ADDQ  BX, R8
 20245  
 20246  check_limit:
 20247  	CMPQ SI, $0x00
 20248  	JHI  loop
 20249  	RET
 20250  
 20251  // func AmdAxpyUnsafeXInterleave_V4A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20252  // Requires: SSE
 20253  TEXT ·AmdAxpyUnsafeXInterleave_V4A0R8(SB), NOSPLIT, $0-48
 20254  	MOVSS alpha+0(FP), X0
 20255  	MOVQ  xs+8(FP), AX
 20256  	MOVQ  incx+16(FP), CX
 20257  	MOVQ  ys+24(FP), DX
 20258  	MOVQ  incy+32(FP), BX
 20259  	MOVQ  n+40(FP), SI
 20260  	XORQ  DI, DI
 20261  	XORQ  R8, R8
 20262  	JMP   check_limit_unroll
 20263  
 20264  loop_unroll:
 20265  	MOVSS (AX)(DI*4), X1
 20266  	ADDQ  CX, DI
 20267  	MOVSS (AX)(DI*4), X2
 20268  	ADDQ  CX, DI
 20269  	MOVSS (AX)(DI*4), X3
 20270  	ADDQ  CX, DI
 20271  	MOVSS (AX)(DI*4), X4
 20272  	ADDQ  CX, DI
 20273  	MOVSS (AX)(DI*4), X5
 20274  	ADDQ  CX, DI
 20275  	MOVSS (AX)(DI*4), X6
 20276  	ADDQ  CX, DI
 20277  	MOVSS (AX)(DI*4), X7
 20278  	ADDQ  CX, DI
 20279  	MOVSS (AX)(DI*4), X8
 20280  	ADDQ  CX, DI
 20281  	MULSS X0, X1
 20282  	MULSS X0, X2
 20283  	MULSS X0, X3
 20284  	MULSS X0, X4
 20285  	MULSS X0, X5
 20286  	MULSS X0, X6
 20287  	MULSS X0, X7
 20288  	MULSS X0, X8
 20289  	ADDSS (DX)(R8*4), X1
 20290  	MOVSS X1, (DX)(R8*4)
 20291  	ADDQ  BX, R8
 20292  	ADDSS (DX)(R8*4), X2
 20293  	MOVSS X2, (DX)(R8*4)
 20294  	ADDQ  BX, R8
 20295  	ADDSS (DX)(R8*4), X3
 20296  	MOVSS X3, (DX)(R8*4)
 20297  	ADDQ  BX, R8
 20298  	ADDSS (DX)(R8*4), X4
 20299  	MOVSS X4, (DX)(R8*4)
 20300  	ADDQ  BX, R8
 20301  	ADDSS (DX)(R8*4), X5
 20302  	MOVSS X5, (DX)(R8*4)
 20303  	ADDQ  BX, R8
 20304  	ADDSS (DX)(R8*4), X6
 20305  	MOVSS X6, (DX)(R8*4)
 20306  	ADDQ  BX, R8
 20307  	ADDSS (DX)(R8*4), X7
 20308  	MOVSS X7, (DX)(R8*4)
 20309  	ADDQ  BX, R8
 20310  	ADDSS (DX)(R8*4), X8
 20311  	MOVSS X8, (DX)(R8*4)
 20312  	ADDQ  BX, R8
 20313  	SUBQ  $0x08, SI
 20314  
 20315  check_limit_unroll:
 20316  	CMPQ SI, $0x08
 20317  	JHS  loop_unroll
 20318  	JMP  check_limit
 20319  
 20320  loop:
 20321  	MOVSS (AX)(DI*4), X1
 20322  	MULSS X0, X1
 20323  	ADDSS (DX)(R8*4), X1
 20324  	MOVSS X1, (DX)(R8*4)
 20325  	DECQ  SI
 20326  	ADDQ  CX, DI
 20327  	ADDQ  BX, R8
 20328  
 20329  check_limit:
 20330  	CMPQ SI, $0x00
 20331  	JHI  loop
 20332  	RET
 20333  
 20334  // func AmdAxpyUnsafeXInterleave_V5A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20335  // Requires: SSE
 20336  TEXT ·AmdAxpyUnsafeXInterleave_V5A0R8(SB), NOSPLIT, $0-48
 20337  	MOVSS alpha+0(FP), X0
 20338  	MOVQ  xs+8(FP), AX
 20339  	MOVQ  incx+16(FP), CX
 20340  	MOVQ  ys+24(FP), DX
 20341  	MOVQ  incy+32(FP), BX
 20342  	MOVQ  n+40(FP), SI
 20343  	XORQ  DI, DI
 20344  	XORQ  R8, R8
 20345  	JMP   check_limit_unroll
 20346  
 20347  loop_unroll:
 20348  	MOVSS (AX)(DI*4), X1
 20349  	ADDQ  CX, DI
 20350  	MOVSS (AX)(DI*4), X2
 20351  	ADDQ  CX, DI
 20352  	MOVSS (AX)(DI*4), X3
 20353  	ADDQ  CX, DI
 20354  	MOVSS (AX)(DI*4), X4
 20355  	ADDQ  CX, DI
 20356  	MOVSS (AX)(DI*4), X5
 20357  	ADDQ  CX, DI
 20358  	MOVSS (AX)(DI*4), X6
 20359  	ADDQ  CX, DI
 20360  	MOVSS (AX)(DI*4), X7
 20361  	ADDQ  CX, DI
 20362  	MOVSS (AX)(DI*4), X8
 20363  	ADDQ  CX, DI
 20364  	MULSS X0, X1
 20365  	MULSS X0, X2
 20366  	MULSS X0, X3
 20367  	MULSS X0, X4
 20368  	MULSS X0, X5
 20369  	MULSS X0, X6
 20370  	MULSS X0, X7
 20371  	MULSS X0, X8
 20372  	ADDSS (DX)(R8*4), X1
 20373  	MOVSS X1, (DX)(R8*4)
 20374  	ADDQ  BX, R8
 20375  	ADDSS (DX)(R8*4), X2
 20376  	MOVSS X2, (DX)(R8*4)
 20377  	ADDQ  BX, R8
 20378  	ADDSS (DX)(R8*4), X3
 20379  	MOVSS X3, (DX)(R8*4)
 20380  	ADDQ  BX, R8
 20381  	ADDSS (DX)(R8*4), X4
 20382  	MOVSS X4, (DX)(R8*4)
 20383  	ADDQ  BX, R8
 20384  	ADDSS (DX)(R8*4), X5
 20385  	MOVSS X5, (DX)(R8*4)
 20386  	ADDQ  BX, R8
 20387  	ADDSS (DX)(R8*4), X6
 20388  	MOVSS X6, (DX)(R8*4)
 20389  	ADDQ  BX, R8
 20390  	ADDSS (DX)(R8*4), X7
 20391  	MOVSS X7, (DX)(R8*4)
 20392  	ADDQ  BX, R8
 20393  	ADDSS (DX)(R8*4), X8
 20394  	MOVSS X8, (DX)(R8*4)
 20395  	ADDQ  BX, R8
 20396  	SUBQ  $0x08, SI
 20397  
 20398  check_limit_unroll:
 20399  	CMPQ SI, $0x08
 20400  	JHS  loop_unroll
 20401  	JMP  check_limit
 20402  
 20403  loop:
 20404  	MOVSS (AX)(DI*4), X1
 20405  	MULSS X0, X1
 20406  	ADDSS (DX)(R8*4), X1
 20407  	MOVSS X1, (DX)(R8*4)
 20408  	DECQ  SI
 20409  	ADDQ  CX, DI
 20410  	ADDQ  BX, R8
 20411  
 20412  check_limit:
 20413  	CMPQ SI, $0x00
 20414  	JHI  loop
 20415  	RET
 20416  
 20417  // func AmdAxpyUnsafeXInterleave_V0A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20418  // Requires: SSE
 20419  TEXT ·AmdAxpyUnsafeXInterleave_V0A8R8(SB), NOSPLIT, $0-48
 20420  	MOVSS alpha+0(FP), X0
 20421  	MOVQ  xs+8(FP), AX
 20422  	MOVQ  incx+16(FP), CX
 20423  	MOVQ  ys+24(FP), DX
 20424  	MOVQ  incy+32(FP), BX
 20425  	MOVQ  n+40(FP), SI
 20426  	XORQ  DI, DI
 20427  	XORQ  R8, R8
 20428  	JMP   check_limit_unroll
 20429  	PCALIGN $0x08
 20430  
 20431  loop_unroll:
 20432  	MOVSS (AX)(DI*4), X1
 20433  	ADDQ  CX, DI
 20434  	MOVSS (AX)(DI*4), X2
 20435  	ADDQ  CX, DI
 20436  	MOVSS (AX)(DI*4), X3
 20437  	ADDQ  CX, DI
 20438  	MOVSS (AX)(DI*4), X4
 20439  	ADDQ  CX, DI
 20440  	MOVSS (AX)(DI*4), X5
 20441  	ADDQ  CX, DI
 20442  	MOVSS (AX)(DI*4), X6
 20443  	ADDQ  CX, DI
 20444  	MOVSS (AX)(DI*4), X7
 20445  	ADDQ  CX, DI
 20446  	MOVSS (AX)(DI*4), X8
 20447  	ADDQ  CX, DI
 20448  	MULSS X0, X1
 20449  	MULSS X0, X2
 20450  	MULSS X0, X3
 20451  	MULSS X0, X4
 20452  	MULSS X0, X5
 20453  	MULSS X0, X6
 20454  	MULSS X0, X7
 20455  	MULSS X0, X8
 20456  	ADDSS (DX)(R8*4), X1
 20457  	MOVSS X1, (DX)(R8*4)
 20458  	ADDQ  BX, R8
 20459  	ADDSS (DX)(R8*4), X2
 20460  	MOVSS X2, (DX)(R8*4)
 20461  	ADDQ  BX, R8
 20462  	ADDSS (DX)(R8*4), X3
 20463  	MOVSS X3, (DX)(R8*4)
 20464  	ADDQ  BX, R8
 20465  	ADDSS (DX)(R8*4), X4
 20466  	MOVSS X4, (DX)(R8*4)
 20467  	ADDQ  BX, R8
 20468  	ADDSS (DX)(R8*4), X5
 20469  	MOVSS X5, (DX)(R8*4)
 20470  	ADDQ  BX, R8
 20471  	ADDSS (DX)(R8*4), X6
 20472  	MOVSS X6, (DX)(R8*4)
 20473  	ADDQ  BX, R8
 20474  	ADDSS (DX)(R8*4), X7
 20475  	MOVSS X7, (DX)(R8*4)
 20476  	ADDQ  BX, R8
 20477  	ADDSS (DX)(R8*4), X8
 20478  	MOVSS X8, (DX)(R8*4)
 20479  	ADDQ  BX, R8
 20480  	SUBQ  $0x08, SI
 20481  
 20482  check_limit_unroll:
 20483  	CMPQ SI, $0x08
 20484  	JHS  loop_unroll
 20485  	JMP  check_limit
 20486  
 20487  loop:
 20488  	MOVSS (AX)(DI*4), X1
 20489  	MULSS X0, X1
 20490  	ADDSS (DX)(R8*4), X1
 20491  	MOVSS X1, (DX)(R8*4)
 20492  	DECQ  SI
 20493  	ADDQ  CX, DI
 20494  	ADDQ  BX, R8
 20495  
 20496  check_limit:
 20497  	CMPQ SI, $0x00
 20498  	JHI  loop
 20499  	RET
 20500  
 20501  // func AmdAxpyUnsafeXInterleave_V1A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20502  // Requires: SSE
 20503  TEXT ·AmdAxpyUnsafeXInterleave_V1A8R8(SB), NOSPLIT, $0-48
 20504  	MOVSS alpha+0(FP), X0
 20505  	MOVQ  xs+8(FP), AX
 20506  	MOVQ  incx+16(FP), CX
 20507  	MOVQ  ys+24(FP), DX
 20508  	MOVQ  incy+32(FP), BX
 20509  	MOVQ  n+40(FP), SI
 20510  	XORQ  DI, DI
 20511  	XORQ  R8, R8
 20512  	JMP   check_limit_unroll
 20513  	PCALIGN $0x08
 20514  
 20515  loop_unroll:
 20516  	MOVSS (AX)(DI*4), X1
 20517  	ADDQ  CX, DI
 20518  	MOVSS (AX)(DI*4), X2
 20519  	ADDQ  CX, DI
 20520  	MOVSS (AX)(DI*4), X3
 20521  	ADDQ  CX, DI
 20522  	MOVSS (AX)(DI*4), X4
 20523  	ADDQ  CX, DI
 20524  	MOVSS (AX)(DI*4), X5
 20525  	ADDQ  CX, DI
 20526  	MOVSS (AX)(DI*4), X6
 20527  	ADDQ  CX, DI
 20528  	MOVSS (AX)(DI*4), X7
 20529  	ADDQ  CX, DI
 20530  	MOVSS (AX)(DI*4), X8
 20531  	ADDQ  CX, DI
 20532  	MULSS X0, X1
 20533  	MULSS X0, X2
 20534  	MULSS X0, X3
 20535  	MULSS X0, X4
 20536  	MULSS X0, X5
 20537  	MULSS X0, X6
 20538  	MULSS X0, X7
 20539  	MULSS X0, X8
 20540  	ADDSS (DX)(R8*4), X1
 20541  	MOVSS X1, (DX)(R8*4)
 20542  	ADDQ  BX, R8
 20543  	ADDSS (DX)(R8*4), X2
 20544  	MOVSS X2, (DX)(R8*4)
 20545  	ADDQ  BX, R8
 20546  	ADDSS (DX)(R8*4), X3
 20547  	MOVSS X3, (DX)(R8*4)
 20548  	ADDQ  BX, R8
 20549  	ADDSS (DX)(R8*4), X4
 20550  	MOVSS X4, (DX)(R8*4)
 20551  	ADDQ  BX, R8
 20552  	ADDSS (DX)(R8*4), X5
 20553  	MOVSS X5, (DX)(R8*4)
 20554  	ADDQ  BX, R8
 20555  	ADDSS (DX)(R8*4), X6
 20556  	MOVSS X6, (DX)(R8*4)
 20557  	ADDQ  BX, R8
 20558  	ADDSS (DX)(R8*4), X7
 20559  	MOVSS X7, (DX)(R8*4)
 20560  	ADDQ  BX, R8
 20561  	ADDSS (DX)(R8*4), X8
 20562  	MOVSS X8, (DX)(R8*4)
 20563  	ADDQ  BX, R8
 20564  	SUBQ  $0x08, SI
 20565  
 20566  check_limit_unroll:
 20567  	CMPQ SI, $0x08
 20568  	JHS  loop_unroll
 20569  	JMP  check_limit
 20570  
 20571  loop:
 20572  	MOVSS (AX)(DI*4), X1
 20573  	MULSS X0, X1
 20574  	ADDSS (DX)(R8*4), X1
 20575  	MOVSS X1, (DX)(R8*4)
 20576  	DECQ  SI
 20577  	ADDQ  CX, DI
 20578  	ADDQ  BX, R8
 20579  
 20580  check_limit:
 20581  	CMPQ SI, $0x00
 20582  	JHI  loop
 20583  	RET
 20584  
 20585  // func AmdAxpyUnsafeXInterleave_V2A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20586  // Requires: SSE
 20587  TEXT ·AmdAxpyUnsafeXInterleave_V2A8R8(SB), NOSPLIT, $0-48
 20588  	MOVSS alpha+0(FP), X0
 20589  	MOVQ  xs+8(FP), AX
 20590  	MOVQ  incx+16(FP), CX
 20591  	MOVQ  ys+24(FP), DX
 20592  	MOVQ  incy+32(FP), BX
 20593  	MOVQ  n+40(FP), SI
 20594  	XORQ  DI, DI
 20595  	XORQ  R8, R8
 20596  	JMP   check_limit_unroll
 20597  	PCALIGN $0x08
 20598  
 20599  loop_unroll:
 20600  	MOVSS (AX)(DI*4), X1
 20601  	ADDQ  CX, DI
 20602  	MOVSS (AX)(DI*4), X2
 20603  	ADDQ  CX, DI
 20604  	MOVSS (AX)(DI*4), X3
 20605  	ADDQ  CX, DI
 20606  	MOVSS (AX)(DI*4), X4
 20607  	ADDQ  CX, DI
 20608  	MOVSS (AX)(DI*4), X5
 20609  	ADDQ  CX, DI
 20610  	MOVSS (AX)(DI*4), X6
 20611  	ADDQ  CX, DI
 20612  	MOVSS (AX)(DI*4), X7
 20613  	ADDQ  CX, DI
 20614  	MOVSS (AX)(DI*4), X8
 20615  	ADDQ  CX, DI
 20616  	MULSS X0, X1
 20617  	MULSS X0, X2
 20618  	MULSS X0, X3
 20619  	MULSS X0, X4
 20620  	MULSS X0, X5
 20621  	MULSS X0, X6
 20622  	MULSS X0, X7
 20623  	MULSS X0, X8
 20624  	ADDSS (DX)(R8*4), X1
 20625  	MOVSS X1, (DX)(R8*4)
 20626  	ADDQ  BX, R8
 20627  	ADDSS (DX)(R8*4), X2
 20628  	MOVSS X2, (DX)(R8*4)
 20629  	ADDQ  BX, R8
 20630  	ADDSS (DX)(R8*4), X3
 20631  	MOVSS X3, (DX)(R8*4)
 20632  	ADDQ  BX, R8
 20633  	ADDSS (DX)(R8*4), X4
 20634  	MOVSS X4, (DX)(R8*4)
 20635  	ADDQ  BX, R8
 20636  	ADDSS (DX)(R8*4), X5
 20637  	MOVSS X5, (DX)(R8*4)
 20638  	ADDQ  BX, R8
 20639  	ADDSS (DX)(R8*4), X6
 20640  	MOVSS X6, (DX)(R8*4)
 20641  	ADDQ  BX, R8
 20642  	ADDSS (DX)(R8*4), X7
 20643  	MOVSS X7, (DX)(R8*4)
 20644  	ADDQ  BX, R8
 20645  	ADDSS (DX)(R8*4), X8
 20646  	MOVSS X8, (DX)(R8*4)
 20647  	ADDQ  BX, R8
 20648  	SUBQ  $0x08, SI
 20649  
 20650  check_limit_unroll:
 20651  	CMPQ SI, $0x08
 20652  	JHS  loop_unroll
 20653  	JMP  check_limit
 20654  
 20655  loop:
 20656  	MOVSS (AX)(DI*4), X1
 20657  	MULSS X0, X1
 20658  	ADDSS (DX)(R8*4), X1
 20659  	MOVSS X1, (DX)(R8*4)
 20660  	DECQ  SI
 20661  	ADDQ  CX, DI
 20662  	ADDQ  BX, R8
 20663  
 20664  check_limit:
 20665  	CMPQ SI, $0x00
 20666  	JHI  loop
 20667  	RET
 20668  
 20669  // func AmdAxpyUnsafeXInterleave_V3A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20670  // Requires: SSE
 20671  TEXT ·AmdAxpyUnsafeXInterleave_V3A8R8(SB), NOSPLIT, $0-48
 20672  	MOVSS alpha+0(FP), X0
 20673  	MOVQ  xs+8(FP), AX
 20674  	MOVQ  incx+16(FP), CX
 20675  	MOVQ  ys+24(FP), DX
 20676  	MOVQ  incy+32(FP), BX
 20677  	MOVQ  n+40(FP), SI
 20678  	XORQ  DI, DI
 20679  	XORQ  R8, R8
 20680  	JMP   check_limit_unroll
 20681  	PCALIGN $0x08
 20682  
 20683  loop_unroll:
 20684  	MOVSS (AX)(DI*4), X1
 20685  	ADDQ  CX, DI
 20686  	MOVSS (AX)(DI*4), X2
 20687  	ADDQ  CX, DI
 20688  	MOVSS (AX)(DI*4), X3
 20689  	ADDQ  CX, DI
 20690  	MOVSS (AX)(DI*4), X4
 20691  	ADDQ  CX, DI
 20692  	MOVSS (AX)(DI*4), X5
 20693  	ADDQ  CX, DI
 20694  	MOVSS (AX)(DI*4), X6
 20695  	ADDQ  CX, DI
 20696  	MOVSS (AX)(DI*4), X7
 20697  	ADDQ  CX, DI
 20698  	MOVSS (AX)(DI*4), X8
 20699  	ADDQ  CX, DI
 20700  	MULSS X0, X1
 20701  	MULSS X0, X2
 20702  	MULSS X0, X3
 20703  	MULSS X0, X4
 20704  	MULSS X0, X5
 20705  	MULSS X0, X6
 20706  	MULSS X0, X7
 20707  	MULSS X0, X8
 20708  	ADDSS (DX)(R8*4), X1
 20709  	MOVSS X1, (DX)(R8*4)
 20710  	ADDQ  BX, R8
 20711  	ADDSS (DX)(R8*4), X2
 20712  	MOVSS X2, (DX)(R8*4)
 20713  	ADDQ  BX, R8
 20714  	ADDSS (DX)(R8*4), X3
 20715  	MOVSS X3, (DX)(R8*4)
 20716  	ADDQ  BX, R8
 20717  	ADDSS (DX)(R8*4), X4
 20718  	MOVSS X4, (DX)(R8*4)
 20719  	ADDQ  BX, R8
 20720  	ADDSS (DX)(R8*4), X5
 20721  	MOVSS X5, (DX)(R8*4)
 20722  	ADDQ  BX, R8
 20723  	ADDSS (DX)(R8*4), X6
 20724  	MOVSS X6, (DX)(R8*4)
 20725  	ADDQ  BX, R8
 20726  	ADDSS (DX)(R8*4), X7
 20727  	MOVSS X7, (DX)(R8*4)
 20728  	ADDQ  BX, R8
 20729  	ADDSS (DX)(R8*4), X8
 20730  	MOVSS X8, (DX)(R8*4)
 20731  	ADDQ  BX, R8
 20732  	SUBQ  $0x08, SI
 20733  
 20734  check_limit_unroll:
 20735  	CMPQ SI, $0x08
 20736  	JHS  loop_unroll
 20737  	JMP  check_limit
 20738  
 20739  loop:
 20740  	MOVSS (AX)(DI*4), X1
 20741  	MULSS X0, X1
 20742  	ADDSS (DX)(R8*4), X1
 20743  	MOVSS X1, (DX)(R8*4)
 20744  	DECQ  SI
 20745  	ADDQ  CX, DI
 20746  	ADDQ  BX, R8
 20747  
 20748  check_limit:
 20749  	CMPQ SI, $0x00
 20750  	JHI  loop
 20751  	RET
 20752  
 20753  // func AmdAxpyUnsafeXInterleave_V4A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20754  // Requires: SSE
 20755  TEXT ·AmdAxpyUnsafeXInterleave_V4A8R8(SB), NOSPLIT, $0-48
 20756  	MOVSS alpha+0(FP), X0
 20757  	MOVQ  xs+8(FP), AX
 20758  	MOVQ  incx+16(FP), CX
 20759  	MOVQ  ys+24(FP), DX
 20760  	MOVQ  incy+32(FP), BX
 20761  	MOVQ  n+40(FP), SI
 20762  	XORQ  DI, DI
 20763  	XORQ  R8, R8
 20764  	JMP   check_limit_unroll
 20765  	PCALIGN $0x08
 20766  
 20767  loop_unroll:
 20768  	MOVSS (AX)(DI*4), X1
 20769  	ADDQ  CX, DI
 20770  	MOVSS (AX)(DI*4), X2
 20771  	ADDQ  CX, DI
 20772  	MOVSS (AX)(DI*4), X3
 20773  	ADDQ  CX, DI
 20774  	MOVSS (AX)(DI*4), X4
 20775  	ADDQ  CX, DI
 20776  	MOVSS (AX)(DI*4), X5
 20777  	ADDQ  CX, DI
 20778  	MOVSS (AX)(DI*4), X6
 20779  	ADDQ  CX, DI
 20780  	MOVSS (AX)(DI*4), X7
 20781  	ADDQ  CX, DI
 20782  	MOVSS (AX)(DI*4), X8
 20783  	ADDQ  CX, DI
 20784  	MULSS X0, X1
 20785  	MULSS X0, X2
 20786  	MULSS X0, X3
 20787  	MULSS X0, X4
 20788  	MULSS X0, X5
 20789  	MULSS X0, X6
 20790  	MULSS X0, X7
 20791  	MULSS X0, X8
 20792  	ADDSS (DX)(R8*4), X1
 20793  	MOVSS X1, (DX)(R8*4)
 20794  	ADDQ  BX, R8
 20795  	ADDSS (DX)(R8*4), X2
 20796  	MOVSS X2, (DX)(R8*4)
 20797  	ADDQ  BX, R8
 20798  	ADDSS (DX)(R8*4), X3
 20799  	MOVSS X3, (DX)(R8*4)
 20800  	ADDQ  BX, R8
 20801  	ADDSS (DX)(R8*4), X4
 20802  	MOVSS X4, (DX)(R8*4)
 20803  	ADDQ  BX, R8
 20804  	ADDSS (DX)(R8*4), X5
 20805  	MOVSS X5, (DX)(R8*4)
 20806  	ADDQ  BX, R8
 20807  	ADDSS (DX)(R8*4), X6
 20808  	MOVSS X6, (DX)(R8*4)
 20809  	ADDQ  BX, R8
 20810  	ADDSS (DX)(R8*4), X7
 20811  	MOVSS X7, (DX)(R8*4)
 20812  	ADDQ  BX, R8
 20813  	ADDSS (DX)(R8*4), X8
 20814  	MOVSS X8, (DX)(R8*4)
 20815  	ADDQ  BX, R8
 20816  	SUBQ  $0x08, SI
 20817  
 20818  check_limit_unroll:
 20819  	CMPQ SI, $0x08
 20820  	JHS  loop_unroll
 20821  	JMP  check_limit
 20822  
 20823  loop:
 20824  	MOVSS (AX)(DI*4), X1
 20825  	MULSS X0, X1
 20826  	ADDSS (DX)(R8*4), X1
 20827  	MOVSS X1, (DX)(R8*4)
 20828  	DECQ  SI
 20829  	ADDQ  CX, DI
 20830  	ADDQ  BX, R8
 20831  
 20832  check_limit:
 20833  	CMPQ SI, $0x00
 20834  	JHI  loop
 20835  	RET
 20836  
 20837  // func AmdAxpyUnsafeXInterleave_V5A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20838  // Requires: SSE
 20839  TEXT ·AmdAxpyUnsafeXInterleave_V5A8R8(SB), NOSPLIT, $0-48
 20840  	MOVSS alpha+0(FP), X0
 20841  	MOVQ  xs+8(FP), AX
 20842  	MOVQ  incx+16(FP), CX
 20843  	MOVQ  ys+24(FP), DX
 20844  	MOVQ  incy+32(FP), BX
 20845  	MOVQ  n+40(FP), SI
 20846  	XORQ  DI, DI
 20847  	XORQ  R8, R8
 20848  	JMP   check_limit_unroll
 20849  	PCALIGN $0x08
 20850  
 20851  loop_unroll:
 20852  	MOVSS (AX)(DI*4), X1
 20853  	ADDQ  CX, DI
 20854  	MOVSS (AX)(DI*4), X2
 20855  	ADDQ  CX, DI
 20856  	MOVSS (AX)(DI*4), X3
 20857  	ADDQ  CX, DI
 20858  	MOVSS (AX)(DI*4), X4
 20859  	ADDQ  CX, DI
 20860  	MOVSS (AX)(DI*4), X5
 20861  	ADDQ  CX, DI
 20862  	MOVSS (AX)(DI*4), X6
 20863  	ADDQ  CX, DI
 20864  	MOVSS (AX)(DI*4), X7
 20865  	ADDQ  CX, DI
 20866  	MOVSS (AX)(DI*4), X8
 20867  	ADDQ  CX, DI
 20868  	MULSS X0, X1
 20869  	MULSS X0, X2
 20870  	MULSS X0, X3
 20871  	MULSS X0, X4
 20872  	MULSS X0, X5
 20873  	MULSS X0, X6
 20874  	MULSS X0, X7
 20875  	MULSS X0, X8
 20876  	ADDSS (DX)(R8*4), X1
 20877  	MOVSS X1, (DX)(R8*4)
 20878  	ADDQ  BX, R8
 20879  	ADDSS (DX)(R8*4), X2
 20880  	MOVSS X2, (DX)(R8*4)
 20881  	ADDQ  BX, R8
 20882  	ADDSS (DX)(R8*4), X3
 20883  	MOVSS X3, (DX)(R8*4)
 20884  	ADDQ  BX, R8
 20885  	ADDSS (DX)(R8*4), X4
 20886  	MOVSS X4, (DX)(R8*4)
 20887  	ADDQ  BX, R8
 20888  	ADDSS (DX)(R8*4), X5
 20889  	MOVSS X5, (DX)(R8*4)
 20890  	ADDQ  BX, R8
 20891  	ADDSS (DX)(R8*4), X6
 20892  	MOVSS X6, (DX)(R8*4)
 20893  	ADDQ  BX, R8
 20894  	ADDSS (DX)(R8*4), X7
 20895  	MOVSS X7, (DX)(R8*4)
 20896  	ADDQ  BX, R8
 20897  	ADDSS (DX)(R8*4), X8
 20898  	MOVSS X8, (DX)(R8*4)
 20899  	ADDQ  BX, R8
 20900  	SUBQ  $0x08, SI
 20901  
 20902  check_limit_unroll:
 20903  	CMPQ SI, $0x08
 20904  	JHS  loop_unroll
 20905  	JMP  check_limit
 20906  
 20907  loop:
 20908  	MOVSS (AX)(DI*4), X1
 20909  	MULSS X0, X1
 20910  	ADDSS (DX)(R8*4), X1
 20911  	MOVSS X1, (DX)(R8*4)
 20912  	DECQ  SI
 20913  	ADDQ  CX, DI
 20914  	ADDQ  BX, R8
 20915  
 20916  check_limit:
 20917  	CMPQ SI, $0x00
 20918  	JHI  loop
 20919  	RET
 20920  
 20921  // func AmdAxpyUnsafeXInterleave_V0A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 20922  // Requires: SSE
 20923  TEXT ·AmdAxpyUnsafeXInterleave_V0A9R8(SB), NOSPLIT, $0-48
 20924  	MOVSS alpha+0(FP), X0
 20925  	MOVQ  xs+8(FP), AX
 20926  	MOVQ  incx+16(FP), CX
 20927  	MOVQ  ys+24(FP), DX
 20928  	MOVQ  incy+32(FP), BX
 20929  	MOVQ  n+40(FP), SI
 20930  	XORQ  DI, DI
 20931  	XORQ  R8, R8
 20932  	JMP   check_limit_unroll
 20933  	PCALIGN $0x08
 20934  	NOP
 20935  
 20936  loop_unroll:
 20937  	MOVSS (AX)(DI*4), X1
 20938  	ADDQ  CX, DI
 20939  	MOVSS (AX)(DI*4), X2
 20940  	ADDQ  CX, DI
 20941  	MOVSS (AX)(DI*4), X3
 20942  	ADDQ  CX, DI
 20943  	MOVSS (AX)(DI*4), X4
 20944  	ADDQ  CX, DI
 20945  	MOVSS (AX)(DI*4), X5
 20946  	ADDQ  CX, DI
 20947  	MOVSS (AX)(DI*4), X6
 20948  	ADDQ  CX, DI
 20949  	MOVSS (AX)(DI*4), X7
 20950  	ADDQ  CX, DI
 20951  	MOVSS (AX)(DI*4), X8
 20952  	ADDQ  CX, DI
 20953  	MULSS X0, X1
 20954  	MULSS X0, X2
 20955  	MULSS X0, X3
 20956  	MULSS X0, X4
 20957  	MULSS X0, X5
 20958  	MULSS X0, X6
 20959  	MULSS X0, X7
 20960  	MULSS X0, X8
 20961  	ADDSS (DX)(R8*4), X1
 20962  	MOVSS X1, (DX)(R8*4)
 20963  	ADDQ  BX, R8
 20964  	ADDSS (DX)(R8*4), X2
 20965  	MOVSS X2, (DX)(R8*4)
 20966  	ADDQ  BX, R8
 20967  	ADDSS (DX)(R8*4), X3
 20968  	MOVSS X3, (DX)(R8*4)
 20969  	ADDQ  BX, R8
 20970  	ADDSS (DX)(R8*4), X4
 20971  	MOVSS X4, (DX)(R8*4)
 20972  	ADDQ  BX, R8
 20973  	ADDSS (DX)(R8*4), X5
 20974  	MOVSS X5, (DX)(R8*4)
 20975  	ADDQ  BX, R8
 20976  	ADDSS (DX)(R8*4), X6
 20977  	MOVSS X6, (DX)(R8*4)
 20978  	ADDQ  BX, R8
 20979  	ADDSS (DX)(R8*4), X7
 20980  	MOVSS X7, (DX)(R8*4)
 20981  	ADDQ  BX, R8
 20982  	ADDSS (DX)(R8*4), X8
 20983  	MOVSS X8, (DX)(R8*4)
 20984  	ADDQ  BX, R8
 20985  	SUBQ  $0x08, SI
 20986  
 20987  check_limit_unroll:
 20988  	CMPQ SI, $0x08
 20989  	JHS  loop_unroll
 20990  	JMP  check_limit
 20991  
 20992  loop:
 20993  	MOVSS (AX)(DI*4), X1
 20994  	MULSS X0, X1
 20995  	ADDSS (DX)(R8*4), X1
 20996  	MOVSS X1, (DX)(R8*4)
 20997  	DECQ  SI
 20998  	ADDQ  CX, DI
 20999  	ADDQ  BX, R8
 21000  
 21001  check_limit:
 21002  	CMPQ SI, $0x00
 21003  	JHI  loop
 21004  	RET
 21005  
 21006  // func AmdAxpyUnsafeXInterleave_V1A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21007  // Requires: SSE
 21008  TEXT ·AmdAxpyUnsafeXInterleave_V1A9R8(SB), NOSPLIT, $0-48
 21009  	MOVSS alpha+0(FP), X0
 21010  	MOVQ  xs+8(FP), AX
 21011  	MOVQ  incx+16(FP), CX
 21012  	MOVQ  ys+24(FP), DX
 21013  	MOVQ  incy+32(FP), BX
 21014  	MOVQ  n+40(FP), SI
 21015  	XORQ  DI, DI
 21016  	XORQ  R8, R8
 21017  	JMP   check_limit_unroll
 21018  	PCALIGN $0x08
 21019  	NOP
 21020  
 21021  loop_unroll:
 21022  	MOVSS (AX)(DI*4), X1
 21023  	ADDQ  CX, DI
 21024  	MOVSS (AX)(DI*4), X2
 21025  	ADDQ  CX, DI
 21026  	MOVSS (AX)(DI*4), X3
 21027  	ADDQ  CX, DI
 21028  	MOVSS (AX)(DI*4), X4
 21029  	ADDQ  CX, DI
 21030  	MOVSS (AX)(DI*4), X5
 21031  	ADDQ  CX, DI
 21032  	MOVSS (AX)(DI*4), X6
 21033  	ADDQ  CX, DI
 21034  	MOVSS (AX)(DI*4), X7
 21035  	ADDQ  CX, DI
 21036  	MOVSS (AX)(DI*4), X8
 21037  	ADDQ  CX, DI
 21038  	MULSS X0, X1
 21039  	MULSS X0, X2
 21040  	MULSS X0, X3
 21041  	MULSS X0, X4
 21042  	MULSS X0, X5
 21043  	MULSS X0, X6
 21044  	MULSS X0, X7
 21045  	MULSS X0, X8
 21046  	ADDSS (DX)(R8*4), X1
 21047  	MOVSS X1, (DX)(R8*4)
 21048  	ADDQ  BX, R8
 21049  	ADDSS (DX)(R8*4), X2
 21050  	MOVSS X2, (DX)(R8*4)
 21051  	ADDQ  BX, R8
 21052  	ADDSS (DX)(R8*4), X3
 21053  	MOVSS X3, (DX)(R8*4)
 21054  	ADDQ  BX, R8
 21055  	ADDSS (DX)(R8*4), X4
 21056  	MOVSS X4, (DX)(R8*4)
 21057  	ADDQ  BX, R8
 21058  	ADDSS (DX)(R8*4), X5
 21059  	MOVSS X5, (DX)(R8*4)
 21060  	ADDQ  BX, R8
 21061  	ADDSS (DX)(R8*4), X6
 21062  	MOVSS X6, (DX)(R8*4)
 21063  	ADDQ  BX, R8
 21064  	ADDSS (DX)(R8*4), X7
 21065  	MOVSS X7, (DX)(R8*4)
 21066  	ADDQ  BX, R8
 21067  	ADDSS (DX)(R8*4), X8
 21068  	MOVSS X8, (DX)(R8*4)
 21069  	ADDQ  BX, R8
 21070  	SUBQ  $0x08, SI
 21071  
 21072  check_limit_unroll:
 21073  	CMPQ SI, $0x08
 21074  	JHS  loop_unroll
 21075  	JMP  check_limit
 21076  
 21077  loop:
 21078  	MOVSS (AX)(DI*4), X1
 21079  	MULSS X0, X1
 21080  	ADDSS (DX)(R8*4), X1
 21081  	MOVSS X1, (DX)(R8*4)
 21082  	DECQ  SI
 21083  	ADDQ  CX, DI
 21084  	ADDQ  BX, R8
 21085  
 21086  check_limit:
 21087  	CMPQ SI, $0x00
 21088  	JHI  loop
 21089  	RET
 21090  
 21091  // func AmdAxpyUnsafeXInterleave_V2A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21092  // Requires: SSE
 21093  TEXT ·AmdAxpyUnsafeXInterleave_V2A9R8(SB), NOSPLIT, $0-48
 21094  	MOVSS alpha+0(FP), X0
 21095  	MOVQ  xs+8(FP), AX
 21096  	MOVQ  incx+16(FP), CX
 21097  	MOVQ  ys+24(FP), DX
 21098  	MOVQ  incy+32(FP), BX
 21099  	MOVQ  n+40(FP), SI
 21100  	XORQ  DI, DI
 21101  	XORQ  R8, R8
 21102  	JMP   check_limit_unroll
 21103  	PCALIGN $0x08
 21104  	NOP
 21105  
 21106  loop_unroll:
 21107  	MOVSS (AX)(DI*4), X1
 21108  	ADDQ  CX, DI
 21109  	MOVSS (AX)(DI*4), X2
 21110  	ADDQ  CX, DI
 21111  	MOVSS (AX)(DI*4), X3
 21112  	ADDQ  CX, DI
 21113  	MOVSS (AX)(DI*4), X4
 21114  	ADDQ  CX, DI
 21115  	MOVSS (AX)(DI*4), X5
 21116  	ADDQ  CX, DI
 21117  	MOVSS (AX)(DI*4), X6
 21118  	ADDQ  CX, DI
 21119  	MOVSS (AX)(DI*4), X7
 21120  	ADDQ  CX, DI
 21121  	MOVSS (AX)(DI*4), X8
 21122  	ADDQ  CX, DI
 21123  	MULSS X0, X1
 21124  	MULSS X0, X2
 21125  	MULSS X0, X3
 21126  	MULSS X0, X4
 21127  	MULSS X0, X5
 21128  	MULSS X0, X6
 21129  	MULSS X0, X7
 21130  	MULSS X0, X8
 21131  	ADDSS (DX)(R8*4), X1
 21132  	MOVSS X1, (DX)(R8*4)
 21133  	ADDQ  BX, R8
 21134  	ADDSS (DX)(R8*4), X2
 21135  	MOVSS X2, (DX)(R8*4)
 21136  	ADDQ  BX, R8
 21137  	ADDSS (DX)(R8*4), X3
 21138  	MOVSS X3, (DX)(R8*4)
 21139  	ADDQ  BX, R8
 21140  	ADDSS (DX)(R8*4), X4
 21141  	MOVSS X4, (DX)(R8*4)
 21142  	ADDQ  BX, R8
 21143  	ADDSS (DX)(R8*4), X5
 21144  	MOVSS X5, (DX)(R8*4)
 21145  	ADDQ  BX, R8
 21146  	ADDSS (DX)(R8*4), X6
 21147  	MOVSS X6, (DX)(R8*4)
 21148  	ADDQ  BX, R8
 21149  	ADDSS (DX)(R8*4), X7
 21150  	MOVSS X7, (DX)(R8*4)
 21151  	ADDQ  BX, R8
 21152  	ADDSS (DX)(R8*4), X8
 21153  	MOVSS X8, (DX)(R8*4)
 21154  	ADDQ  BX, R8
 21155  	SUBQ  $0x08, SI
 21156  
 21157  check_limit_unroll:
 21158  	CMPQ SI, $0x08
 21159  	JHS  loop_unroll
 21160  	JMP  check_limit
 21161  
 21162  loop:
 21163  	MOVSS (AX)(DI*4), X1
 21164  	MULSS X0, X1
 21165  	ADDSS (DX)(R8*4), X1
 21166  	MOVSS X1, (DX)(R8*4)
 21167  	DECQ  SI
 21168  	ADDQ  CX, DI
 21169  	ADDQ  BX, R8
 21170  
 21171  check_limit:
 21172  	CMPQ SI, $0x00
 21173  	JHI  loop
 21174  	RET
 21175  
 21176  // func AmdAxpyUnsafeXInterleave_V3A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21177  // Requires: SSE
 21178  TEXT ·AmdAxpyUnsafeXInterleave_V3A9R8(SB), NOSPLIT, $0-48
 21179  	MOVSS alpha+0(FP), X0
 21180  	MOVQ  xs+8(FP), AX
 21181  	MOVQ  incx+16(FP), CX
 21182  	MOVQ  ys+24(FP), DX
 21183  	MOVQ  incy+32(FP), BX
 21184  	MOVQ  n+40(FP), SI
 21185  	XORQ  DI, DI
 21186  	XORQ  R8, R8
 21187  	JMP   check_limit_unroll
 21188  	PCALIGN $0x08
 21189  	NOP
 21190  
 21191  loop_unroll:
 21192  	MOVSS (AX)(DI*4), X1
 21193  	ADDQ  CX, DI
 21194  	MOVSS (AX)(DI*4), X2
 21195  	ADDQ  CX, DI
 21196  	MOVSS (AX)(DI*4), X3
 21197  	ADDQ  CX, DI
 21198  	MOVSS (AX)(DI*4), X4
 21199  	ADDQ  CX, DI
 21200  	MOVSS (AX)(DI*4), X5
 21201  	ADDQ  CX, DI
 21202  	MOVSS (AX)(DI*4), X6
 21203  	ADDQ  CX, DI
 21204  	MOVSS (AX)(DI*4), X7
 21205  	ADDQ  CX, DI
 21206  	MOVSS (AX)(DI*4), X8
 21207  	ADDQ  CX, DI
 21208  	MULSS X0, X1
 21209  	MULSS X0, X2
 21210  	MULSS X0, X3
 21211  	MULSS X0, X4
 21212  	MULSS X0, X5
 21213  	MULSS X0, X6
 21214  	MULSS X0, X7
 21215  	MULSS X0, X8
 21216  	ADDSS (DX)(R8*4), X1
 21217  	MOVSS X1, (DX)(R8*4)
 21218  	ADDQ  BX, R8
 21219  	ADDSS (DX)(R8*4), X2
 21220  	MOVSS X2, (DX)(R8*4)
 21221  	ADDQ  BX, R8
 21222  	ADDSS (DX)(R8*4), X3
 21223  	MOVSS X3, (DX)(R8*4)
 21224  	ADDQ  BX, R8
 21225  	ADDSS (DX)(R8*4), X4
 21226  	MOVSS X4, (DX)(R8*4)
 21227  	ADDQ  BX, R8
 21228  	ADDSS (DX)(R8*4), X5
 21229  	MOVSS X5, (DX)(R8*4)
 21230  	ADDQ  BX, R8
 21231  	ADDSS (DX)(R8*4), X6
 21232  	MOVSS X6, (DX)(R8*4)
 21233  	ADDQ  BX, R8
 21234  	ADDSS (DX)(R8*4), X7
 21235  	MOVSS X7, (DX)(R8*4)
 21236  	ADDQ  BX, R8
 21237  	ADDSS (DX)(R8*4), X8
 21238  	MOVSS X8, (DX)(R8*4)
 21239  	ADDQ  BX, R8
 21240  	SUBQ  $0x08, SI
 21241  
 21242  check_limit_unroll:
 21243  	CMPQ SI, $0x08
 21244  	JHS  loop_unroll
 21245  	JMP  check_limit
 21246  
 21247  loop:
 21248  	MOVSS (AX)(DI*4), X1
 21249  	MULSS X0, X1
 21250  	ADDSS (DX)(R8*4), X1
 21251  	MOVSS X1, (DX)(R8*4)
 21252  	DECQ  SI
 21253  	ADDQ  CX, DI
 21254  	ADDQ  BX, R8
 21255  
 21256  check_limit:
 21257  	CMPQ SI, $0x00
 21258  	JHI  loop
 21259  	RET
 21260  
 21261  // func AmdAxpyUnsafeXInterleave_V4A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21262  // Requires: SSE
 21263  TEXT ·AmdAxpyUnsafeXInterleave_V4A9R8(SB), NOSPLIT, $0-48
 21264  	MOVSS alpha+0(FP), X0
 21265  	MOVQ  xs+8(FP), AX
 21266  	MOVQ  incx+16(FP), CX
 21267  	MOVQ  ys+24(FP), DX
 21268  	MOVQ  incy+32(FP), BX
 21269  	MOVQ  n+40(FP), SI
 21270  	XORQ  DI, DI
 21271  	XORQ  R8, R8
 21272  	JMP   check_limit_unroll
 21273  	PCALIGN $0x08
 21274  	NOP
 21275  
 21276  loop_unroll:
 21277  	MOVSS (AX)(DI*4), X1
 21278  	ADDQ  CX, DI
 21279  	MOVSS (AX)(DI*4), X2
 21280  	ADDQ  CX, DI
 21281  	MOVSS (AX)(DI*4), X3
 21282  	ADDQ  CX, DI
 21283  	MOVSS (AX)(DI*4), X4
 21284  	ADDQ  CX, DI
 21285  	MOVSS (AX)(DI*4), X5
 21286  	ADDQ  CX, DI
 21287  	MOVSS (AX)(DI*4), X6
 21288  	ADDQ  CX, DI
 21289  	MOVSS (AX)(DI*4), X7
 21290  	ADDQ  CX, DI
 21291  	MOVSS (AX)(DI*4), X8
 21292  	ADDQ  CX, DI
 21293  	MULSS X0, X1
 21294  	MULSS X0, X2
 21295  	MULSS X0, X3
 21296  	MULSS X0, X4
 21297  	MULSS X0, X5
 21298  	MULSS X0, X6
 21299  	MULSS X0, X7
 21300  	MULSS X0, X8
 21301  	ADDSS (DX)(R8*4), X1
 21302  	MOVSS X1, (DX)(R8*4)
 21303  	ADDQ  BX, R8
 21304  	ADDSS (DX)(R8*4), X2
 21305  	MOVSS X2, (DX)(R8*4)
 21306  	ADDQ  BX, R8
 21307  	ADDSS (DX)(R8*4), X3
 21308  	MOVSS X3, (DX)(R8*4)
 21309  	ADDQ  BX, R8
 21310  	ADDSS (DX)(R8*4), X4
 21311  	MOVSS X4, (DX)(R8*4)
 21312  	ADDQ  BX, R8
 21313  	ADDSS (DX)(R8*4), X5
 21314  	MOVSS X5, (DX)(R8*4)
 21315  	ADDQ  BX, R8
 21316  	ADDSS (DX)(R8*4), X6
 21317  	MOVSS X6, (DX)(R8*4)
 21318  	ADDQ  BX, R8
 21319  	ADDSS (DX)(R8*4), X7
 21320  	MOVSS X7, (DX)(R8*4)
 21321  	ADDQ  BX, R8
 21322  	ADDSS (DX)(R8*4), X8
 21323  	MOVSS X8, (DX)(R8*4)
 21324  	ADDQ  BX, R8
 21325  	SUBQ  $0x08, SI
 21326  
 21327  check_limit_unroll:
 21328  	CMPQ SI, $0x08
 21329  	JHS  loop_unroll
 21330  	JMP  check_limit
 21331  
 21332  loop:
 21333  	MOVSS (AX)(DI*4), X1
 21334  	MULSS X0, X1
 21335  	ADDSS (DX)(R8*4), X1
 21336  	MOVSS X1, (DX)(R8*4)
 21337  	DECQ  SI
 21338  	ADDQ  CX, DI
 21339  	ADDQ  BX, R8
 21340  
 21341  check_limit:
 21342  	CMPQ SI, $0x00
 21343  	JHI  loop
 21344  	RET
 21345  
 21346  // func AmdAxpyUnsafeXInterleave_V5A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21347  // Requires: SSE
 21348  TEXT ·AmdAxpyUnsafeXInterleave_V5A9R8(SB), NOSPLIT, $0-48
 21349  	MOVSS alpha+0(FP), X0
 21350  	MOVQ  xs+8(FP), AX
 21351  	MOVQ  incx+16(FP), CX
 21352  	MOVQ  ys+24(FP), DX
 21353  	MOVQ  incy+32(FP), BX
 21354  	MOVQ  n+40(FP), SI
 21355  	XORQ  DI, DI
 21356  	XORQ  R8, R8
 21357  	JMP   check_limit_unroll
 21358  	PCALIGN $0x08
 21359  	NOP
 21360  
 21361  loop_unroll:
 21362  	MOVSS (AX)(DI*4), X1
 21363  	ADDQ  CX, DI
 21364  	MOVSS (AX)(DI*4), X2
 21365  	ADDQ  CX, DI
 21366  	MOVSS (AX)(DI*4), X3
 21367  	ADDQ  CX, DI
 21368  	MOVSS (AX)(DI*4), X4
 21369  	ADDQ  CX, DI
 21370  	MOVSS (AX)(DI*4), X5
 21371  	ADDQ  CX, DI
 21372  	MOVSS (AX)(DI*4), X6
 21373  	ADDQ  CX, DI
 21374  	MOVSS (AX)(DI*4), X7
 21375  	ADDQ  CX, DI
 21376  	MOVSS (AX)(DI*4), X8
 21377  	ADDQ  CX, DI
 21378  	MULSS X0, X1
 21379  	MULSS X0, X2
 21380  	MULSS X0, X3
 21381  	MULSS X0, X4
 21382  	MULSS X0, X5
 21383  	MULSS X0, X6
 21384  	MULSS X0, X7
 21385  	MULSS X0, X8
 21386  	ADDSS (DX)(R8*4), X1
 21387  	MOVSS X1, (DX)(R8*4)
 21388  	ADDQ  BX, R8
 21389  	ADDSS (DX)(R8*4), X2
 21390  	MOVSS X2, (DX)(R8*4)
 21391  	ADDQ  BX, R8
 21392  	ADDSS (DX)(R8*4), X3
 21393  	MOVSS X3, (DX)(R8*4)
 21394  	ADDQ  BX, R8
 21395  	ADDSS (DX)(R8*4), X4
 21396  	MOVSS X4, (DX)(R8*4)
 21397  	ADDQ  BX, R8
 21398  	ADDSS (DX)(R8*4), X5
 21399  	MOVSS X5, (DX)(R8*4)
 21400  	ADDQ  BX, R8
 21401  	ADDSS (DX)(R8*4), X6
 21402  	MOVSS X6, (DX)(R8*4)
 21403  	ADDQ  BX, R8
 21404  	ADDSS (DX)(R8*4), X7
 21405  	MOVSS X7, (DX)(R8*4)
 21406  	ADDQ  BX, R8
 21407  	ADDSS (DX)(R8*4), X8
 21408  	MOVSS X8, (DX)(R8*4)
 21409  	ADDQ  BX, R8
 21410  	SUBQ  $0x08, SI
 21411  
 21412  check_limit_unroll:
 21413  	CMPQ SI, $0x08
 21414  	JHS  loop_unroll
 21415  	JMP  check_limit
 21416  
 21417  loop:
 21418  	MOVSS (AX)(DI*4), X1
 21419  	MULSS X0, X1
 21420  	ADDSS (DX)(R8*4), X1
 21421  	MOVSS X1, (DX)(R8*4)
 21422  	DECQ  SI
 21423  	ADDQ  CX, DI
 21424  	ADDQ  BX, R8
 21425  
 21426  check_limit:
 21427  	CMPQ SI, $0x00
 21428  	JHI  loop
 21429  	RET
 21430  
 21431  // func AmdAxpyUnsafeXInterleave_V0A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21432  // Requires: SSE
 21433  TEXT ·AmdAxpyUnsafeXInterleave_V0A10R8(SB), NOSPLIT, $0-48
 21434  	MOVSS alpha+0(FP), X0
 21435  	MOVQ  xs+8(FP), AX
 21436  	MOVQ  incx+16(FP), CX
 21437  	MOVQ  ys+24(FP), DX
 21438  	MOVQ  incy+32(FP), BX
 21439  	MOVQ  n+40(FP), SI
 21440  	XORQ  DI, DI
 21441  	XORQ  R8, R8
 21442  	JMP   check_limit_unroll
 21443  	PCALIGN $0x08
 21444  	NOP
 21445  	NOP
 21446  
 21447  loop_unroll:
 21448  	MOVSS (AX)(DI*4), X1
 21449  	ADDQ  CX, DI
 21450  	MOVSS (AX)(DI*4), X2
 21451  	ADDQ  CX, DI
 21452  	MOVSS (AX)(DI*4), X3
 21453  	ADDQ  CX, DI
 21454  	MOVSS (AX)(DI*4), X4
 21455  	ADDQ  CX, DI
 21456  	MOVSS (AX)(DI*4), X5
 21457  	ADDQ  CX, DI
 21458  	MOVSS (AX)(DI*4), X6
 21459  	ADDQ  CX, DI
 21460  	MOVSS (AX)(DI*4), X7
 21461  	ADDQ  CX, DI
 21462  	MOVSS (AX)(DI*4), X8
 21463  	ADDQ  CX, DI
 21464  	MULSS X0, X1
 21465  	MULSS X0, X2
 21466  	MULSS X0, X3
 21467  	MULSS X0, X4
 21468  	MULSS X0, X5
 21469  	MULSS X0, X6
 21470  	MULSS X0, X7
 21471  	MULSS X0, X8
 21472  	ADDSS (DX)(R8*4), X1
 21473  	MOVSS X1, (DX)(R8*4)
 21474  	ADDQ  BX, R8
 21475  	ADDSS (DX)(R8*4), X2
 21476  	MOVSS X2, (DX)(R8*4)
 21477  	ADDQ  BX, R8
 21478  	ADDSS (DX)(R8*4), X3
 21479  	MOVSS X3, (DX)(R8*4)
 21480  	ADDQ  BX, R8
 21481  	ADDSS (DX)(R8*4), X4
 21482  	MOVSS X4, (DX)(R8*4)
 21483  	ADDQ  BX, R8
 21484  	ADDSS (DX)(R8*4), X5
 21485  	MOVSS X5, (DX)(R8*4)
 21486  	ADDQ  BX, R8
 21487  	ADDSS (DX)(R8*4), X6
 21488  	MOVSS X6, (DX)(R8*4)
 21489  	ADDQ  BX, R8
 21490  	ADDSS (DX)(R8*4), X7
 21491  	MOVSS X7, (DX)(R8*4)
 21492  	ADDQ  BX, R8
 21493  	ADDSS (DX)(R8*4), X8
 21494  	MOVSS X8, (DX)(R8*4)
 21495  	ADDQ  BX, R8
 21496  	SUBQ  $0x08, SI
 21497  
 21498  check_limit_unroll:
 21499  	CMPQ SI, $0x08
 21500  	JHS  loop_unroll
 21501  	JMP  check_limit
 21502  
 21503  loop:
 21504  	MOVSS (AX)(DI*4), X1
 21505  	MULSS X0, X1
 21506  	ADDSS (DX)(R8*4), X1
 21507  	MOVSS X1, (DX)(R8*4)
 21508  	DECQ  SI
 21509  	ADDQ  CX, DI
 21510  	ADDQ  BX, R8
 21511  
 21512  check_limit:
 21513  	CMPQ SI, $0x00
 21514  	JHI  loop
 21515  	RET
 21516  
 21517  // func AmdAxpyUnsafeXInterleave_V1A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21518  // Requires: SSE
 21519  TEXT ·AmdAxpyUnsafeXInterleave_V1A10R8(SB), NOSPLIT, $0-48
 21520  	MOVSS alpha+0(FP), X0
 21521  	MOVQ  xs+8(FP), AX
 21522  	MOVQ  incx+16(FP), CX
 21523  	MOVQ  ys+24(FP), DX
 21524  	MOVQ  incy+32(FP), BX
 21525  	MOVQ  n+40(FP), SI
 21526  	XORQ  DI, DI
 21527  	XORQ  R8, R8
 21528  	JMP   check_limit_unroll
 21529  	PCALIGN $0x08
 21530  	NOP
 21531  	NOP
 21532  
 21533  loop_unroll:
 21534  	MOVSS (AX)(DI*4), X1
 21535  	ADDQ  CX, DI
 21536  	MOVSS (AX)(DI*4), X2
 21537  	ADDQ  CX, DI
 21538  	MOVSS (AX)(DI*4), X3
 21539  	ADDQ  CX, DI
 21540  	MOVSS (AX)(DI*4), X4
 21541  	ADDQ  CX, DI
 21542  	MOVSS (AX)(DI*4), X5
 21543  	ADDQ  CX, DI
 21544  	MOVSS (AX)(DI*4), X6
 21545  	ADDQ  CX, DI
 21546  	MOVSS (AX)(DI*4), X7
 21547  	ADDQ  CX, DI
 21548  	MOVSS (AX)(DI*4), X8
 21549  	ADDQ  CX, DI
 21550  	MULSS X0, X1
 21551  	MULSS X0, X2
 21552  	MULSS X0, X3
 21553  	MULSS X0, X4
 21554  	MULSS X0, X5
 21555  	MULSS X0, X6
 21556  	MULSS X0, X7
 21557  	MULSS X0, X8
 21558  	ADDSS (DX)(R8*4), X1
 21559  	MOVSS X1, (DX)(R8*4)
 21560  	ADDQ  BX, R8
 21561  	ADDSS (DX)(R8*4), X2
 21562  	MOVSS X2, (DX)(R8*4)
 21563  	ADDQ  BX, R8
 21564  	ADDSS (DX)(R8*4), X3
 21565  	MOVSS X3, (DX)(R8*4)
 21566  	ADDQ  BX, R8
 21567  	ADDSS (DX)(R8*4), X4
 21568  	MOVSS X4, (DX)(R8*4)
 21569  	ADDQ  BX, R8
 21570  	ADDSS (DX)(R8*4), X5
 21571  	MOVSS X5, (DX)(R8*4)
 21572  	ADDQ  BX, R8
 21573  	ADDSS (DX)(R8*4), X6
 21574  	MOVSS X6, (DX)(R8*4)
 21575  	ADDQ  BX, R8
 21576  	ADDSS (DX)(R8*4), X7
 21577  	MOVSS X7, (DX)(R8*4)
 21578  	ADDQ  BX, R8
 21579  	ADDSS (DX)(R8*4), X8
 21580  	MOVSS X8, (DX)(R8*4)
 21581  	ADDQ  BX, R8
 21582  	SUBQ  $0x08, SI
 21583  
 21584  check_limit_unroll:
 21585  	CMPQ SI, $0x08
 21586  	JHS  loop_unroll
 21587  	JMP  check_limit
 21588  
 21589  loop:
 21590  	MOVSS (AX)(DI*4), X1
 21591  	MULSS X0, X1
 21592  	ADDSS (DX)(R8*4), X1
 21593  	MOVSS X1, (DX)(R8*4)
 21594  	DECQ  SI
 21595  	ADDQ  CX, DI
 21596  	ADDQ  BX, R8
 21597  
 21598  check_limit:
 21599  	CMPQ SI, $0x00
 21600  	JHI  loop
 21601  	RET
 21602  
 21603  // func AmdAxpyUnsafeXInterleave_V2A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21604  // Requires: SSE
 21605  TEXT ·AmdAxpyUnsafeXInterleave_V2A10R8(SB), NOSPLIT, $0-48
 21606  	MOVSS alpha+0(FP), X0
 21607  	MOVQ  xs+8(FP), AX
 21608  	MOVQ  incx+16(FP), CX
 21609  	MOVQ  ys+24(FP), DX
 21610  	MOVQ  incy+32(FP), BX
 21611  	MOVQ  n+40(FP), SI
 21612  	XORQ  DI, DI
 21613  	XORQ  R8, R8
 21614  	JMP   check_limit_unroll
 21615  	PCALIGN $0x08
 21616  	NOP
 21617  	NOP
 21618  
 21619  loop_unroll:
 21620  	MOVSS (AX)(DI*4), X1
 21621  	ADDQ  CX, DI
 21622  	MOVSS (AX)(DI*4), X2
 21623  	ADDQ  CX, DI
 21624  	MOVSS (AX)(DI*4), X3
 21625  	ADDQ  CX, DI
 21626  	MOVSS (AX)(DI*4), X4
 21627  	ADDQ  CX, DI
 21628  	MOVSS (AX)(DI*4), X5
 21629  	ADDQ  CX, DI
 21630  	MOVSS (AX)(DI*4), X6
 21631  	ADDQ  CX, DI
 21632  	MOVSS (AX)(DI*4), X7
 21633  	ADDQ  CX, DI
 21634  	MOVSS (AX)(DI*4), X8
 21635  	ADDQ  CX, DI
 21636  	MULSS X0, X1
 21637  	MULSS X0, X2
 21638  	MULSS X0, X3
 21639  	MULSS X0, X4
 21640  	MULSS X0, X5
 21641  	MULSS X0, X6
 21642  	MULSS X0, X7
 21643  	MULSS X0, X8
 21644  	ADDSS (DX)(R8*4), X1
 21645  	MOVSS X1, (DX)(R8*4)
 21646  	ADDQ  BX, R8
 21647  	ADDSS (DX)(R8*4), X2
 21648  	MOVSS X2, (DX)(R8*4)
 21649  	ADDQ  BX, R8
 21650  	ADDSS (DX)(R8*4), X3
 21651  	MOVSS X3, (DX)(R8*4)
 21652  	ADDQ  BX, R8
 21653  	ADDSS (DX)(R8*4), X4
 21654  	MOVSS X4, (DX)(R8*4)
 21655  	ADDQ  BX, R8
 21656  	ADDSS (DX)(R8*4), X5
 21657  	MOVSS X5, (DX)(R8*4)
 21658  	ADDQ  BX, R8
 21659  	ADDSS (DX)(R8*4), X6
 21660  	MOVSS X6, (DX)(R8*4)
 21661  	ADDQ  BX, R8
 21662  	ADDSS (DX)(R8*4), X7
 21663  	MOVSS X7, (DX)(R8*4)
 21664  	ADDQ  BX, R8
 21665  	ADDSS (DX)(R8*4), X8
 21666  	MOVSS X8, (DX)(R8*4)
 21667  	ADDQ  BX, R8
 21668  	SUBQ  $0x08, SI
 21669  
 21670  check_limit_unroll:
 21671  	CMPQ SI, $0x08
 21672  	JHS  loop_unroll
 21673  	JMP  check_limit
 21674  
 21675  loop:
 21676  	MOVSS (AX)(DI*4), X1
 21677  	MULSS X0, X1
 21678  	ADDSS (DX)(R8*4), X1
 21679  	MOVSS X1, (DX)(R8*4)
 21680  	DECQ  SI
 21681  	ADDQ  CX, DI
 21682  	ADDQ  BX, R8
 21683  
 21684  check_limit:
 21685  	CMPQ SI, $0x00
 21686  	JHI  loop
 21687  	RET
 21688  
 21689  // func AmdAxpyUnsafeXInterleave_V3A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21690  // Requires: SSE
 21691  TEXT ·AmdAxpyUnsafeXInterleave_V3A10R8(SB), NOSPLIT, $0-48
 21692  	MOVSS alpha+0(FP), X0
 21693  	MOVQ  xs+8(FP), AX
 21694  	MOVQ  incx+16(FP), CX
 21695  	MOVQ  ys+24(FP), DX
 21696  	MOVQ  incy+32(FP), BX
 21697  	MOVQ  n+40(FP), SI
 21698  	XORQ  DI, DI
 21699  	XORQ  R8, R8
 21700  	JMP   check_limit_unroll
 21701  	PCALIGN $0x08
 21702  	NOP
 21703  	NOP
 21704  
 21705  loop_unroll:
 21706  	MOVSS (AX)(DI*4), X1
 21707  	ADDQ  CX, DI
 21708  	MOVSS (AX)(DI*4), X2
 21709  	ADDQ  CX, DI
 21710  	MOVSS (AX)(DI*4), X3
 21711  	ADDQ  CX, DI
 21712  	MOVSS (AX)(DI*4), X4
 21713  	ADDQ  CX, DI
 21714  	MOVSS (AX)(DI*4), X5
 21715  	ADDQ  CX, DI
 21716  	MOVSS (AX)(DI*4), X6
 21717  	ADDQ  CX, DI
 21718  	MOVSS (AX)(DI*4), X7
 21719  	ADDQ  CX, DI
 21720  	MOVSS (AX)(DI*4), X8
 21721  	ADDQ  CX, DI
 21722  	MULSS X0, X1
 21723  	MULSS X0, X2
 21724  	MULSS X0, X3
 21725  	MULSS X0, X4
 21726  	MULSS X0, X5
 21727  	MULSS X0, X6
 21728  	MULSS X0, X7
 21729  	MULSS X0, X8
 21730  	ADDSS (DX)(R8*4), X1
 21731  	MOVSS X1, (DX)(R8*4)
 21732  	ADDQ  BX, R8
 21733  	ADDSS (DX)(R8*4), X2
 21734  	MOVSS X2, (DX)(R8*4)
 21735  	ADDQ  BX, R8
 21736  	ADDSS (DX)(R8*4), X3
 21737  	MOVSS X3, (DX)(R8*4)
 21738  	ADDQ  BX, R8
 21739  	ADDSS (DX)(R8*4), X4
 21740  	MOVSS X4, (DX)(R8*4)
 21741  	ADDQ  BX, R8
 21742  	ADDSS (DX)(R8*4), X5
 21743  	MOVSS X5, (DX)(R8*4)
 21744  	ADDQ  BX, R8
 21745  	ADDSS (DX)(R8*4), X6
 21746  	MOVSS X6, (DX)(R8*4)
 21747  	ADDQ  BX, R8
 21748  	ADDSS (DX)(R8*4), X7
 21749  	MOVSS X7, (DX)(R8*4)
 21750  	ADDQ  BX, R8
 21751  	ADDSS (DX)(R8*4), X8
 21752  	MOVSS X8, (DX)(R8*4)
 21753  	ADDQ  BX, R8
 21754  	SUBQ  $0x08, SI
 21755  
 21756  check_limit_unroll:
 21757  	CMPQ SI, $0x08
 21758  	JHS  loop_unroll
 21759  	JMP  check_limit
 21760  
 21761  loop:
 21762  	MOVSS (AX)(DI*4), X1
 21763  	MULSS X0, X1
 21764  	ADDSS (DX)(R8*4), X1
 21765  	MOVSS X1, (DX)(R8*4)
 21766  	DECQ  SI
 21767  	ADDQ  CX, DI
 21768  	ADDQ  BX, R8
 21769  
 21770  check_limit:
 21771  	CMPQ SI, $0x00
 21772  	JHI  loop
 21773  	RET
 21774  
 21775  // func AmdAxpyUnsafeXInterleave_V4A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21776  // Requires: SSE
 21777  TEXT ·AmdAxpyUnsafeXInterleave_V4A10R8(SB), NOSPLIT, $0-48
 21778  	MOVSS alpha+0(FP), X0
 21779  	MOVQ  xs+8(FP), AX
 21780  	MOVQ  incx+16(FP), CX
 21781  	MOVQ  ys+24(FP), DX
 21782  	MOVQ  incy+32(FP), BX
 21783  	MOVQ  n+40(FP), SI
 21784  	XORQ  DI, DI
 21785  	XORQ  R8, R8
 21786  	JMP   check_limit_unroll
 21787  	PCALIGN $0x08
 21788  	NOP
 21789  	NOP
 21790  
 21791  loop_unroll:
 21792  	MOVSS (AX)(DI*4), X1
 21793  	ADDQ  CX, DI
 21794  	MOVSS (AX)(DI*4), X2
 21795  	ADDQ  CX, DI
 21796  	MOVSS (AX)(DI*4), X3
 21797  	ADDQ  CX, DI
 21798  	MOVSS (AX)(DI*4), X4
 21799  	ADDQ  CX, DI
 21800  	MOVSS (AX)(DI*4), X5
 21801  	ADDQ  CX, DI
 21802  	MOVSS (AX)(DI*4), X6
 21803  	ADDQ  CX, DI
 21804  	MOVSS (AX)(DI*4), X7
 21805  	ADDQ  CX, DI
 21806  	MOVSS (AX)(DI*4), X8
 21807  	ADDQ  CX, DI
 21808  	MULSS X0, X1
 21809  	MULSS X0, X2
 21810  	MULSS X0, X3
 21811  	MULSS X0, X4
 21812  	MULSS X0, X5
 21813  	MULSS X0, X6
 21814  	MULSS X0, X7
 21815  	MULSS X0, X8
 21816  	ADDSS (DX)(R8*4), X1
 21817  	MOVSS X1, (DX)(R8*4)
 21818  	ADDQ  BX, R8
 21819  	ADDSS (DX)(R8*4), X2
 21820  	MOVSS X2, (DX)(R8*4)
 21821  	ADDQ  BX, R8
 21822  	ADDSS (DX)(R8*4), X3
 21823  	MOVSS X3, (DX)(R8*4)
 21824  	ADDQ  BX, R8
 21825  	ADDSS (DX)(R8*4), X4
 21826  	MOVSS X4, (DX)(R8*4)
 21827  	ADDQ  BX, R8
 21828  	ADDSS (DX)(R8*4), X5
 21829  	MOVSS X5, (DX)(R8*4)
 21830  	ADDQ  BX, R8
 21831  	ADDSS (DX)(R8*4), X6
 21832  	MOVSS X6, (DX)(R8*4)
 21833  	ADDQ  BX, R8
 21834  	ADDSS (DX)(R8*4), X7
 21835  	MOVSS X7, (DX)(R8*4)
 21836  	ADDQ  BX, R8
 21837  	ADDSS (DX)(R8*4), X8
 21838  	MOVSS X8, (DX)(R8*4)
 21839  	ADDQ  BX, R8
 21840  	SUBQ  $0x08, SI
 21841  
 21842  check_limit_unroll:
 21843  	CMPQ SI, $0x08
 21844  	JHS  loop_unroll
 21845  	JMP  check_limit
 21846  
 21847  loop:
 21848  	MOVSS (AX)(DI*4), X1
 21849  	MULSS X0, X1
 21850  	ADDSS (DX)(R8*4), X1
 21851  	MOVSS X1, (DX)(R8*4)
 21852  	DECQ  SI
 21853  	ADDQ  CX, DI
 21854  	ADDQ  BX, R8
 21855  
 21856  check_limit:
 21857  	CMPQ SI, $0x00
 21858  	JHI  loop
 21859  	RET
 21860  
 21861  // func AmdAxpyUnsafeXInterleave_V5A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21862  // Requires: SSE
 21863  TEXT ·AmdAxpyUnsafeXInterleave_V5A10R8(SB), NOSPLIT, $0-48
 21864  	MOVSS alpha+0(FP), X0
 21865  	MOVQ  xs+8(FP), AX
 21866  	MOVQ  incx+16(FP), CX
 21867  	MOVQ  ys+24(FP), DX
 21868  	MOVQ  incy+32(FP), BX
 21869  	MOVQ  n+40(FP), SI
 21870  	XORQ  DI, DI
 21871  	XORQ  R8, R8
 21872  	JMP   check_limit_unroll
 21873  	PCALIGN $0x08
 21874  	NOP
 21875  	NOP
 21876  
 21877  loop_unroll:
 21878  	MOVSS (AX)(DI*4), X1
 21879  	ADDQ  CX, DI
 21880  	MOVSS (AX)(DI*4), X2
 21881  	ADDQ  CX, DI
 21882  	MOVSS (AX)(DI*4), X3
 21883  	ADDQ  CX, DI
 21884  	MOVSS (AX)(DI*4), X4
 21885  	ADDQ  CX, DI
 21886  	MOVSS (AX)(DI*4), X5
 21887  	ADDQ  CX, DI
 21888  	MOVSS (AX)(DI*4), X6
 21889  	ADDQ  CX, DI
 21890  	MOVSS (AX)(DI*4), X7
 21891  	ADDQ  CX, DI
 21892  	MOVSS (AX)(DI*4), X8
 21893  	ADDQ  CX, DI
 21894  	MULSS X0, X1
 21895  	MULSS X0, X2
 21896  	MULSS X0, X3
 21897  	MULSS X0, X4
 21898  	MULSS X0, X5
 21899  	MULSS X0, X6
 21900  	MULSS X0, X7
 21901  	MULSS X0, X8
 21902  	ADDSS (DX)(R8*4), X1
 21903  	MOVSS X1, (DX)(R8*4)
 21904  	ADDQ  BX, R8
 21905  	ADDSS (DX)(R8*4), X2
 21906  	MOVSS X2, (DX)(R8*4)
 21907  	ADDQ  BX, R8
 21908  	ADDSS (DX)(R8*4), X3
 21909  	MOVSS X3, (DX)(R8*4)
 21910  	ADDQ  BX, R8
 21911  	ADDSS (DX)(R8*4), X4
 21912  	MOVSS X4, (DX)(R8*4)
 21913  	ADDQ  BX, R8
 21914  	ADDSS (DX)(R8*4), X5
 21915  	MOVSS X5, (DX)(R8*4)
 21916  	ADDQ  BX, R8
 21917  	ADDSS (DX)(R8*4), X6
 21918  	MOVSS X6, (DX)(R8*4)
 21919  	ADDQ  BX, R8
 21920  	ADDSS (DX)(R8*4), X7
 21921  	MOVSS X7, (DX)(R8*4)
 21922  	ADDQ  BX, R8
 21923  	ADDSS (DX)(R8*4), X8
 21924  	MOVSS X8, (DX)(R8*4)
 21925  	ADDQ  BX, R8
 21926  	SUBQ  $0x08, SI
 21927  
 21928  check_limit_unroll:
 21929  	CMPQ SI, $0x08
 21930  	JHS  loop_unroll
 21931  	JMP  check_limit
 21932  
 21933  loop:
 21934  	MOVSS (AX)(DI*4), X1
 21935  	MULSS X0, X1
 21936  	ADDSS (DX)(R8*4), X1
 21937  	MOVSS X1, (DX)(R8*4)
 21938  	DECQ  SI
 21939  	ADDQ  CX, DI
 21940  	ADDQ  BX, R8
 21941  
 21942  check_limit:
 21943  	CMPQ SI, $0x00
 21944  	JHI  loop
 21945  	RET
 21946  
 21947  // func AmdAxpyUnsafeXInterleave_V0A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 21948  // Requires: SSE
 21949  TEXT ·AmdAxpyUnsafeXInterleave_V0A11R8(SB), NOSPLIT, $0-48
 21950  	MOVSS alpha+0(FP), X0
 21951  	MOVQ  xs+8(FP), AX
 21952  	MOVQ  incx+16(FP), CX
 21953  	MOVQ  ys+24(FP), DX
 21954  	MOVQ  incy+32(FP), BX
 21955  	MOVQ  n+40(FP), SI
 21956  	XORQ  DI, DI
 21957  	XORQ  R8, R8
 21958  	JMP   check_limit_unroll
 21959  	PCALIGN $0x08
 21960  	NOP
 21961  	NOP
 21962  	NOP
 21963  
 21964  loop_unroll:
 21965  	MOVSS (AX)(DI*4), X1
 21966  	ADDQ  CX, DI
 21967  	MOVSS (AX)(DI*4), X2
 21968  	ADDQ  CX, DI
 21969  	MOVSS (AX)(DI*4), X3
 21970  	ADDQ  CX, DI
 21971  	MOVSS (AX)(DI*4), X4
 21972  	ADDQ  CX, DI
 21973  	MOVSS (AX)(DI*4), X5
 21974  	ADDQ  CX, DI
 21975  	MOVSS (AX)(DI*4), X6
 21976  	ADDQ  CX, DI
 21977  	MOVSS (AX)(DI*4), X7
 21978  	ADDQ  CX, DI
 21979  	MOVSS (AX)(DI*4), X8
 21980  	ADDQ  CX, DI
 21981  	MULSS X0, X1
 21982  	MULSS X0, X2
 21983  	MULSS X0, X3
 21984  	MULSS X0, X4
 21985  	MULSS X0, X5
 21986  	MULSS X0, X6
 21987  	MULSS X0, X7
 21988  	MULSS X0, X8
 21989  	ADDSS (DX)(R8*4), X1
 21990  	MOVSS X1, (DX)(R8*4)
 21991  	ADDQ  BX, R8
 21992  	ADDSS (DX)(R8*4), X2
 21993  	MOVSS X2, (DX)(R8*4)
 21994  	ADDQ  BX, R8
 21995  	ADDSS (DX)(R8*4), X3
 21996  	MOVSS X3, (DX)(R8*4)
 21997  	ADDQ  BX, R8
 21998  	ADDSS (DX)(R8*4), X4
 21999  	MOVSS X4, (DX)(R8*4)
 22000  	ADDQ  BX, R8
 22001  	ADDSS (DX)(R8*4), X5
 22002  	MOVSS X5, (DX)(R8*4)
 22003  	ADDQ  BX, R8
 22004  	ADDSS (DX)(R8*4), X6
 22005  	MOVSS X6, (DX)(R8*4)
 22006  	ADDQ  BX, R8
 22007  	ADDSS (DX)(R8*4), X7
 22008  	MOVSS X7, (DX)(R8*4)
 22009  	ADDQ  BX, R8
 22010  	ADDSS (DX)(R8*4), X8
 22011  	MOVSS X8, (DX)(R8*4)
 22012  	ADDQ  BX, R8
 22013  	SUBQ  $0x08, SI
 22014  
 22015  check_limit_unroll:
 22016  	CMPQ SI, $0x08
 22017  	JHS  loop_unroll
 22018  	JMP  check_limit
 22019  
 22020  loop:
 22021  	MOVSS (AX)(DI*4), X1
 22022  	MULSS X0, X1
 22023  	ADDSS (DX)(R8*4), X1
 22024  	MOVSS X1, (DX)(R8*4)
 22025  	DECQ  SI
 22026  	ADDQ  CX, DI
 22027  	ADDQ  BX, R8
 22028  
 22029  check_limit:
 22030  	CMPQ SI, $0x00
 22031  	JHI  loop
 22032  	RET
 22033  
 22034  // func AmdAxpyUnsafeXInterleave_V1A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22035  // Requires: SSE
 22036  TEXT ·AmdAxpyUnsafeXInterleave_V1A11R8(SB), NOSPLIT, $0-48
 22037  	MOVSS alpha+0(FP), X0
 22038  	MOVQ  xs+8(FP), AX
 22039  	MOVQ  incx+16(FP), CX
 22040  	MOVQ  ys+24(FP), DX
 22041  	MOVQ  incy+32(FP), BX
 22042  	MOVQ  n+40(FP), SI
 22043  	XORQ  DI, DI
 22044  	XORQ  R8, R8
 22045  	JMP   check_limit_unroll
 22046  	PCALIGN $0x08
 22047  	NOP
 22048  	NOP
 22049  	NOP
 22050  
 22051  loop_unroll:
 22052  	MOVSS (AX)(DI*4), X1
 22053  	ADDQ  CX, DI
 22054  	MOVSS (AX)(DI*4), X2
 22055  	ADDQ  CX, DI
 22056  	MOVSS (AX)(DI*4), X3
 22057  	ADDQ  CX, DI
 22058  	MOVSS (AX)(DI*4), X4
 22059  	ADDQ  CX, DI
 22060  	MOVSS (AX)(DI*4), X5
 22061  	ADDQ  CX, DI
 22062  	MOVSS (AX)(DI*4), X6
 22063  	ADDQ  CX, DI
 22064  	MOVSS (AX)(DI*4), X7
 22065  	ADDQ  CX, DI
 22066  	MOVSS (AX)(DI*4), X8
 22067  	ADDQ  CX, DI
 22068  	MULSS X0, X1
 22069  	MULSS X0, X2
 22070  	MULSS X0, X3
 22071  	MULSS X0, X4
 22072  	MULSS X0, X5
 22073  	MULSS X0, X6
 22074  	MULSS X0, X7
 22075  	MULSS X0, X8
 22076  	ADDSS (DX)(R8*4), X1
 22077  	MOVSS X1, (DX)(R8*4)
 22078  	ADDQ  BX, R8
 22079  	ADDSS (DX)(R8*4), X2
 22080  	MOVSS X2, (DX)(R8*4)
 22081  	ADDQ  BX, R8
 22082  	ADDSS (DX)(R8*4), X3
 22083  	MOVSS X3, (DX)(R8*4)
 22084  	ADDQ  BX, R8
 22085  	ADDSS (DX)(R8*4), X4
 22086  	MOVSS X4, (DX)(R8*4)
 22087  	ADDQ  BX, R8
 22088  	ADDSS (DX)(R8*4), X5
 22089  	MOVSS X5, (DX)(R8*4)
 22090  	ADDQ  BX, R8
 22091  	ADDSS (DX)(R8*4), X6
 22092  	MOVSS X6, (DX)(R8*4)
 22093  	ADDQ  BX, R8
 22094  	ADDSS (DX)(R8*4), X7
 22095  	MOVSS X7, (DX)(R8*4)
 22096  	ADDQ  BX, R8
 22097  	ADDSS (DX)(R8*4), X8
 22098  	MOVSS X8, (DX)(R8*4)
 22099  	ADDQ  BX, R8
 22100  	SUBQ  $0x08, SI
 22101  
 22102  check_limit_unroll:
 22103  	CMPQ SI, $0x08
 22104  	JHS  loop_unroll
 22105  	JMP  check_limit
 22106  
 22107  loop:
 22108  	MOVSS (AX)(DI*4), X1
 22109  	MULSS X0, X1
 22110  	ADDSS (DX)(R8*4), X1
 22111  	MOVSS X1, (DX)(R8*4)
 22112  	DECQ  SI
 22113  	ADDQ  CX, DI
 22114  	ADDQ  BX, R8
 22115  
 22116  check_limit:
 22117  	CMPQ SI, $0x00
 22118  	JHI  loop
 22119  	RET
 22120  
 22121  // func AmdAxpyUnsafeXInterleave_V2A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22122  // Requires: SSE
 22123  TEXT ·AmdAxpyUnsafeXInterleave_V2A11R8(SB), NOSPLIT, $0-48
 22124  	MOVSS alpha+0(FP), X0
 22125  	MOVQ  xs+8(FP), AX
 22126  	MOVQ  incx+16(FP), CX
 22127  	MOVQ  ys+24(FP), DX
 22128  	MOVQ  incy+32(FP), BX
 22129  	MOVQ  n+40(FP), SI
 22130  	XORQ  DI, DI
 22131  	XORQ  R8, R8
 22132  	JMP   check_limit_unroll
 22133  	PCALIGN $0x08
 22134  	NOP
 22135  	NOP
 22136  	NOP
 22137  
 22138  loop_unroll:
 22139  	MOVSS (AX)(DI*4), X1
 22140  	ADDQ  CX, DI
 22141  	MOVSS (AX)(DI*4), X2
 22142  	ADDQ  CX, DI
 22143  	MOVSS (AX)(DI*4), X3
 22144  	ADDQ  CX, DI
 22145  	MOVSS (AX)(DI*4), X4
 22146  	ADDQ  CX, DI
 22147  	MOVSS (AX)(DI*4), X5
 22148  	ADDQ  CX, DI
 22149  	MOVSS (AX)(DI*4), X6
 22150  	ADDQ  CX, DI
 22151  	MOVSS (AX)(DI*4), X7
 22152  	ADDQ  CX, DI
 22153  	MOVSS (AX)(DI*4), X8
 22154  	ADDQ  CX, DI
 22155  	MULSS X0, X1
 22156  	MULSS X0, X2
 22157  	MULSS X0, X3
 22158  	MULSS X0, X4
 22159  	MULSS X0, X5
 22160  	MULSS X0, X6
 22161  	MULSS X0, X7
 22162  	MULSS X0, X8
 22163  	ADDSS (DX)(R8*4), X1
 22164  	MOVSS X1, (DX)(R8*4)
 22165  	ADDQ  BX, R8
 22166  	ADDSS (DX)(R8*4), X2
 22167  	MOVSS X2, (DX)(R8*4)
 22168  	ADDQ  BX, R8
 22169  	ADDSS (DX)(R8*4), X3
 22170  	MOVSS X3, (DX)(R8*4)
 22171  	ADDQ  BX, R8
 22172  	ADDSS (DX)(R8*4), X4
 22173  	MOVSS X4, (DX)(R8*4)
 22174  	ADDQ  BX, R8
 22175  	ADDSS (DX)(R8*4), X5
 22176  	MOVSS X5, (DX)(R8*4)
 22177  	ADDQ  BX, R8
 22178  	ADDSS (DX)(R8*4), X6
 22179  	MOVSS X6, (DX)(R8*4)
 22180  	ADDQ  BX, R8
 22181  	ADDSS (DX)(R8*4), X7
 22182  	MOVSS X7, (DX)(R8*4)
 22183  	ADDQ  BX, R8
 22184  	ADDSS (DX)(R8*4), X8
 22185  	MOVSS X8, (DX)(R8*4)
 22186  	ADDQ  BX, R8
 22187  	SUBQ  $0x08, SI
 22188  
 22189  check_limit_unroll:
 22190  	CMPQ SI, $0x08
 22191  	JHS  loop_unroll
 22192  	JMP  check_limit
 22193  
 22194  loop:
 22195  	MOVSS (AX)(DI*4), X1
 22196  	MULSS X0, X1
 22197  	ADDSS (DX)(R8*4), X1
 22198  	MOVSS X1, (DX)(R8*4)
 22199  	DECQ  SI
 22200  	ADDQ  CX, DI
 22201  	ADDQ  BX, R8
 22202  
 22203  check_limit:
 22204  	CMPQ SI, $0x00
 22205  	JHI  loop
 22206  	RET
 22207  
 22208  // func AmdAxpyUnsafeXInterleave_V3A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22209  // Requires: SSE
 22210  TEXT ·AmdAxpyUnsafeXInterleave_V3A11R8(SB), NOSPLIT, $0-48
 22211  	MOVSS alpha+0(FP), X0
 22212  	MOVQ  xs+8(FP), AX
 22213  	MOVQ  incx+16(FP), CX
 22214  	MOVQ  ys+24(FP), DX
 22215  	MOVQ  incy+32(FP), BX
 22216  	MOVQ  n+40(FP), SI
 22217  	XORQ  DI, DI
 22218  	XORQ  R8, R8
 22219  	JMP   check_limit_unroll
 22220  	PCALIGN $0x08
 22221  	NOP
 22222  	NOP
 22223  	NOP
 22224  
 22225  loop_unroll:
 22226  	MOVSS (AX)(DI*4), X1
 22227  	ADDQ  CX, DI
 22228  	MOVSS (AX)(DI*4), X2
 22229  	ADDQ  CX, DI
 22230  	MOVSS (AX)(DI*4), X3
 22231  	ADDQ  CX, DI
 22232  	MOVSS (AX)(DI*4), X4
 22233  	ADDQ  CX, DI
 22234  	MOVSS (AX)(DI*4), X5
 22235  	ADDQ  CX, DI
 22236  	MOVSS (AX)(DI*4), X6
 22237  	ADDQ  CX, DI
 22238  	MOVSS (AX)(DI*4), X7
 22239  	ADDQ  CX, DI
 22240  	MOVSS (AX)(DI*4), X8
 22241  	ADDQ  CX, DI
 22242  	MULSS X0, X1
 22243  	MULSS X0, X2
 22244  	MULSS X0, X3
 22245  	MULSS X0, X4
 22246  	MULSS X0, X5
 22247  	MULSS X0, X6
 22248  	MULSS X0, X7
 22249  	MULSS X0, X8
 22250  	ADDSS (DX)(R8*4), X1
 22251  	MOVSS X1, (DX)(R8*4)
 22252  	ADDQ  BX, R8
 22253  	ADDSS (DX)(R8*4), X2
 22254  	MOVSS X2, (DX)(R8*4)
 22255  	ADDQ  BX, R8
 22256  	ADDSS (DX)(R8*4), X3
 22257  	MOVSS X3, (DX)(R8*4)
 22258  	ADDQ  BX, R8
 22259  	ADDSS (DX)(R8*4), X4
 22260  	MOVSS X4, (DX)(R8*4)
 22261  	ADDQ  BX, R8
 22262  	ADDSS (DX)(R8*4), X5
 22263  	MOVSS X5, (DX)(R8*4)
 22264  	ADDQ  BX, R8
 22265  	ADDSS (DX)(R8*4), X6
 22266  	MOVSS X6, (DX)(R8*4)
 22267  	ADDQ  BX, R8
 22268  	ADDSS (DX)(R8*4), X7
 22269  	MOVSS X7, (DX)(R8*4)
 22270  	ADDQ  BX, R8
 22271  	ADDSS (DX)(R8*4), X8
 22272  	MOVSS X8, (DX)(R8*4)
 22273  	ADDQ  BX, R8
 22274  	SUBQ  $0x08, SI
 22275  
 22276  check_limit_unroll:
 22277  	CMPQ SI, $0x08
 22278  	JHS  loop_unroll
 22279  	JMP  check_limit
 22280  
 22281  loop:
 22282  	MOVSS (AX)(DI*4), X1
 22283  	MULSS X0, X1
 22284  	ADDSS (DX)(R8*4), X1
 22285  	MOVSS X1, (DX)(R8*4)
 22286  	DECQ  SI
 22287  	ADDQ  CX, DI
 22288  	ADDQ  BX, R8
 22289  
 22290  check_limit:
 22291  	CMPQ SI, $0x00
 22292  	JHI  loop
 22293  	RET
 22294  
 22295  // func AmdAxpyUnsafeXInterleave_V4A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22296  // Requires: SSE
 22297  TEXT ·AmdAxpyUnsafeXInterleave_V4A11R8(SB), NOSPLIT, $0-48
 22298  	MOVSS alpha+0(FP), X0
 22299  	MOVQ  xs+8(FP), AX
 22300  	MOVQ  incx+16(FP), CX
 22301  	MOVQ  ys+24(FP), DX
 22302  	MOVQ  incy+32(FP), BX
 22303  	MOVQ  n+40(FP), SI
 22304  	XORQ  DI, DI
 22305  	XORQ  R8, R8
 22306  	JMP   check_limit_unroll
 22307  	PCALIGN $0x08
 22308  	NOP
 22309  	NOP
 22310  	NOP
 22311  
 22312  loop_unroll:
 22313  	MOVSS (AX)(DI*4), X1
 22314  	ADDQ  CX, DI
 22315  	MOVSS (AX)(DI*4), X2
 22316  	ADDQ  CX, DI
 22317  	MOVSS (AX)(DI*4), X3
 22318  	ADDQ  CX, DI
 22319  	MOVSS (AX)(DI*4), X4
 22320  	ADDQ  CX, DI
 22321  	MOVSS (AX)(DI*4), X5
 22322  	ADDQ  CX, DI
 22323  	MOVSS (AX)(DI*4), X6
 22324  	ADDQ  CX, DI
 22325  	MOVSS (AX)(DI*4), X7
 22326  	ADDQ  CX, DI
 22327  	MOVSS (AX)(DI*4), X8
 22328  	ADDQ  CX, DI
 22329  	MULSS X0, X1
 22330  	MULSS X0, X2
 22331  	MULSS X0, X3
 22332  	MULSS X0, X4
 22333  	MULSS X0, X5
 22334  	MULSS X0, X6
 22335  	MULSS X0, X7
 22336  	MULSS X0, X8
 22337  	ADDSS (DX)(R8*4), X1
 22338  	MOVSS X1, (DX)(R8*4)
 22339  	ADDQ  BX, R8
 22340  	ADDSS (DX)(R8*4), X2
 22341  	MOVSS X2, (DX)(R8*4)
 22342  	ADDQ  BX, R8
 22343  	ADDSS (DX)(R8*4), X3
 22344  	MOVSS X3, (DX)(R8*4)
 22345  	ADDQ  BX, R8
 22346  	ADDSS (DX)(R8*4), X4
 22347  	MOVSS X4, (DX)(R8*4)
 22348  	ADDQ  BX, R8
 22349  	ADDSS (DX)(R8*4), X5
 22350  	MOVSS X5, (DX)(R8*4)
 22351  	ADDQ  BX, R8
 22352  	ADDSS (DX)(R8*4), X6
 22353  	MOVSS X6, (DX)(R8*4)
 22354  	ADDQ  BX, R8
 22355  	ADDSS (DX)(R8*4), X7
 22356  	MOVSS X7, (DX)(R8*4)
 22357  	ADDQ  BX, R8
 22358  	ADDSS (DX)(R8*4), X8
 22359  	MOVSS X8, (DX)(R8*4)
 22360  	ADDQ  BX, R8
 22361  	SUBQ  $0x08, SI
 22362  
 22363  check_limit_unroll:
 22364  	CMPQ SI, $0x08
 22365  	JHS  loop_unroll
 22366  	JMP  check_limit
 22367  
 22368  loop:
 22369  	MOVSS (AX)(DI*4), X1
 22370  	MULSS X0, X1
 22371  	ADDSS (DX)(R8*4), X1
 22372  	MOVSS X1, (DX)(R8*4)
 22373  	DECQ  SI
 22374  	ADDQ  CX, DI
 22375  	ADDQ  BX, R8
 22376  
 22377  check_limit:
 22378  	CMPQ SI, $0x00
 22379  	JHI  loop
 22380  	RET
 22381  
 22382  // func AmdAxpyUnsafeXInterleave_V5A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22383  // Requires: SSE
 22384  TEXT ·AmdAxpyUnsafeXInterleave_V5A11R8(SB), NOSPLIT, $0-48
 22385  	MOVSS alpha+0(FP), X0
 22386  	MOVQ  xs+8(FP), AX
 22387  	MOVQ  incx+16(FP), CX
 22388  	MOVQ  ys+24(FP), DX
 22389  	MOVQ  incy+32(FP), BX
 22390  	MOVQ  n+40(FP), SI
 22391  	XORQ  DI, DI
 22392  	XORQ  R8, R8
 22393  	JMP   check_limit_unroll
 22394  	PCALIGN $0x08
 22395  	NOP
 22396  	NOP
 22397  	NOP
 22398  
 22399  loop_unroll:
 22400  	MOVSS (AX)(DI*4), X1
 22401  	ADDQ  CX, DI
 22402  	MOVSS (AX)(DI*4), X2
 22403  	ADDQ  CX, DI
 22404  	MOVSS (AX)(DI*4), X3
 22405  	ADDQ  CX, DI
 22406  	MOVSS (AX)(DI*4), X4
 22407  	ADDQ  CX, DI
 22408  	MOVSS (AX)(DI*4), X5
 22409  	ADDQ  CX, DI
 22410  	MOVSS (AX)(DI*4), X6
 22411  	ADDQ  CX, DI
 22412  	MOVSS (AX)(DI*4), X7
 22413  	ADDQ  CX, DI
 22414  	MOVSS (AX)(DI*4), X8
 22415  	ADDQ  CX, DI
 22416  	MULSS X0, X1
 22417  	MULSS X0, X2
 22418  	MULSS X0, X3
 22419  	MULSS X0, X4
 22420  	MULSS X0, X5
 22421  	MULSS X0, X6
 22422  	MULSS X0, X7
 22423  	MULSS X0, X8
 22424  	ADDSS (DX)(R8*4), X1
 22425  	MOVSS X1, (DX)(R8*4)
 22426  	ADDQ  BX, R8
 22427  	ADDSS (DX)(R8*4), X2
 22428  	MOVSS X2, (DX)(R8*4)
 22429  	ADDQ  BX, R8
 22430  	ADDSS (DX)(R8*4), X3
 22431  	MOVSS X3, (DX)(R8*4)
 22432  	ADDQ  BX, R8
 22433  	ADDSS (DX)(R8*4), X4
 22434  	MOVSS X4, (DX)(R8*4)
 22435  	ADDQ  BX, R8
 22436  	ADDSS (DX)(R8*4), X5
 22437  	MOVSS X5, (DX)(R8*4)
 22438  	ADDQ  BX, R8
 22439  	ADDSS (DX)(R8*4), X6
 22440  	MOVSS X6, (DX)(R8*4)
 22441  	ADDQ  BX, R8
 22442  	ADDSS (DX)(R8*4), X7
 22443  	MOVSS X7, (DX)(R8*4)
 22444  	ADDQ  BX, R8
 22445  	ADDSS (DX)(R8*4), X8
 22446  	MOVSS X8, (DX)(R8*4)
 22447  	ADDQ  BX, R8
 22448  	SUBQ  $0x08, SI
 22449  
 22450  check_limit_unroll:
 22451  	CMPQ SI, $0x08
 22452  	JHS  loop_unroll
 22453  	JMP  check_limit
 22454  
 22455  loop:
 22456  	MOVSS (AX)(DI*4), X1
 22457  	MULSS X0, X1
 22458  	ADDSS (DX)(R8*4), X1
 22459  	MOVSS X1, (DX)(R8*4)
 22460  	DECQ  SI
 22461  	ADDQ  CX, DI
 22462  	ADDQ  BX, R8
 22463  
 22464  check_limit:
 22465  	CMPQ SI, $0x00
 22466  	JHI  loop
 22467  	RET
 22468  
 22469  // func AmdAxpyUnsafeXInterleave_V0A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22470  // Requires: SSE
 22471  TEXT ·AmdAxpyUnsafeXInterleave_V0A12R8(SB), NOSPLIT, $0-48
 22472  	MOVSS alpha+0(FP), X0
 22473  	MOVQ  xs+8(FP), AX
 22474  	MOVQ  incx+16(FP), CX
 22475  	MOVQ  ys+24(FP), DX
 22476  	MOVQ  incy+32(FP), BX
 22477  	MOVQ  n+40(FP), SI
 22478  	XORQ  DI, DI
 22479  	XORQ  R8, R8
 22480  	JMP   check_limit_unroll
 22481  	PCALIGN $0x08
 22482  	NOP
 22483  	NOP
 22484  	NOP
 22485  	NOP
 22486  
 22487  loop_unroll:
 22488  	MOVSS (AX)(DI*4), X1
 22489  	ADDQ  CX, DI
 22490  	MOVSS (AX)(DI*4), X2
 22491  	ADDQ  CX, DI
 22492  	MOVSS (AX)(DI*4), X3
 22493  	ADDQ  CX, DI
 22494  	MOVSS (AX)(DI*4), X4
 22495  	ADDQ  CX, DI
 22496  	MOVSS (AX)(DI*4), X5
 22497  	ADDQ  CX, DI
 22498  	MOVSS (AX)(DI*4), X6
 22499  	ADDQ  CX, DI
 22500  	MOVSS (AX)(DI*4), X7
 22501  	ADDQ  CX, DI
 22502  	MOVSS (AX)(DI*4), X8
 22503  	ADDQ  CX, DI
 22504  	MULSS X0, X1
 22505  	MULSS X0, X2
 22506  	MULSS X0, X3
 22507  	MULSS X0, X4
 22508  	MULSS X0, X5
 22509  	MULSS X0, X6
 22510  	MULSS X0, X7
 22511  	MULSS X0, X8
 22512  	ADDSS (DX)(R8*4), X1
 22513  	MOVSS X1, (DX)(R8*4)
 22514  	ADDQ  BX, R8
 22515  	ADDSS (DX)(R8*4), X2
 22516  	MOVSS X2, (DX)(R8*4)
 22517  	ADDQ  BX, R8
 22518  	ADDSS (DX)(R8*4), X3
 22519  	MOVSS X3, (DX)(R8*4)
 22520  	ADDQ  BX, R8
 22521  	ADDSS (DX)(R8*4), X4
 22522  	MOVSS X4, (DX)(R8*4)
 22523  	ADDQ  BX, R8
 22524  	ADDSS (DX)(R8*4), X5
 22525  	MOVSS X5, (DX)(R8*4)
 22526  	ADDQ  BX, R8
 22527  	ADDSS (DX)(R8*4), X6
 22528  	MOVSS X6, (DX)(R8*4)
 22529  	ADDQ  BX, R8
 22530  	ADDSS (DX)(R8*4), X7
 22531  	MOVSS X7, (DX)(R8*4)
 22532  	ADDQ  BX, R8
 22533  	ADDSS (DX)(R8*4), X8
 22534  	MOVSS X8, (DX)(R8*4)
 22535  	ADDQ  BX, R8
 22536  	SUBQ  $0x08, SI
 22537  
 22538  check_limit_unroll:
 22539  	CMPQ SI, $0x08
 22540  	JHS  loop_unroll
 22541  	JMP  check_limit
 22542  
 22543  loop:
 22544  	MOVSS (AX)(DI*4), X1
 22545  	MULSS X0, X1
 22546  	ADDSS (DX)(R8*4), X1
 22547  	MOVSS X1, (DX)(R8*4)
 22548  	DECQ  SI
 22549  	ADDQ  CX, DI
 22550  	ADDQ  BX, R8
 22551  
 22552  check_limit:
 22553  	CMPQ SI, $0x00
 22554  	JHI  loop
 22555  	RET
 22556  
 22557  // func AmdAxpyUnsafeXInterleave_V1A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22558  // Requires: SSE
 22559  TEXT ·AmdAxpyUnsafeXInterleave_V1A12R8(SB), NOSPLIT, $0-48
 22560  	MOVSS alpha+0(FP), X0
 22561  	MOVQ  xs+8(FP), AX
 22562  	MOVQ  incx+16(FP), CX
 22563  	MOVQ  ys+24(FP), DX
 22564  	MOVQ  incy+32(FP), BX
 22565  	MOVQ  n+40(FP), SI
 22566  	XORQ  DI, DI
 22567  	XORQ  R8, R8
 22568  	JMP   check_limit_unroll
 22569  	PCALIGN $0x08
 22570  	NOP
 22571  	NOP
 22572  	NOP
 22573  	NOP
 22574  
 22575  loop_unroll:
 22576  	MOVSS (AX)(DI*4), X1
 22577  	ADDQ  CX, DI
 22578  	MOVSS (AX)(DI*4), X2
 22579  	ADDQ  CX, DI
 22580  	MOVSS (AX)(DI*4), X3
 22581  	ADDQ  CX, DI
 22582  	MOVSS (AX)(DI*4), X4
 22583  	ADDQ  CX, DI
 22584  	MOVSS (AX)(DI*4), X5
 22585  	ADDQ  CX, DI
 22586  	MOVSS (AX)(DI*4), X6
 22587  	ADDQ  CX, DI
 22588  	MOVSS (AX)(DI*4), X7
 22589  	ADDQ  CX, DI
 22590  	MOVSS (AX)(DI*4), X8
 22591  	ADDQ  CX, DI
 22592  	MULSS X0, X1
 22593  	MULSS X0, X2
 22594  	MULSS X0, X3
 22595  	MULSS X0, X4
 22596  	MULSS X0, X5
 22597  	MULSS X0, X6
 22598  	MULSS X0, X7
 22599  	MULSS X0, X8
 22600  	ADDSS (DX)(R8*4), X1
 22601  	MOVSS X1, (DX)(R8*4)
 22602  	ADDQ  BX, R8
 22603  	ADDSS (DX)(R8*4), X2
 22604  	MOVSS X2, (DX)(R8*4)
 22605  	ADDQ  BX, R8
 22606  	ADDSS (DX)(R8*4), X3
 22607  	MOVSS X3, (DX)(R8*4)
 22608  	ADDQ  BX, R8
 22609  	ADDSS (DX)(R8*4), X4
 22610  	MOVSS X4, (DX)(R8*4)
 22611  	ADDQ  BX, R8
 22612  	ADDSS (DX)(R8*4), X5
 22613  	MOVSS X5, (DX)(R8*4)
 22614  	ADDQ  BX, R8
 22615  	ADDSS (DX)(R8*4), X6
 22616  	MOVSS X6, (DX)(R8*4)
 22617  	ADDQ  BX, R8
 22618  	ADDSS (DX)(R8*4), X7
 22619  	MOVSS X7, (DX)(R8*4)
 22620  	ADDQ  BX, R8
 22621  	ADDSS (DX)(R8*4), X8
 22622  	MOVSS X8, (DX)(R8*4)
 22623  	ADDQ  BX, R8
 22624  	SUBQ  $0x08, SI
 22625  
 22626  check_limit_unroll:
 22627  	CMPQ SI, $0x08
 22628  	JHS  loop_unroll
 22629  	JMP  check_limit
 22630  
 22631  loop:
 22632  	MOVSS (AX)(DI*4), X1
 22633  	MULSS X0, X1
 22634  	ADDSS (DX)(R8*4), X1
 22635  	MOVSS X1, (DX)(R8*4)
 22636  	DECQ  SI
 22637  	ADDQ  CX, DI
 22638  	ADDQ  BX, R8
 22639  
 22640  check_limit:
 22641  	CMPQ SI, $0x00
 22642  	JHI  loop
 22643  	RET
 22644  
 22645  // func AmdAxpyUnsafeXInterleave_V2A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22646  // Requires: SSE
 22647  TEXT ·AmdAxpyUnsafeXInterleave_V2A12R8(SB), NOSPLIT, $0-48
 22648  	MOVSS alpha+0(FP), X0
 22649  	MOVQ  xs+8(FP), AX
 22650  	MOVQ  incx+16(FP), CX
 22651  	MOVQ  ys+24(FP), DX
 22652  	MOVQ  incy+32(FP), BX
 22653  	MOVQ  n+40(FP), SI
 22654  	XORQ  DI, DI
 22655  	XORQ  R8, R8
 22656  	JMP   check_limit_unroll
 22657  	PCALIGN $0x08
 22658  	NOP
 22659  	NOP
 22660  	NOP
 22661  	NOP
 22662  
 22663  loop_unroll:
 22664  	MOVSS (AX)(DI*4), X1
 22665  	ADDQ  CX, DI
 22666  	MOVSS (AX)(DI*4), X2
 22667  	ADDQ  CX, DI
 22668  	MOVSS (AX)(DI*4), X3
 22669  	ADDQ  CX, DI
 22670  	MOVSS (AX)(DI*4), X4
 22671  	ADDQ  CX, DI
 22672  	MOVSS (AX)(DI*4), X5
 22673  	ADDQ  CX, DI
 22674  	MOVSS (AX)(DI*4), X6
 22675  	ADDQ  CX, DI
 22676  	MOVSS (AX)(DI*4), X7
 22677  	ADDQ  CX, DI
 22678  	MOVSS (AX)(DI*4), X8
 22679  	ADDQ  CX, DI
 22680  	MULSS X0, X1
 22681  	MULSS X0, X2
 22682  	MULSS X0, X3
 22683  	MULSS X0, X4
 22684  	MULSS X0, X5
 22685  	MULSS X0, X6
 22686  	MULSS X0, X7
 22687  	MULSS X0, X8
 22688  	ADDSS (DX)(R8*4), X1
 22689  	MOVSS X1, (DX)(R8*4)
 22690  	ADDQ  BX, R8
 22691  	ADDSS (DX)(R8*4), X2
 22692  	MOVSS X2, (DX)(R8*4)
 22693  	ADDQ  BX, R8
 22694  	ADDSS (DX)(R8*4), X3
 22695  	MOVSS X3, (DX)(R8*4)
 22696  	ADDQ  BX, R8
 22697  	ADDSS (DX)(R8*4), X4
 22698  	MOVSS X4, (DX)(R8*4)
 22699  	ADDQ  BX, R8
 22700  	ADDSS (DX)(R8*4), X5
 22701  	MOVSS X5, (DX)(R8*4)
 22702  	ADDQ  BX, R8
 22703  	ADDSS (DX)(R8*4), X6
 22704  	MOVSS X6, (DX)(R8*4)
 22705  	ADDQ  BX, R8
 22706  	ADDSS (DX)(R8*4), X7
 22707  	MOVSS X7, (DX)(R8*4)
 22708  	ADDQ  BX, R8
 22709  	ADDSS (DX)(R8*4), X8
 22710  	MOVSS X8, (DX)(R8*4)
 22711  	ADDQ  BX, R8
 22712  	SUBQ  $0x08, SI
 22713  
 22714  check_limit_unroll:
 22715  	CMPQ SI, $0x08
 22716  	JHS  loop_unroll
 22717  	JMP  check_limit
 22718  
 22719  loop:
 22720  	MOVSS (AX)(DI*4), X1
 22721  	MULSS X0, X1
 22722  	ADDSS (DX)(R8*4), X1
 22723  	MOVSS X1, (DX)(R8*4)
 22724  	DECQ  SI
 22725  	ADDQ  CX, DI
 22726  	ADDQ  BX, R8
 22727  
 22728  check_limit:
 22729  	CMPQ SI, $0x00
 22730  	JHI  loop
 22731  	RET
 22732  
 22733  // func AmdAxpyUnsafeXInterleave_V3A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22734  // Requires: SSE
 22735  TEXT ·AmdAxpyUnsafeXInterleave_V3A12R8(SB), NOSPLIT, $0-48
 22736  	MOVSS alpha+0(FP), X0
 22737  	MOVQ  xs+8(FP), AX
 22738  	MOVQ  incx+16(FP), CX
 22739  	MOVQ  ys+24(FP), DX
 22740  	MOVQ  incy+32(FP), BX
 22741  	MOVQ  n+40(FP), SI
 22742  	XORQ  DI, DI
 22743  	XORQ  R8, R8
 22744  	JMP   check_limit_unroll
 22745  	PCALIGN $0x08
 22746  	NOP
 22747  	NOP
 22748  	NOP
 22749  	NOP
 22750  
 22751  loop_unroll:
 22752  	MOVSS (AX)(DI*4), X1
 22753  	ADDQ  CX, DI
 22754  	MOVSS (AX)(DI*4), X2
 22755  	ADDQ  CX, DI
 22756  	MOVSS (AX)(DI*4), X3
 22757  	ADDQ  CX, DI
 22758  	MOVSS (AX)(DI*4), X4
 22759  	ADDQ  CX, DI
 22760  	MOVSS (AX)(DI*4), X5
 22761  	ADDQ  CX, DI
 22762  	MOVSS (AX)(DI*4), X6
 22763  	ADDQ  CX, DI
 22764  	MOVSS (AX)(DI*4), X7
 22765  	ADDQ  CX, DI
 22766  	MOVSS (AX)(DI*4), X8
 22767  	ADDQ  CX, DI
 22768  	MULSS X0, X1
 22769  	MULSS X0, X2
 22770  	MULSS X0, X3
 22771  	MULSS X0, X4
 22772  	MULSS X0, X5
 22773  	MULSS X0, X6
 22774  	MULSS X0, X7
 22775  	MULSS X0, X8
 22776  	ADDSS (DX)(R8*4), X1
 22777  	MOVSS X1, (DX)(R8*4)
 22778  	ADDQ  BX, R8
 22779  	ADDSS (DX)(R8*4), X2
 22780  	MOVSS X2, (DX)(R8*4)
 22781  	ADDQ  BX, R8
 22782  	ADDSS (DX)(R8*4), X3
 22783  	MOVSS X3, (DX)(R8*4)
 22784  	ADDQ  BX, R8
 22785  	ADDSS (DX)(R8*4), X4
 22786  	MOVSS X4, (DX)(R8*4)
 22787  	ADDQ  BX, R8
 22788  	ADDSS (DX)(R8*4), X5
 22789  	MOVSS X5, (DX)(R8*4)
 22790  	ADDQ  BX, R8
 22791  	ADDSS (DX)(R8*4), X6
 22792  	MOVSS X6, (DX)(R8*4)
 22793  	ADDQ  BX, R8
 22794  	ADDSS (DX)(R8*4), X7
 22795  	MOVSS X7, (DX)(R8*4)
 22796  	ADDQ  BX, R8
 22797  	ADDSS (DX)(R8*4), X8
 22798  	MOVSS X8, (DX)(R8*4)
 22799  	ADDQ  BX, R8
 22800  	SUBQ  $0x08, SI
 22801  
 22802  check_limit_unroll:
 22803  	CMPQ SI, $0x08
 22804  	JHS  loop_unroll
 22805  	JMP  check_limit
 22806  
 22807  loop:
 22808  	MOVSS (AX)(DI*4), X1
 22809  	MULSS X0, X1
 22810  	ADDSS (DX)(R8*4), X1
 22811  	MOVSS X1, (DX)(R8*4)
 22812  	DECQ  SI
 22813  	ADDQ  CX, DI
 22814  	ADDQ  BX, R8
 22815  
 22816  check_limit:
 22817  	CMPQ SI, $0x00
 22818  	JHI  loop
 22819  	RET
 22820  
 22821  // func AmdAxpyUnsafeXInterleave_V4A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22822  // Requires: SSE
 22823  TEXT ·AmdAxpyUnsafeXInterleave_V4A12R8(SB), NOSPLIT, $0-48
 22824  	MOVSS alpha+0(FP), X0
 22825  	MOVQ  xs+8(FP), AX
 22826  	MOVQ  incx+16(FP), CX
 22827  	MOVQ  ys+24(FP), DX
 22828  	MOVQ  incy+32(FP), BX
 22829  	MOVQ  n+40(FP), SI
 22830  	XORQ  DI, DI
 22831  	XORQ  R8, R8
 22832  	JMP   check_limit_unroll
 22833  	PCALIGN $0x08
 22834  	NOP
 22835  	NOP
 22836  	NOP
 22837  	NOP
 22838  
 22839  loop_unroll:
 22840  	MOVSS (AX)(DI*4), X1
 22841  	ADDQ  CX, DI
 22842  	MOVSS (AX)(DI*4), X2
 22843  	ADDQ  CX, DI
 22844  	MOVSS (AX)(DI*4), X3
 22845  	ADDQ  CX, DI
 22846  	MOVSS (AX)(DI*4), X4
 22847  	ADDQ  CX, DI
 22848  	MOVSS (AX)(DI*4), X5
 22849  	ADDQ  CX, DI
 22850  	MOVSS (AX)(DI*4), X6
 22851  	ADDQ  CX, DI
 22852  	MOVSS (AX)(DI*4), X7
 22853  	ADDQ  CX, DI
 22854  	MOVSS (AX)(DI*4), X8
 22855  	ADDQ  CX, DI
 22856  	MULSS X0, X1
 22857  	MULSS X0, X2
 22858  	MULSS X0, X3
 22859  	MULSS X0, X4
 22860  	MULSS X0, X5
 22861  	MULSS X0, X6
 22862  	MULSS X0, X7
 22863  	MULSS X0, X8
 22864  	ADDSS (DX)(R8*4), X1
 22865  	MOVSS X1, (DX)(R8*4)
 22866  	ADDQ  BX, R8
 22867  	ADDSS (DX)(R8*4), X2
 22868  	MOVSS X2, (DX)(R8*4)
 22869  	ADDQ  BX, R8
 22870  	ADDSS (DX)(R8*4), X3
 22871  	MOVSS X3, (DX)(R8*4)
 22872  	ADDQ  BX, R8
 22873  	ADDSS (DX)(R8*4), X4
 22874  	MOVSS X4, (DX)(R8*4)
 22875  	ADDQ  BX, R8
 22876  	ADDSS (DX)(R8*4), X5
 22877  	MOVSS X5, (DX)(R8*4)
 22878  	ADDQ  BX, R8
 22879  	ADDSS (DX)(R8*4), X6
 22880  	MOVSS X6, (DX)(R8*4)
 22881  	ADDQ  BX, R8
 22882  	ADDSS (DX)(R8*4), X7
 22883  	MOVSS X7, (DX)(R8*4)
 22884  	ADDQ  BX, R8
 22885  	ADDSS (DX)(R8*4), X8
 22886  	MOVSS X8, (DX)(R8*4)
 22887  	ADDQ  BX, R8
 22888  	SUBQ  $0x08, SI
 22889  
 22890  check_limit_unroll:
 22891  	CMPQ SI, $0x08
 22892  	JHS  loop_unroll
 22893  	JMP  check_limit
 22894  
 22895  loop:
 22896  	MOVSS (AX)(DI*4), X1
 22897  	MULSS X0, X1
 22898  	ADDSS (DX)(R8*4), X1
 22899  	MOVSS X1, (DX)(R8*4)
 22900  	DECQ  SI
 22901  	ADDQ  CX, DI
 22902  	ADDQ  BX, R8
 22903  
 22904  check_limit:
 22905  	CMPQ SI, $0x00
 22906  	JHI  loop
 22907  	RET
 22908  
 22909  // func AmdAxpyUnsafeXInterleave_V5A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22910  // Requires: SSE
 22911  TEXT ·AmdAxpyUnsafeXInterleave_V5A12R8(SB), NOSPLIT, $0-48
 22912  	MOVSS alpha+0(FP), X0
 22913  	MOVQ  xs+8(FP), AX
 22914  	MOVQ  incx+16(FP), CX
 22915  	MOVQ  ys+24(FP), DX
 22916  	MOVQ  incy+32(FP), BX
 22917  	MOVQ  n+40(FP), SI
 22918  	XORQ  DI, DI
 22919  	XORQ  R8, R8
 22920  	JMP   check_limit_unroll
 22921  	PCALIGN $0x08
 22922  	NOP
 22923  	NOP
 22924  	NOP
 22925  	NOP
 22926  
 22927  loop_unroll:
 22928  	MOVSS (AX)(DI*4), X1
 22929  	ADDQ  CX, DI
 22930  	MOVSS (AX)(DI*4), X2
 22931  	ADDQ  CX, DI
 22932  	MOVSS (AX)(DI*4), X3
 22933  	ADDQ  CX, DI
 22934  	MOVSS (AX)(DI*4), X4
 22935  	ADDQ  CX, DI
 22936  	MOVSS (AX)(DI*4), X5
 22937  	ADDQ  CX, DI
 22938  	MOVSS (AX)(DI*4), X6
 22939  	ADDQ  CX, DI
 22940  	MOVSS (AX)(DI*4), X7
 22941  	ADDQ  CX, DI
 22942  	MOVSS (AX)(DI*4), X8
 22943  	ADDQ  CX, DI
 22944  	MULSS X0, X1
 22945  	MULSS X0, X2
 22946  	MULSS X0, X3
 22947  	MULSS X0, X4
 22948  	MULSS X0, X5
 22949  	MULSS X0, X6
 22950  	MULSS X0, X7
 22951  	MULSS X0, X8
 22952  	ADDSS (DX)(R8*4), X1
 22953  	MOVSS X1, (DX)(R8*4)
 22954  	ADDQ  BX, R8
 22955  	ADDSS (DX)(R8*4), X2
 22956  	MOVSS X2, (DX)(R8*4)
 22957  	ADDQ  BX, R8
 22958  	ADDSS (DX)(R8*4), X3
 22959  	MOVSS X3, (DX)(R8*4)
 22960  	ADDQ  BX, R8
 22961  	ADDSS (DX)(R8*4), X4
 22962  	MOVSS X4, (DX)(R8*4)
 22963  	ADDQ  BX, R8
 22964  	ADDSS (DX)(R8*4), X5
 22965  	MOVSS X5, (DX)(R8*4)
 22966  	ADDQ  BX, R8
 22967  	ADDSS (DX)(R8*4), X6
 22968  	MOVSS X6, (DX)(R8*4)
 22969  	ADDQ  BX, R8
 22970  	ADDSS (DX)(R8*4), X7
 22971  	MOVSS X7, (DX)(R8*4)
 22972  	ADDQ  BX, R8
 22973  	ADDSS (DX)(R8*4), X8
 22974  	MOVSS X8, (DX)(R8*4)
 22975  	ADDQ  BX, R8
 22976  	SUBQ  $0x08, SI
 22977  
 22978  check_limit_unroll:
 22979  	CMPQ SI, $0x08
 22980  	JHS  loop_unroll
 22981  	JMP  check_limit
 22982  
 22983  loop:
 22984  	MOVSS (AX)(DI*4), X1
 22985  	MULSS X0, X1
 22986  	ADDSS (DX)(R8*4), X1
 22987  	MOVSS X1, (DX)(R8*4)
 22988  	DECQ  SI
 22989  	ADDQ  CX, DI
 22990  	ADDQ  BX, R8
 22991  
 22992  check_limit:
 22993  	CMPQ SI, $0x00
 22994  	JHI  loop
 22995  	RET
 22996  
 22997  // func AmdAxpyUnsafeXInterleave_V0A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 22998  // Requires: SSE
 22999  TEXT ·AmdAxpyUnsafeXInterleave_V0A13R8(SB), NOSPLIT, $0-48
 23000  	MOVSS alpha+0(FP), X0
 23001  	MOVQ  xs+8(FP), AX
 23002  	MOVQ  incx+16(FP), CX
 23003  	MOVQ  ys+24(FP), DX
 23004  	MOVQ  incy+32(FP), BX
 23005  	MOVQ  n+40(FP), SI
 23006  	XORQ  DI, DI
 23007  	XORQ  R8, R8
 23008  	JMP   check_limit_unroll
 23009  	PCALIGN $0x08
 23010  	NOP
 23011  	NOP
 23012  	NOP
 23013  	NOP
 23014  	NOP
 23015  
 23016  loop_unroll:
 23017  	MOVSS (AX)(DI*4), X1
 23018  	ADDQ  CX, DI
 23019  	MOVSS (AX)(DI*4), X2
 23020  	ADDQ  CX, DI
 23021  	MOVSS (AX)(DI*4), X3
 23022  	ADDQ  CX, DI
 23023  	MOVSS (AX)(DI*4), X4
 23024  	ADDQ  CX, DI
 23025  	MOVSS (AX)(DI*4), X5
 23026  	ADDQ  CX, DI
 23027  	MOVSS (AX)(DI*4), X6
 23028  	ADDQ  CX, DI
 23029  	MOVSS (AX)(DI*4), X7
 23030  	ADDQ  CX, DI
 23031  	MOVSS (AX)(DI*4), X8
 23032  	ADDQ  CX, DI
 23033  	MULSS X0, X1
 23034  	MULSS X0, X2
 23035  	MULSS X0, X3
 23036  	MULSS X0, X4
 23037  	MULSS X0, X5
 23038  	MULSS X0, X6
 23039  	MULSS X0, X7
 23040  	MULSS X0, X8
 23041  	ADDSS (DX)(R8*4), X1
 23042  	MOVSS X1, (DX)(R8*4)
 23043  	ADDQ  BX, R8
 23044  	ADDSS (DX)(R8*4), X2
 23045  	MOVSS X2, (DX)(R8*4)
 23046  	ADDQ  BX, R8
 23047  	ADDSS (DX)(R8*4), X3
 23048  	MOVSS X3, (DX)(R8*4)
 23049  	ADDQ  BX, R8
 23050  	ADDSS (DX)(R8*4), X4
 23051  	MOVSS X4, (DX)(R8*4)
 23052  	ADDQ  BX, R8
 23053  	ADDSS (DX)(R8*4), X5
 23054  	MOVSS X5, (DX)(R8*4)
 23055  	ADDQ  BX, R8
 23056  	ADDSS (DX)(R8*4), X6
 23057  	MOVSS X6, (DX)(R8*4)
 23058  	ADDQ  BX, R8
 23059  	ADDSS (DX)(R8*4), X7
 23060  	MOVSS X7, (DX)(R8*4)
 23061  	ADDQ  BX, R8
 23062  	ADDSS (DX)(R8*4), X8
 23063  	MOVSS X8, (DX)(R8*4)
 23064  	ADDQ  BX, R8
 23065  	SUBQ  $0x08, SI
 23066  
 23067  check_limit_unroll:
 23068  	CMPQ SI, $0x08
 23069  	JHS  loop_unroll
 23070  	JMP  check_limit
 23071  
 23072  loop:
 23073  	MOVSS (AX)(DI*4), X1
 23074  	MULSS X0, X1
 23075  	ADDSS (DX)(R8*4), X1
 23076  	MOVSS X1, (DX)(R8*4)
 23077  	DECQ  SI
 23078  	ADDQ  CX, DI
 23079  	ADDQ  BX, R8
 23080  
 23081  check_limit:
 23082  	CMPQ SI, $0x00
 23083  	JHI  loop
 23084  	RET
 23085  
 23086  // func AmdAxpyUnsafeXInterleave_V1A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23087  // Requires: SSE
 23088  TEXT ·AmdAxpyUnsafeXInterleave_V1A13R8(SB), NOSPLIT, $0-48
 23089  	MOVSS alpha+0(FP), X0
 23090  	MOVQ  xs+8(FP), AX
 23091  	MOVQ  incx+16(FP), CX
 23092  	MOVQ  ys+24(FP), DX
 23093  	MOVQ  incy+32(FP), BX
 23094  	MOVQ  n+40(FP), SI
 23095  	XORQ  DI, DI
 23096  	XORQ  R8, R8
 23097  	JMP   check_limit_unroll
 23098  	PCALIGN $0x08
 23099  	NOP
 23100  	NOP
 23101  	NOP
 23102  	NOP
 23103  	NOP
 23104  
 23105  loop_unroll:
 23106  	MOVSS (AX)(DI*4), X1
 23107  	ADDQ  CX, DI
 23108  	MOVSS (AX)(DI*4), X2
 23109  	ADDQ  CX, DI
 23110  	MOVSS (AX)(DI*4), X3
 23111  	ADDQ  CX, DI
 23112  	MOVSS (AX)(DI*4), X4
 23113  	ADDQ  CX, DI
 23114  	MOVSS (AX)(DI*4), X5
 23115  	ADDQ  CX, DI
 23116  	MOVSS (AX)(DI*4), X6
 23117  	ADDQ  CX, DI
 23118  	MOVSS (AX)(DI*4), X7
 23119  	ADDQ  CX, DI
 23120  	MOVSS (AX)(DI*4), X8
 23121  	ADDQ  CX, DI
 23122  	MULSS X0, X1
 23123  	MULSS X0, X2
 23124  	MULSS X0, X3
 23125  	MULSS X0, X4
 23126  	MULSS X0, X5
 23127  	MULSS X0, X6
 23128  	MULSS X0, X7
 23129  	MULSS X0, X8
 23130  	ADDSS (DX)(R8*4), X1
 23131  	MOVSS X1, (DX)(R8*4)
 23132  	ADDQ  BX, R8
 23133  	ADDSS (DX)(R8*4), X2
 23134  	MOVSS X2, (DX)(R8*4)
 23135  	ADDQ  BX, R8
 23136  	ADDSS (DX)(R8*4), X3
 23137  	MOVSS X3, (DX)(R8*4)
 23138  	ADDQ  BX, R8
 23139  	ADDSS (DX)(R8*4), X4
 23140  	MOVSS X4, (DX)(R8*4)
 23141  	ADDQ  BX, R8
 23142  	ADDSS (DX)(R8*4), X5
 23143  	MOVSS X5, (DX)(R8*4)
 23144  	ADDQ  BX, R8
 23145  	ADDSS (DX)(R8*4), X6
 23146  	MOVSS X6, (DX)(R8*4)
 23147  	ADDQ  BX, R8
 23148  	ADDSS (DX)(R8*4), X7
 23149  	MOVSS X7, (DX)(R8*4)
 23150  	ADDQ  BX, R8
 23151  	ADDSS (DX)(R8*4), X8
 23152  	MOVSS X8, (DX)(R8*4)
 23153  	ADDQ  BX, R8
 23154  	SUBQ  $0x08, SI
 23155  
 23156  check_limit_unroll:
 23157  	CMPQ SI, $0x08
 23158  	JHS  loop_unroll
 23159  	JMP  check_limit
 23160  
 23161  loop:
 23162  	MOVSS (AX)(DI*4), X1
 23163  	MULSS X0, X1
 23164  	ADDSS (DX)(R8*4), X1
 23165  	MOVSS X1, (DX)(R8*4)
 23166  	DECQ  SI
 23167  	ADDQ  CX, DI
 23168  	ADDQ  BX, R8
 23169  
 23170  check_limit:
 23171  	CMPQ SI, $0x00
 23172  	JHI  loop
 23173  	RET
 23174  
 23175  // func AmdAxpyUnsafeXInterleave_V2A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23176  // Requires: SSE
 23177  TEXT ·AmdAxpyUnsafeXInterleave_V2A13R8(SB), NOSPLIT, $0-48
 23178  	MOVSS alpha+0(FP), X0
 23179  	MOVQ  xs+8(FP), AX
 23180  	MOVQ  incx+16(FP), CX
 23181  	MOVQ  ys+24(FP), DX
 23182  	MOVQ  incy+32(FP), BX
 23183  	MOVQ  n+40(FP), SI
 23184  	XORQ  DI, DI
 23185  	XORQ  R8, R8
 23186  	JMP   check_limit_unroll
 23187  	PCALIGN $0x08
 23188  	NOP
 23189  	NOP
 23190  	NOP
 23191  	NOP
 23192  	NOP
 23193  
 23194  loop_unroll:
 23195  	MOVSS (AX)(DI*4), X1
 23196  	ADDQ  CX, DI
 23197  	MOVSS (AX)(DI*4), X2
 23198  	ADDQ  CX, DI
 23199  	MOVSS (AX)(DI*4), X3
 23200  	ADDQ  CX, DI
 23201  	MOVSS (AX)(DI*4), X4
 23202  	ADDQ  CX, DI
 23203  	MOVSS (AX)(DI*4), X5
 23204  	ADDQ  CX, DI
 23205  	MOVSS (AX)(DI*4), X6
 23206  	ADDQ  CX, DI
 23207  	MOVSS (AX)(DI*4), X7
 23208  	ADDQ  CX, DI
 23209  	MOVSS (AX)(DI*4), X8
 23210  	ADDQ  CX, DI
 23211  	MULSS X0, X1
 23212  	MULSS X0, X2
 23213  	MULSS X0, X3
 23214  	MULSS X0, X4
 23215  	MULSS X0, X5
 23216  	MULSS X0, X6
 23217  	MULSS X0, X7
 23218  	MULSS X0, X8
 23219  	ADDSS (DX)(R8*4), X1
 23220  	MOVSS X1, (DX)(R8*4)
 23221  	ADDQ  BX, R8
 23222  	ADDSS (DX)(R8*4), X2
 23223  	MOVSS X2, (DX)(R8*4)
 23224  	ADDQ  BX, R8
 23225  	ADDSS (DX)(R8*4), X3
 23226  	MOVSS X3, (DX)(R8*4)
 23227  	ADDQ  BX, R8
 23228  	ADDSS (DX)(R8*4), X4
 23229  	MOVSS X4, (DX)(R8*4)
 23230  	ADDQ  BX, R8
 23231  	ADDSS (DX)(R8*4), X5
 23232  	MOVSS X5, (DX)(R8*4)
 23233  	ADDQ  BX, R8
 23234  	ADDSS (DX)(R8*4), X6
 23235  	MOVSS X6, (DX)(R8*4)
 23236  	ADDQ  BX, R8
 23237  	ADDSS (DX)(R8*4), X7
 23238  	MOVSS X7, (DX)(R8*4)
 23239  	ADDQ  BX, R8
 23240  	ADDSS (DX)(R8*4), X8
 23241  	MOVSS X8, (DX)(R8*4)
 23242  	ADDQ  BX, R8
 23243  	SUBQ  $0x08, SI
 23244  
 23245  check_limit_unroll:
 23246  	CMPQ SI, $0x08
 23247  	JHS  loop_unroll
 23248  	JMP  check_limit
 23249  
 23250  loop:
 23251  	MOVSS (AX)(DI*4), X1
 23252  	MULSS X0, X1
 23253  	ADDSS (DX)(R8*4), X1
 23254  	MOVSS X1, (DX)(R8*4)
 23255  	DECQ  SI
 23256  	ADDQ  CX, DI
 23257  	ADDQ  BX, R8
 23258  
 23259  check_limit:
 23260  	CMPQ SI, $0x00
 23261  	JHI  loop
 23262  	RET
 23263  
 23264  // func AmdAxpyUnsafeXInterleave_V3A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23265  // Requires: SSE
 23266  TEXT ·AmdAxpyUnsafeXInterleave_V3A13R8(SB), NOSPLIT, $0-48
 23267  	MOVSS alpha+0(FP), X0
 23268  	MOVQ  xs+8(FP), AX
 23269  	MOVQ  incx+16(FP), CX
 23270  	MOVQ  ys+24(FP), DX
 23271  	MOVQ  incy+32(FP), BX
 23272  	MOVQ  n+40(FP), SI
 23273  	XORQ  DI, DI
 23274  	XORQ  R8, R8
 23275  	JMP   check_limit_unroll
 23276  	PCALIGN $0x08
 23277  	NOP
 23278  	NOP
 23279  	NOP
 23280  	NOP
 23281  	NOP
 23282  
 23283  loop_unroll:
 23284  	MOVSS (AX)(DI*4), X1
 23285  	ADDQ  CX, DI
 23286  	MOVSS (AX)(DI*4), X2
 23287  	ADDQ  CX, DI
 23288  	MOVSS (AX)(DI*4), X3
 23289  	ADDQ  CX, DI
 23290  	MOVSS (AX)(DI*4), X4
 23291  	ADDQ  CX, DI
 23292  	MOVSS (AX)(DI*4), X5
 23293  	ADDQ  CX, DI
 23294  	MOVSS (AX)(DI*4), X6
 23295  	ADDQ  CX, DI
 23296  	MOVSS (AX)(DI*4), X7
 23297  	ADDQ  CX, DI
 23298  	MOVSS (AX)(DI*4), X8
 23299  	ADDQ  CX, DI
 23300  	MULSS X0, X1
 23301  	MULSS X0, X2
 23302  	MULSS X0, X3
 23303  	MULSS X0, X4
 23304  	MULSS X0, X5
 23305  	MULSS X0, X6
 23306  	MULSS X0, X7
 23307  	MULSS X0, X8
 23308  	ADDSS (DX)(R8*4), X1
 23309  	MOVSS X1, (DX)(R8*4)
 23310  	ADDQ  BX, R8
 23311  	ADDSS (DX)(R8*4), X2
 23312  	MOVSS X2, (DX)(R8*4)
 23313  	ADDQ  BX, R8
 23314  	ADDSS (DX)(R8*4), X3
 23315  	MOVSS X3, (DX)(R8*4)
 23316  	ADDQ  BX, R8
 23317  	ADDSS (DX)(R8*4), X4
 23318  	MOVSS X4, (DX)(R8*4)
 23319  	ADDQ  BX, R8
 23320  	ADDSS (DX)(R8*4), X5
 23321  	MOVSS X5, (DX)(R8*4)
 23322  	ADDQ  BX, R8
 23323  	ADDSS (DX)(R8*4), X6
 23324  	MOVSS X6, (DX)(R8*4)
 23325  	ADDQ  BX, R8
 23326  	ADDSS (DX)(R8*4), X7
 23327  	MOVSS X7, (DX)(R8*4)
 23328  	ADDQ  BX, R8
 23329  	ADDSS (DX)(R8*4), X8
 23330  	MOVSS X8, (DX)(R8*4)
 23331  	ADDQ  BX, R8
 23332  	SUBQ  $0x08, SI
 23333  
 23334  check_limit_unroll:
 23335  	CMPQ SI, $0x08
 23336  	JHS  loop_unroll
 23337  	JMP  check_limit
 23338  
 23339  loop:
 23340  	MOVSS (AX)(DI*4), X1
 23341  	MULSS X0, X1
 23342  	ADDSS (DX)(R8*4), X1
 23343  	MOVSS X1, (DX)(R8*4)
 23344  	DECQ  SI
 23345  	ADDQ  CX, DI
 23346  	ADDQ  BX, R8
 23347  
 23348  check_limit:
 23349  	CMPQ SI, $0x00
 23350  	JHI  loop
 23351  	RET
 23352  
 23353  // func AmdAxpyUnsafeXInterleave_V4A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23354  // Requires: SSE
 23355  TEXT ·AmdAxpyUnsafeXInterleave_V4A13R8(SB), NOSPLIT, $0-48
 23356  	MOVSS alpha+0(FP), X0
 23357  	MOVQ  xs+8(FP), AX
 23358  	MOVQ  incx+16(FP), CX
 23359  	MOVQ  ys+24(FP), DX
 23360  	MOVQ  incy+32(FP), BX
 23361  	MOVQ  n+40(FP), SI
 23362  	XORQ  DI, DI
 23363  	XORQ  R8, R8
 23364  	JMP   check_limit_unroll
 23365  	PCALIGN $0x08
 23366  	NOP
 23367  	NOP
 23368  	NOP
 23369  	NOP
 23370  	NOP
 23371  
 23372  loop_unroll:
 23373  	MOVSS (AX)(DI*4), X1
 23374  	ADDQ  CX, DI
 23375  	MOVSS (AX)(DI*4), X2
 23376  	ADDQ  CX, DI
 23377  	MOVSS (AX)(DI*4), X3
 23378  	ADDQ  CX, DI
 23379  	MOVSS (AX)(DI*4), X4
 23380  	ADDQ  CX, DI
 23381  	MOVSS (AX)(DI*4), X5
 23382  	ADDQ  CX, DI
 23383  	MOVSS (AX)(DI*4), X6
 23384  	ADDQ  CX, DI
 23385  	MOVSS (AX)(DI*4), X7
 23386  	ADDQ  CX, DI
 23387  	MOVSS (AX)(DI*4), X8
 23388  	ADDQ  CX, DI
 23389  	MULSS X0, X1
 23390  	MULSS X0, X2
 23391  	MULSS X0, X3
 23392  	MULSS X0, X4
 23393  	MULSS X0, X5
 23394  	MULSS X0, X6
 23395  	MULSS X0, X7
 23396  	MULSS X0, X8
 23397  	ADDSS (DX)(R8*4), X1
 23398  	MOVSS X1, (DX)(R8*4)
 23399  	ADDQ  BX, R8
 23400  	ADDSS (DX)(R8*4), X2
 23401  	MOVSS X2, (DX)(R8*4)
 23402  	ADDQ  BX, R8
 23403  	ADDSS (DX)(R8*4), X3
 23404  	MOVSS X3, (DX)(R8*4)
 23405  	ADDQ  BX, R8
 23406  	ADDSS (DX)(R8*4), X4
 23407  	MOVSS X4, (DX)(R8*4)
 23408  	ADDQ  BX, R8
 23409  	ADDSS (DX)(R8*4), X5
 23410  	MOVSS X5, (DX)(R8*4)
 23411  	ADDQ  BX, R8
 23412  	ADDSS (DX)(R8*4), X6
 23413  	MOVSS X6, (DX)(R8*4)
 23414  	ADDQ  BX, R8
 23415  	ADDSS (DX)(R8*4), X7
 23416  	MOVSS X7, (DX)(R8*4)
 23417  	ADDQ  BX, R8
 23418  	ADDSS (DX)(R8*4), X8
 23419  	MOVSS X8, (DX)(R8*4)
 23420  	ADDQ  BX, R8
 23421  	SUBQ  $0x08, SI
 23422  
 23423  check_limit_unroll:
 23424  	CMPQ SI, $0x08
 23425  	JHS  loop_unroll
 23426  	JMP  check_limit
 23427  
 23428  loop:
 23429  	MOVSS (AX)(DI*4), X1
 23430  	MULSS X0, X1
 23431  	ADDSS (DX)(R8*4), X1
 23432  	MOVSS X1, (DX)(R8*4)
 23433  	DECQ  SI
 23434  	ADDQ  CX, DI
 23435  	ADDQ  BX, R8
 23436  
 23437  check_limit:
 23438  	CMPQ SI, $0x00
 23439  	JHI  loop
 23440  	RET
 23441  
 23442  // func AmdAxpyUnsafeXInterleave_V5A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23443  // Requires: SSE
 23444  TEXT ·AmdAxpyUnsafeXInterleave_V5A13R8(SB), NOSPLIT, $0-48
 23445  	MOVSS alpha+0(FP), X0
 23446  	MOVQ  xs+8(FP), AX
 23447  	MOVQ  incx+16(FP), CX
 23448  	MOVQ  ys+24(FP), DX
 23449  	MOVQ  incy+32(FP), BX
 23450  	MOVQ  n+40(FP), SI
 23451  	XORQ  DI, DI
 23452  	XORQ  R8, R8
 23453  	JMP   check_limit_unroll
 23454  	PCALIGN $0x08
 23455  	NOP
 23456  	NOP
 23457  	NOP
 23458  	NOP
 23459  	NOP
 23460  
 23461  loop_unroll:
 23462  	MOVSS (AX)(DI*4), X1
 23463  	ADDQ  CX, DI
 23464  	MOVSS (AX)(DI*4), X2
 23465  	ADDQ  CX, DI
 23466  	MOVSS (AX)(DI*4), X3
 23467  	ADDQ  CX, DI
 23468  	MOVSS (AX)(DI*4), X4
 23469  	ADDQ  CX, DI
 23470  	MOVSS (AX)(DI*4), X5
 23471  	ADDQ  CX, DI
 23472  	MOVSS (AX)(DI*4), X6
 23473  	ADDQ  CX, DI
 23474  	MOVSS (AX)(DI*4), X7
 23475  	ADDQ  CX, DI
 23476  	MOVSS (AX)(DI*4), X8
 23477  	ADDQ  CX, DI
 23478  	MULSS X0, X1
 23479  	MULSS X0, X2
 23480  	MULSS X0, X3
 23481  	MULSS X0, X4
 23482  	MULSS X0, X5
 23483  	MULSS X0, X6
 23484  	MULSS X0, X7
 23485  	MULSS X0, X8
 23486  	ADDSS (DX)(R8*4), X1
 23487  	MOVSS X1, (DX)(R8*4)
 23488  	ADDQ  BX, R8
 23489  	ADDSS (DX)(R8*4), X2
 23490  	MOVSS X2, (DX)(R8*4)
 23491  	ADDQ  BX, R8
 23492  	ADDSS (DX)(R8*4), X3
 23493  	MOVSS X3, (DX)(R8*4)
 23494  	ADDQ  BX, R8
 23495  	ADDSS (DX)(R8*4), X4
 23496  	MOVSS X4, (DX)(R8*4)
 23497  	ADDQ  BX, R8
 23498  	ADDSS (DX)(R8*4), X5
 23499  	MOVSS X5, (DX)(R8*4)
 23500  	ADDQ  BX, R8
 23501  	ADDSS (DX)(R8*4), X6
 23502  	MOVSS X6, (DX)(R8*4)
 23503  	ADDQ  BX, R8
 23504  	ADDSS (DX)(R8*4), X7
 23505  	MOVSS X7, (DX)(R8*4)
 23506  	ADDQ  BX, R8
 23507  	ADDSS (DX)(R8*4), X8
 23508  	MOVSS X8, (DX)(R8*4)
 23509  	ADDQ  BX, R8
 23510  	SUBQ  $0x08, SI
 23511  
 23512  check_limit_unroll:
 23513  	CMPQ SI, $0x08
 23514  	JHS  loop_unroll
 23515  	JMP  check_limit
 23516  
 23517  loop:
 23518  	MOVSS (AX)(DI*4), X1
 23519  	MULSS X0, X1
 23520  	ADDSS (DX)(R8*4), X1
 23521  	MOVSS X1, (DX)(R8*4)
 23522  	DECQ  SI
 23523  	ADDQ  CX, DI
 23524  	ADDQ  BX, R8
 23525  
 23526  check_limit:
 23527  	CMPQ SI, $0x00
 23528  	JHI  loop
 23529  	RET
 23530  
 23531  // func AmdAxpyUnsafeXInterleave_V0A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23532  // Requires: SSE
 23533  TEXT ·AmdAxpyUnsafeXInterleave_V0A14R8(SB), NOSPLIT, $0-48
 23534  	MOVSS alpha+0(FP), X0
 23535  	MOVQ  xs+8(FP), AX
 23536  	MOVQ  incx+16(FP), CX
 23537  	MOVQ  ys+24(FP), DX
 23538  	MOVQ  incy+32(FP), BX
 23539  	MOVQ  n+40(FP), SI
 23540  	XORQ  DI, DI
 23541  	XORQ  R8, R8
 23542  	JMP   check_limit_unroll
 23543  	PCALIGN $0x08
 23544  	NOP
 23545  	NOP
 23546  	NOP
 23547  	NOP
 23548  	NOP
 23549  	NOP
 23550  
 23551  loop_unroll:
 23552  	MOVSS (AX)(DI*4), X1
 23553  	ADDQ  CX, DI
 23554  	MOVSS (AX)(DI*4), X2
 23555  	ADDQ  CX, DI
 23556  	MOVSS (AX)(DI*4), X3
 23557  	ADDQ  CX, DI
 23558  	MOVSS (AX)(DI*4), X4
 23559  	ADDQ  CX, DI
 23560  	MOVSS (AX)(DI*4), X5
 23561  	ADDQ  CX, DI
 23562  	MOVSS (AX)(DI*4), X6
 23563  	ADDQ  CX, DI
 23564  	MOVSS (AX)(DI*4), X7
 23565  	ADDQ  CX, DI
 23566  	MOVSS (AX)(DI*4), X8
 23567  	ADDQ  CX, DI
 23568  	MULSS X0, X1
 23569  	MULSS X0, X2
 23570  	MULSS X0, X3
 23571  	MULSS X0, X4
 23572  	MULSS X0, X5
 23573  	MULSS X0, X6
 23574  	MULSS X0, X7
 23575  	MULSS X0, X8
 23576  	ADDSS (DX)(R8*4), X1
 23577  	MOVSS X1, (DX)(R8*4)
 23578  	ADDQ  BX, R8
 23579  	ADDSS (DX)(R8*4), X2
 23580  	MOVSS X2, (DX)(R8*4)
 23581  	ADDQ  BX, R8
 23582  	ADDSS (DX)(R8*4), X3
 23583  	MOVSS X3, (DX)(R8*4)
 23584  	ADDQ  BX, R8
 23585  	ADDSS (DX)(R8*4), X4
 23586  	MOVSS X4, (DX)(R8*4)
 23587  	ADDQ  BX, R8
 23588  	ADDSS (DX)(R8*4), X5
 23589  	MOVSS X5, (DX)(R8*4)
 23590  	ADDQ  BX, R8
 23591  	ADDSS (DX)(R8*4), X6
 23592  	MOVSS X6, (DX)(R8*4)
 23593  	ADDQ  BX, R8
 23594  	ADDSS (DX)(R8*4), X7
 23595  	MOVSS X7, (DX)(R8*4)
 23596  	ADDQ  BX, R8
 23597  	ADDSS (DX)(R8*4), X8
 23598  	MOVSS X8, (DX)(R8*4)
 23599  	ADDQ  BX, R8
 23600  	SUBQ  $0x08, SI
 23601  
 23602  check_limit_unroll:
 23603  	CMPQ SI, $0x08
 23604  	JHS  loop_unroll
 23605  	JMP  check_limit
 23606  
 23607  loop:
 23608  	MOVSS (AX)(DI*4), X1
 23609  	MULSS X0, X1
 23610  	ADDSS (DX)(R8*4), X1
 23611  	MOVSS X1, (DX)(R8*4)
 23612  	DECQ  SI
 23613  	ADDQ  CX, DI
 23614  	ADDQ  BX, R8
 23615  
 23616  check_limit:
 23617  	CMPQ SI, $0x00
 23618  	JHI  loop
 23619  	RET
 23620  
 23621  // func AmdAxpyUnsafeXInterleave_V1A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23622  // Requires: SSE
 23623  TEXT ·AmdAxpyUnsafeXInterleave_V1A14R8(SB), NOSPLIT, $0-48
 23624  	MOVSS alpha+0(FP), X0
 23625  	MOVQ  xs+8(FP), AX
 23626  	MOVQ  incx+16(FP), CX
 23627  	MOVQ  ys+24(FP), DX
 23628  	MOVQ  incy+32(FP), BX
 23629  	MOVQ  n+40(FP), SI
 23630  	XORQ  DI, DI
 23631  	XORQ  R8, R8
 23632  	JMP   check_limit_unroll
 23633  	PCALIGN $0x08
 23634  	NOP
 23635  	NOP
 23636  	NOP
 23637  	NOP
 23638  	NOP
 23639  	NOP
 23640  
 23641  loop_unroll:
 23642  	MOVSS (AX)(DI*4), X1
 23643  	ADDQ  CX, DI
 23644  	MOVSS (AX)(DI*4), X2
 23645  	ADDQ  CX, DI
 23646  	MOVSS (AX)(DI*4), X3
 23647  	ADDQ  CX, DI
 23648  	MOVSS (AX)(DI*4), X4
 23649  	ADDQ  CX, DI
 23650  	MOVSS (AX)(DI*4), X5
 23651  	ADDQ  CX, DI
 23652  	MOVSS (AX)(DI*4), X6
 23653  	ADDQ  CX, DI
 23654  	MOVSS (AX)(DI*4), X7
 23655  	ADDQ  CX, DI
 23656  	MOVSS (AX)(DI*4), X8
 23657  	ADDQ  CX, DI
 23658  	MULSS X0, X1
 23659  	MULSS X0, X2
 23660  	MULSS X0, X3
 23661  	MULSS X0, X4
 23662  	MULSS X0, X5
 23663  	MULSS X0, X6
 23664  	MULSS X0, X7
 23665  	MULSS X0, X8
 23666  	ADDSS (DX)(R8*4), X1
 23667  	MOVSS X1, (DX)(R8*4)
 23668  	ADDQ  BX, R8
 23669  	ADDSS (DX)(R8*4), X2
 23670  	MOVSS X2, (DX)(R8*4)
 23671  	ADDQ  BX, R8
 23672  	ADDSS (DX)(R8*4), X3
 23673  	MOVSS X3, (DX)(R8*4)
 23674  	ADDQ  BX, R8
 23675  	ADDSS (DX)(R8*4), X4
 23676  	MOVSS X4, (DX)(R8*4)
 23677  	ADDQ  BX, R8
 23678  	ADDSS (DX)(R8*4), X5
 23679  	MOVSS X5, (DX)(R8*4)
 23680  	ADDQ  BX, R8
 23681  	ADDSS (DX)(R8*4), X6
 23682  	MOVSS X6, (DX)(R8*4)
 23683  	ADDQ  BX, R8
 23684  	ADDSS (DX)(R8*4), X7
 23685  	MOVSS X7, (DX)(R8*4)
 23686  	ADDQ  BX, R8
 23687  	ADDSS (DX)(R8*4), X8
 23688  	MOVSS X8, (DX)(R8*4)
 23689  	ADDQ  BX, R8
 23690  	SUBQ  $0x08, SI
 23691  
 23692  check_limit_unroll:
 23693  	CMPQ SI, $0x08
 23694  	JHS  loop_unroll
 23695  	JMP  check_limit
 23696  
 23697  loop:
 23698  	MOVSS (AX)(DI*4), X1
 23699  	MULSS X0, X1
 23700  	ADDSS (DX)(R8*4), X1
 23701  	MOVSS X1, (DX)(R8*4)
 23702  	DECQ  SI
 23703  	ADDQ  CX, DI
 23704  	ADDQ  BX, R8
 23705  
 23706  check_limit:
 23707  	CMPQ SI, $0x00
 23708  	JHI  loop
 23709  	RET
 23710  
 23711  // func AmdAxpyUnsafeXInterleave_V2A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23712  // Requires: SSE
 23713  TEXT ·AmdAxpyUnsafeXInterleave_V2A14R8(SB), NOSPLIT, $0-48
 23714  	MOVSS alpha+0(FP), X0
 23715  	MOVQ  xs+8(FP), AX
 23716  	MOVQ  incx+16(FP), CX
 23717  	MOVQ  ys+24(FP), DX
 23718  	MOVQ  incy+32(FP), BX
 23719  	MOVQ  n+40(FP), SI
 23720  	XORQ  DI, DI
 23721  	XORQ  R8, R8
 23722  	JMP   check_limit_unroll
 23723  	PCALIGN $0x08
 23724  	NOP
 23725  	NOP
 23726  	NOP
 23727  	NOP
 23728  	NOP
 23729  	NOP
 23730  
 23731  loop_unroll:
 23732  	MOVSS (AX)(DI*4), X1
 23733  	ADDQ  CX, DI
 23734  	MOVSS (AX)(DI*4), X2
 23735  	ADDQ  CX, DI
 23736  	MOVSS (AX)(DI*4), X3
 23737  	ADDQ  CX, DI
 23738  	MOVSS (AX)(DI*4), X4
 23739  	ADDQ  CX, DI
 23740  	MOVSS (AX)(DI*4), X5
 23741  	ADDQ  CX, DI
 23742  	MOVSS (AX)(DI*4), X6
 23743  	ADDQ  CX, DI
 23744  	MOVSS (AX)(DI*4), X7
 23745  	ADDQ  CX, DI
 23746  	MOVSS (AX)(DI*4), X8
 23747  	ADDQ  CX, DI
 23748  	MULSS X0, X1
 23749  	MULSS X0, X2
 23750  	MULSS X0, X3
 23751  	MULSS X0, X4
 23752  	MULSS X0, X5
 23753  	MULSS X0, X6
 23754  	MULSS X0, X7
 23755  	MULSS X0, X8
 23756  	ADDSS (DX)(R8*4), X1
 23757  	MOVSS X1, (DX)(R8*4)
 23758  	ADDQ  BX, R8
 23759  	ADDSS (DX)(R8*4), X2
 23760  	MOVSS X2, (DX)(R8*4)
 23761  	ADDQ  BX, R8
 23762  	ADDSS (DX)(R8*4), X3
 23763  	MOVSS X3, (DX)(R8*4)
 23764  	ADDQ  BX, R8
 23765  	ADDSS (DX)(R8*4), X4
 23766  	MOVSS X4, (DX)(R8*4)
 23767  	ADDQ  BX, R8
 23768  	ADDSS (DX)(R8*4), X5
 23769  	MOVSS X5, (DX)(R8*4)
 23770  	ADDQ  BX, R8
 23771  	ADDSS (DX)(R8*4), X6
 23772  	MOVSS X6, (DX)(R8*4)
 23773  	ADDQ  BX, R8
 23774  	ADDSS (DX)(R8*4), X7
 23775  	MOVSS X7, (DX)(R8*4)
 23776  	ADDQ  BX, R8
 23777  	ADDSS (DX)(R8*4), X8
 23778  	MOVSS X8, (DX)(R8*4)
 23779  	ADDQ  BX, R8
 23780  	SUBQ  $0x08, SI
 23781  
 23782  check_limit_unroll:
 23783  	CMPQ SI, $0x08
 23784  	JHS  loop_unroll
 23785  	JMP  check_limit
 23786  
 23787  loop:
 23788  	MOVSS (AX)(DI*4), X1
 23789  	MULSS X0, X1
 23790  	ADDSS (DX)(R8*4), X1
 23791  	MOVSS X1, (DX)(R8*4)
 23792  	DECQ  SI
 23793  	ADDQ  CX, DI
 23794  	ADDQ  BX, R8
 23795  
 23796  check_limit:
 23797  	CMPQ SI, $0x00
 23798  	JHI  loop
 23799  	RET
 23800  
 23801  // func AmdAxpyUnsafeXInterleave_V3A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23802  // Requires: SSE
 23803  TEXT ·AmdAxpyUnsafeXInterleave_V3A14R8(SB), NOSPLIT, $0-48
 23804  	MOVSS alpha+0(FP), X0
 23805  	MOVQ  xs+8(FP), AX
 23806  	MOVQ  incx+16(FP), CX
 23807  	MOVQ  ys+24(FP), DX
 23808  	MOVQ  incy+32(FP), BX
 23809  	MOVQ  n+40(FP), SI
 23810  	XORQ  DI, DI
 23811  	XORQ  R8, R8
 23812  	JMP   check_limit_unroll
 23813  	PCALIGN $0x08
 23814  	NOP
 23815  	NOP
 23816  	NOP
 23817  	NOP
 23818  	NOP
 23819  	NOP
 23820  
 23821  loop_unroll:
 23822  	MOVSS (AX)(DI*4), X1
 23823  	ADDQ  CX, DI
 23824  	MOVSS (AX)(DI*4), X2
 23825  	ADDQ  CX, DI
 23826  	MOVSS (AX)(DI*4), X3
 23827  	ADDQ  CX, DI
 23828  	MOVSS (AX)(DI*4), X4
 23829  	ADDQ  CX, DI
 23830  	MOVSS (AX)(DI*4), X5
 23831  	ADDQ  CX, DI
 23832  	MOVSS (AX)(DI*4), X6
 23833  	ADDQ  CX, DI
 23834  	MOVSS (AX)(DI*4), X7
 23835  	ADDQ  CX, DI
 23836  	MOVSS (AX)(DI*4), X8
 23837  	ADDQ  CX, DI
 23838  	MULSS X0, X1
 23839  	MULSS X0, X2
 23840  	MULSS X0, X3
 23841  	MULSS X0, X4
 23842  	MULSS X0, X5
 23843  	MULSS X0, X6
 23844  	MULSS X0, X7
 23845  	MULSS X0, X8
 23846  	ADDSS (DX)(R8*4), X1
 23847  	MOVSS X1, (DX)(R8*4)
 23848  	ADDQ  BX, R8
 23849  	ADDSS (DX)(R8*4), X2
 23850  	MOVSS X2, (DX)(R8*4)
 23851  	ADDQ  BX, R8
 23852  	ADDSS (DX)(R8*4), X3
 23853  	MOVSS X3, (DX)(R8*4)
 23854  	ADDQ  BX, R8
 23855  	ADDSS (DX)(R8*4), X4
 23856  	MOVSS X4, (DX)(R8*4)
 23857  	ADDQ  BX, R8
 23858  	ADDSS (DX)(R8*4), X5
 23859  	MOVSS X5, (DX)(R8*4)
 23860  	ADDQ  BX, R8
 23861  	ADDSS (DX)(R8*4), X6
 23862  	MOVSS X6, (DX)(R8*4)
 23863  	ADDQ  BX, R8
 23864  	ADDSS (DX)(R8*4), X7
 23865  	MOVSS X7, (DX)(R8*4)
 23866  	ADDQ  BX, R8
 23867  	ADDSS (DX)(R8*4), X8
 23868  	MOVSS X8, (DX)(R8*4)
 23869  	ADDQ  BX, R8
 23870  	SUBQ  $0x08, SI
 23871  
 23872  check_limit_unroll:
 23873  	CMPQ SI, $0x08
 23874  	JHS  loop_unroll
 23875  	JMP  check_limit
 23876  
 23877  loop:
 23878  	MOVSS (AX)(DI*4), X1
 23879  	MULSS X0, X1
 23880  	ADDSS (DX)(R8*4), X1
 23881  	MOVSS X1, (DX)(R8*4)
 23882  	DECQ  SI
 23883  	ADDQ  CX, DI
 23884  	ADDQ  BX, R8
 23885  
 23886  check_limit:
 23887  	CMPQ SI, $0x00
 23888  	JHI  loop
 23889  	RET
 23890  
 23891  // func AmdAxpyUnsafeXInterleave_V4A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23892  // Requires: SSE
 23893  TEXT ·AmdAxpyUnsafeXInterleave_V4A14R8(SB), NOSPLIT, $0-48
 23894  	MOVSS alpha+0(FP), X0
 23895  	MOVQ  xs+8(FP), AX
 23896  	MOVQ  incx+16(FP), CX
 23897  	MOVQ  ys+24(FP), DX
 23898  	MOVQ  incy+32(FP), BX
 23899  	MOVQ  n+40(FP), SI
 23900  	XORQ  DI, DI
 23901  	XORQ  R8, R8
 23902  	JMP   check_limit_unroll
 23903  	PCALIGN $0x08
 23904  	NOP
 23905  	NOP
 23906  	NOP
 23907  	NOP
 23908  	NOP
 23909  	NOP
 23910  
 23911  loop_unroll:
 23912  	MOVSS (AX)(DI*4), X1
 23913  	ADDQ  CX, DI
 23914  	MOVSS (AX)(DI*4), X2
 23915  	ADDQ  CX, DI
 23916  	MOVSS (AX)(DI*4), X3
 23917  	ADDQ  CX, DI
 23918  	MOVSS (AX)(DI*4), X4
 23919  	ADDQ  CX, DI
 23920  	MOVSS (AX)(DI*4), X5
 23921  	ADDQ  CX, DI
 23922  	MOVSS (AX)(DI*4), X6
 23923  	ADDQ  CX, DI
 23924  	MOVSS (AX)(DI*4), X7
 23925  	ADDQ  CX, DI
 23926  	MOVSS (AX)(DI*4), X8
 23927  	ADDQ  CX, DI
 23928  	MULSS X0, X1
 23929  	MULSS X0, X2
 23930  	MULSS X0, X3
 23931  	MULSS X0, X4
 23932  	MULSS X0, X5
 23933  	MULSS X0, X6
 23934  	MULSS X0, X7
 23935  	MULSS X0, X8
 23936  	ADDSS (DX)(R8*4), X1
 23937  	MOVSS X1, (DX)(R8*4)
 23938  	ADDQ  BX, R8
 23939  	ADDSS (DX)(R8*4), X2
 23940  	MOVSS X2, (DX)(R8*4)
 23941  	ADDQ  BX, R8
 23942  	ADDSS (DX)(R8*4), X3
 23943  	MOVSS X3, (DX)(R8*4)
 23944  	ADDQ  BX, R8
 23945  	ADDSS (DX)(R8*4), X4
 23946  	MOVSS X4, (DX)(R8*4)
 23947  	ADDQ  BX, R8
 23948  	ADDSS (DX)(R8*4), X5
 23949  	MOVSS X5, (DX)(R8*4)
 23950  	ADDQ  BX, R8
 23951  	ADDSS (DX)(R8*4), X6
 23952  	MOVSS X6, (DX)(R8*4)
 23953  	ADDQ  BX, R8
 23954  	ADDSS (DX)(R8*4), X7
 23955  	MOVSS X7, (DX)(R8*4)
 23956  	ADDQ  BX, R8
 23957  	ADDSS (DX)(R8*4), X8
 23958  	MOVSS X8, (DX)(R8*4)
 23959  	ADDQ  BX, R8
 23960  	SUBQ  $0x08, SI
 23961  
 23962  check_limit_unroll:
 23963  	CMPQ SI, $0x08
 23964  	JHS  loop_unroll
 23965  	JMP  check_limit
 23966  
 23967  loop:
 23968  	MOVSS (AX)(DI*4), X1
 23969  	MULSS X0, X1
 23970  	ADDSS (DX)(R8*4), X1
 23971  	MOVSS X1, (DX)(R8*4)
 23972  	DECQ  SI
 23973  	ADDQ  CX, DI
 23974  	ADDQ  BX, R8
 23975  
 23976  check_limit:
 23977  	CMPQ SI, $0x00
 23978  	JHI  loop
 23979  	RET
 23980  
 23981  // func AmdAxpyUnsafeXInterleave_V5A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 23982  // Requires: SSE
 23983  TEXT ·AmdAxpyUnsafeXInterleave_V5A14R8(SB), NOSPLIT, $0-48
 23984  	MOVSS alpha+0(FP), X0
 23985  	MOVQ  xs+8(FP), AX
 23986  	MOVQ  incx+16(FP), CX
 23987  	MOVQ  ys+24(FP), DX
 23988  	MOVQ  incy+32(FP), BX
 23989  	MOVQ  n+40(FP), SI
 23990  	XORQ  DI, DI
 23991  	XORQ  R8, R8
 23992  	JMP   check_limit_unroll
 23993  	PCALIGN $0x08
 23994  	NOP
 23995  	NOP
 23996  	NOP
 23997  	NOP
 23998  	NOP
 23999  	NOP
 24000  
 24001  loop_unroll:
 24002  	MOVSS (AX)(DI*4), X1
 24003  	ADDQ  CX, DI
 24004  	MOVSS (AX)(DI*4), X2
 24005  	ADDQ  CX, DI
 24006  	MOVSS (AX)(DI*4), X3
 24007  	ADDQ  CX, DI
 24008  	MOVSS (AX)(DI*4), X4
 24009  	ADDQ  CX, DI
 24010  	MOVSS (AX)(DI*4), X5
 24011  	ADDQ  CX, DI
 24012  	MOVSS (AX)(DI*4), X6
 24013  	ADDQ  CX, DI
 24014  	MOVSS (AX)(DI*4), X7
 24015  	ADDQ  CX, DI
 24016  	MOVSS (AX)(DI*4), X8
 24017  	ADDQ  CX, DI
 24018  	MULSS X0, X1
 24019  	MULSS X0, X2
 24020  	MULSS X0, X3
 24021  	MULSS X0, X4
 24022  	MULSS X0, X5
 24023  	MULSS X0, X6
 24024  	MULSS X0, X7
 24025  	MULSS X0, X8
 24026  	ADDSS (DX)(R8*4), X1
 24027  	MOVSS X1, (DX)(R8*4)
 24028  	ADDQ  BX, R8
 24029  	ADDSS (DX)(R8*4), X2
 24030  	MOVSS X2, (DX)(R8*4)
 24031  	ADDQ  BX, R8
 24032  	ADDSS (DX)(R8*4), X3
 24033  	MOVSS X3, (DX)(R8*4)
 24034  	ADDQ  BX, R8
 24035  	ADDSS (DX)(R8*4), X4
 24036  	MOVSS X4, (DX)(R8*4)
 24037  	ADDQ  BX, R8
 24038  	ADDSS (DX)(R8*4), X5
 24039  	MOVSS X5, (DX)(R8*4)
 24040  	ADDQ  BX, R8
 24041  	ADDSS (DX)(R8*4), X6
 24042  	MOVSS X6, (DX)(R8*4)
 24043  	ADDQ  BX, R8
 24044  	ADDSS (DX)(R8*4), X7
 24045  	MOVSS X7, (DX)(R8*4)
 24046  	ADDQ  BX, R8
 24047  	ADDSS (DX)(R8*4), X8
 24048  	MOVSS X8, (DX)(R8*4)
 24049  	ADDQ  BX, R8
 24050  	SUBQ  $0x08, SI
 24051  
 24052  check_limit_unroll:
 24053  	CMPQ SI, $0x08
 24054  	JHS  loop_unroll
 24055  	JMP  check_limit
 24056  
 24057  loop:
 24058  	MOVSS (AX)(DI*4), X1
 24059  	MULSS X0, X1
 24060  	ADDSS (DX)(R8*4), X1
 24061  	MOVSS X1, (DX)(R8*4)
 24062  	DECQ  SI
 24063  	ADDQ  CX, DI
 24064  	ADDQ  BX, R8
 24065  
 24066  check_limit:
 24067  	CMPQ SI, $0x00
 24068  	JHI  loop
 24069  	RET
 24070  
 24071  // func AmdAxpyUnsafeXInterleave_V0A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24072  // Requires: SSE
 24073  TEXT ·AmdAxpyUnsafeXInterleave_V0A15R8(SB), NOSPLIT, $0-48
 24074  	MOVSS alpha+0(FP), X0
 24075  	MOVQ  xs+8(FP), AX
 24076  	MOVQ  incx+16(FP), CX
 24077  	MOVQ  ys+24(FP), DX
 24078  	MOVQ  incy+32(FP), BX
 24079  	MOVQ  n+40(FP), SI
 24080  	XORQ  DI, DI
 24081  	XORQ  R8, R8
 24082  	JMP   check_limit_unroll
 24083  	PCALIGN $0x08
 24084  	NOP
 24085  	NOP
 24086  	NOP
 24087  	NOP
 24088  	NOP
 24089  	NOP
 24090  	NOP
 24091  
 24092  loop_unroll:
 24093  	MOVSS (AX)(DI*4), X1
 24094  	ADDQ  CX, DI
 24095  	MOVSS (AX)(DI*4), X2
 24096  	ADDQ  CX, DI
 24097  	MOVSS (AX)(DI*4), X3
 24098  	ADDQ  CX, DI
 24099  	MOVSS (AX)(DI*4), X4
 24100  	ADDQ  CX, DI
 24101  	MOVSS (AX)(DI*4), X5
 24102  	ADDQ  CX, DI
 24103  	MOVSS (AX)(DI*4), X6
 24104  	ADDQ  CX, DI
 24105  	MOVSS (AX)(DI*4), X7
 24106  	ADDQ  CX, DI
 24107  	MOVSS (AX)(DI*4), X8
 24108  	ADDQ  CX, DI
 24109  	MULSS X0, X1
 24110  	MULSS X0, X2
 24111  	MULSS X0, X3
 24112  	MULSS X0, X4
 24113  	MULSS X0, X5
 24114  	MULSS X0, X6
 24115  	MULSS X0, X7
 24116  	MULSS X0, X8
 24117  	ADDSS (DX)(R8*4), X1
 24118  	MOVSS X1, (DX)(R8*4)
 24119  	ADDQ  BX, R8
 24120  	ADDSS (DX)(R8*4), X2
 24121  	MOVSS X2, (DX)(R8*4)
 24122  	ADDQ  BX, R8
 24123  	ADDSS (DX)(R8*4), X3
 24124  	MOVSS X3, (DX)(R8*4)
 24125  	ADDQ  BX, R8
 24126  	ADDSS (DX)(R8*4), X4
 24127  	MOVSS X4, (DX)(R8*4)
 24128  	ADDQ  BX, R8
 24129  	ADDSS (DX)(R8*4), X5
 24130  	MOVSS X5, (DX)(R8*4)
 24131  	ADDQ  BX, R8
 24132  	ADDSS (DX)(R8*4), X6
 24133  	MOVSS X6, (DX)(R8*4)
 24134  	ADDQ  BX, R8
 24135  	ADDSS (DX)(R8*4), X7
 24136  	MOVSS X7, (DX)(R8*4)
 24137  	ADDQ  BX, R8
 24138  	ADDSS (DX)(R8*4), X8
 24139  	MOVSS X8, (DX)(R8*4)
 24140  	ADDQ  BX, R8
 24141  	SUBQ  $0x08, SI
 24142  
 24143  check_limit_unroll:
 24144  	CMPQ SI, $0x08
 24145  	JHS  loop_unroll
 24146  	JMP  check_limit
 24147  
 24148  loop:
 24149  	MOVSS (AX)(DI*4), X1
 24150  	MULSS X0, X1
 24151  	ADDSS (DX)(R8*4), X1
 24152  	MOVSS X1, (DX)(R8*4)
 24153  	DECQ  SI
 24154  	ADDQ  CX, DI
 24155  	ADDQ  BX, R8
 24156  
 24157  check_limit:
 24158  	CMPQ SI, $0x00
 24159  	JHI  loop
 24160  	RET
 24161  
 24162  // func AmdAxpyUnsafeXInterleave_V1A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24163  // Requires: SSE
 24164  TEXT ·AmdAxpyUnsafeXInterleave_V1A15R8(SB), NOSPLIT, $0-48
 24165  	MOVSS alpha+0(FP), X0
 24166  	MOVQ  xs+8(FP), AX
 24167  	MOVQ  incx+16(FP), CX
 24168  	MOVQ  ys+24(FP), DX
 24169  	MOVQ  incy+32(FP), BX
 24170  	MOVQ  n+40(FP), SI
 24171  	XORQ  DI, DI
 24172  	XORQ  R8, R8
 24173  	JMP   check_limit_unroll
 24174  	PCALIGN $0x08
 24175  	NOP
 24176  	NOP
 24177  	NOP
 24178  	NOP
 24179  	NOP
 24180  	NOP
 24181  	NOP
 24182  
 24183  loop_unroll:
 24184  	MOVSS (AX)(DI*4), X1
 24185  	ADDQ  CX, DI
 24186  	MOVSS (AX)(DI*4), X2
 24187  	ADDQ  CX, DI
 24188  	MOVSS (AX)(DI*4), X3
 24189  	ADDQ  CX, DI
 24190  	MOVSS (AX)(DI*4), X4
 24191  	ADDQ  CX, DI
 24192  	MOVSS (AX)(DI*4), X5
 24193  	ADDQ  CX, DI
 24194  	MOVSS (AX)(DI*4), X6
 24195  	ADDQ  CX, DI
 24196  	MOVSS (AX)(DI*4), X7
 24197  	ADDQ  CX, DI
 24198  	MOVSS (AX)(DI*4), X8
 24199  	ADDQ  CX, DI
 24200  	MULSS X0, X1
 24201  	MULSS X0, X2
 24202  	MULSS X0, X3
 24203  	MULSS X0, X4
 24204  	MULSS X0, X5
 24205  	MULSS X0, X6
 24206  	MULSS X0, X7
 24207  	MULSS X0, X8
 24208  	ADDSS (DX)(R8*4), X1
 24209  	MOVSS X1, (DX)(R8*4)
 24210  	ADDQ  BX, R8
 24211  	ADDSS (DX)(R8*4), X2
 24212  	MOVSS X2, (DX)(R8*4)
 24213  	ADDQ  BX, R8
 24214  	ADDSS (DX)(R8*4), X3
 24215  	MOVSS X3, (DX)(R8*4)
 24216  	ADDQ  BX, R8
 24217  	ADDSS (DX)(R8*4), X4
 24218  	MOVSS X4, (DX)(R8*4)
 24219  	ADDQ  BX, R8
 24220  	ADDSS (DX)(R8*4), X5
 24221  	MOVSS X5, (DX)(R8*4)
 24222  	ADDQ  BX, R8
 24223  	ADDSS (DX)(R8*4), X6
 24224  	MOVSS X6, (DX)(R8*4)
 24225  	ADDQ  BX, R8
 24226  	ADDSS (DX)(R8*4), X7
 24227  	MOVSS X7, (DX)(R8*4)
 24228  	ADDQ  BX, R8
 24229  	ADDSS (DX)(R8*4), X8
 24230  	MOVSS X8, (DX)(R8*4)
 24231  	ADDQ  BX, R8
 24232  	SUBQ  $0x08, SI
 24233  
 24234  check_limit_unroll:
 24235  	CMPQ SI, $0x08
 24236  	JHS  loop_unroll
 24237  	JMP  check_limit
 24238  
 24239  loop:
 24240  	MOVSS (AX)(DI*4), X1
 24241  	MULSS X0, X1
 24242  	ADDSS (DX)(R8*4), X1
 24243  	MOVSS X1, (DX)(R8*4)
 24244  	DECQ  SI
 24245  	ADDQ  CX, DI
 24246  	ADDQ  BX, R8
 24247  
 24248  check_limit:
 24249  	CMPQ SI, $0x00
 24250  	JHI  loop
 24251  	RET
 24252  
 24253  // func AmdAxpyUnsafeXInterleave_V2A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24254  // Requires: SSE
 24255  TEXT ·AmdAxpyUnsafeXInterleave_V2A15R8(SB), NOSPLIT, $0-48
 24256  	MOVSS alpha+0(FP), X0
 24257  	MOVQ  xs+8(FP), AX
 24258  	MOVQ  incx+16(FP), CX
 24259  	MOVQ  ys+24(FP), DX
 24260  	MOVQ  incy+32(FP), BX
 24261  	MOVQ  n+40(FP), SI
 24262  	XORQ  DI, DI
 24263  	XORQ  R8, R8
 24264  	JMP   check_limit_unroll
 24265  	PCALIGN $0x08
 24266  	NOP
 24267  	NOP
 24268  	NOP
 24269  	NOP
 24270  	NOP
 24271  	NOP
 24272  	NOP
 24273  
 24274  loop_unroll:
 24275  	MOVSS (AX)(DI*4), X1
 24276  	ADDQ  CX, DI
 24277  	MOVSS (AX)(DI*4), X2
 24278  	ADDQ  CX, DI
 24279  	MOVSS (AX)(DI*4), X3
 24280  	ADDQ  CX, DI
 24281  	MOVSS (AX)(DI*4), X4
 24282  	ADDQ  CX, DI
 24283  	MOVSS (AX)(DI*4), X5
 24284  	ADDQ  CX, DI
 24285  	MOVSS (AX)(DI*4), X6
 24286  	ADDQ  CX, DI
 24287  	MOVSS (AX)(DI*4), X7
 24288  	ADDQ  CX, DI
 24289  	MOVSS (AX)(DI*4), X8
 24290  	ADDQ  CX, DI
 24291  	MULSS X0, X1
 24292  	MULSS X0, X2
 24293  	MULSS X0, X3
 24294  	MULSS X0, X4
 24295  	MULSS X0, X5
 24296  	MULSS X0, X6
 24297  	MULSS X0, X7
 24298  	MULSS X0, X8
 24299  	ADDSS (DX)(R8*4), X1
 24300  	MOVSS X1, (DX)(R8*4)
 24301  	ADDQ  BX, R8
 24302  	ADDSS (DX)(R8*4), X2
 24303  	MOVSS X2, (DX)(R8*4)
 24304  	ADDQ  BX, R8
 24305  	ADDSS (DX)(R8*4), X3
 24306  	MOVSS X3, (DX)(R8*4)
 24307  	ADDQ  BX, R8
 24308  	ADDSS (DX)(R8*4), X4
 24309  	MOVSS X4, (DX)(R8*4)
 24310  	ADDQ  BX, R8
 24311  	ADDSS (DX)(R8*4), X5
 24312  	MOVSS X5, (DX)(R8*4)
 24313  	ADDQ  BX, R8
 24314  	ADDSS (DX)(R8*4), X6
 24315  	MOVSS X6, (DX)(R8*4)
 24316  	ADDQ  BX, R8
 24317  	ADDSS (DX)(R8*4), X7
 24318  	MOVSS X7, (DX)(R8*4)
 24319  	ADDQ  BX, R8
 24320  	ADDSS (DX)(R8*4), X8
 24321  	MOVSS X8, (DX)(R8*4)
 24322  	ADDQ  BX, R8
 24323  	SUBQ  $0x08, SI
 24324  
 24325  check_limit_unroll:
 24326  	CMPQ SI, $0x08
 24327  	JHS  loop_unroll
 24328  	JMP  check_limit
 24329  
 24330  loop:
 24331  	MOVSS (AX)(DI*4), X1
 24332  	MULSS X0, X1
 24333  	ADDSS (DX)(R8*4), X1
 24334  	MOVSS X1, (DX)(R8*4)
 24335  	DECQ  SI
 24336  	ADDQ  CX, DI
 24337  	ADDQ  BX, R8
 24338  
 24339  check_limit:
 24340  	CMPQ SI, $0x00
 24341  	JHI  loop
 24342  	RET
 24343  
 24344  // func AmdAxpyUnsafeXInterleave_V3A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24345  // Requires: SSE
 24346  TEXT ·AmdAxpyUnsafeXInterleave_V3A15R8(SB), NOSPLIT, $0-48
 24347  	MOVSS alpha+0(FP), X0
 24348  	MOVQ  xs+8(FP), AX
 24349  	MOVQ  incx+16(FP), CX
 24350  	MOVQ  ys+24(FP), DX
 24351  	MOVQ  incy+32(FP), BX
 24352  	MOVQ  n+40(FP), SI
 24353  	XORQ  DI, DI
 24354  	XORQ  R8, R8
 24355  	JMP   check_limit_unroll
 24356  	PCALIGN $0x08
 24357  	NOP
 24358  	NOP
 24359  	NOP
 24360  	NOP
 24361  	NOP
 24362  	NOP
 24363  	NOP
 24364  
 24365  loop_unroll:
 24366  	MOVSS (AX)(DI*4), X1
 24367  	ADDQ  CX, DI
 24368  	MOVSS (AX)(DI*4), X2
 24369  	ADDQ  CX, DI
 24370  	MOVSS (AX)(DI*4), X3
 24371  	ADDQ  CX, DI
 24372  	MOVSS (AX)(DI*4), X4
 24373  	ADDQ  CX, DI
 24374  	MOVSS (AX)(DI*4), X5
 24375  	ADDQ  CX, DI
 24376  	MOVSS (AX)(DI*4), X6
 24377  	ADDQ  CX, DI
 24378  	MOVSS (AX)(DI*4), X7
 24379  	ADDQ  CX, DI
 24380  	MOVSS (AX)(DI*4), X8
 24381  	ADDQ  CX, DI
 24382  	MULSS X0, X1
 24383  	MULSS X0, X2
 24384  	MULSS X0, X3
 24385  	MULSS X0, X4
 24386  	MULSS X0, X5
 24387  	MULSS X0, X6
 24388  	MULSS X0, X7
 24389  	MULSS X0, X8
 24390  	ADDSS (DX)(R8*4), X1
 24391  	MOVSS X1, (DX)(R8*4)
 24392  	ADDQ  BX, R8
 24393  	ADDSS (DX)(R8*4), X2
 24394  	MOVSS X2, (DX)(R8*4)
 24395  	ADDQ  BX, R8
 24396  	ADDSS (DX)(R8*4), X3
 24397  	MOVSS X3, (DX)(R8*4)
 24398  	ADDQ  BX, R8
 24399  	ADDSS (DX)(R8*4), X4
 24400  	MOVSS X4, (DX)(R8*4)
 24401  	ADDQ  BX, R8
 24402  	ADDSS (DX)(R8*4), X5
 24403  	MOVSS X5, (DX)(R8*4)
 24404  	ADDQ  BX, R8
 24405  	ADDSS (DX)(R8*4), X6
 24406  	MOVSS X6, (DX)(R8*4)
 24407  	ADDQ  BX, R8
 24408  	ADDSS (DX)(R8*4), X7
 24409  	MOVSS X7, (DX)(R8*4)
 24410  	ADDQ  BX, R8
 24411  	ADDSS (DX)(R8*4), X8
 24412  	MOVSS X8, (DX)(R8*4)
 24413  	ADDQ  BX, R8
 24414  	SUBQ  $0x08, SI
 24415  
 24416  check_limit_unroll:
 24417  	CMPQ SI, $0x08
 24418  	JHS  loop_unroll
 24419  	JMP  check_limit
 24420  
 24421  loop:
 24422  	MOVSS (AX)(DI*4), X1
 24423  	MULSS X0, X1
 24424  	ADDSS (DX)(R8*4), X1
 24425  	MOVSS X1, (DX)(R8*4)
 24426  	DECQ  SI
 24427  	ADDQ  CX, DI
 24428  	ADDQ  BX, R8
 24429  
 24430  check_limit:
 24431  	CMPQ SI, $0x00
 24432  	JHI  loop
 24433  	RET
 24434  
 24435  // func AmdAxpyUnsafeXInterleave_V4A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24436  // Requires: SSE
 24437  TEXT ·AmdAxpyUnsafeXInterleave_V4A15R8(SB), NOSPLIT, $0-48
 24438  	MOVSS alpha+0(FP), X0
 24439  	MOVQ  xs+8(FP), AX
 24440  	MOVQ  incx+16(FP), CX
 24441  	MOVQ  ys+24(FP), DX
 24442  	MOVQ  incy+32(FP), BX
 24443  	MOVQ  n+40(FP), SI
 24444  	XORQ  DI, DI
 24445  	XORQ  R8, R8
 24446  	JMP   check_limit_unroll
 24447  	PCALIGN $0x08
 24448  	NOP
 24449  	NOP
 24450  	NOP
 24451  	NOP
 24452  	NOP
 24453  	NOP
 24454  	NOP
 24455  
 24456  loop_unroll:
 24457  	MOVSS (AX)(DI*4), X1
 24458  	ADDQ  CX, DI
 24459  	MOVSS (AX)(DI*4), X2
 24460  	ADDQ  CX, DI
 24461  	MOVSS (AX)(DI*4), X3
 24462  	ADDQ  CX, DI
 24463  	MOVSS (AX)(DI*4), X4
 24464  	ADDQ  CX, DI
 24465  	MOVSS (AX)(DI*4), X5
 24466  	ADDQ  CX, DI
 24467  	MOVSS (AX)(DI*4), X6
 24468  	ADDQ  CX, DI
 24469  	MOVSS (AX)(DI*4), X7
 24470  	ADDQ  CX, DI
 24471  	MOVSS (AX)(DI*4), X8
 24472  	ADDQ  CX, DI
 24473  	MULSS X0, X1
 24474  	MULSS X0, X2
 24475  	MULSS X0, X3
 24476  	MULSS X0, X4
 24477  	MULSS X0, X5
 24478  	MULSS X0, X6
 24479  	MULSS X0, X7
 24480  	MULSS X0, X8
 24481  	ADDSS (DX)(R8*4), X1
 24482  	MOVSS X1, (DX)(R8*4)
 24483  	ADDQ  BX, R8
 24484  	ADDSS (DX)(R8*4), X2
 24485  	MOVSS X2, (DX)(R8*4)
 24486  	ADDQ  BX, R8
 24487  	ADDSS (DX)(R8*4), X3
 24488  	MOVSS X3, (DX)(R8*4)
 24489  	ADDQ  BX, R8
 24490  	ADDSS (DX)(R8*4), X4
 24491  	MOVSS X4, (DX)(R8*4)
 24492  	ADDQ  BX, R8
 24493  	ADDSS (DX)(R8*4), X5
 24494  	MOVSS X5, (DX)(R8*4)
 24495  	ADDQ  BX, R8
 24496  	ADDSS (DX)(R8*4), X6
 24497  	MOVSS X6, (DX)(R8*4)
 24498  	ADDQ  BX, R8
 24499  	ADDSS (DX)(R8*4), X7
 24500  	MOVSS X7, (DX)(R8*4)
 24501  	ADDQ  BX, R8
 24502  	ADDSS (DX)(R8*4), X8
 24503  	MOVSS X8, (DX)(R8*4)
 24504  	ADDQ  BX, R8
 24505  	SUBQ  $0x08, SI
 24506  
 24507  check_limit_unroll:
 24508  	CMPQ SI, $0x08
 24509  	JHS  loop_unroll
 24510  	JMP  check_limit
 24511  
 24512  loop:
 24513  	MOVSS (AX)(DI*4), X1
 24514  	MULSS X0, X1
 24515  	ADDSS (DX)(R8*4), X1
 24516  	MOVSS X1, (DX)(R8*4)
 24517  	DECQ  SI
 24518  	ADDQ  CX, DI
 24519  	ADDQ  BX, R8
 24520  
 24521  check_limit:
 24522  	CMPQ SI, $0x00
 24523  	JHI  loop
 24524  	RET
 24525  
 24526  // func AmdAxpyUnsafeXInterleave_V5A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24527  // Requires: SSE
 24528  TEXT ·AmdAxpyUnsafeXInterleave_V5A15R8(SB), NOSPLIT, $0-48
 24529  	MOVSS alpha+0(FP), X0
 24530  	MOVQ  xs+8(FP), AX
 24531  	MOVQ  incx+16(FP), CX
 24532  	MOVQ  ys+24(FP), DX
 24533  	MOVQ  incy+32(FP), BX
 24534  	MOVQ  n+40(FP), SI
 24535  	XORQ  DI, DI
 24536  	XORQ  R8, R8
 24537  	JMP   check_limit_unroll
 24538  	PCALIGN $0x08
 24539  	NOP
 24540  	NOP
 24541  	NOP
 24542  	NOP
 24543  	NOP
 24544  	NOP
 24545  	NOP
 24546  
 24547  loop_unroll:
 24548  	MOVSS (AX)(DI*4), X1
 24549  	ADDQ  CX, DI
 24550  	MOVSS (AX)(DI*4), X2
 24551  	ADDQ  CX, DI
 24552  	MOVSS (AX)(DI*4), X3
 24553  	ADDQ  CX, DI
 24554  	MOVSS (AX)(DI*4), X4
 24555  	ADDQ  CX, DI
 24556  	MOVSS (AX)(DI*4), X5
 24557  	ADDQ  CX, DI
 24558  	MOVSS (AX)(DI*4), X6
 24559  	ADDQ  CX, DI
 24560  	MOVSS (AX)(DI*4), X7
 24561  	ADDQ  CX, DI
 24562  	MOVSS (AX)(DI*4), X8
 24563  	ADDQ  CX, DI
 24564  	MULSS X0, X1
 24565  	MULSS X0, X2
 24566  	MULSS X0, X3
 24567  	MULSS X0, X4
 24568  	MULSS X0, X5
 24569  	MULSS X0, X6
 24570  	MULSS X0, X7
 24571  	MULSS X0, X8
 24572  	ADDSS (DX)(R8*4), X1
 24573  	MOVSS X1, (DX)(R8*4)
 24574  	ADDQ  BX, R8
 24575  	ADDSS (DX)(R8*4), X2
 24576  	MOVSS X2, (DX)(R8*4)
 24577  	ADDQ  BX, R8
 24578  	ADDSS (DX)(R8*4), X3
 24579  	MOVSS X3, (DX)(R8*4)
 24580  	ADDQ  BX, R8
 24581  	ADDSS (DX)(R8*4), X4
 24582  	MOVSS X4, (DX)(R8*4)
 24583  	ADDQ  BX, R8
 24584  	ADDSS (DX)(R8*4), X5
 24585  	MOVSS X5, (DX)(R8*4)
 24586  	ADDQ  BX, R8
 24587  	ADDSS (DX)(R8*4), X6
 24588  	MOVSS X6, (DX)(R8*4)
 24589  	ADDQ  BX, R8
 24590  	ADDSS (DX)(R8*4), X7
 24591  	MOVSS X7, (DX)(R8*4)
 24592  	ADDQ  BX, R8
 24593  	ADDSS (DX)(R8*4), X8
 24594  	MOVSS X8, (DX)(R8*4)
 24595  	ADDQ  BX, R8
 24596  	SUBQ  $0x08, SI
 24597  
 24598  check_limit_unroll:
 24599  	CMPQ SI, $0x08
 24600  	JHS  loop_unroll
 24601  	JMP  check_limit
 24602  
 24603  loop:
 24604  	MOVSS (AX)(DI*4), X1
 24605  	MULSS X0, X1
 24606  	ADDSS (DX)(R8*4), X1
 24607  	MOVSS X1, (DX)(R8*4)
 24608  	DECQ  SI
 24609  	ADDQ  CX, DI
 24610  	ADDQ  BX, R8
 24611  
 24612  check_limit:
 24613  	CMPQ SI, $0x00
 24614  	JHI  loop
 24615  	RET
 24616  
 24617  // func AmdAxpyUnsafeXInterleave_V0A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24618  // Requires: SSE
 24619  TEXT ·AmdAxpyUnsafeXInterleave_V0A16R8(SB), NOSPLIT, $0-48
 24620  	MOVSS alpha+0(FP), X0
 24621  	MOVQ  xs+8(FP), AX
 24622  	MOVQ  incx+16(FP), CX
 24623  	MOVQ  ys+24(FP), DX
 24624  	MOVQ  incy+32(FP), BX
 24625  	MOVQ  n+40(FP), SI
 24626  	XORQ  DI, DI
 24627  	XORQ  R8, R8
 24628  	JMP   check_limit_unroll
 24629  	PCALIGN $0x10
 24630  
 24631  loop_unroll:
 24632  	MOVSS (AX)(DI*4), X1
 24633  	ADDQ  CX, DI
 24634  	MOVSS (AX)(DI*4), X2
 24635  	ADDQ  CX, DI
 24636  	MOVSS (AX)(DI*4), X3
 24637  	ADDQ  CX, DI
 24638  	MOVSS (AX)(DI*4), X4
 24639  	ADDQ  CX, DI
 24640  	MOVSS (AX)(DI*4), X5
 24641  	ADDQ  CX, DI
 24642  	MOVSS (AX)(DI*4), X6
 24643  	ADDQ  CX, DI
 24644  	MOVSS (AX)(DI*4), X7
 24645  	ADDQ  CX, DI
 24646  	MOVSS (AX)(DI*4), X8
 24647  	ADDQ  CX, DI
 24648  	MULSS X0, X1
 24649  	MULSS X0, X2
 24650  	MULSS X0, X3
 24651  	MULSS X0, X4
 24652  	MULSS X0, X5
 24653  	MULSS X0, X6
 24654  	MULSS X0, X7
 24655  	MULSS X0, X8
 24656  	ADDSS (DX)(R8*4), X1
 24657  	MOVSS X1, (DX)(R8*4)
 24658  	ADDQ  BX, R8
 24659  	ADDSS (DX)(R8*4), X2
 24660  	MOVSS X2, (DX)(R8*4)
 24661  	ADDQ  BX, R8
 24662  	ADDSS (DX)(R8*4), X3
 24663  	MOVSS X3, (DX)(R8*4)
 24664  	ADDQ  BX, R8
 24665  	ADDSS (DX)(R8*4), X4
 24666  	MOVSS X4, (DX)(R8*4)
 24667  	ADDQ  BX, R8
 24668  	ADDSS (DX)(R8*4), X5
 24669  	MOVSS X5, (DX)(R8*4)
 24670  	ADDQ  BX, R8
 24671  	ADDSS (DX)(R8*4), X6
 24672  	MOVSS X6, (DX)(R8*4)
 24673  	ADDQ  BX, R8
 24674  	ADDSS (DX)(R8*4), X7
 24675  	MOVSS X7, (DX)(R8*4)
 24676  	ADDQ  BX, R8
 24677  	ADDSS (DX)(R8*4), X8
 24678  	MOVSS X8, (DX)(R8*4)
 24679  	ADDQ  BX, R8
 24680  	SUBQ  $0x08, SI
 24681  
 24682  check_limit_unroll:
 24683  	CMPQ SI, $0x08
 24684  	JHS  loop_unroll
 24685  	JMP  check_limit
 24686  
 24687  loop:
 24688  	MOVSS (AX)(DI*4), X1
 24689  	MULSS X0, X1
 24690  	ADDSS (DX)(R8*4), X1
 24691  	MOVSS X1, (DX)(R8*4)
 24692  	DECQ  SI
 24693  	ADDQ  CX, DI
 24694  	ADDQ  BX, R8
 24695  
 24696  check_limit:
 24697  	CMPQ SI, $0x00
 24698  	JHI  loop
 24699  	RET
 24700  
 24701  // func AmdAxpyUnsafeXInterleave_V1A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24702  // Requires: SSE
 24703  TEXT ·AmdAxpyUnsafeXInterleave_V1A16R8(SB), NOSPLIT, $0-48
 24704  	MOVSS alpha+0(FP), X0
 24705  	MOVQ  xs+8(FP), AX
 24706  	MOVQ  incx+16(FP), CX
 24707  	MOVQ  ys+24(FP), DX
 24708  	MOVQ  incy+32(FP), BX
 24709  	MOVQ  n+40(FP), SI
 24710  	XORQ  DI, DI
 24711  	XORQ  R8, R8
 24712  	JMP   check_limit_unroll
 24713  	PCALIGN $0x10
 24714  
 24715  loop_unroll:
 24716  	MOVSS (AX)(DI*4), X1
 24717  	ADDQ  CX, DI
 24718  	MOVSS (AX)(DI*4), X2
 24719  	ADDQ  CX, DI
 24720  	MOVSS (AX)(DI*4), X3
 24721  	ADDQ  CX, DI
 24722  	MOVSS (AX)(DI*4), X4
 24723  	ADDQ  CX, DI
 24724  	MOVSS (AX)(DI*4), X5
 24725  	ADDQ  CX, DI
 24726  	MOVSS (AX)(DI*4), X6
 24727  	ADDQ  CX, DI
 24728  	MOVSS (AX)(DI*4), X7
 24729  	ADDQ  CX, DI
 24730  	MOVSS (AX)(DI*4), X8
 24731  	ADDQ  CX, DI
 24732  	MULSS X0, X1
 24733  	MULSS X0, X2
 24734  	MULSS X0, X3
 24735  	MULSS X0, X4
 24736  	MULSS X0, X5
 24737  	MULSS X0, X6
 24738  	MULSS X0, X7
 24739  	MULSS X0, X8
 24740  	ADDSS (DX)(R8*4), X1
 24741  	MOVSS X1, (DX)(R8*4)
 24742  	ADDQ  BX, R8
 24743  	ADDSS (DX)(R8*4), X2
 24744  	MOVSS X2, (DX)(R8*4)
 24745  	ADDQ  BX, R8
 24746  	ADDSS (DX)(R8*4), X3
 24747  	MOVSS X3, (DX)(R8*4)
 24748  	ADDQ  BX, R8
 24749  	ADDSS (DX)(R8*4), X4
 24750  	MOVSS X4, (DX)(R8*4)
 24751  	ADDQ  BX, R8
 24752  	ADDSS (DX)(R8*4), X5
 24753  	MOVSS X5, (DX)(R8*4)
 24754  	ADDQ  BX, R8
 24755  	ADDSS (DX)(R8*4), X6
 24756  	MOVSS X6, (DX)(R8*4)
 24757  	ADDQ  BX, R8
 24758  	ADDSS (DX)(R8*4), X7
 24759  	MOVSS X7, (DX)(R8*4)
 24760  	ADDQ  BX, R8
 24761  	ADDSS (DX)(R8*4), X8
 24762  	MOVSS X8, (DX)(R8*4)
 24763  	ADDQ  BX, R8
 24764  	SUBQ  $0x08, SI
 24765  
 24766  check_limit_unroll:
 24767  	CMPQ SI, $0x08
 24768  	JHS  loop_unroll
 24769  	JMP  check_limit
 24770  
 24771  loop:
 24772  	MOVSS (AX)(DI*4), X1
 24773  	MULSS X0, X1
 24774  	ADDSS (DX)(R8*4), X1
 24775  	MOVSS X1, (DX)(R8*4)
 24776  	DECQ  SI
 24777  	ADDQ  CX, DI
 24778  	ADDQ  BX, R8
 24779  
 24780  check_limit:
 24781  	CMPQ SI, $0x00
 24782  	JHI  loop
 24783  	RET
 24784  
 24785  // func AmdAxpyUnsafeXInterleave_V2A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24786  // Requires: SSE
 24787  TEXT ·AmdAxpyUnsafeXInterleave_V2A16R8(SB), NOSPLIT, $0-48
 24788  	MOVSS alpha+0(FP), X0
 24789  	MOVQ  xs+8(FP), AX
 24790  	MOVQ  incx+16(FP), CX
 24791  	MOVQ  ys+24(FP), DX
 24792  	MOVQ  incy+32(FP), BX
 24793  	MOVQ  n+40(FP), SI
 24794  	XORQ  DI, DI
 24795  	XORQ  R8, R8
 24796  	JMP   check_limit_unroll
 24797  	PCALIGN $0x10
 24798  
 24799  loop_unroll:
 24800  	MOVSS (AX)(DI*4), X1
 24801  	ADDQ  CX, DI
 24802  	MOVSS (AX)(DI*4), X2
 24803  	ADDQ  CX, DI
 24804  	MOVSS (AX)(DI*4), X3
 24805  	ADDQ  CX, DI
 24806  	MOVSS (AX)(DI*4), X4
 24807  	ADDQ  CX, DI
 24808  	MOVSS (AX)(DI*4), X5
 24809  	ADDQ  CX, DI
 24810  	MOVSS (AX)(DI*4), X6
 24811  	ADDQ  CX, DI
 24812  	MOVSS (AX)(DI*4), X7
 24813  	ADDQ  CX, DI
 24814  	MOVSS (AX)(DI*4), X8
 24815  	ADDQ  CX, DI
 24816  	MULSS X0, X1
 24817  	MULSS X0, X2
 24818  	MULSS X0, X3
 24819  	MULSS X0, X4
 24820  	MULSS X0, X5
 24821  	MULSS X0, X6
 24822  	MULSS X0, X7
 24823  	MULSS X0, X8
 24824  	ADDSS (DX)(R8*4), X1
 24825  	MOVSS X1, (DX)(R8*4)
 24826  	ADDQ  BX, R8
 24827  	ADDSS (DX)(R8*4), X2
 24828  	MOVSS X2, (DX)(R8*4)
 24829  	ADDQ  BX, R8
 24830  	ADDSS (DX)(R8*4), X3
 24831  	MOVSS X3, (DX)(R8*4)
 24832  	ADDQ  BX, R8
 24833  	ADDSS (DX)(R8*4), X4
 24834  	MOVSS X4, (DX)(R8*4)
 24835  	ADDQ  BX, R8
 24836  	ADDSS (DX)(R8*4), X5
 24837  	MOVSS X5, (DX)(R8*4)
 24838  	ADDQ  BX, R8
 24839  	ADDSS (DX)(R8*4), X6
 24840  	MOVSS X6, (DX)(R8*4)
 24841  	ADDQ  BX, R8
 24842  	ADDSS (DX)(R8*4), X7
 24843  	MOVSS X7, (DX)(R8*4)
 24844  	ADDQ  BX, R8
 24845  	ADDSS (DX)(R8*4), X8
 24846  	MOVSS X8, (DX)(R8*4)
 24847  	ADDQ  BX, R8
 24848  	SUBQ  $0x08, SI
 24849  
 24850  check_limit_unroll:
 24851  	CMPQ SI, $0x08
 24852  	JHS  loop_unroll
 24853  	JMP  check_limit
 24854  
 24855  loop:
 24856  	MOVSS (AX)(DI*4), X1
 24857  	MULSS X0, X1
 24858  	ADDSS (DX)(R8*4), X1
 24859  	MOVSS X1, (DX)(R8*4)
 24860  	DECQ  SI
 24861  	ADDQ  CX, DI
 24862  	ADDQ  BX, R8
 24863  
 24864  check_limit:
 24865  	CMPQ SI, $0x00
 24866  	JHI  loop
 24867  	RET
 24868  
 24869  // func AmdAxpyUnsafeXInterleave_V3A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24870  // Requires: SSE
 24871  TEXT ·AmdAxpyUnsafeXInterleave_V3A16R8(SB), NOSPLIT, $0-48
 24872  	MOVSS alpha+0(FP), X0
 24873  	MOVQ  xs+8(FP), AX
 24874  	MOVQ  incx+16(FP), CX
 24875  	MOVQ  ys+24(FP), DX
 24876  	MOVQ  incy+32(FP), BX
 24877  	MOVQ  n+40(FP), SI
 24878  	XORQ  DI, DI
 24879  	XORQ  R8, R8
 24880  	JMP   check_limit_unroll
 24881  	PCALIGN $0x10
 24882  
 24883  loop_unroll:
 24884  	MOVSS (AX)(DI*4), X1
 24885  	ADDQ  CX, DI
 24886  	MOVSS (AX)(DI*4), X2
 24887  	ADDQ  CX, DI
 24888  	MOVSS (AX)(DI*4), X3
 24889  	ADDQ  CX, DI
 24890  	MOVSS (AX)(DI*4), X4
 24891  	ADDQ  CX, DI
 24892  	MOVSS (AX)(DI*4), X5
 24893  	ADDQ  CX, DI
 24894  	MOVSS (AX)(DI*4), X6
 24895  	ADDQ  CX, DI
 24896  	MOVSS (AX)(DI*4), X7
 24897  	ADDQ  CX, DI
 24898  	MOVSS (AX)(DI*4), X8
 24899  	ADDQ  CX, DI
 24900  	MULSS X0, X1
 24901  	MULSS X0, X2
 24902  	MULSS X0, X3
 24903  	MULSS X0, X4
 24904  	MULSS X0, X5
 24905  	MULSS X0, X6
 24906  	MULSS X0, X7
 24907  	MULSS X0, X8
 24908  	ADDSS (DX)(R8*4), X1
 24909  	MOVSS X1, (DX)(R8*4)
 24910  	ADDQ  BX, R8
 24911  	ADDSS (DX)(R8*4), X2
 24912  	MOVSS X2, (DX)(R8*4)
 24913  	ADDQ  BX, R8
 24914  	ADDSS (DX)(R8*4), X3
 24915  	MOVSS X3, (DX)(R8*4)
 24916  	ADDQ  BX, R8
 24917  	ADDSS (DX)(R8*4), X4
 24918  	MOVSS X4, (DX)(R8*4)
 24919  	ADDQ  BX, R8
 24920  	ADDSS (DX)(R8*4), X5
 24921  	MOVSS X5, (DX)(R8*4)
 24922  	ADDQ  BX, R8
 24923  	ADDSS (DX)(R8*4), X6
 24924  	MOVSS X6, (DX)(R8*4)
 24925  	ADDQ  BX, R8
 24926  	ADDSS (DX)(R8*4), X7
 24927  	MOVSS X7, (DX)(R8*4)
 24928  	ADDQ  BX, R8
 24929  	ADDSS (DX)(R8*4), X8
 24930  	MOVSS X8, (DX)(R8*4)
 24931  	ADDQ  BX, R8
 24932  	SUBQ  $0x08, SI
 24933  
 24934  check_limit_unroll:
 24935  	CMPQ SI, $0x08
 24936  	JHS  loop_unroll
 24937  	JMP  check_limit
 24938  
 24939  loop:
 24940  	MOVSS (AX)(DI*4), X1
 24941  	MULSS X0, X1
 24942  	ADDSS (DX)(R8*4), X1
 24943  	MOVSS X1, (DX)(R8*4)
 24944  	DECQ  SI
 24945  	ADDQ  CX, DI
 24946  	ADDQ  BX, R8
 24947  
 24948  check_limit:
 24949  	CMPQ SI, $0x00
 24950  	JHI  loop
 24951  	RET
 24952  
 24953  // func AmdAxpyUnsafeXInterleave_V4A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 24954  // Requires: SSE
 24955  TEXT ·AmdAxpyUnsafeXInterleave_V4A16R8(SB), NOSPLIT, $0-48
 24956  	MOVSS alpha+0(FP), X0
 24957  	MOVQ  xs+8(FP), AX
 24958  	MOVQ  incx+16(FP), CX
 24959  	MOVQ  ys+24(FP), DX
 24960  	MOVQ  incy+32(FP), BX
 24961  	MOVQ  n+40(FP), SI
 24962  	XORQ  DI, DI
 24963  	XORQ  R8, R8
 24964  	JMP   check_limit_unroll
 24965  	PCALIGN $0x10
 24966  
 24967  loop_unroll:
 24968  	MOVSS (AX)(DI*4), X1
 24969  	ADDQ  CX, DI
 24970  	MOVSS (AX)(DI*4), X2
 24971  	ADDQ  CX, DI
 24972  	MOVSS (AX)(DI*4), X3
 24973  	ADDQ  CX, DI
 24974  	MOVSS (AX)(DI*4), X4
 24975  	ADDQ  CX, DI
 24976  	MOVSS (AX)(DI*4), X5
 24977  	ADDQ  CX, DI
 24978  	MOVSS (AX)(DI*4), X6
 24979  	ADDQ  CX, DI
 24980  	MOVSS (AX)(DI*4), X7
 24981  	ADDQ  CX, DI
 24982  	MOVSS (AX)(DI*4), X8
 24983  	ADDQ  CX, DI
 24984  	MULSS X0, X1
 24985  	MULSS X0, X2
 24986  	MULSS X0, X3
 24987  	MULSS X0, X4
 24988  	MULSS X0, X5
 24989  	MULSS X0, X6
 24990  	MULSS X0, X7
 24991  	MULSS X0, X8
 24992  	ADDSS (DX)(R8*4), X1
 24993  	MOVSS X1, (DX)(R8*4)
 24994  	ADDQ  BX, R8
 24995  	ADDSS (DX)(R8*4), X2
 24996  	MOVSS X2, (DX)(R8*4)
 24997  	ADDQ  BX, R8
 24998  	ADDSS (DX)(R8*4), X3
 24999  	MOVSS X3, (DX)(R8*4)
 25000  	ADDQ  BX, R8
 25001  	ADDSS (DX)(R8*4), X4
 25002  	MOVSS X4, (DX)(R8*4)
 25003  	ADDQ  BX, R8
 25004  	ADDSS (DX)(R8*4), X5
 25005  	MOVSS X5, (DX)(R8*4)
 25006  	ADDQ  BX, R8
 25007  	ADDSS (DX)(R8*4), X6
 25008  	MOVSS X6, (DX)(R8*4)
 25009  	ADDQ  BX, R8
 25010  	ADDSS (DX)(R8*4), X7
 25011  	MOVSS X7, (DX)(R8*4)
 25012  	ADDQ  BX, R8
 25013  	ADDSS (DX)(R8*4), X8
 25014  	MOVSS X8, (DX)(R8*4)
 25015  	ADDQ  BX, R8
 25016  	SUBQ  $0x08, SI
 25017  
 25018  check_limit_unroll:
 25019  	CMPQ SI, $0x08
 25020  	JHS  loop_unroll
 25021  	JMP  check_limit
 25022  
 25023  loop:
 25024  	MOVSS (AX)(DI*4), X1
 25025  	MULSS X0, X1
 25026  	ADDSS (DX)(R8*4), X1
 25027  	MOVSS X1, (DX)(R8*4)
 25028  	DECQ  SI
 25029  	ADDQ  CX, DI
 25030  	ADDQ  BX, R8
 25031  
 25032  check_limit:
 25033  	CMPQ SI, $0x00
 25034  	JHI  loop
 25035  	RET
 25036  
 25037  // func AmdAxpyUnsafeXInterleave_V5A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25038  // Requires: SSE
 25039  TEXT ·AmdAxpyUnsafeXInterleave_V5A16R8(SB), NOSPLIT, $0-48
 25040  	MOVSS alpha+0(FP), X0
 25041  	MOVQ  xs+8(FP), AX
 25042  	MOVQ  incx+16(FP), CX
 25043  	MOVQ  ys+24(FP), DX
 25044  	MOVQ  incy+32(FP), BX
 25045  	MOVQ  n+40(FP), SI
 25046  	XORQ  DI, DI
 25047  	XORQ  R8, R8
 25048  	JMP   check_limit_unroll
 25049  	PCALIGN $0x10
 25050  
 25051  loop_unroll:
 25052  	MOVSS (AX)(DI*4), X1
 25053  	ADDQ  CX, DI
 25054  	MOVSS (AX)(DI*4), X2
 25055  	ADDQ  CX, DI
 25056  	MOVSS (AX)(DI*4), X3
 25057  	ADDQ  CX, DI
 25058  	MOVSS (AX)(DI*4), X4
 25059  	ADDQ  CX, DI
 25060  	MOVSS (AX)(DI*4), X5
 25061  	ADDQ  CX, DI
 25062  	MOVSS (AX)(DI*4), X6
 25063  	ADDQ  CX, DI
 25064  	MOVSS (AX)(DI*4), X7
 25065  	ADDQ  CX, DI
 25066  	MOVSS (AX)(DI*4), X8
 25067  	ADDQ  CX, DI
 25068  	MULSS X0, X1
 25069  	MULSS X0, X2
 25070  	MULSS X0, X3
 25071  	MULSS X0, X4
 25072  	MULSS X0, X5
 25073  	MULSS X0, X6
 25074  	MULSS X0, X7
 25075  	MULSS X0, X8
 25076  	ADDSS (DX)(R8*4), X1
 25077  	MOVSS X1, (DX)(R8*4)
 25078  	ADDQ  BX, R8
 25079  	ADDSS (DX)(R8*4), X2
 25080  	MOVSS X2, (DX)(R8*4)
 25081  	ADDQ  BX, R8
 25082  	ADDSS (DX)(R8*4), X3
 25083  	MOVSS X3, (DX)(R8*4)
 25084  	ADDQ  BX, R8
 25085  	ADDSS (DX)(R8*4), X4
 25086  	MOVSS X4, (DX)(R8*4)
 25087  	ADDQ  BX, R8
 25088  	ADDSS (DX)(R8*4), X5
 25089  	MOVSS X5, (DX)(R8*4)
 25090  	ADDQ  BX, R8
 25091  	ADDSS (DX)(R8*4), X6
 25092  	MOVSS X6, (DX)(R8*4)
 25093  	ADDQ  BX, R8
 25094  	ADDSS (DX)(R8*4), X7
 25095  	MOVSS X7, (DX)(R8*4)
 25096  	ADDQ  BX, R8
 25097  	ADDSS (DX)(R8*4), X8
 25098  	MOVSS X8, (DX)(R8*4)
 25099  	ADDQ  BX, R8
 25100  	SUBQ  $0x08, SI
 25101  
 25102  check_limit_unroll:
 25103  	CMPQ SI, $0x08
 25104  	JHS  loop_unroll
 25105  	JMP  check_limit
 25106  
 25107  loop:
 25108  	MOVSS (AX)(DI*4), X1
 25109  	MULSS X0, X1
 25110  	ADDSS (DX)(R8*4), X1
 25111  	MOVSS X1, (DX)(R8*4)
 25112  	DECQ  SI
 25113  	ADDQ  CX, DI
 25114  	ADDQ  BX, R8
 25115  
 25116  check_limit:
 25117  	CMPQ SI, $0x00
 25118  	JHI  loop
 25119  	RET
 25120  
 25121  // func AmdAxpyPointerLoopX_V0A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25122  // Requires: SSE
 25123  TEXT ·AmdAxpyPointerLoopX_V0A0U4(SB), NOSPLIT, $0-48
 25124  	MOVSS alpha+0(FP), X0
 25125  	MOVQ  xs+8(FP), AX
 25126  	MOVQ  incx+16(FP), CX
 25127  	MOVQ  ys+24(FP), DX
 25128  	MOVQ  incy+32(FP), BX
 25129  	MOVQ  n+40(FP), SI
 25130  	JMP   check_limit_unroll
 25131  
 25132  loop_unroll:
 25133  	MOVSS (AX), X1
 25134  	MULSS X0, X1
 25135  	ADDSS (DX), X1
 25136  	MOVSS X1, (DX)
 25137  	LEAQ  (AX)(CX*4), AX
 25138  	LEAQ  (DX)(BX*4), DX
 25139  	MOVSS (AX), X1
 25140  	MULSS X0, X1
 25141  	ADDSS (DX), X1
 25142  	MOVSS X1, (DX)
 25143  	LEAQ  (AX)(CX*4), AX
 25144  	LEAQ  (DX)(BX*4), DX
 25145  	MOVSS (AX), X1
 25146  	MULSS X0, X1
 25147  	ADDSS (DX), X1
 25148  	MOVSS X1, (DX)
 25149  	LEAQ  (AX)(CX*4), AX
 25150  	LEAQ  (DX)(BX*4), DX
 25151  	MOVSS (AX), X1
 25152  	MULSS X0, X1
 25153  	ADDSS (DX), X1
 25154  	MOVSS X1, (DX)
 25155  	LEAQ  (AX)(CX*4), AX
 25156  	LEAQ  (DX)(BX*4), DX
 25157  	SUBQ  $0x04, SI
 25158  
 25159  check_limit_unroll:
 25160  	CMPQ SI, $0x04
 25161  	JHS  loop_unroll
 25162  	JMP  check_limit
 25163  
 25164  loop:
 25165  	MOVSS (AX), X1
 25166  	MULSS X0, X1
 25167  	ADDSS (DX), X1
 25168  	MOVSS X1, (DX)
 25169  	DECQ  SI
 25170  	LEAQ  (AX)(CX*4), AX
 25171  	LEAQ  (DX)(BX*4), DX
 25172  
 25173  check_limit:
 25174  	CMPQ SI, $0x00
 25175  	JHI  loop
 25176  	RET
 25177  
 25178  // func AmdAxpyPointerLoopX_V1A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25179  // Requires: SSE
 25180  TEXT ·AmdAxpyPointerLoopX_V1A0U4(SB), NOSPLIT, $0-48
 25181  	MOVSS alpha+0(FP), X0
 25182  	MOVQ  xs+8(FP), AX
 25183  	MOVQ  incx+16(FP), CX
 25184  	MOVQ  ys+24(FP), DX
 25185  	MOVQ  incy+32(FP), BX
 25186  	MOVQ  n+40(FP), SI
 25187  	JMP   check_limit_unroll
 25188  
 25189  loop_unroll:
 25190  	MOVSS (AX), X1
 25191  	MULSS X0, X1
 25192  	ADDSS (DX), X1
 25193  	MOVSS X1, (DX)
 25194  	LEAQ  (AX)(CX*4), AX
 25195  	LEAQ  (DX)(BX*4), DX
 25196  	MOVSS (AX), X1
 25197  	MULSS X0, X1
 25198  	ADDSS (DX), X1
 25199  	MOVSS X1, (DX)
 25200  	LEAQ  (AX)(CX*4), AX
 25201  	LEAQ  (DX)(BX*4), DX
 25202  	MOVSS (AX), X1
 25203  	MULSS X0, X1
 25204  	ADDSS (DX), X1
 25205  	MOVSS X1, (DX)
 25206  	LEAQ  (AX)(CX*4), AX
 25207  	LEAQ  (DX)(BX*4), DX
 25208  	MOVSS (AX), X1
 25209  	MULSS X0, X1
 25210  	ADDSS (DX), X1
 25211  	MOVSS X1, (DX)
 25212  	LEAQ  (AX)(CX*4), AX
 25213  	LEAQ  (DX)(BX*4), DX
 25214  	SUBQ  $0x04, SI
 25215  
 25216  check_limit_unroll:
 25217  	CMPQ SI, $0x04
 25218  	JHS  loop_unroll
 25219  	JMP  check_limit
 25220  
 25221  loop:
 25222  	MOVSS (AX), X1
 25223  	MULSS X0, X1
 25224  	ADDSS (DX), X1
 25225  	MOVSS X1, (DX)
 25226  	DECQ  SI
 25227  	LEAQ  (AX)(CX*4), AX
 25228  	LEAQ  (DX)(BX*4), DX
 25229  
 25230  check_limit:
 25231  	CMPQ SI, $0x00
 25232  	JHI  loop
 25233  	RET
 25234  
 25235  // func AmdAxpyPointerLoopX_V2A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25236  // Requires: SSE
 25237  TEXT ·AmdAxpyPointerLoopX_V2A0U4(SB), NOSPLIT, $0-48
 25238  	MOVSS alpha+0(FP), X0
 25239  	MOVQ  xs+8(FP), AX
 25240  	MOVQ  incx+16(FP), CX
 25241  	MOVQ  ys+24(FP), DX
 25242  	MOVQ  incy+32(FP), BX
 25243  	MOVQ  n+40(FP), SI
 25244  	JMP   check_limit_unroll
 25245  
 25246  loop_unroll:
 25247  	MOVSS (AX), X1
 25248  	MULSS X0, X1
 25249  	ADDSS (DX), X1
 25250  	MOVSS X1, (DX)
 25251  	LEAQ  (AX)(CX*4), AX
 25252  	LEAQ  (DX)(BX*4), DX
 25253  	MOVSS (AX), X1
 25254  	MULSS X0, X1
 25255  	ADDSS (DX), X1
 25256  	MOVSS X1, (DX)
 25257  	LEAQ  (AX)(CX*4), AX
 25258  	LEAQ  (DX)(BX*4), DX
 25259  	MOVSS (AX), X1
 25260  	MULSS X0, X1
 25261  	ADDSS (DX), X1
 25262  	MOVSS X1, (DX)
 25263  	LEAQ  (AX)(CX*4), AX
 25264  	LEAQ  (DX)(BX*4), DX
 25265  	MOVSS (AX), X1
 25266  	MULSS X0, X1
 25267  	ADDSS (DX), X1
 25268  	MOVSS X1, (DX)
 25269  	LEAQ  (AX)(CX*4), AX
 25270  	LEAQ  (DX)(BX*4), DX
 25271  	SUBQ  $0x04, SI
 25272  
 25273  check_limit_unroll:
 25274  	CMPQ SI, $0x04
 25275  	JHS  loop_unroll
 25276  	JMP  check_limit
 25277  
 25278  loop:
 25279  	MOVSS (AX), X1
 25280  	MULSS X0, X1
 25281  	ADDSS (DX), X1
 25282  	MOVSS X1, (DX)
 25283  	DECQ  SI
 25284  	LEAQ  (AX)(CX*4), AX
 25285  	LEAQ  (DX)(BX*4), DX
 25286  
 25287  check_limit:
 25288  	CMPQ SI, $0x00
 25289  	JHI  loop
 25290  	RET
 25291  
 25292  // func AmdAxpyPointerLoopX_V3A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25293  // Requires: SSE
 25294  TEXT ·AmdAxpyPointerLoopX_V3A0U4(SB), NOSPLIT, $0-48
 25295  	MOVSS alpha+0(FP), X0
 25296  	MOVQ  xs+8(FP), AX
 25297  	MOVQ  incx+16(FP), CX
 25298  	MOVQ  ys+24(FP), DX
 25299  	MOVQ  incy+32(FP), BX
 25300  	MOVQ  n+40(FP), SI
 25301  	JMP   check_limit_unroll
 25302  
 25303  loop_unroll:
 25304  	MOVSS (AX), X1
 25305  	MULSS X0, X1
 25306  	ADDSS (DX), X1
 25307  	MOVSS X1, (DX)
 25308  	LEAQ  (AX)(CX*4), AX
 25309  	LEAQ  (DX)(BX*4), DX
 25310  	MOVSS (AX), X1
 25311  	MULSS X0, X1
 25312  	ADDSS (DX), X1
 25313  	MOVSS X1, (DX)
 25314  	LEAQ  (AX)(CX*4), AX
 25315  	LEAQ  (DX)(BX*4), DX
 25316  	MOVSS (AX), X1
 25317  	MULSS X0, X1
 25318  	ADDSS (DX), X1
 25319  	MOVSS X1, (DX)
 25320  	LEAQ  (AX)(CX*4), AX
 25321  	LEAQ  (DX)(BX*4), DX
 25322  	MOVSS (AX), X1
 25323  	MULSS X0, X1
 25324  	ADDSS (DX), X1
 25325  	MOVSS X1, (DX)
 25326  	LEAQ  (AX)(CX*4), AX
 25327  	LEAQ  (DX)(BX*4), DX
 25328  	SUBQ  $0x04, SI
 25329  
 25330  check_limit_unroll:
 25331  	CMPQ SI, $0x04
 25332  	JHS  loop_unroll
 25333  	JMP  check_limit
 25334  
 25335  loop:
 25336  	MOVSS (AX), X1
 25337  	MULSS X0, X1
 25338  	ADDSS (DX), X1
 25339  	MOVSS X1, (DX)
 25340  	DECQ  SI
 25341  	LEAQ  (AX)(CX*4), AX
 25342  	LEAQ  (DX)(BX*4), DX
 25343  
 25344  check_limit:
 25345  	CMPQ SI, $0x00
 25346  	JHI  loop
 25347  	RET
 25348  
 25349  // func AmdAxpyPointerLoopX_V4A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25350  // Requires: SSE
 25351  TEXT ·AmdAxpyPointerLoopX_V4A0U4(SB), NOSPLIT, $0-48
 25352  	MOVSS alpha+0(FP), X0
 25353  	MOVQ  xs+8(FP), AX
 25354  	MOVQ  incx+16(FP), CX
 25355  	MOVQ  ys+24(FP), DX
 25356  	MOVQ  incy+32(FP), BX
 25357  	MOVQ  n+40(FP), SI
 25358  	JMP   check_limit_unroll
 25359  
 25360  loop_unroll:
 25361  	MOVSS (AX), X1
 25362  	MULSS X0, X1
 25363  	ADDSS (DX), X1
 25364  	MOVSS X1, (DX)
 25365  	LEAQ  (AX)(CX*4), AX
 25366  	LEAQ  (DX)(BX*4), DX
 25367  	MOVSS (AX), X1
 25368  	MULSS X0, X1
 25369  	ADDSS (DX), X1
 25370  	MOVSS X1, (DX)
 25371  	LEAQ  (AX)(CX*4), AX
 25372  	LEAQ  (DX)(BX*4), DX
 25373  	MOVSS (AX), X1
 25374  	MULSS X0, X1
 25375  	ADDSS (DX), X1
 25376  	MOVSS X1, (DX)
 25377  	LEAQ  (AX)(CX*4), AX
 25378  	LEAQ  (DX)(BX*4), DX
 25379  	MOVSS (AX), X1
 25380  	MULSS X0, X1
 25381  	ADDSS (DX), X1
 25382  	MOVSS X1, (DX)
 25383  	LEAQ  (AX)(CX*4), AX
 25384  	LEAQ  (DX)(BX*4), DX
 25385  	SUBQ  $0x04, SI
 25386  
 25387  check_limit_unroll:
 25388  	CMPQ SI, $0x04
 25389  	JHS  loop_unroll
 25390  	JMP  check_limit
 25391  
 25392  loop:
 25393  	MOVSS (AX), X1
 25394  	MULSS X0, X1
 25395  	ADDSS (DX), X1
 25396  	MOVSS X1, (DX)
 25397  	DECQ  SI
 25398  	LEAQ  (AX)(CX*4), AX
 25399  	LEAQ  (DX)(BX*4), DX
 25400  
 25401  check_limit:
 25402  	CMPQ SI, $0x00
 25403  	JHI  loop
 25404  	RET
 25405  
 25406  // func AmdAxpyPointerLoopX_V5A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25407  // Requires: SSE
 25408  TEXT ·AmdAxpyPointerLoopX_V5A0U4(SB), NOSPLIT, $0-48
 25409  	MOVSS alpha+0(FP), X0
 25410  	MOVQ  xs+8(FP), AX
 25411  	MOVQ  incx+16(FP), CX
 25412  	MOVQ  ys+24(FP), DX
 25413  	MOVQ  incy+32(FP), BX
 25414  	MOVQ  n+40(FP), SI
 25415  	JMP   check_limit_unroll
 25416  
 25417  loop_unroll:
 25418  	MOVSS (AX), X1
 25419  	MULSS X0, X1
 25420  	ADDSS (DX), X1
 25421  	MOVSS X1, (DX)
 25422  	LEAQ  (AX)(CX*4), AX
 25423  	LEAQ  (DX)(BX*4), DX
 25424  	MOVSS (AX), X1
 25425  	MULSS X0, X1
 25426  	ADDSS (DX), X1
 25427  	MOVSS X1, (DX)
 25428  	LEAQ  (AX)(CX*4), AX
 25429  	LEAQ  (DX)(BX*4), DX
 25430  	MOVSS (AX), X1
 25431  	MULSS X0, X1
 25432  	ADDSS (DX), X1
 25433  	MOVSS X1, (DX)
 25434  	LEAQ  (AX)(CX*4), AX
 25435  	LEAQ  (DX)(BX*4), DX
 25436  	MOVSS (AX), X1
 25437  	MULSS X0, X1
 25438  	ADDSS (DX), X1
 25439  	MOVSS X1, (DX)
 25440  	LEAQ  (AX)(CX*4), AX
 25441  	LEAQ  (DX)(BX*4), DX
 25442  	SUBQ  $0x04, SI
 25443  
 25444  check_limit_unroll:
 25445  	CMPQ SI, $0x04
 25446  	JHS  loop_unroll
 25447  	JMP  check_limit
 25448  
 25449  loop:
 25450  	MOVSS (AX), X1
 25451  	MULSS X0, X1
 25452  	ADDSS (DX), X1
 25453  	MOVSS X1, (DX)
 25454  	DECQ  SI
 25455  	LEAQ  (AX)(CX*4), AX
 25456  	LEAQ  (DX)(BX*4), DX
 25457  
 25458  check_limit:
 25459  	CMPQ SI, $0x00
 25460  	JHI  loop
 25461  	RET
 25462  
 25463  // func AmdAxpyPointerLoopX_V0A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25464  // Requires: SSE
 25465  TEXT ·AmdAxpyPointerLoopX_V0A8U4(SB), NOSPLIT, $0-48
 25466  	MOVSS alpha+0(FP), X0
 25467  	MOVQ  xs+8(FP), AX
 25468  	MOVQ  incx+16(FP), CX
 25469  	MOVQ  ys+24(FP), DX
 25470  	MOVQ  incy+32(FP), BX
 25471  	MOVQ  n+40(FP), SI
 25472  	JMP   check_limit_unroll
 25473  	PCALIGN $0x08
 25474  
 25475  loop_unroll:
 25476  	MOVSS (AX), X1
 25477  	MULSS X0, X1
 25478  	ADDSS (DX), X1
 25479  	MOVSS X1, (DX)
 25480  	LEAQ  (AX)(CX*4), AX
 25481  	LEAQ  (DX)(BX*4), DX
 25482  	MOVSS (AX), X1
 25483  	MULSS X0, X1
 25484  	ADDSS (DX), X1
 25485  	MOVSS X1, (DX)
 25486  	LEAQ  (AX)(CX*4), AX
 25487  	LEAQ  (DX)(BX*4), DX
 25488  	MOVSS (AX), X1
 25489  	MULSS X0, X1
 25490  	ADDSS (DX), X1
 25491  	MOVSS X1, (DX)
 25492  	LEAQ  (AX)(CX*4), AX
 25493  	LEAQ  (DX)(BX*4), DX
 25494  	MOVSS (AX), X1
 25495  	MULSS X0, X1
 25496  	ADDSS (DX), X1
 25497  	MOVSS X1, (DX)
 25498  	LEAQ  (AX)(CX*4), AX
 25499  	LEAQ  (DX)(BX*4), DX
 25500  	SUBQ  $0x04, SI
 25501  
 25502  check_limit_unroll:
 25503  	CMPQ SI, $0x04
 25504  	JHS  loop_unroll
 25505  	JMP  check_limit
 25506  
 25507  loop:
 25508  	MOVSS (AX), X1
 25509  	MULSS X0, X1
 25510  	ADDSS (DX), X1
 25511  	MOVSS X1, (DX)
 25512  	DECQ  SI
 25513  	LEAQ  (AX)(CX*4), AX
 25514  	LEAQ  (DX)(BX*4), DX
 25515  
 25516  check_limit:
 25517  	CMPQ SI, $0x00
 25518  	JHI  loop
 25519  	RET
 25520  
 25521  // func AmdAxpyPointerLoopX_V1A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25522  // Requires: SSE
 25523  TEXT ·AmdAxpyPointerLoopX_V1A8U4(SB), NOSPLIT, $0-48
 25524  	MOVSS alpha+0(FP), X0
 25525  	MOVQ  xs+8(FP), AX
 25526  	MOVQ  incx+16(FP), CX
 25527  	MOVQ  ys+24(FP), DX
 25528  	MOVQ  incy+32(FP), BX
 25529  	MOVQ  n+40(FP), SI
 25530  	JMP   check_limit_unroll
 25531  	PCALIGN $0x08
 25532  
 25533  loop_unroll:
 25534  	MOVSS (AX), X1
 25535  	MULSS X0, X1
 25536  	ADDSS (DX), X1
 25537  	MOVSS X1, (DX)
 25538  	LEAQ  (AX)(CX*4), AX
 25539  	LEAQ  (DX)(BX*4), DX
 25540  	MOVSS (AX), X1
 25541  	MULSS X0, X1
 25542  	ADDSS (DX), X1
 25543  	MOVSS X1, (DX)
 25544  	LEAQ  (AX)(CX*4), AX
 25545  	LEAQ  (DX)(BX*4), DX
 25546  	MOVSS (AX), X1
 25547  	MULSS X0, X1
 25548  	ADDSS (DX), X1
 25549  	MOVSS X1, (DX)
 25550  	LEAQ  (AX)(CX*4), AX
 25551  	LEAQ  (DX)(BX*4), DX
 25552  	MOVSS (AX), X1
 25553  	MULSS X0, X1
 25554  	ADDSS (DX), X1
 25555  	MOVSS X1, (DX)
 25556  	LEAQ  (AX)(CX*4), AX
 25557  	LEAQ  (DX)(BX*4), DX
 25558  	SUBQ  $0x04, SI
 25559  
 25560  check_limit_unroll:
 25561  	CMPQ SI, $0x04
 25562  	JHS  loop_unroll
 25563  	JMP  check_limit
 25564  
 25565  loop:
 25566  	MOVSS (AX), X1
 25567  	MULSS X0, X1
 25568  	ADDSS (DX), X1
 25569  	MOVSS X1, (DX)
 25570  	DECQ  SI
 25571  	LEAQ  (AX)(CX*4), AX
 25572  	LEAQ  (DX)(BX*4), DX
 25573  
 25574  check_limit:
 25575  	CMPQ SI, $0x00
 25576  	JHI  loop
 25577  	RET
 25578  
 25579  // func AmdAxpyPointerLoopX_V2A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25580  // Requires: SSE
 25581  TEXT ·AmdAxpyPointerLoopX_V2A8U4(SB), NOSPLIT, $0-48
 25582  	MOVSS alpha+0(FP), X0
 25583  	MOVQ  xs+8(FP), AX
 25584  	MOVQ  incx+16(FP), CX
 25585  	MOVQ  ys+24(FP), DX
 25586  	MOVQ  incy+32(FP), BX
 25587  	MOVQ  n+40(FP), SI
 25588  	JMP   check_limit_unroll
 25589  	PCALIGN $0x08
 25590  
 25591  loop_unroll:
 25592  	MOVSS (AX), X1
 25593  	MULSS X0, X1
 25594  	ADDSS (DX), X1
 25595  	MOVSS X1, (DX)
 25596  	LEAQ  (AX)(CX*4), AX
 25597  	LEAQ  (DX)(BX*4), DX
 25598  	MOVSS (AX), X1
 25599  	MULSS X0, X1
 25600  	ADDSS (DX), X1
 25601  	MOVSS X1, (DX)
 25602  	LEAQ  (AX)(CX*4), AX
 25603  	LEAQ  (DX)(BX*4), DX
 25604  	MOVSS (AX), X1
 25605  	MULSS X0, X1
 25606  	ADDSS (DX), X1
 25607  	MOVSS X1, (DX)
 25608  	LEAQ  (AX)(CX*4), AX
 25609  	LEAQ  (DX)(BX*4), DX
 25610  	MOVSS (AX), X1
 25611  	MULSS X0, X1
 25612  	ADDSS (DX), X1
 25613  	MOVSS X1, (DX)
 25614  	LEAQ  (AX)(CX*4), AX
 25615  	LEAQ  (DX)(BX*4), DX
 25616  	SUBQ  $0x04, SI
 25617  
 25618  check_limit_unroll:
 25619  	CMPQ SI, $0x04
 25620  	JHS  loop_unroll
 25621  	JMP  check_limit
 25622  
 25623  loop:
 25624  	MOVSS (AX), X1
 25625  	MULSS X0, X1
 25626  	ADDSS (DX), X1
 25627  	MOVSS X1, (DX)
 25628  	DECQ  SI
 25629  	LEAQ  (AX)(CX*4), AX
 25630  	LEAQ  (DX)(BX*4), DX
 25631  
 25632  check_limit:
 25633  	CMPQ SI, $0x00
 25634  	JHI  loop
 25635  	RET
 25636  
 25637  // func AmdAxpyPointerLoopX_V3A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25638  // Requires: SSE
 25639  TEXT ·AmdAxpyPointerLoopX_V3A8U4(SB), NOSPLIT, $0-48
 25640  	MOVSS alpha+0(FP), X0
 25641  	MOVQ  xs+8(FP), AX
 25642  	MOVQ  incx+16(FP), CX
 25643  	MOVQ  ys+24(FP), DX
 25644  	MOVQ  incy+32(FP), BX
 25645  	MOVQ  n+40(FP), SI
 25646  	JMP   check_limit_unroll
 25647  	PCALIGN $0x08
 25648  
 25649  loop_unroll:
 25650  	MOVSS (AX), X1
 25651  	MULSS X0, X1
 25652  	ADDSS (DX), X1
 25653  	MOVSS X1, (DX)
 25654  	LEAQ  (AX)(CX*4), AX
 25655  	LEAQ  (DX)(BX*4), DX
 25656  	MOVSS (AX), X1
 25657  	MULSS X0, X1
 25658  	ADDSS (DX), X1
 25659  	MOVSS X1, (DX)
 25660  	LEAQ  (AX)(CX*4), AX
 25661  	LEAQ  (DX)(BX*4), DX
 25662  	MOVSS (AX), X1
 25663  	MULSS X0, X1
 25664  	ADDSS (DX), X1
 25665  	MOVSS X1, (DX)
 25666  	LEAQ  (AX)(CX*4), AX
 25667  	LEAQ  (DX)(BX*4), DX
 25668  	MOVSS (AX), X1
 25669  	MULSS X0, X1
 25670  	ADDSS (DX), X1
 25671  	MOVSS X1, (DX)
 25672  	LEAQ  (AX)(CX*4), AX
 25673  	LEAQ  (DX)(BX*4), DX
 25674  	SUBQ  $0x04, SI
 25675  
 25676  check_limit_unroll:
 25677  	CMPQ SI, $0x04
 25678  	JHS  loop_unroll
 25679  	JMP  check_limit
 25680  
 25681  loop:
 25682  	MOVSS (AX), X1
 25683  	MULSS X0, X1
 25684  	ADDSS (DX), X1
 25685  	MOVSS X1, (DX)
 25686  	DECQ  SI
 25687  	LEAQ  (AX)(CX*4), AX
 25688  	LEAQ  (DX)(BX*4), DX
 25689  
 25690  check_limit:
 25691  	CMPQ SI, $0x00
 25692  	JHI  loop
 25693  	RET
 25694  
 25695  // func AmdAxpyPointerLoopX_V4A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25696  // Requires: SSE
 25697  TEXT ·AmdAxpyPointerLoopX_V4A8U4(SB), NOSPLIT, $0-48
 25698  	MOVSS alpha+0(FP), X0
 25699  	MOVQ  xs+8(FP), AX
 25700  	MOVQ  incx+16(FP), CX
 25701  	MOVQ  ys+24(FP), DX
 25702  	MOVQ  incy+32(FP), BX
 25703  	MOVQ  n+40(FP), SI
 25704  	JMP   check_limit_unroll
 25705  	PCALIGN $0x08
 25706  
 25707  loop_unroll:
 25708  	MOVSS (AX), X1
 25709  	MULSS X0, X1
 25710  	ADDSS (DX), X1
 25711  	MOVSS X1, (DX)
 25712  	LEAQ  (AX)(CX*4), AX
 25713  	LEAQ  (DX)(BX*4), DX
 25714  	MOVSS (AX), X1
 25715  	MULSS X0, X1
 25716  	ADDSS (DX), X1
 25717  	MOVSS X1, (DX)
 25718  	LEAQ  (AX)(CX*4), AX
 25719  	LEAQ  (DX)(BX*4), DX
 25720  	MOVSS (AX), X1
 25721  	MULSS X0, X1
 25722  	ADDSS (DX), X1
 25723  	MOVSS X1, (DX)
 25724  	LEAQ  (AX)(CX*4), AX
 25725  	LEAQ  (DX)(BX*4), DX
 25726  	MOVSS (AX), X1
 25727  	MULSS X0, X1
 25728  	ADDSS (DX), X1
 25729  	MOVSS X1, (DX)
 25730  	LEAQ  (AX)(CX*4), AX
 25731  	LEAQ  (DX)(BX*4), DX
 25732  	SUBQ  $0x04, SI
 25733  
 25734  check_limit_unroll:
 25735  	CMPQ SI, $0x04
 25736  	JHS  loop_unroll
 25737  	JMP  check_limit
 25738  
 25739  loop:
 25740  	MOVSS (AX), X1
 25741  	MULSS X0, X1
 25742  	ADDSS (DX), X1
 25743  	MOVSS X1, (DX)
 25744  	DECQ  SI
 25745  	LEAQ  (AX)(CX*4), AX
 25746  	LEAQ  (DX)(BX*4), DX
 25747  
 25748  check_limit:
 25749  	CMPQ SI, $0x00
 25750  	JHI  loop
 25751  	RET
 25752  
 25753  // func AmdAxpyPointerLoopX_V5A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25754  // Requires: SSE
 25755  TEXT ·AmdAxpyPointerLoopX_V5A8U4(SB), NOSPLIT, $0-48
 25756  	MOVSS alpha+0(FP), X0
 25757  	MOVQ  xs+8(FP), AX
 25758  	MOVQ  incx+16(FP), CX
 25759  	MOVQ  ys+24(FP), DX
 25760  	MOVQ  incy+32(FP), BX
 25761  	MOVQ  n+40(FP), SI
 25762  	JMP   check_limit_unroll
 25763  	PCALIGN $0x08
 25764  
 25765  loop_unroll:
 25766  	MOVSS (AX), X1
 25767  	MULSS X0, X1
 25768  	ADDSS (DX), X1
 25769  	MOVSS X1, (DX)
 25770  	LEAQ  (AX)(CX*4), AX
 25771  	LEAQ  (DX)(BX*4), DX
 25772  	MOVSS (AX), X1
 25773  	MULSS X0, X1
 25774  	ADDSS (DX), X1
 25775  	MOVSS X1, (DX)
 25776  	LEAQ  (AX)(CX*4), AX
 25777  	LEAQ  (DX)(BX*4), DX
 25778  	MOVSS (AX), X1
 25779  	MULSS X0, X1
 25780  	ADDSS (DX), X1
 25781  	MOVSS X1, (DX)
 25782  	LEAQ  (AX)(CX*4), AX
 25783  	LEAQ  (DX)(BX*4), DX
 25784  	MOVSS (AX), X1
 25785  	MULSS X0, X1
 25786  	ADDSS (DX), X1
 25787  	MOVSS X1, (DX)
 25788  	LEAQ  (AX)(CX*4), AX
 25789  	LEAQ  (DX)(BX*4), DX
 25790  	SUBQ  $0x04, SI
 25791  
 25792  check_limit_unroll:
 25793  	CMPQ SI, $0x04
 25794  	JHS  loop_unroll
 25795  	JMP  check_limit
 25796  
 25797  loop:
 25798  	MOVSS (AX), X1
 25799  	MULSS X0, X1
 25800  	ADDSS (DX), X1
 25801  	MOVSS X1, (DX)
 25802  	DECQ  SI
 25803  	LEAQ  (AX)(CX*4), AX
 25804  	LEAQ  (DX)(BX*4), DX
 25805  
 25806  check_limit:
 25807  	CMPQ SI, $0x00
 25808  	JHI  loop
 25809  	RET
 25810  
 25811  // func AmdAxpyPointerLoopX_V0A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25812  // Requires: SSE
 25813  TEXT ·AmdAxpyPointerLoopX_V0A9U4(SB), NOSPLIT, $0-48
 25814  	MOVSS alpha+0(FP), X0
 25815  	MOVQ  xs+8(FP), AX
 25816  	MOVQ  incx+16(FP), CX
 25817  	MOVQ  ys+24(FP), DX
 25818  	MOVQ  incy+32(FP), BX
 25819  	MOVQ  n+40(FP), SI
 25820  	JMP   check_limit_unroll
 25821  	PCALIGN $0x08
 25822  	NOP
 25823  
 25824  loop_unroll:
 25825  	MOVSS (AX), X1
 25826  	MULSS X0, X1
 25827  	ADDSS (DX), X1
 25828  	MOVSS X1, (DX)
 25829  	LEAQ  (AX)(CX*4), AX
 25830  	LEAQ  (DX)(BX*4), DX
 25831  	MOVSS (AX), X1
 25832  	MULSS X0, X1
 25833  	ADDSS (DX), X1
 25834  	MOVSS X1, (DX)
 25835  	LEAQ  (AX)(CX*4), AX
 25836  	LEAQ  (DX)(BX*4), DX
 25837  	MOVSS (AX), X1
 25838  	MULSS X0, X1
 25839  	ADDSS (DX), X1
 25840  	MOVSS X1, (DX)
 25841  	LEAQ  (AX)(CX*4), AX
 25842  	LEAQ  (DX)(BX*4), DX
 25843  	MOVSS (AX), X1
 25844  	MULSS X0, X1
 25845  	ADDSS (DX), X1
 25846  	MOVSS X1, (DX)
 25847  	LEAQ  (AX)(CX*4), AX
 25848  	LEAQ  (DX)(BX*4), DX
 25849  	SUBQ  $0x04, SI
 25850  
 25851  check_limit_unroll:
 25852  	CMPQ SI, $0x04
 25853  	JHS  loop_unroll
 25854  	JMP  check_limit
 25855  
 25856  loop:
 25857  	MOVSS (AX), X1
 25858  	MULSS X0, X1
 25859  	ADDSS (DX), X1
 25860  	MOVSS X1, (DX)
 25861  	DECQ  SI
 25862  	LEAQ  (AX)(CX*4), AX
 25863  	LEAQ  (DX)(BX*4), DX
 25864  
 25865  check_limit:
 25866  	CMPQ SI, $0x00
 25867  	JHI  loop
 25868  	RET
 25869  
 25870  // func AmdAxpyPointerLoopX_V1A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25871  // Requires: SSE
 25872  TEXT ·AmdAxpyPointerLoopX_V1A9U4(SB), NOSPLIT, $0-48
 25873  	MOVSS alpha+0(FP), X0
 25874  	MOVQ  xs+8(FP), AX
 25875  	MOVQ  incx+16(FP), CX
 25876  	MOVQ  ys+24(FP), DX
 25877  	MOVQ  incy+32(FP), BX
 25878  	MOVQ  n+40(FP), SI
 25879  	JMP   check_limit_unroll
 25880  	PCALIGN $0x08
 25881  	NOP
 25882  
 25883  loop_unroll:
 25884  	MOVSS (AX), X1
 25885  	MULSS X0, X1
 25886  	ADDSS (DX), X1
 25887  	MOVSS X1, (DX)
 25888  	LEAQ  (AX)(CX*4), AX
 25889  	LEAQ  (DX)(BX*4), DX
 25890  	MOVSS (AX), X1
 25891  	MULSS X0, X1
 25892  	ADDSS (DX), X1
 25893  	MOVSS X1, (DX)
 25894  	LEAQ  (AX)(CX*4), AX
 25895  	LEAQ  (DX)(BX*4), DX
 25896  	MOVSS (AX), X1
 25897  	MULSS X0, X1
 25898  	ADDSS (DX), X1
 25899  	MOVSS X1, (DX)
 25900  	LEAQ  (AX)(CX*4), AX
 25901  	LEAQ  (DX)(BX*4), DX
 25902  	MOVSS (AX), X1
 25903  	MULSS X0, X1
 25904  	ADDSS (DX), X1
 25905  	MOVSS X1, (DX)
 25906  	LEAQ  (AX)(CX*4), AX
 25907  	LEAQ  (DX)(BX*4), DX
 25908  	SUBQ  $0x04, SI
 25909  
 25910  check_limit_unroll:
 25911  	CMPQ SI, $0x04
 25912  	JHS  loop_unroll
 25913  	JMP  check_limit
 25914  
 25915  loop:
 25916  	MOVSS (AX), X1
 25917  	MULSS X0, X1
 25918  	ADDSS (DX), X1
 25919  	MOVSS X1, (DX)
 25920  	DECQ  SI
 25921  	LEAQ  (AX)(CX*4), AX
 25922  	LEAQ  (DX)(BX*4), DX
 25923  
 25924  check_limit:
 25925  	CMPQ SI, $0x00
 25926  	JHI  loop
 25927  	RET
 25928  
 25929  // func AmdAxpyPointerLoopX_V2A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25930  // Requires: SSE
 25931  TEXT ·AmdAxpyPointerLoopX_V2A9U4(SB), NOSPLIT, $0-48
 25932  	MOVSS alpha+0(FP), X0
 25933  	MOVQ  xs+8(FP), AX
 25934  	MOVQ  incx+16(FP), CX
 25935  	MOVQ  ys+24(FP), DX
 25936  	MOVQ  incy+32(FP), BX
 25937  	MOVQ  n+40(FP), SI
 25938  	JMP   check_limit_unroll
 25939  	PCALIGN $0x08
 25940  	NOP
 25941  
 25942  loop_unroll:
 25943  	MOVSS (AX), X1
 25944  	MULSS X0, X1
 25945  	ADDSS (DX), X1
 25946  	MOVSS X1, (DX)
 25947  	LEAQ  (AX)(CX*4), AX
 25948  	LEAQ  (DX)(BX*4), DX
 25949  	MOVSS (AX), X1
 25950  	MULSS X0, X1
 25951  	ADDSS (DX), X1
 25952  	MOVSS X1, (DX)
 25953  	LEAQ  (AX)(CX*4), AX
 25954  	LEAQ  (DX)(BX*4), DX
 25955  	MOVSS (AX), X1
 25956  	MULSS X0, X1
 25957  	ADDSS (DX), X1
 25958  	MOVSS X1, (DX)
 25959  	LEAQ  (AX)(CX*4), AX
 25960  	LEAQ  (DX)(BX*4), DX
 25961  	MOVSS (AX), X1
 25962  	MULSS X0, X1
 25963  	ADDSS (DX), X1
 25964  	MOVSS X1, (DX)
 25965  	LEAQ  (AX)(CX*4), AX
 25966  	LEAQ  (DX)(BX*4), DX
 25967  	SUBQ  $0x04, SI
 25968  
 25969  check_limit_unroll:
 25970  	CMPQ SI, $0x04
 25971  	JHS  loop_unroll
 25972  	JMP  check_limit
 25973  
 25974  loop:
 25975  	MOVSS (AX), X1
 25976  	MULSS X0, X1
 25977  	ADDSS (DX), X1
 25978  	MOVSS X1, (DX)
 25979  	DECQ  SI
 25980  	LEAQ  (AX)(CX*4), AX
 25981  	LEAQ  (DX)(BX*4), DX
 25982  
 25983  check_limit:
 25984  	CMPQ SI, $0x00
 25985  	JHI  loop
 25986  	RET
 25987  
 25988  // func AmdAxpyPointerLoopX_V3A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 25989  // Requires: SSE
 25990  TEXT ·AmdAxpyPointerLoopX_V3A9U4(SB), NOSPLIT, $0-48
 25991  	MOVSS alpha+0(FP), X0
 25992  	MOVQ  xs+8(FP), AX
 25993  	MOVQ  incx+16(FP), CX
 25994  	MOVQ  ys+24(FP), DX
 25995  	MOVQ  incy+32(FP), BX
 25996  	MOVQ  n+40(FP), SI
 25997  	JMP   check_limit_unroll
 25998  	PCALIGN $0x08
 25999  	NOP
 26000  
 26001  loop_unroll:
 26002  	MOVSS (AX), X1
 26003  	MULSS X0, X1
 26004  	ADDSS (DX), X1
 26005  	MOVSS X1, (DX)
 26006  	LEAQ  (AX)(CX*4), AX
 26007  	LEAQ  (DX)(BX*4), DX
 26008  	MOVSS (AX), X1
 26009  	MULSS X0, X1
 26010  	ADDSS (DX), X1
 26011  	MOVSS X1, (DX)
 26012  	LEAQ  (AX)(CX*4), AX
 26013  	LEAQ  (DX)(BX*4), DX
 26014  	MOVSS (AX), X1
 26015  	MULSS X0, X1
 26016  	ADDSS (DX), X1
 26017  	MOVSS X1, (DX)
 26018  	LEAQ  (AX)(CX*4), AX
 26019  	LEAQ  (DX)(BX*4), DX
 26020  	MOVSS (AX), X1
 26021  	MULSS X0, X1
 26022  	ADDSS (DX), X1
 26023  	MOVSS X1, (DX)
 26024  	LEAQ  (AX)(CX*4), AX
 26025  	LEAQ  (DX)(BX*4), DX
 26026  	SUBQ  $0x04, SI
 26027  
 26028  check_limit_unroll:
 26029  	CMPQ SI, $0x04
 26030  	JHS  loop_unroll
 26031  	JMP  check_limit
 26032  
 26033  loop:
 26034  	MOVSS (AX), X1
 26035  	MULSS X0, X1
 26036  	ADDSS (DX), X1
 26037  	MOVSS X1, (DX)
 26038  	DECQ  SI
 26039  	LEAQ  (AX)(CX*4), AX
 26040  	LEAQ  (DX)(BX*4), DX
 26041  
 26042  check_limit:
 26043  	CMPQ SI, $0x00
 26044  	JHI  loop
 26045  	RET
 26046  
 26047  // func AmdAxpyPointerLoopX_V4A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26048  // Requires: SSE
 26049  TEXT ·AmdAxpyPointerLoopX_V4A9U4(SB), NOSPLIT, $0-48
 26050  	MOVSS alpha+0(FP), X0
 26051  	MOVQ  xs+8(FP), AX
 26052  	MOVQ  incx+16(FP), CX
 26053  	MOVQ  ys+24(FP), DX
 26054  	MOVQ  incy+32(FP), BX
 26055  	MOVQ  n+40(FP), SI
 26056  	JMP   check_limit_unroll
 26057  	PCALIGN $0x08
 26058  	NOP
 26059  
 26060  loop_unroll:
 26061  	MOVSS (AX), X1
 26062  	MULSS X0, X1
 26063  	ADDSS (DX), X1
 26064  	MOVSS X1, (DX)
 26065  	LEAQ  (AX)(CX*4), AX
 26066  	LEAQ  (DX)(BX*4), DX
 26067  	MOVSS (AX), X1
 26068  	MULSS X0, X1
 26069  	ADDSS (DX), X1
 26070  	MOVSS X1, (DX)
 26071  	LEAQ  (AX)(CX*4), AX
 26072  	LEAQ  (DX)(BX*4), DX
 26073  	MOVSS (AX), X1
 26074  	MULSS X0, X1
 26075  	ADDSS (DX), X1
 26076  	MOVSS X1, (DX)
 26077  	LEAQ  (AX)(CX*4), AX
 26078  	LEAQ  (DX)(BX*4), DX
 26079  	MOVSS (AX), X1
 26080  	MULSS X0, X1
 26081  	ADDSS (DX), X1
 26082  	MOVSS X1, (DX)
 26083  	LEAQ  (AX)(CX*4), AX
 26084  	LEAQ  (DX)(BX*4), DX
 26085  	SUBQ  $0x04, SI
 26086  
 26087  check_limit_unroll:
 26088  	CMPQ SI, $0x04
 26089  	JHS  loop_unroll
 26090  	JMP  check_limit
 26091  
 26092  loop:
 26093  	MOVSS (AX), X1
 26094  	MULSS X0, X1
 26095  	ADDSS (DX), X1
 26096  	MOVSS X1, (DX)
 26097  	DECQ  SI
 26098  	LEAQ  (AX)(CX*4), AX
 26099  	LEAQ  (DX)(BX*4), DX
 26100  
 26101  check_limit:
 26102  	CMPQ SI, $0x00
 26103  	JHI  loop
 26104  	RET
 26105  
 26106  // func AmdAxpyPointerLoopX_V5A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26107  // Requires: SSE
 26108  TEXT ·AmdAxpyPointerLoopX_V5A9U4(SB), NOSPLIT, $0-48
 26109  	MOVSS alpha+0(FP), X0
 26110  	MOVQ  xs+8(FP), AX
 26111  	MOVQ  incx+16(FP), CX
 26112  	MOVQ  ys+24(FP), DX
 26113  	MOVQ  incy+32(FP), BX
 26114  	MOVQ  n+40(FP), SI
 26115  	JMP   check_limit_unroll
 26116  	PCALIGN $0x08
 26117  	NOP
 26118  
 26119  loop_unroll:
 26120  	MOVSS (AX), X1
 26121  	MULSS X0, X1
 26122  	ADDSS (DX), X1
 26123  	MOVSS X1, (DX)
 26124  	LEAQ  (AX)(CX*4), AX
 26125  	LEAQ  (DX)(BX*4), DX
 26126  	MOVSS (AX), X1
 26127  	MULSS X0, X1
 26128  	ADDSS (DX), X1
 26129  	MOVSS X1, (DX)
 26130  	LEAQ  (AX)(CX*4), AX
 26131  	LEAQ  (DX)(BX*4), DX
 26132  	MOVSS (AX), X1
 26133  	MULSS X0, X1
 26134  	ADDSS (DX), X1
 26135  	MOVSS X1, (DX)
 26136  	LEAQ  (AX)(CX*4), AX
 26137  	LEAQ  (DX)(BX*4), DX
 26138  	MOVSS (AX), X1
 26139  	MULSS X0, X1
 26140  	ADDSS (DX), X1
 26141  	MOVSS X1, (DX)
 26142  	LEAQ  (AX)(CX*4), AX
 26143  	LEAQ  (DX)(BX*4), DX
 26144  	SUBQ  $0x04, SI
 26145  
 26146  check_limit_unroll:
 26147  	CMPQ SI, $0x04
 26148  	JHS  loop_unroll
 26149  	JMP  check_limit
 26150  
 26151  loop:
 26152  	MOVSS (AX), X1
 26153  	MULSS X0, X1
 26154  	ADDSS (DX), X1
 26155  	MOVSS X1, (DX)
 26156  	DECQ  SI
 26157  	LEAQ  (AX)(CX*4), AX
 26158  	LEAQ  (DX)(BX*4), DX
 26159  
 26160  check_limit:
 26161  	CMPQ SI, $0x00
 26162  	JHI  loop
 26163  	RET
 26164  
 26165  // func AmdAxpyPointerLoopX_V0A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26166  // Requires: SSE
 26167  TEXT ·AmdAxpyPointerLoopX_V0A10U4(SB), NOSPLIT, $0-48
 26168  	MOVSS alpha+0(FP), X0
 26169  	MOVQ  xs+8(FP), AX
 26170  	MOVQ  incx+16(FP), CX
 26171  	MOVQ  ys+24(FP), DX
 26172  	MOVQ  incy+32(FP), BX
 26173  	MOVQ  n+40(FP), SI
 26174  	JMP   check_limit_unroll
 26175  	PCALIGN $0x08
 26176  	NOP
 26177  	NOP
 26178  
 26179  loop_unroll:
 26180  	MOVSS (AX), X1
 26181  	MULSS X0, X1
 26182  	ADDSS (DX), X1
 26183  	MOVSS X1, (DX)
 26184  	LEAQ  (AX)(CX*4), AX
 26185  	LEAQ  (DX)(BX*4), DX
 26186  	MOVSS (AX), X1
 26187  	MULSS X0, X1
 26188  	ADDSS (DX), X1
 26189  	MOVSS X1, (DX)
 26190  	LEAQ  (AX)(CX*4), AX
 26191  	LEAQ  (DX)(BX*4), DX
 26192  	MOVSS (AX), X1
 26193  	MULSS X0, X1
 26194  	ADDSS (DX), X1
 26195  	MOVSS X1, (DX)
 26196  	LEAQ  (AX)(CX*4), AX
 26197  	LEAQ  (DX)(BX*4), DX
 26198  	MOVSS (AX), X1
 26199  	MULSS X0, X1
 26200  	ADDSS (DX), X1
 26201  	MOVSS X1, (DX)
 26202  	LEAQ  (AX)(CX*4), AX
 26203  	LEAQ  (DX)(BX*4), DX
 26204  	SUBQ  $0x04, SI
 26205  
 26206  check_limit_unroll:
 26207  	CMPQ SI, $0x04
 26208  	JHS  loop_unroll
 26209  	JMP  check_limit
 26210  
 26211  loop:
 26212  	MOVSS (AX), X1
 26213  	MULSS X0, X1
 26214  	ADDSS (DX), X1
 26215  	MOVSS X1, (DX)
 26216  	DECQ  SI
 26217  	LEAQ  (AX)(CX*4), AX
 26218  	LEAQ  (DX)(BX*4), DX
 26219  
 26220  check_limit:
 26221  	CMPQ SI, $0x00
 26222  	JHI  loop
 26223  	RET
 26224  
 26225  // func AmdAxpyPointerLoopX_V1A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26226  // Requires: SSE
 26227  TEXT ·AmdAxpyPointerLoopX_V1A10U4(SB), NOSPLIT, $0-48
 26228  	MOVSS alpha+0(FP), X0
 26229  	MOVQ  xs+8(FP), AX
 26230  	MOVQ  incx+16(FP), CX
 26231  	MOVQ  ys+24(FP), DX
 26232  	MOVQ  incy+32(FP), BX
 26233  	MOVQ  n+40(FP), SI
 26234  	JMP   check_limit_unroll
 26235  	PCALIGN $0x08
 26236  	NOP
 26237  	NOP
 26238  
 26239  loop_unroll:
 26240  	MOVSS (AX), X1
 26241  	MULSS X0, X1
 26242  	ADDSS (DX), X1
 26243  	MOVSS X1, (DX)
 26244  	LEAQ  (AX)(CX*4), AX
 26245  	LEAQ  (DX)(BX*4), DX
 26246  	MOVSS (AX), X1
 26247  	MULSS X0, X1
 26248  	ADDSS (DX), X1
 26249  	MOVSS X1, (DX)
 26250  	LEAQ  (AX)(CX*4), AX
 26251  	LEAQ  (DX)(BX*4), DX
 26252  	MOVSS (AX), X1
 26253  	MULSS X0, X1
 26254  	ADDSS (DX), X1
 26255  	MOVSS X1, (DX)
 26256  	LEAQ  (AX)(CX*4), AX
 26257  	LEAQ  (DX)(BX*4), DX
 26258  	MOVSS (AX), X1
 26259  	MULSS X0, X1
 26260  	ADDSS (DX), X1
 26261  	MOVSS X1, (DX)
 26262  	LEAQ  (AX)(CX*4), AX
 26263  	LEAQ  (DX)(BX*4), DX
 26264  	SUBQ  $0x04, SI
 26265  
 26266  check_limit_unroll:
 26267  	CMPQ SI, $0x04
 26268  	JHS  loop_unroll
 26269  	JMP  check_limit
 26270  
 26271  loop:
 26272  	MOVSS (AX), X1
 26273  	MULSS X0, X1
 26274  	ADDSS (DX), X1
 26275  	MOVSS X1, (DX)
 26276  	DECQ  SI
 26277  	LEAQ  (AX)(CX*4), AX
 26278  	LEAQ  (DX)(BX*4), DX
 26279  
 26280  check_limit:
 26281  	CMPQ SI, $0x00
 26282  	JHI  loop
 26283  	RET
 26284  
 26285  // func AmdAxpyPointerLoopX_V2A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26286  // Requires: SSE
 26287  TEXT ·AmdAxpyPointerLoopX_V2A10U4(SB), NOSPLIT, $0-48
 26288  	MOVSS alpha+0(FP), X0
 26289  	MOVQ  xs+8(FP), AX
 26290  	MOVQ  incx+16(FP), CX
 26291  	MOVQ  ys+24(FP), DX
 26292  	MOVQ  incy+32(FP), BX
 26293  	MOVQ  n+40(FP), SI
 26294  	JMP   check_limit_unroll
 26295  	PCALIGN $0x08
 26296  	NOP
 26297  	NOP
 26298  
 26299  loop_unroll:
 26300  	MOVSS (AX), X1
 26301  	MULSS X0, X1
 26302  	ADDSS (DX), X1
 26303  	MOVSS X1, (DX)
 26304  	LEAQ  (AX)(CX*4), AX
 26305  	LEAQ  (DX)(BX*4), DX
 26306  	MOVSS (AX), X1
 26307  	MULSS X0, X1
 26308  	ADDSS (DX), X1
 26309  	MOVSS X1, (DX)
 26310  	LEAQ  (AX)(CX*4), AX
 26311  	LEAQ  (DX)(BX*4), DX
 26312  	MOVSS (AX), X1
 26313  	MULSS X0, X1
 26314  	ADDSS (DX), X1
 26315  	MOVSS X1, (DX)
 26316  	LEAQ  (AX)(CX*4), AX
 26317  	LEAQ  (DX)(BX*4), DX
 26318  	MOVSS (AX), X1
 26319  	MULSS X0, X1
 26320  	ADDSS (DX), X1
 26321  	MOVSS X1, (DX)
 26322  	LEAQ  (AX)(CX*4), AX
 26323  	LEAQ  (DX)(BX*4), DX
 26324  	SUBQ  $0x04, SI
 26325  
 26326  check_limit_unroll:
 26327  	CMPQ SI, $0x04
 26328  	JHS  loop_unroll
 26329  	JMP  check_limit
 26330  
 26331  loop:
 26332  	MOVSS (AX), X1
 26333  	MULSS X0, X1
 26334  	ADDSS (DX), X1
 26335  	MOVSS X1, (DX)
 26336  	DECQ  SI
 26337  	LEAQ  (AX)(CX*4), AX
 26338  	LEAQ  (DX)(BX*4), DX
 26339  
 26340  check_limit:
 26341  	CMPQ SI, $0x00
 26342  	JHI  loop
 26343  	RET
 26344  
 26345  // func AmdAxpyPointerLoopX_V3A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26346  // Requires: SSE
 26347  TEXT ·AmdAxpyPointerLoopX_V3A10U4(SB), NOSPLIT, $0-48
 26348  	MOVSS alpha+0(FP), X0
 26349  	MOVQ  xs+8(FP), AX
 26350  	MOVQ  incx+16(FP), CX
 26351  	MOVQ  ys+24(FP), DX
 26352  	MOVQ  incy+32(FP), BX
 26353  	MOVQ  n+40(FP), SI
 26354  	JMP   check_limit_unroll
 26355  	PCALIGN $0x08
 26356  	NOP
 26357  	NOP
 26358  
 26359  loop_unroll:
 26360  	MOVSS (AX), X1
 26361  	MULSS X0, X1
 26362  	ADDSS (DX), X1
 26363  	MOVSS X1, (DX)
 26364  	LEAQ  (AX)(CX*4), AX
 26365  	LEAQ  (DX)(BX*4), DX
 26366  	MOVSS (AX), X1
 26367  	MULSS X0, X1
 26368  	ADDSS (DX), X1
 26369  	MOVSS X1, (DX)
 26370  	LEAQ  (AX)(CX*4), AX
 26371  	LEAQ  (DX)(BX*4), DX
 26372  	MOVSS (AX), X1
 26373  	MULSS X0, X1
 26374  	ADDSS (DX), X1
 26375  	MOVSS X1, (DX)
 26376  	LEAQ  (AX)(CX*4), AX
 26377  	LEAQ  (DX)(BX*4), DX
 26378  	MOVSS (AX), X1
 26379  	MULSS X0, X1
 26380  	ADDSS (DX), X1
 26381  	MOVSS X1, (DX)
 26382  	LEAQ  (AX)(CX*4), AX
 26383  	LEAQ  (DX)(BX*4), DX
 26384  	SUBQ  $0x04, SI
 26385  
 26386  check_limit_unroll:
 26387  	CMPQ SI, $0x04
 26388  	JHS  loop_unroll
 26389  	JMP  check_limit
 26390  
 26391  loop:
 26392  	MOVSS (AX), X1
 26393  	MULSS X0, X1
 26394  	ADDSS (DX), X1
 26395  	MOVSS X1, (DX)
 26396  	DECQ  SI
 26397  	LEAQ  (AX)(CX*4), AX
 26398  	LEAQ  (DX)(BX*4), DX
 26399  
 26400  check_limit:
 26401  	CMPQ SI, $0x00
 26402  	JHI  loop
 26403  	RET
 26404  
 26405  // func AmdAxpyPointerLoopX_V4A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26406  // Requires: SSE
 26407  TEXT ·AmdAxpyPointerLoopX_V4A10U4(SB), NOSPLIT, $0-48
 26408  	MOVSS alpha+0(FP), X0
 26409  	MOVQ  xs+8(FP), AX
 26410  	MOVQ  incx+16(FP), CX
 26411  	MOVQ  ys+24(FP), DX
 26412  	MOVQ  incy+32(FP), BX
 26413  	MOVQ  n+40(FP), SI
 26414  	JMP   check_limit_unroll
 26415  	PCALIGN $0x08
 26416  	NOP
 26417  	NOP
 26418  
 26419  loop_unroll:
 26420  	MOVSS (AX), X1
 26421  	MULSS X0, X1
 26422  	ADDSS (DX), X1
 26423  	MOVSS X1, (DX)
 26424  	LEAQ  (AX)(CX*4), AX
 26425  	LEAQ  (DX)(BX*4), DX
 26426  	MOVSS (AX), X1
 26427  	MULSS X0, X1
 26428  	ADDSS (DX), X1
 26429  	MOVSS X1, (DX)
 26430  	LEAQ  (AX)(CX*4), AX
 26431  	LEAQ  (DX)(BX*4), DX
 26432  	MOVSS (AX), X1
 26433  	MULSS X0, X1
 26434  	ADDSS (DX), X1
 26435  	MOVSS X1, (DX)
 26436  	LEAQ  (AX)(CX*4), AX
 26437  	LEAQ  (DX)(BX*4), DX
 26438  	MOVSS (AX), X1
 26439  	MULSS X0, X1
 26440  	ADDSS (DX), X1
 26441  	MOVSS X1, (DX)
 26442  	LEAQ  (AX)(CX*4), AX
 26443  	LEAQ  (DX)(BX*4), DX
 26444  	SUBQ  $0x04, SI
 26445  
 26446  check_limit_unroll:
 26447  	CMPQ SI, $0x04
 26448  	JHS  loop_unroll
 26449  	JMP  check_limit
 26450  
 26451  loop:
 26452  	MOVSS (AX), X1
 26453  	MULSS X0, X1
 26454  	ADDSS (DX), X1
 26455  	MOVSS X1, (DX)
 26456  	DECQ  SI
 26457  	LEAQ  (AX)(CX*4), AX
 26458  	LEAQ  (DX)(BX*4), DX
 26459  
 26460  check_limit:
 26461  	CMPQ SI, $0x00
 26462  	JHI  loop
 26463  	RET
 26464  
 26465  // func AmdAxpyPointerLoopX_V5A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26466  // Requires: SSE
 26467  TEXT ·AmdAxpyPointerLoopX_V5A10U4(SB), NOSPLIT, $0-48
 26468  	MOVSS alpha+0(FP), X0
 26469  	MOVQ  xs+8(FP), AX
 26470  	MOVQ  incx+16(FP), CX
 26471  	MOVQ  ys+24(FP), DX
 26472  	MOVQ  incy+32(FP), BX
 26473  	MOVQ  n+40(FP), SI
 26474  	JMP   check_limit_unroll
 26475  	PCALIGN $0x08
 26476  	NOP
 26477  	NOP
 26478  
 26479  loop_unroll:
 26480  	MOVSS (AX), X1
 26481  	MULSS X0, X1
 26482  	ADDSS (DX), X1
 26483  	MOVSS X1, (DX)
 26484  	LEAQ  (AX)(CX*4), AX
 26485  	LEAQ  (DX)(BX*4), DX
 26486  	MOVSS (AX), X1
 26487  	MULSS X0, X1
 26488  	ADDSS (DX), X1
 26489  	MOVSS X1, (DX)
 26490  	LEAQ  (AX)(CX*4), AX
 26491  	LEAQ  (DX)(BX*4), DX
 26492  	MOVSS (AX), X1
 26493  	MULSS X0, X1
 26494  	ADDSS (DX), X1
 26495  	MOVSS X1, (DX)
 26496  	LEAQ  (AX)(CX*4), AX
 26497  	LEAQ  (DX)(BX*4), DX
 26498  	MOVSS (AX), X1
 26499  	MULSS X0, X1
 26500  	ADDSS (DX), X1
 26501  	MOVSS X1, (DX)
 26502  	LEAQ  (AX)(CX*4), AX
 26503  	LEAQ  (DX)(BX*4), DX
 26504  	SUBQ  $0x04, SI
 26505  
 26506  check_limit_unroll:
 26507  	CMPQ SI, $0x04
 26508  	JHS  loop_unroll
 26509  	JMP  check_limit
 26510  
 26511  loop:
 26512  	MOVSS (AX), X1
 26513  	MULSS X0, X1
 26514  	ADDSS (DX), X1
 26515  	MOVSS X1, (DX)
 26516  	DECQ  SI
 26517  	LEAQ  (AX)(CX*4), AX
 26518  	LEAQ  (DX)(BX*4), DX
 26519  
 26520  check_limit:
 26521  	CMPQ SI, $0x00
 26522  	JHI  loop
 26523  	RET
 26524  
 26525  // func AmdAxpyPointerLoopX_V0A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26526  // Requires: SSE
 26527  TEXT ·AmdAxpyPointerLoopX_V0A11U4(SB), NOSPLIT, $0-48
 26528  	MOVSS alpha+0(FP), X0
 26529  	MOVQ  xs+8(FP), AX
 26530  	MOVQ  incx+16(FP), CX
 26531  	MOVQ  ys+24(FP), DX
 26532  	MOVQ  incy+32(FP), BX
 26533  	MOVQ  n+40(FP), SI
 26534  	JMP   check_limit_unroll
 26535  	PCALIGN $0x08
 26536  	NOP
 26537  	NOP
 26538  	NOP
 26539  
 26540  loop_unroll:
 26541  	MOVSS (AX), X1
 26542  	MULSS X0, X1
 26543  	ADDSS (DX), X1
 26544  	MOVSS X1, (DX)
 26545  	LEAQ  (AX)(CX*4), AX
 26546  	LEAQ  (DX)(BX*4), DX
 26547  	MOVSS (AX), X1
 26548  	MULSS X0, X1
 26549  	ADDSS (DX), X1
 26550  	MOVSS X1, (DX)
 26551  	LEAQ  (AX)(CX*4), AX
 26552  	LEAQ  (DX)(BX*4), DX
 26553  	MOVSS (AX), X1
 26554  	MULSS X0, X1
 26555  	ADDSS (DX), X1
 26556  	MOVSS X1, (DX)
 26557  	LEAQ  (AX)(CX*4), AX
 26558  	LEAQ  (DX)(BX*4), DX
 26559  	MOVSS (AX), X1
 26560  	MULSS X0, X1
 26561  	ADDSS (DX), X1
 26562  	MOVSS X1, (DX)
 26563  	LEAQ  (AX)(CX*4), AX
 26564  	LEAQ  (DX)(BX*4), DX
 26565  	SUBQ  $0x04, SI
 26566  
 26567  check_limit_unroll:
 26568  	CMPQ SI, $0x04
 26569  	JHS  loop_unroll
 26570  	JMP  check_limit
 26571  
 26572  loop:
 26573  	MOVSS (AX), X1
 26574  	MULSS X0, X1
 26575  	ADDSS (DX), X1
 26576  	MOVSS X1, (DX)
 26577  	DECQ  SI
 26578  	LEAQ  (AX)(CX*4), AX
 26579  	LEAQ  (DX)(BX*4), DX
 26580  
 26581  check_limit:
 26582  	CMPQ SI, $0x00
 26583  	JHI  loop
 26584  	RET
 26585  
 26586  // func AmdAxpyPointerLoopX_V1A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26587  // Requires: SSE
 26588  TEXT ·AmdAxpyPointerLoopX_V1A11U4(SB), NOSPLIT, $0-48
 26589  	MOVSS alpha+0(FP), X0
 26590  	MOVQ  xs+8(FP), AX
 26591  	MOVQ  incx+16(FP), CX
 26592  	MOVQ  ys+24(FP), DX
 26593  	MOVQ  incy+32(FP), BX
 26594  	MOVQ  n+40(FP), SI
 26595  	JMP   check_limit_unroll
 26596  	PCALIGN $0x08
 26597  	NOP
 26598  	NOP
 26599  	NOP
 26600  
 26601  loop_unroll:
 26602  	MOVSS (AX), X1
 26603  	MULSS X0, X1
 26604  	ADDSS (DX), X1
 26605  	MOVSS X1, (DX)
 26606  	LEAQ  (AX)(CX*4), AX
 26607  	LEAQ  (DX)(BX*4), DX
 26608  	MOVSS (AX), X1
 26609  	MULSS X0, X1
 26610  	ADDSS (DX), X1
 26611  	MOVSS X1, (DX)
 26612  	LEAQ  (AX)(CX*4), AX
 26613  	LEAQ  (DX)(BX*4), DX
 26614  	MOVSS (AX), X1
 26615  	MULSS X0, X1
 26616  	ADDSS (DX), X1
 26617  	MOVSS X1, (DX)
 26618  	LEAQ  (AX)(CX*4), AX
 26619  	LEAQ  (DX)(BX*4), DX
 26620  	MOVSS (AX), X1
 26621  	MULSS X0, X1
 26622  	ADDSS (DX), X1
 26623  	MOVSS X1, (DX)
 26624  	LEAQ  (AX)(CX*4), AX
 26625  	LEAQ  (DX)(BX*4), DX
 26626  	SUBQ  $0x04, SI
 26627  
 26628  check_limit_unroll:
 26629  	CMPQ SI, $0x04
 26630  	JHS  loop_unroll
 26631  	JMP  check_limit
 26632  
 26633  loop:
 26634  	MOVSS (AX), X1
 26635  	MULSS X0, X1
 26636  	ADDSS (DX), X1
 26637  	MOVSS X1, (DX)
 26638  	DECQ  SI
 26639  	LEAQ  (AX)(CX*4), AX
 26640  	LEAQ  (DX)(BX*4), DX
 26641  
 26642  check_limit:
 26643  	CMPQ SI, $0x00
 26644  	JHI  loop
 26645  	RET
 26646  
 26647  // func AmdAxpyPointerLoopX_V2A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26648  // Requires: SSE
 26649  TEXT ·AmdAxpyPointerLoopX_V2A11U4(SB), NOSPLIT, $0-48
 26650  	MOVSS alpha+0(FP), X0
 26651  	MOVQ  xs+8(FP), AX
 26652  	MOVQ  incx+16(FP), CX
 26653  	MOVQ  ys+24(FP), DX
 26654  	MOVQ  incy+32(FP), BX
 26655  	MOVQ  n+40(FP), SI
 26656  	JMP   check_limit_unroll
 26657  	PCALIGN $0x08
 26658  	NOP
 26659  	NOP
 26660  	NOP
 26661  
 26662  loop_unroll:
 26663  	MOVSS (AX), X1
 26664  	MULSS X0, X1
 26665  	ADDSS (DX), X1
 26666  	MOVSS X1, (DX)
 26667  	LEAQ  (AX)(CX*4), AX
 26668  	LEAQ  (DX)(BX*4), DX
 26669  	MOVSS (AX), X1
 26670  	MULSS X0, X1
 26671  	ADDSS (DX), X1
 26672  	MOVSS X1, (DX)
 26673  	LEAQ  (AX)(CX*4), AX
 26674  	LEAQ  (DX)(BX*4), DX
 26675  	MOVSS (AX), X1
 26676  	MULSS X0, X1
 26677  	ADDSS (DX), X1
 26678  	MOVSS X1, (DX)
 26679  	LEAQ  (AX)(CX*4), AX
 26680  	LEAQ  (DX)(BX*4), DX
 26681  	MOVSS (AX), X1
 26682  	MULSS X0, X1
 26683  	ADDSS (DX), X1
 26684  	MOVSS X1, (DX)
 26685  	LEAQ  (AX)(CX*4), AX
 26686  	LEAQ  (DX)(BX*4), DX
 26687  	SUBQ  $0x04, SI
 26688  
 26689  check_limit_unroll:
 26690  	CMPQ SI, $0x04
 26691  	JHS  loop_unroll
 26692  	JMP  check_limit
 26693  
 26694  loop:
 26695  	MOVSS (AX), X1
 26696  	MULSS X0, X1
 26697  	ADDSS (DX), X1
 26698  	MOVSS X1, (DX)
 26699  	DECQ  SI
 26700  	LEAQ  (AX)(CX*4), AX
 26701  	LEAQ  (DX)(BX*4), DX
 26702  
 26703  check_limit:
 26704  	CMPQ SI, $0x00
 26705  	JHI  loop
 26706  	RET
 26707  
 26708  // func AmdAxpyPointerLoopX_V3A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26709  // Requires: SSE
 26710  TEXT ·AmdAxpyPointerLoopX_V3A11U4(SB), NOSPLIT, $0-48
 26711  	MOVSS alpha+0(FP), X0
 26712  	MOVQ  xs+8(FP), AX
 26713  	MOVQ  incx+16(FP), CX
 26714  	MOVQ  ys+24(FP), DX
 26715  	MOVQ  incy+32(FP), BX
 26716  	MOVQ  n+40(FP), SI
 26717  	JMP   check_limit_unroll
 26718  	PCALIGN $0x08
 26719  	NOP
 26720  	NOP
 26721  	NOP
 26722  
 26723  loop_unroll:
 26724  	MOVSS (AX), X1
 26725  	MULSS X0, X1
 26726  	ADDSS (DX), X1
 26727  	MOVSS X1, (DX)
 26728  	LEAQ  (AX)(CX*4), AX
 26729  	LEAQ  (DX)(BX*4), DX
 26730  	MOVSS (AX), X1
 26731  	MULSS X0, X1
 26732  	ADDSS (DX), X1
 26733  	MOVSS X1, (DX)
 26734  	LEAQ  (AX)(CX*4), AX
 26735  	LEAQ  (DX)(BX*4), DX
 26736  	MOVSS (AX), X1
 26737  	MULSS X0, X1
 26738  	ADDSS (DX), X1
 26739  	MOVSS X1, (DX)
 26740  	LEAQ  (AX)(CX*4), AX
 26741  	LEAQ  (DX)(BX*4), DX
 26742  	MOVSS (AX), X1
 26743  	MULSS X0, X1
 26744  	ADDSS (DX), X1
 26745  	MOVSS X1, (DX)
 26746  	LEAQ  (AX)(CX*4), AX
 26747  	LEAQ  (DX)(BX*4), DX
 26748  	SUBQ  $0x04, SI
 26749  
 26750  check_limit_unroll:
 26751  	CMPQ SI, $0x04
 26752  	JHS  loop_unroll
 26753  	JMP  check_limit
 26754  
 26755  loop:
 26756  	MOVSS (AX), X1
 26757  	MULSS X0, X1
 26758  	ADDSS (DX), X1
 26759  	MOVSS X1, (DX)
 26760  	DECQ  SI
 26761  	LEAQ  (AX)(CX*4), AX
 26762  	LEAQ  (DX)(BX*4), DX
 26763  
 26764  check_limit:
 26765  	CMPQ SI, $0x00
 26766  	JHI  loop
 26767  	RET
 26768  
 26769  // func AmdAxpyPointerLoopX_V4A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26770  // Requires: SSE
 26771  TEXT ·AmdAxpyPointerLoopX_V4A11U4(SB), NOSPLIT, $0-48
 26772  	MOVSS alpha+0(FP), X0
 26773  	MOVQ  xs+8(FP), AX
 26774  	MOVQ  incx+16(FP), CX
 26775  	MOVQ  ys+24(FP), DX
 26776  	MOVQ  incy+32(FP), BX
 26777  	MOVQ  n+40(FP), SI
 26778  	JMP   check_limit_unroll
 26779  	PCALIGN $0x08
 26780  	NOP
 26781  	NOP
 26782  	NOP
 26783  
 26784  loop_unroll:
 26785  	MOVSS (AX), X1
 26786  	MULSS X0, X1
 26787  	ADDSS (DX), X1
 26788  	MOVSS X1, (DX)
 26789  	LEAQ  (AX)(CX*4), AX
 26790  	LEAQ  (DX)(BX*4), DX
 26791  	MOVSS (AX), X1
 26792  	MULSS X0, X1
 26793  	ADDSS (DX), X1
 26794  	MOVSS X1, (DX)
 26795  	LEAQ  (AX)(CX*4), AX
 26796  	LEAQ  (DX)(BX*4), DX
 26797  	MOVSS (AX), X1
 26798  	MULSS X0, X1
 26799  	ADDSS (DX), X1
 26800  	MOVSS X1, (DX)
 26801  	LEAQ  (AX)(CX*4), AX
 26802  	LEAQ  (DX)(BX*4), DX
 26803  	MOVSS (AX), X1
 26804  	MULSS X0, X1
 26805  	ADDSS (DX), X1
 26806  	MOVSS X1, (DX)
 26807  	LEAQ  (AX)(CX*4), AX
 26808  	LEAQ  (DX)(BX*4), DX
 26809  	SUBQ  $0x04, SI
 26810  
 26811  check_limit_unroll:
 26812  	CMPQ SI, $0x04
 26813  	JHS  loop_unroll
 26814  	JMP  check_limit
 26815  
 26816  loop:
 26817  	MOVSS (AX), X1
 26818  	MULSS X0, X1
 26819  	ADDSS (DX), X1
 26820  	MOVSS X1, (DX)
 26821  	DECQ  SI
 26822  	LEAQ  (AX)(CX*4), AX
 26823  	LEAQ  (DX)(BX*4), DX
 26824  
 26825  check_limit:
 26826  	CMPQ SI, $0x00
 26827  	JHI  loop
 26828  	RET
 26829  
 26830  // func AmdAxpyPointerLoopX_V5A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26831  // Requires: SSE
 26832  TEXT ·AmdAxpyPointerLoopX_V5A11U4(SB), NOSPLIT, $0-48
 26833  	MOVSS alpha+0(FP), X0
 26834  	MOVQ  xs+8(FP), AX
 26835  	MOVQ  incx+16(FP), CX
 26836  	MOVQ  ys+24(FP), DX
 26837  	MOVQ  incy+32(FP), BX
 26838  	MOVQ  n+40(FP), SI
 26839  	JMP   check_limit_unroll
 26840  	PCALIGN $0x08
 26841  	NOP
 26842  	NOP
 26843  	NOP
 26844  
 26845  loop_unroll:
 26846  	MOVSS (AX), X1
 26847  	MULSS X0, X1
 26848  	ADDSS (DX), X1
 26849  	MOVSS X1, (DX)
 26850  	LEAQ  (AX)(CX*4), AX
 26851  	LEAQ  (DX)(BX*4), DX
 26852  	MOVSS (AX), X1
 26853  	MULSS X0, X1
 26854  	ADDSS (DX), X1
 26855  	MOVSS X1, (DX)
 26856  	LEAQ  (AX)(CX*4), AX
 26857  	LEAQ  (DX)(BX*4), DX
 26858  	MOVSS (AX), X1
 26859  	MULSS X0, X1
 26860  	ADDSS (DX), X1
 26861  	MOVSS X1, (DX)
 26862  	LEAQ  (AX)(CX*4), AX
 26863  	LEAQ  (DX)(BX*4), DX
 26864  	MOVSS (AX), X1
 26865  	MULSS X0, X1
 26866  	ADDSS (DX), X1
 26867  	MOVSS X1, (DX)
 26868  	LEAQ  (AX)(CX*4), AX
 26869  	LEAQ  (DX)(BX*4), DX
 26870  	SUBQ  $0x04, SI
 26871  
 26872  check_limit_unroll:
 26873  	CMPQ SI, $0x04
 26874  	JHS  loop_unroll
 26875  	JMP  check_limit
 26876  
 26877  loop:
 26878  	MOVSS (AX), X1
 26879  	MULSS X0, X1
 26880  	ADDSS (DX), X1
 26881  	MOVSS X1, (DX)
 26882  	DECQ  SI
 26883  	LEAQ  (AX)(CX*4), AX
 26884  	LEAQ  (DX)(BX*4), DX
 26885  
 26886  check_limit:
 26887  	CMPQ SI, $0x00
 26888  	JHI  loop
 26889  	RET
 26890  
 26891  // func AmdAxpyPointerLoopX_V0A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26892  // Requires: SSE
 26893  TEXT ·AmdAxpyPointerLoopX_V0A12U4(SB), NOSPLIT, $0-48
 26894  	MOVSS alpha+0(FP), X0
 26895  	MOVQ  xs+8(FP), AX
 26896  	MOVQ  incx+16(FP), CX
 26897  	MOVQ  ys+24(FP), DX
 26898  	MOVQ  incy+32(FP), BX
 26899  	MOVQ  n+40(FP), SI
 26900  	JMP   check_limit_unroll
 26901  	PCALIGN $0x08
 26902  	NOP
 26903  	NOP
 26904  	NOP
 26905  	NOP
 26906  
 26907  loop_unroll:
 26908  	MOVSS (AX), X1
 26909  	MULSS X0, X1
 26910  	ADDSS (DX), X1
 26911  	MOVSS X1, (DX)
 26912  	LEAQ  (AX)(CX*4), AX
 26913  	LEAQ  (DX)(BX*4), DX
 26914  	MOVSS (AX), X1
 26915  	MULSS X0, X1
 26916  	ADDSS (DX), X1
 26917  	MOVSS X1, (DX)
 26918  	LEAQ  (AX)(CX*4), AX
 26919  	LEAQ  (DX)(BX*4), DX
 26920  	MOVSS (AX), X1
 26921  	MULSS X0, X1
 26922  	ADDSS (DX), X1
 26923  	MOVSS X1, (DX)
 26924  	LEAQ  (AX)(CX*4), AX
 26925  	LEAQ  (DX)(BX*4), DX
 26926  	MOVSS (AX), X1
 26927  	MULSS X0, X1
 26928  	ADDSS (DX), X1
 26929  	MOVSS X1, (DX)
 26930  	LEAQ  (AX)(CX*4), AX
 26931  	LEAQ  (DX)(BX*4), DX
 26932  	SUBQ  $0x04, SI
 26933  
 26934  check_limit_unroll:
 26935  	CMPQ SI, $0x04
 26936  	JHS  loop_unroll
 26937  	JMP  check_limit
 26938  
 26939  loop:
 26940  	MOVSS (AX), X1
 26941  	MULSS X0, X1
 26942  	ADDSS (DX), X1
 26943  	MOVSS X1, (DX)
 26944  	DECQ  SI
 26945  	LEAQ  (AX)(CX*4), AX
 26946  	LEAQ  (DX)(BX*4), DX
 26947  
 26948  check_limit:
 26949  	CMPQ SI, $0x00
 26950  	JHI  loop
 26951  	RET
 26952  
 26953  // func AmdAxpyPointerLoopX_V1A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 26954  // Requires: SSE
 26955  TEXT ·AmdAxpyPointerLoopX_V1A12U4(SB), NOSPLIT, $0-48
 26956  	MOVSS alpha+0(FP), X0
 26957  	MOVQ  xs+8(FP), AX
 26958  	MOVQ  incx+16(FP), CX
 26959  	MOVQ  ys+24(FP), DX
 26960  	MOVQ  incy+32(FP), BX
 26961  	MOVQ  n+40(FP), SI
 26962  	JMP   check_limit_unroll
 26963  	PCALIGN $0x08
 26964  	NOP
 26965  	NOP
 26966  	NOP
 26967  	NOP
 26968  
 26969  loop_unroll:
 26970  	MOVSS (AX), X1
 26971  	MULSS X0, X1
 26972  	ADDSS (DX), X1
 26973  	MOVSS X1, (DX)
 26974  	LEAQ  (AX)(CX*4), AX
 26975  	LEAQ  (DX)(BX*4), DX
 26976  	MOVSS (AX), X1
 26977  	MULSS X0, X1
 26978  	ADDSS (DX), X1
 26979  	MOVSS X1, (DX)
 26980  	LEAQ  (AX)(CX*4), AX
 26981  	LEAQ  (DX)(BX*4), DX
 26982  	MOVSS (AX), X1
 26983  	MULSS X0, X1
 26984  	ADDSS (DX), X1
 26985  	MOVSS X1, (DX)
 26986  	LEAQ  (AX)(CX*4), AX
 26987  	LEAQ  (DX)(BX*4), DX
 26988  	MOVSS (AX), X1
 26989  	MULSS X0, X1
 26990  	ADDSS (DX), X1
 26991  	MOVSS X1, (DX)
 26992  	LEAQ  (AX)(CX*4), AX
 26993  	LEAQ  (DX)(BX*4), DX
 26994  	SUBQ  $0x04, SI
 26995  
 26996  check_limit_unroll:
 26997  	CMPQ SI, $0x04
 26998  	JHS  loop_unroll
 26999  	JMP  check_limit
 27000  
 27001  loop:
 27002  	MOVSS (AX), X1
 27003  	MULSS X0, X1
 27004  	ADDSS (DX), X1
 27005  	MOVSS X1, (DX)
 27006  	DECQ  SI
 27007  	LEAQ  (AX)(CX*4), AX
 27008  	LEAQ  (DX)(BX*4), DX
 27009  
 27010  check_limit:
 27011  	CMPQ SI, $0x00
 27012  	JHI  loop
 27013  	RET
 27014  
 27015  // func AmdAxpyPointerLoopX_V2A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27016  // Requires: SSE
 27017  TEXT ·AmdAxpyPointerLoopX_V2A12U4(SB), NOSPLIT, $0-48
 27018  	MOVSS alpha+0(FP), X0
 27019  	MOVQ  xs+8(FP), AX
 27020  	MOVQ  incx+16(FP), CX
 27021  	MOVQ  ys+24(FP), DX
 27022  	MOVQ  incy+32(FP), BX
 27023  	MOVQ  n+40(FP), SI
 27024  	JMP   check_limit_unroll
 27025  	PCALIGN $0x08
 27026  	NOP
 27027  	NOP
 27028  	NOP
 27029  	NOP
 27030  
 27031  loop_unroll:
 27032  	MOVSS (AX), X1
 27033  	MULSS X0, X1
 27034  	ADDSS (DX), X1
 27035  	MOVSS X1, (DX)
 27036  	LEAQ  (AX)(CX*4), AX
 27037  	LEAQ  (DX)(BX*4), DX
 27038  	MOVSS (AX), X1
 27039  	MULSS X0, X1
 27040  	ADDSS (DX), X1
 27041  	MOVSS X1, (DX)
 27042  	LEAQ  (AX)(CX*4), AX
 27043  	LEAQ  (DX)(BX*4), DX
 27044  	MOVSS (AX), X1
 27045  	MULSS X0, X1
 27046  	ADDSS (DX), X1
 27047  	MOVSS X1, (DX)
 27048  	LEAQ  (AX)(CX*4), AX
 27049  	LEAQ  (DX)(BX*4), DX
 27050  	MOVSS (AX), X1
 27051  	MULSS X0, X1
 27052  	ADDSS (DX), X1
 27053  	MOVSS X1, (DX)
 27054  	LEAQ  (AX)(CX*4), AX
 27055  	LEAQ  (DX)(BX*4), DX
 27056  	SUBQ  $0x04, SI
 27057  
 27058  check_limit_unroll:
 27059  	CMPQ SI, $0x04
 27060  	JHS  loop_unroll
 27061  	JMP  check_limit
 27062  
 27063  loop:
 27064  	MOVSS (AX), X1
 27065  	MULSS X0, X1
 27066  	ADDSS (DX), X1
 27067  	MOVSS X1, (DX)
 27068  	DECQ  SI
 27069  	LEAQ  (AX)(CX*4), AX
 27070  	LEAQ  (DX)(BX*4), DX
 27071  
 27072  check_limit:
 27073  	CMPQ SI, $0x00
 27074  	JHI  loop
 27075  	RET
 27076  
 27077  // func AmdAxpyPointerLoopX_V3A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27078  // Requires: SSE
 27079  TEXT ·AmdAxpyPointerLoopX_V3A12U4(SB), NOSPLIT, $0-48
 27080  	MOVSS alpha+0(FP), X0
 27081  	MOVQ  xs+8(FP), AX
 27082  	MOVQ  incx+16(FP), CX
 27083  	MOVQ  ys+24(FP), DX
 27084  	MOVQ  incy+32(FP), BX
 27085  	MOVQ  n+40(FP), SI
 27086  	JMP   check_limit_unroll
 27087  	PCALIGN $0x08
 27088  	NOP
 27089  	NOP
 27090  	NOP
 27091  	NOP
 27092  
 27093  loop_unroll:
 27094  	MOVSS (AX), X1
 27095  	MULSS X0, X1
 27096  	ADDSS (DX), X1
 27097  	MOVSS X1, (DX)
 27098  	LEAQ  (AX)(CX*4), AX
 27099  	LEAQ  (DX)(BX*4), DX
 27100  	MOVSS (AX), X1
 27101  	MULSS X0, X1
 27102  	ADDSS (DX), X1
 27103  	MOVSS X1, (DX)
 27104  	LEAQ  (AX)(CX*4), AX
 27105  	LEAQ  (DX)(BX*4), DX
 27106  	MOVSS (AX), X1
 27107  	MULSS X0, X1
 27108  	ADDSS (DX), X1
 27109  	MOVSS X1, (DX)
 27110  	LEAQ  (AX)(CX*4), AX
 27111  	LEAQ  (DX)(BX*4), DX
 27112  	MOVSS (AX), X1
 27113  	MULSS X0, X1
 27114  	ADDSS (DX), X1
 27115  	MOVSS X1, (DX)
 27116  	LEAQ  (AX)(CX*4), AX
 27117  	LEAQ  (DX)(BX*4), DX
 27118  	SUBQ  $0x04, SI
 27119  
 27120  check_limit_unroll:
 27121  	CMPQ SI, $0x04
 27122  	JHS  loop_unroll
 27123  	JMP  check_limit
 27124  
 27125  loop:
 27126  	MOVSS (AX), X1
 27127  	MULSS X0, X1
 27128  	ADDSS (DX), X1
 27129  	MOVSS X1, (DX)
 27130  	DECQ  SI
 27131  	LEAQ  (AX)(CX*4), AX
 27132  	LEAQ  (DX)(BX*4), DX
 27133  
 27134  check_limit:
 27135  	CMPQ SI, $0x00
 27136  	JHI  loop
 27137  	RET
 27138  
 27139  // func AmdAxpyPointerLoopX_V4A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27140  // Requires: SSE
 27141  TEXT ·AmdAxpyPointerLoopX_V4A12U4(SB), NOSPLIT, $0-48
 27142  	MOVSS alpha+0(FP), X0
 27143  	MOVQ  xs+8(FP), AX
 27144  	MOVQ  incx+16(FP), CX
 27145  	MOVQ  ys+24(FP), DX
 27146  	MOVQ  incy+32(FP), BX
 27147  	MOVQ  n+40(FP), SI
 27148  	JMP   check_limit_unroll
 27149  	PCALIGN $0x08
 27150  	NOP
 27151  	NOP
 27152  	NOP
 27153  	NOP
 27154  
 27155  loop_unroll:
 27156  	MOVSS (AX), X1
 27157  	MULSS X0, X1
 27158  	ADDSS (DX), X1
 27159  	MOVSS X1, (DX)
 27160  	LEAQ  (AX)(CX*4), AX
 27161  	LEAQ  (DX)(BX*4), DX
 27162  	MOVSS (AX), X1
 27163  	MULSS X0, X1
 27164  	ADDSS (DX), X1
 27165  	MOVSS X1, (DX)
 27166  	LEAQ  (AX)(CX*4), AX
 27167  	LEAQ  (DX)(BX*4), DX
 27168  	MOVSS (AX), X1
 27169  	MULSS X0, X1
 27170  	ADDSS (DX), X1
 27171  	MOVSS X1, (DX)
 27172  	LEAQ  (AX)(CX*4), AX
 27173  	LEAQ  (DX)(BX*4), DX
 27174  	MOVSS (AX), X1
 27175  	MULSS X0, X1
 27176  	ADDSS (DX), X1
 27177  	MOVSS X1, (DX)
 27178  	LEAQ  (AX)(CX*4), AX
 27179  	LEAQ  (DX)(BX*4), DX
 27180  	SUBQ  $0x04, SI
 27181  
 27182  check_limit_unroll:
 27183  	CMPQ SI, $0x04
 27184  	JHS  loop_unroll
 27185  	JMP  check_limit
 27186  
 27187  loop:
 27188  	MOVSS (AX), X1
 27189  	MULSS X0, X1
 27190  	ADDSS (DX), X1
 27191  	MOVSS X1, (DX)
 27192  	DECQ  SI
 27193  	LEAQ  (AX)(CX*4), AX
 27194  	LEAQ  (DX)(BX*4), DX
 27195  
 27196  check_limit:
 27197  	CMPQ SI, $0x00
 27198  	JHI  loop
 27199  	RET
 27200  
 27201  // func AmdAxpyPointerLoopX_V5A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27202  // Requires: SSE
 27203  TEXT ·AmdAxpyPointerLoopX_V5A12U4(SB), NOSPLIT, $0-48
 27204  	MOVSS alpha+0(FP), X0
 27205  	MOVQ  xs+8(FP), AX
 27206  	MOVQ  incx+16(FP), CX
 27207  	MOVQ  ys+24(FP), DX
 27208  	MOVQ  incy+32(FP), BX
 27209  	MOVQ  n+40(FP), SI
 27210  	JMP   check_limit_unroll
 27211  	PCALIGN $0x08
 27212  	NOP
 27213  	NOP
 27214  	NOP
 27215  	NOP
 27216  
 27217  loop_unroll:
 27218  	MOVSS (AX), X1
 27219  	MULSS X0, X1
 27220  	ADDSS (DX), X1
 27221  	MOVSS X1, (DX)
 27222  	LEAQ  (AX)(CX*4), AX
 27223  	LEAQ  (DX)(BX*4), DX
 27224  	MOVSS (AX), X1
 27225  	MULSS X0, X1
 27226  	ADDSS (DX), X1
 27227  	MOVSS X1, (DX)
 27228  	LEAQ  (AX)(CX*4), AX
 27229  	LEAQ  (DX)(BX*4), DX
 27230  	MOVSS (AX), X1
 27231  	MULSS X0, X1
 27232  	ADDSS (DX), X1
 27233  	MOVSS X1, (DX)
 27234  	LEAQ  (AX)(CX*4), AX
 27235  	LEAQ  (DX)(BX*4), DX
 27236  	MOVSS (AX), X1
 27237  	MULSS X0, X1
 27238  	ADDSS (DX), X1
 27239  	MOVSS X1, (DX)
 27240  	LEAQ  (AX)(CX*4), AX
 27241  	LEAQ  (DX)(BX*4), DX
 27242  	SUBQ  $0x04, SI
 27243  
 27244  check_limit_unroll:
 27245  	CMPQ SI, $0x04
 27246  	JHS  loop_unroll
 27247  	JMP  check_limit
 27248  
 27249  loop:
 27250  	MOVSS (AX), X1
 27251  	MULSS X0, X1
 27252  	ADDSS (DX), X1
 27253  	MOVSS X1, (DX)
 27254  	DECQ  SI
 27255  	LEAQ  (AX)(CX*4), AX
 27256  	LEAQ  (DX)(BX*4), DX
 27257  
 27258  check_limit:
 27259  	CMPQ SI, $0x00
 27260  	JHI  loop
 27261  	RET
 27262  
 27263  // func AmdAxpyPointerLoopX_V0A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27264  // Requires: SSE
 27265  TEXT ·AmdAxpyPointerLoopX_V0A13U4(SB), NOSPLIT, $0-48
 27266  	MOVSS alpha+0(FP), X0
 27267  	MOVQ  xs+8(FP), AX
 27268  	MOVQ  incx+16(FP), CX
 27269  	MOVQ  ys+24(FP), DX
 27270  	MOVQ  incy+32(FP), BX
 27271  	MOVQ  n+40(FP), SI
 27272  	JMP   check_limit_unroll
 27273  	PCALIGN $0x08
 27274  	NOP
 27275  	NOP
 27276  	NOP
 27277  	NOP
 27278  	NOP
 27279  
 27280  loop_unroll:
 27281  	MOVSS (AX), X1
 27282  	MULSS X0, X1
 27283  	ADDSS (DX), X1
 27284  	MOVSS X1, (DX)
 27285  	LEAQ  (AX)(CX*4), AX
 27286  	LEAQ  (DX)(BX*4), DX
 27287  	MOVSS (AX), X1
 27288  	MULSS X0, X1
 27289  	ADDSS (DX), X1
 27290  	MOVSS X1, (DX)
 27291  	LEAQ  (AX)(CX*4), AX
 27292  	LEAQ  (DX)(BX*4), DX
 27293  	MOVSS (AX), X1
 27294  	MULSS X0, X1
 27295  	ADDSS (DX), X1
 27296  	MOVSS X1, (DX)
 27297  	LEAQ  (AX)(CX*4), AX
 27298  	LEAQ  (DX)(BX*4), DX
 27299  	MOVSS (AX), X1
 27300  	MULSS X0, X1
 27301  	ADDSS (DX), X1
 27302  	MOVSS X1, (DX)
 27303  	LEAQ  (AX)(CX*4), AX
 27304  	LEAQ  (DX)(BX*4), DX
 27305  	SUBQ  $0x04, SI
 27306  
 27307  check_limit_unroll:
 27308  	CMPQ SI, $0x04
 27309  	JHS  loop_unroll
 27310  	JMP  check_limit
 27311  
 27312  loop:
 27313  	MOVSS (AX), X1
 27314  	MULSS X0, X1
 27315  	ADDSS (DX), X1
 27316  	MOVSS X1, (DX)
 27317  	DECQ  SI
 27318  	LEAQ  (AX)(CX*4), AX
 27319  	LEAQ  (DX)(BX*4), DX
 27320  
 27321  check_limit:
 27322  	CMPQ SI, $0x00
 27323  	JHI  loop
 27324  	RET
 27325  
 27326  // func AmdAxpyPointerLoopX_V1A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27327  // Requires: SSE
 27328  TEXT ·AmdAxpyPointerLoopX_V1A13U4(SB), NOSPLIT, $0-48
 27329  	MOVSS alpha+0(FP), X0
 27330  	MOVQ  xs+8(FP), AX
 27331  	MOVQ  incx+16(FP), CX
 27332  	MOVQ  ys+24(FP), DX
 27333  	MOVQ  incy+32(FP), BX
 27334  	MOVQ  n+40(FP), SI
 27335  	JMP   check_limit_unroll
 27336  	PCALIGN $0x08
 27337  	NOP
 27338  	NOP
 27339  	NOP
 27340  	NOP
 27341  	NOP
 27342  
 27343  loop_unroll:
 27344  	MOVSS (AX), X1
 27345  	MULSS X0, X1
 27346  	ADDSS (DX), X1
 27347  	MOVSS X1, (DX)
 27348  	LEAQ  (AX)(CX*4), AX
 27349  	LEAQ  (DX)(BX*4), DX
 27350  	MOVSS (AX), X1
 27351  	MULSS X0, X1
 27352  	ADDSS (DX), X1
 27353  	MOVSS X1, (DX)
 27354  	LEAQ  (AX)(CX*4), AX
 27355  	LEAQ  (DX)(BX*4), DX
 27356  	MOVSS (AX), X1
 27357  	MULSS X0, X1
 27358  	ADDSS (DX), X1
 27359  	MOVSS X1, (DX)
 27360  	LEAQ  (AX)(CX*4), AX
 27361  	LEAQ  (DX)(BX*4), DX
 27362  	MOVSS (AX), X1
 27363  	MULSS X0, X1
 27364  	ADDSS (DX), X1
 27365  	MOVSS X1, (DX)
 27366  	LEAQ  (AX)(CX*4), AX
 27367  	LEAQ  (DX)(BX*4), DX
 27368  	SUBQ  $0x04, SI
 27369  
 27370  check_limit_unroll:
 27371  	CMPQ SI, $0x04
 27372  	JHS  loop_unroll
 27373  	JMP  check_limit
 27374  
 27375  loop:
 27376  	MOVSS (AX), X1
 27377  	MULSS X0, X1
 27378  	ADDSS (DX), X1
 27379  	MOVSS X1, (DX)
 27380  	DECQ  SI
 27381  	LEAQ  (AX)(CX*4), AX
 27382  	LEAQ  (DX)(BX*4), DX
 27383  
 27384  check_limit:
 27385  	CMPQ SI, $0x00
 27386  	JHI  loop
 27387  	RET
 27388  
 27389  // func AmdAxpyPointerLoopX_V2A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27390  // Requires: SSE
 27391  TEXT ·AmdAxpyPointerLoopX_V2A13U4(SB), NOSPLIT, $0-48
 27392  	MOVSS alpha+0(FP), X0
 27393  	MOVQ  xs+8(FP), AX
 27394  	MOVQ  incx+16(FP), CX
 27395  	MOVQ  ys+24(FP), DX
 27396  	MOVQ  incy+32(FP), BX
 27397  	MOVQ  n+40(FP), SI
 27398  	JMP   check_limit_unroll
 27399  	PCALIGN $0x08
 27400  	NOP
 27401  	NOP
 27402  	NOP
 27403  	NOP
 27404  	NOP
 27405  
 27406  loop_unroll:
 27407  	MOVSS (AX), X1
 27408  	MULSS X0, X1
 27409  	ADDSS (DX), X1
 27410  	MOVSS X1, (DX)
 27411  	LEAQ  (AX)(CX*4), AX
 27412  	LEAQ  (DX)(BX*4), DX
 27413  	MOVSS (AX), X1
 27414  	MULSS X0, X1
 27415  	ADDSS (DX), X1
 27416  	MOVSS X1, (DX)
 27417  	LEAQ  (AX)(CX*4), AX
 27418  	LEAQ  (DX)(BX*4), DX
 27419  	MOVSS (AX), X1
 27420  	MULSS X0, X1
 27421  	ADDSS (DX), X1
 27422  	MOVSS X1, (DX)
 27423  	LEAQ  (AX)(CX*4), AX
 27424  	LEAQ  (DX)(BX*4), DX
 27425  	MOVSS (AX), X1
 27426  	MULSS X0, X1
 27427  	ADDSS (DX), X1
 27428  	MOVSS X1, (DX)
 27429  	LEAQ  (AX)(CX*4), AX
 27430  	LEAQ  (DX)(BX*4), DX
 27431  	SUBQ  $0x04, SI
 27432  
 27433  check_limit_unroll:
 27434  	CMPQ SI, $0x04
 27435  	JHS  loop_unroll
 27436  	JMP  check_limit
 27437  
 27438  loop:
 27439  	MOVSS (AX), X1
 27440  	MULSS X0, X1
 27441  	ADDSS (DX), X1
 27442  	MOVSS X1, (DX)
 27443  	DECQ  SI
 27444  	LEAQ  (AX)(CX*4), AX
 27445  	LEAQ  (DX)(BX*4), DX
 27446  
 27447  check_limit:
 27448  	CMPQ SI, $0x00
 27449  	JHI  loop
 27450  	RET
 27451  
 27452  // func AmdAxpyPointerLoopX_V3A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27453  // Requires: SSE
 27454  TEXT ·AmdAxpyPointerLoopX_V3A13U4(SB), NOSPLIT, $0-48
 27455  	MOVSS alpha+0(FP), X0
 27456  	MOVQ  xs+8(FP), AX
 27457  	MOVQ  incx+16(FP), CX
 27458  	MOVQ  ys+24(FP), DX
 27459  	MOVQ  incy+32(FP), BX
 27460  	MOVQ  n+40(FP), SI
 27461  	JMP   check_limit_unroll
 27462  	PCALIGN $0x08
 27463  	NOP
 27464  	NOP
 27465  	NOP
 27466  	NOP
 27467  	NOP
 27468  
 27469  loop_unroll:
 27470  	MOVSS (AX), X1
 27471  	MULSS X0, X1
 27472  	ADDSS (DX), X1
 27473  	MOVSS X1, (DX)
 27474  	LEAQ  (AX)(CX*4), AX
 27475  	LEAQ  (DX)(BX*4), DX
 27476  	MOVSS (AX), X1
 27477  	MULSS X0, X1
 27478  	ADDSS (DX), X1
 27479  	MOVSS X1, (DX)
 27480  	LEAQ  (AX)(CX*4), AX
 27481  	LEAQ  (DX)(BX*4), DX
 27482  	MOVSS (AX), X1
 27483  	MULSS X0, X1
 27484  	ADDSS (DX), X1
 27485  	MOVSS X1, (DX)
 27486  	LEAQ  (AX)(CX*4), AX
 27487  	LEAQ  (DX)(BX*4), DX
 27488  	MOVSS (AX), X1
 27489  	MULSS X0, X1
 27490  	ADDSS (DX), X1
 27491  	MOVSS X1, (DX)
 27492  	LEAQ  (AX)(CX*4), AX
 27493  	LEAQ  (DX)(BX*4), DX
 27494  	SUBQ  $0x04, SI
 27495  
 27496  check_limit_unroll:
 27497  	CMPQ SI, $0x04
 27498  	JHS  loop_unroll
 27499  	JMP  check_limit
 27500  
 27501  loop:
 27502  	MOVSS (AX), X1
 27503  	MULSS X0, X1
 27504  	ADDSS (DX), X1
 27505  	MOVSS X1, (DX)
 27506  	DECQ  SI
 27507  	LEAQ  (AX)(CX*4), AX
 27508  	LEAQ  (DX)(BX*4), DX
 27509  
 27510  check_limit:
 27511  	CMPQ SI, $0x00
 27512  	JHI  loop
 27513  	RET
 27514  
 27515  // func AmdAxpyPointerLoopX_V4A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27516  // Requires: SSE
 27517  TEXT ·AmdAxpyPointerLoopX_V4A13U4(SB), NOSPLIT, $0-48
 27518  	MOVSS alpha+0(FP), X0
 27519  	MOVQ  xs+8(FP), AX
 27520  	MOVQ  incx+16(FP), CX
 27521  	MOVQ  ys+24(FP), DX
 27522  	MOVQ  incy+32(FP), BX
 27523  	MOVQ  n+40(FP), SI
 27524  	JMP   check_limit_unroll
 27525  	PCALIGN $0x08
 27526  	NOP
 27527  	NOP
 27528  	NOP
 27529  	NOP
 27530  	NOP
 27531  
 27532  loop_unroll:
 27533  	MOVSS (AX), X1
 27534  	MULSS X0, X1
 27535  	ADDSS (DX), X1
 27536  	MOVSS X1, (DX)
 27537  	LEAQ  (AX)(CX*4), AX
 27538  	LEAQ  (DX)(BX*4), DX
 27539  	MOVSS (AX), X1
 27540  	MULSS X0, X1
 27541  	ADDSS (DX), X1
 27542  	MOVSS X1, (DX)
 27543  	LEAQ  (AX)(CX*4), AX
 27544  	LEAQ  (DX)(BX*4), DX
 27545  	MOVSS (AX), X1
 27546  	MULSS X0, X1
 27547  	ADDSS (DX), X1
 27548  	MOVSS X1, (DX)
 27549  	LEAQ  (AX)(CX*4), AX
 27550  	LEAQ  (DX)(BX*4), DX
 27551  	MOVSS (AX), X1
 27552  	MULSS X0, X1
 27553  	ADDSS (DX), X1
 27554  	MOVSS X1, (DX)
 27555  	LEAQ  (AX)(CX*4), AX
 27556  	LEAQ  (DX)(BX*4), DX
 27557  	SUBQ  $0x04, SI
 27558  
 27559  check_limit_unroll:
 27560  	CMPQ SI, $0x04
 27561  	JHS  loop_unroll
 27562  	JMP  check_limit
 27563  
 27564  loop:
 27565  	MOVSS (AX), X1
 27566  	MULSS X0, X1
 27567  	ADDSS (DX), X1
 27568  	MOVSS X1, (DX)
 27569  	DECQ  SI
 27570  	LEAQ  (AX)(CX*4), AX
 27571  	LEAQ  (DX)(BX*4), DX
 27572  
 27573  check_limit:
 27574  	CMPQ SI, $0x00
 27575  	JHI  loop
 27576  	RET
 27577  
 27578  // func AmdAxpyPointerLoopX_V5A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27579  // Requires: SSE
 27580  TEXT ·AmdAxpyPointerLoopX_V5A13U4(SB), NOSPLIT, $0-48
 27581  	MOVSS alpha+0(FP), X0
 27582  	MOVQ  xs+8(FP), AX
 27583  	MOVQ  incx+16(FP), CX
 27584  	MOVQ  ys+24(FP), DX
 27585  	MOVQ  incy+32(FP), BX
 27586  	MOVQ  n+40(FP), SI
 27587  	JMP   check_limit_unroll
 27588  	PCALIGN $0x08
 27589  	NOP
 27590  	NOP
 27591  	NOP
 27592  	NOP
 27593  	NOP
 27594  
 27595  loop_unroll:
 27596  	MOVSS (AX), X1
 27597  	MULSS X0, X1
 27598  	ADDSS (DX), X1
 27599  	MOVSS X1, (DX)
 27600  	LEAQ  (AX)(CX*4), AX
 27601  	LEAQ  (DX)(BX*4), DX
 27602  	MOVSS (AX), X1
 27603  	MULSS X0, X1
 27604  	ADDSS (DX), X1
 27605  	MOVSS X1, (DX)
 27606  	LEAQ  (AX)(CX*4), AX
 27607  	LEAQ  (DX)(BX*4), DX
 27608  	MOVSS (AX), X1
 27609  	MULSS X0, X1
 27610  	ADDSS (DX), X1
 27611  	MOVSS X1, (DX)
 27612  	LEAQ  (AX)(CX*4), AX
 27613  	LEAQ  (DX)(BX*4), DX
 27614  	MOVSS (AX), X1
 27615  	MULSS X0, X1
 27616  	ADDSS (DX), X1
 27617  	MOVSS X1, (DX)
 27618  	LEAQ  (AX)(CX*4), AX
 27619  	LEAQ  (DX)(BX*4), DX
 27620  	SUBQ  $0x04, SI
 27621  
 27622  check_limit_unroll:
 27623  	CMPQ SI, $0x04
 27624  	JHS  loop_unroll
 27625  	JMP  check_limit
 27626  
 27627  loop:
 27628  	MOVSS (AX), X1
 27629  	MULSS X0, X1
 27630  	ADDSS (DX), X1
 27631  	MOVSS X1, (DX)
 27632  	DECQ  SI
 27633  	LEAQ  (AX)(CX*4), AX
 27634  	LEAQ  (DX)(BX*4), DX
 27635  
 27636  check_limit:
 27637  	CMPQ SI, $0x00
 27638  	JHI  loop
 27639  	RET
 27640  
 27641  // func AmdAxpyPointerLoopX_V0A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27642  // Requires: SSE
 27643  TEXT ·AmdAxpyPointerLoopX_V0A14U4(SB), NOSPLIT, $0-48
 27644  	MOVSS alpha+0(FP), X0
 27645  	MOVQ  xs+8(FP), AX
 27646  	MOVQ  incx+16(FP), CX
 27647  	MOVQ  ys+24(FP), DX
 27648  	MOVQ  incy+32(FP), BX
 27649  	MOVQ  n+40(FP), SI
 27650  	JMP   check_limit_unroll
 27651  	PCALIGN $0x08
 27652  	NOP
 27653  	NOP
 27654  	NOP
 27655  	NOP
 27656  	NOP
 27657  	NOP
 27658  
 27659  loop_unroll:
 27660  	MOVSS (AX), X1
 27661  	MULSS X0, X1
 27662  	ADDSS (DX), X1
 27663  	MOVSS X1, (DX)
 27664  	LEAQ  (AX)(CX*4), AX
 27665  	LEAQ  (DX)(BX*4), DX
 27666  	MOVSS (AX), X1
 27667  	MULSS X0, X1
 27668  	ADDSS (DX), X1
 27669  	MOVSS X1, (DX)
 27670  	LEAQ  (AX)(CX*4), AX
 27671  	LEAQ  (DX)(BX*4), DX
 27672  	MOVSS (AX), X1
 27673  	MULSS X0, X1
 27674  	ADDSS (DX), X1
 27675  	MOVSS X1, (DX)
 27676  	LEAQ  (AX)(CX*4), AX
 27677  	LEAQ  (DX)(BX*4), DX
 27678  	MOVSS (AX), X1
 27679  	MULSS X0, X1
 27680  	ADDSS (DX), X1
 27681  	MOVSS X1, (DX)
 27682  	LEAQ  (AX)(CX*4), AX
 27683  	LEAQ  (DX)(BX*4), DX
 27684  	SUBQ  $0x04, SI
 27685  
 27686  check_limit_unroll:
 27687  	CMPQ SI, $0x04
 27688  	JHS  loop_unroll
 27689  	JMP  check_limit
 27690  
 27691  loop:
 27692  	MOVSS (AX), X1
 27693  	MULSS X0, X1
 27694  	ADDSS (DX), X1
 27695  	MOVSS X1, (DX)
 27696  	DECQ  SI
 27697  	LEAQ  (AX)(CX*4), AX
 27698  	LEAQ  (DX)(BX*4), DX
 27699  
 27700  check_limit:
 27701  	CMPQ SI, $0x00
 27702  	JHI  loop
 27703  	RET
 27704  
 27705  // func AmdAxpyPointerLoopX_V1A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27706  // Requires: SSE
 27707  TEXT ·AmdAxpyPointerLoopX_V1A14U4(SB), NOSPLIT, $0-48
 27708  	MOVSS alpha+0(FP), X0
 27709  	MOVQ  xs+8(FP), AX
 27710  	MOVQ  incx+16(FP), CX
 27711  	MOVQ  ys+24(FP), DX
 27712  	MOVQ  incy+32(FP), BX
 27713  	MOVQ  n+40(FP), SI
 27714  	JMP   check_limit_unroll
 27715  	PCALIGN $0x08
 27716  	NOP
 27717  	NOP
 27718  	NOP
 27719  	NOP
 27720  	NOP
 27721  	NOP
 27722  
 27723  loop_unroll:
 27724  	MOVSS (AX), X1
 27725  	MULSS X0, X1
 27726  	ADDSS (DX), X1
 27727  	MOVSS X1, (DX)
 27728  	LEAQ  (AX)(CX*4), AX
 27729  	LEAQ  (DX)(BX*4), DX
 27730  	MOVSS (AX), X1
 27731  	MULSS X0, X1
 27732  	ADDSS (DX), X1
 27733  	MOVSS X1, (DX)
 27734  	LEAQ  (AX)(CX*4), AX
 27735  	LEAQ  (DX)(BX*4), DX
 27736  	MOVSS (AX), X1
 27737  	MULSS X0, X1
 27738  	ADDSS (DX), X1
 27739  	MOVSS X1, (DX)
 27740  	LEAQ  (AX)(CX*4), AX
 27741  	LEAQ  (DX)(BX*4), DX
 27742  	MOVSS (AX), X1
 27743  	MULSS X0, X1
 27744  	ADDSS (DX), X1
 27745  	MOVSS X1, (DX)
 27746  	LEAQ  (AX)(CX*4), AX
 27747  	LEAQ  (DX)(BX*4), DX
 27748  	SUBQ  $0x04, SI
 27749  
 27750  check_limit_unroll:
 27751  	CMPQ SI, $0x04
 27752  	JHS  loop_unroll
 27753  	JMP  check_limit
 27754  
 27755  loop:
 27756  	MOVSS (AX), X1
 27757  	MULSS X0, X1
 27758  	ADDSS (DX), X1
 27759  	MOVSS X1, (DX)
 27760  	DECQ  SI
 27761  	LEAQ  (AX)(CX*4), AX
 27762  	LEAQ  (DX)(BX*4), DX
 27763  
 27764  check_limit:
 27765  	CMPQ SI, $0x00
 27766  	JHI  loop
 27767  	RET
 27768  
 27769  // func AmdAxpyPointerLoopX_V2A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27770  // Requires: SSE
 27771  TEXT ·AmdAxpyPointerLoopX_V2A14U4(SB), NOSPLIT, $0-48
 27772  	MOVSS alpha+0(FP), X0
 27773  	MOVQ  xs+8(FP), AX
 27774  	MOVQ  incx+16(FP), CX
 27775  	MOVQ  ys+24(FP), DX
 27776  	MOVQ  incy+32(FP), BX
 27777  	MOVQ  n+40(FP), SI
 27778  	JMP   check_limit_unroll
 27779  	PCALIGN $0x08
 27780  	NOP
 27781  	NOP
 27782  	NOP
 27783  	NOP
 27784  	NOP
 27785  	NOP
 27786  
 27787  loop_unroll:
 27788  	MOVSS (AX), X1
 27789  	MULSS X0, X1
 27790  	ADDSS (DX), X1
 27791  	MOVSS X1, (DX)
 27792  	LEAQ  (AX)(CX*4), AX
 27793  	LEAQ  (DX)(BX*4), DX
 27794  	MOVSS (AX), X1
 27795  	MULSS X0, X1
 27796  	ADDSS (DX), X1
 27797  	MOVSS X1, (DX)
 27798  	LEAQ  (AX)(CX*4), AX
 27799  	LEAQ  (DX)(BX*4), DX
 27800  	MOVSS (AX), X1
 27801  	MULSS X0, X1
 27802  	ADDSS (DX), X1
 27803  	MOVSS X1, (DX)
 27804  	LEAQ  (AX)(CX*4), AX
 27805  	LEAQ  (DX)(BX*4), DX
 27806  	MOVSS (AX), X1
 27807  	MULSS X0, X1
 27808  	ADDSS (DX), X1
 27809  	MOVSS X1, (DX)
 27810  	LEAQ  (AX)(CX*4), AX
 27811  	LEAQ  (DX)(BX*4), DX
 27812  	SUBQ  $0x04, SI
 27813  
 27814  check_limit_unroll:
 27815  	CMPQ SI, $0x04
 27816  	JHS  loop_unroll
 27817  	JMP  check_limit
 27818  
 27819  loop:
 27820  	MOVSS (AX), X1
 27821  	MULSS X0, X1
 27822  	ADDSS (DX), X1
 27823  	MOVSS X1, (DX)
 27824  	DECQ  SI
 27825  	LEAQ  (AX)(CX*4), AX
 27826  	LEAQ  (DX)(BX*4), DX
 27827  
 27828  check_limit:
 27829  	CMPQ SI, $0x00
 27830  	JHI  loop
 27831  	RET
 27832  
 27833  // func AmdAxpyPointerLoopX_V3A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27834  // Requires: SSE
 27835  TEXT ·AmdAxpyPointerLoopX_V3A14U4(SB), NOSPLIT, $0-48
 27836  	MOVSS alpha+0(FP), X0
 27837  	MOVQ  xs+8(FP), AX
 27838  	MOVQ  incx+16(FP), CX
 27839  	MOVQ  ys+24(FP), DX
 27840  	MOVQ  incy+32(FP), BX
 27841  	MOVQ  n+40(FP), SI
 27842  	JMP   check_limit_unroll
 27843  	PCALIGN $0x08
 27844  	NOP
 27845  	NOP
 27846  	NOP
 27847  	NOP
 27848  	NOP
 27849  	NOP
 27850  
 27851  loop_unroll:
 27852  	MOVSS (AX), X1
 27853  	MULSS X0, X1
 27854  	ADDSS (DX), X1
 27855  	MOVSS X1, (DX)
 27856  	LEAQ  (AX)(CX*4), AX
 27857  	LEAQ  (DX)(BX*4), DX
 27858  	MOVSS (AX), X1
 27859  	MULSS X0, X1
 27860  	ADDSS (DX), X1
 27861  	MOVSS X1, (DX)
 27862  	LEAQ  (AX)(CX*4), AX
 27863  	LEAQ  (DX)(BX*4), DX
 27864  	MOVSS (AX), X1
 27865  	MULSS X0, X1
 27866  	ADDSS (DX), X1
 27867  	MOVSS X1, (DX)
 27868  	LEAQ  (AX)(CX*4), AX
 27869  	LEAQ  (DX)(BX*4), DX
 27870  	MOVSS (AX), X1
 27871  	MULSS X0, X1
 27872  	ADDSS (DX), X1
 27873  	MOVSS X1, (DX)
 27874  	LEAQ  (AX)(CX*4), AX
 27875  	LEAQ  (DX)(BX*4), DX
 27876  	SUBQ  $0x04, SI
 27877  
 27878  check_limit_unroll:
 27879  	CMPQ SI, $0x04
 27880  	JHS  loop_unroll
 27881  	JMP  check_limit
 27882  
 27883  loop:
 27884  	MOVSS (AX), X1
 27885  	MULSS X0, X1
 27886  	ADDSS (DX), X1
 27887  	MOVSS X1, (DX)
 27888  	DECQ  SI
 27889  	LEAQ  (AX)(CX*4), AX
 27890  	LEAQ  (DX)(BX*4), DX
 27891  
 27892  check_limit:
 27893  	CMPQ SI, $0x00
 27894  	JHI  loop
 27895  	RET
 27896  
 27897  // func AmdAxpyPointerLoopX_V4A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27898  // Requires: SSE
 27899  TEXT ·AmdAxpyPointerLoopX_V4A14U4(SB), NOSPLIT, $0-48
 27900  	MOVSS alpha+0(FP), X0
 27901  	MOVQ  xs+8(FP), AX
 27902  	MOVQ  incx+16(FP), CX
 27903  	MOVQ  ys+24(FP), DX
 27904  	MOVQ  incy+32(FP), BX
 27905  	MOVQ  n+40(FP), SI
 27906  	JMP   check_limit_unroll
 27907  	PCALIGN $0x08
 27908  	NOP
 27909  	NOP
 27910  	NOP
 27911  	NOP
 27912  	NOP
 27913  	NOP
 27914  
 27915  loop_unroll:
 27916  	MOVSS (AX), X1
 27917  	MULSS X0, X1
 27918  	ADDSS (DX), X1
 27919  	MOVSS X1, (DX)
 27920  	LEAQ  (AX)(CX*4), AX
 27921  	LEAQ  (DX)(BX*4), DX
 27922  	MOVSS (AX), X1
 27923  	MULSS X0, X1
 27924  	ADDSS (DX), X1
 27925  	MOVSS X1, (DX)
 27926  	LEAQ  (AX)(CX*4), AX
 27927  	LEAQ  (DX)(BX*4), DX
 27928  	MOVSS (AX), X1
 27929  	MULSS X0, X1
 27930  	ADDSS (DX), X1
 27931  	MOVSS X1, (DX)
 27932  	LEAQ  (AX)(CX*4), AX
 27933  	LEAQ  (DX)(BX*4), DX
 27934  	MOVSS (AX), X1
 27935  	MULSS X0, X1
 27936  	ADDSS (DX), X1
 27937  	MOVSS X1, (DX)
 27938  	LEAQ  (AX)(CX*4), AX
 27939  	LEAQ  (DX)(BX*4), DX
 27940  	SUBQ  $0x04, SI
 27941  
 27942  check_limit_unroll:
 27943  	CMPQ SI, $0x04
 27944  	JHS  loop_unroll
 27945  	JMP  check_limit
 27946  
 27947  loop:
 27948  	MOVSS (AX), X1
 27949  	MULSS X0, X1
 27950  	ADDSS (DX), X1
 27951  	MOVSS X1, (DX)
 27952  	DECQ  SI
 27953  	LEAQ  (AX)(CX*4), AX
 27954  	LEAQ  (DX)(BX*4), DX
 27955  
 27956  check_limit:
 27957  	CMPQ SI, $0x00
 27958  	JHI  loop
 27959  	RET
 27960  
 27961  // func AmdAxpyPointerLoopX_V5A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 27962  // Requires: SSE
 27963  TEXT ·AmdAxpyPointerLoopX_V5A14U4(SB), NOSPLIT, $0-48
 27964  	MOVSS alpha+0(FP), X0
 27965  	MOVQ  xs+8(FP), AX
 27966  	MOVQ  incx+16(FP), CX
 27967  	MOVQ  ys+24(FP), DX
 27968  	MOVQ  incy+32(FP), BX
 27969  	MOVQ  n+40(FP), SI
 27970  	JMP   check_limit_unroll
 27971  	PCALIGN $0x08
 27972  	NOP
 27973  	NOP
 27974  	NOP
 27975  	NOP
 27976  	NOP
 27977  	NOP
 27978  
 27979  loop_unroll:
 27980  	MOVSS (AX), X1
 27981  	MULSS X0, X1
 27982  	ADDSS (DX), X1
 27983  	MOVSS X1, (DX)
 27984  	LEAQ  (AX)(CX*4), AX
 27985  	LEAQ  (DX)(BX*4), DX
 27986  	MOVSS (AX), X1
 27987  	MULSS X0, X1
 27988  	ADDSS (DX), X1
 27989  	MOVSS X1, (DX)
 27990  	LEAQ  (AX)(CX*4), AX
 27991  	LEAQ  (DX)(BX*4), DX
 27992  	MOVSS (AX), X1
 27993  	MULSS X0, X1
 27994  	ADDSS (DX), X1
 27995  	MOVSS X1, (DX)
 27996  	LEAQ  (AX)(CX*4), AX
 27997  	LEAQ  (DX)(BX*4), DX
 27998  	MOVSS (AX), X1
 27999  	MULSS X0, X1
 28000  	ADDSS (DX), X1
 28001  	MOVSS X1, (DX)
 28002  	LEAQ  (AX)(CX*4), AX
 28003  	LEAQ  (DX)(BX*4), DX
 28004  	SUBQ  $0x04, SI
 28005  
 28006  check_limit_unroll:
 28007  	CMPQ SI, $0x04
 28008  	JHS  loop_unroll
 28009  	JMP  check_limit
 28010  
 28011  loop:
 28012  	MOVSS (AX), X1
 28013  	MULSS X0, X1
 28014  	ADDSS (DX), X1
 28015  	MOVSS X1, (DX)
 28016  	DECQ  SI
 28017  	LEAQ  (AX)(CX*4), AX
 28018  	LEAQ  (DX)(BX*4), DX
 28019  
 28020  check_limit:
 28021  	CMPQ SI, $0x00
 28022  	JHI  loop
 28023  	RET
 28024  
 28025  // func AmdAxpyPointerLoopX_V0A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28026  // Requires: SSE
 28027  TEXT ·AmdAxpyPointerLoopX_V0A15U4(SB), NOSPLIT, $0-48
 28028  	MOVSS alpha+0(FP), X0
 28029  	MOVQ  xs+8(FP), AX
 28030  	MOVQ  incx+16(FP), CX
 28031  	MOVQ  ys+24(FP), DX
 28032  	MOVQ  incy+32(FP), BX
 28033  	MOVQ  n+40(FP), SI
 28034  	JMP   check_limit_unroll
 28035  	PCALIGN $0x08
 28036  	NOP
 28037  	NOP
 28038  	NOP
 28039  	NOP
 28040  	NOP
 28041  	NOP
 28042  	NOP
 28043  
 28044  loop_unroll:
 28045  	MOVSS (AX), X1
 28046  	MULSS X0, X1
 28047  	ADDSS (DX), X1
 28048  	MOVSS X1, (DX)
 28049  	LEAQ  (AX)(CX*4), AX
 28050  	LEAQ  (DX)(BX*4), DX
 28051  	MOVSS (AX), X1
 28052  	MULSS X0, X1
 28053  	ADDSS (DX), X1
 28054  	MOVSS X1, (DX)
 28055  	LEAQ  (AX)(CX*4), AX
 28056  	LEAQ  (DX)(BX*4), DX
 28057  	MOVSS (AX), X1
 28058  	MULSS X0, X1
 28059  	ADDSS (DX), X1
 28060  	MOVSS X1, (DX)
 28061  	LEAQ  (AX)(CX*4), AX
 28062  	LEAQ  (DX)(BX*4), DX
 28063  	MOVSS (AX), X1
 28064  	MULSS X0, X1
 28065  	ADDSS (DX), X1
 28066  	MOVSS X1, (DX)
 28067  	LEAQ  (AX)(CX*4), AX
 28068  	LEAQ  (DX)(BX*4), DX
 28069  	SUBQ  $0x04, SI
 28070  
 28071  check_limit_unroll:
 28072  	CMPQ SI, $0x04
 28073  	JHS  loop_unroll
 28074  	JMP  check_limit
 28075  
 28076  loop:
 28077  	MOVSS (AX), X1
 28078  	MULSS X0, X1
 28079  	ADDSS (DX), X1
 28080  	MOVSS X1, (DX)
 28081  	DECQ  SI
 28082  	LEAQ  (AX)(CX*4), AX
 28083  	LEAQ  (DX)(BX*4), DX
 28084  
 28085  check_limit:
 28086  	CMPQ SI, $0x00
 28087  	JHI  loop
 28088  	RET
 28089  
 28090  // func AmdAxpyPointerLoopX_V1A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28091  // Requires: SSE
 28092  TEXT ·AmdAxpyPointerLoopX_V1A15U4(SB), NOSPLIT, $0-48
 28093  	MOVSS alpha+0(FP), X0
 28094  	MOVQ  xs+8(FP), AX
 28095  	MOVQ  incx+16(FP), CX
 28096  	MOVQ  ys+24(FP), DX
 28097  	MOVQ  incy+32(FP), BX
 28098  	MOVQ  n+40(FP), SI
 28099  	JMP   check_limit_unroll
 28100  	PCALIGN $0x08
 28101  	NOP
 28102  	NOP
 28103  	NOP
 28104  	NOP
 28105  	NOP
 28106  	NOP
 28107  	NOP
 28108  
 28109  loop_unroll:
 28110  	MOVSS (AX), X1
 28111  	MULSS X0, X1
 28112  	ADDSS (DX), X1
 28113  	MOVSS X1, (DX)
 28114  	LEAQ  (AX)(CX*4), AX
 28115  	LEAQ  (DX)(BX*4), DX
 28116  	MOVSS (AX), X1
 28117  	MULSS X0, X1
 28118  	ADDSS (DX), X1
 28119  	MOVSS X1, (DX)
 28120  	LEAQ  (AX)(CX*4), AX
 28121  	LEAQ  (DX)(BX*4), DX
 28122  	MOVSS (AX), X1
 28123  	MULSS X0, X1
 28124  	ADDSS (DX), X1
 28125  	MOVSS X1, (DX)
 28126  	LEAQ  (AX)(CX*4), AX
 28127  	LEAQ  (DX)(BX*4), DX
 28128  	MOVSS (AX), X1
 28129  	MULSS X0, X1
 28130  	ADDSS (DX), X1
 28131  	MOVSS X1, (DX)
 28132  	LEAQ  (AX)(CX*4), AX
 28133  	LEAQ  (DX)(BX*4), DX
 28134  	SUBQ  $0x04, SI
 28135  
 28136  check_limit_unroll:
 28137  	CMPQ SI, $0x04
 28138  	JHS  loop_unroll
 28139  	JMP  check_limit
 28140  
 28141  loop:
 28142  	MOVSS (AX), X1
 28143  	MULSS X0, X1
 28144  	ADDSS (DX), X1
 28145  	MOVSS X1, (DX)
 28146  	DECQ  SI
 28147  	LEAQ  (AX)(CX*4), AX
 28148  	LEAQ  (DX)(BX*4), DX
 28149  
 28150  check_limit:
 28151  	CMPQ SI, $0x00
 28152  	JHI  loop
 28153  	RET
 28154  
 28155  // func AmdAxpyPointerLoopX_V2A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28156  // Requires: SSE
 28157  TEXT ·AmdAxpyPointerLoopX_V2A15U4(SB), NOSPLIT, $0-48
 28158  	MOVSS alpha+0(FP), X0
 28159  	MOVQ  xs+8(FP), AX
 28160  	MOVQ  incx+16(FP), CX
 28161  	MOVQ  ys+24(FP), DX
 28162  	MOVQ  incy+32(FP), BX
 28163  	MOVQ  n+40(FP), SI
 28164  	JMP   check_limit_unroll
 28165  	PCALIGN $0x08
 28166  	NOP
 28167  	NOP
 28168  	NOP
 28169  	NOP
 28170  	NOP
 28171  	NOP
 28172  	NOP
 28173  
 28174  loop_unroll:
 28175  	MOVSS (AX), X1
 28176  	MULSS X0, X1
 28177  	ADDSS (DX), X1
 28178  	MOVSS X1, (DX)
 28179  	LEAQ  (AX)(CX*4), AX
 28180  	LEAQ  (DX)(BX*4), DX
 28181  	MOVSS (AX), X1
 28182  	MULSS X0, X1
 28183  	ADDSS (DX), X1
 28184  	MOVSS X1, (DX)
 28185  	LEAQ  (AX)(CX*4), AX
 28186  	LEAQ  (DX)(BX*4), DX
 28187  	MOVSS (AX), X1
 28188  	MULSS X0, X1
 28189  	ADDSS (DX), X1
 28190  	MOVSS X1, (DX)
 28191  	LEAQ  (AX)(CX*4), AX
 28192  	LEAQ  (DX)(BX*4), DX
 28193  	MOVSS (AX), X1
 28194  	MULSS X0, X1
 28195  	ADDSS (DX), X1
 28196  	MOVSS X1, (DX)
 28197  	LEAQ  (AX)(CX*4), AX
 28198  	LEAQ  (DX)(BX*4), DX
 28199  	SUBQ  $0x04, SI
 28200  
 28201  check_limit_unroll:
 28202  	CMPQ SI, $0x04
 28203  	JHS  loop_unroll
 28204  	JMP  check_limit
 28205  
 28206  loop:
 28207  	MOVSS (AX), X1
 28208  	MULSS X0, X1
 28209  	ADDSS (DX), X1
 28210  	MOVSS X1, (DX)
 28211  	DECQ  SI
 28212  	LEAQ  (AX)(CX*4), AX
 28213  	LEAQ  (DX)(BX*4), DX
 28214  
 28215  check_limit:
 28216  	CMPQ SI, $0x00
 28217  	JHI  loop
 28218  	RET
 28219  
 28220  // func AmdAxpyPointerLoopX_V3A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28221  // Requires: SSE
 28222  TEXT ·AmdAxpyPointerLoopX_V3A15U4(SB), NOSPLIT, $0-48
 28223  	MOVSS alpha+0(FP), X0
 28224  	MOVQ  xs+8(FP), AX
 28225  	MOVQ  incx+16(FP), CX
 28226  	MOVQ  ys+24(FP), DX
 28227  	MOVQ  incy+32(FP), BX
 28228  	MOVQ  n+40(FP), SI
 28229  	JMP   check_limit_unroll
 28230  	PCALIGN $0x08
 28231  	NOP
 28232  	NOP
 28233  	NOP
 28234  	NOP
 28235  	NOP
 28236  	NOP
 28237  	NOP
 28238  
 28239  loop_unroll:
 28240  	MOVSS (AX), X1
 28241  	MULSS X0, X1
 28242  	ADDSS (DX), X1
 28243  	MOVSS X1, (DX)
 28244  	LEAQ  (AX)(CX*4), AX
 28245  	LEAQ  (DX)(BX*4), DX
 28246  	MOVSS (AX), X1
 28247  	MULSS X0, X1
 28248  	ADDSS (DX), X1
 28249  	MOVSS X1, (DX)
 28250  	LEAQ  (AX)(CX*4), AX
 28251  	LEAQ  (DX)(BX*4), DX
 28252  	MOVSS (AX), X1
 28253  	MULSS X0, X1
 28254  	ADDSS (DX), X1
 28255  	MOVSS X1, (DX)
 28256  	LEAQ  (AX)(CX*4), AX
 28257  	LEAQ  (DX)(BX*4), DX
 28258  	MOVSS (AX), X1
 28259  	MULSS X0, X1
 28260  	ADDSS (DX), X1
 28261  	MOVSS X1, (DX)
 28262  	LEAQ  (AX)(CX*4), AX
 28263  	LEAQ  (DX)(BX*4), DX
 28264  	SUBQ  $0x04, SI
 28265  
 28266  check_limit_unroll:
 28267  	CMPQ SI, $0x04
 28268  	JHS  loop_unroll
 28269  	JMP  check_limit
 28270  
 28271  loop:
 28272  	MOVSS (AX), X1
 28273  	MULSS X0, X1
 28274  	ADDSS (DX), X1
 28275  	MOVSS X1, (DX)
 28276  	DECQ  SI
 28277  	LEAQ  (AX)(CX*4), AX
 28278  	LEAQ  (DX)(BX*4), DX
 28279  
 28280  check_limit:
 28281  	CMPQ SI, $0x00
 28282  	JHI  loop
 28283  	RET
 28284  
 28285  // func AmdAxpyPointerLoopX_V4A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28286  // Requires: SSE
 28287  TEXT ·AmdAxpyPointerLoopX_V4A15U4(SB), NOSPLIT, $0-48
 28288  	MOVSS alpha+0(FP), X0
 28289  	MOVQ  xs+8(FP), AX
 28290  	MOVQ  incx+16(FP), CX
 28291  	MOVQ  ys+24(FP), DX
 28292  	MOVQ  incy+32(FP), BX
 28293  	MOVQ  n+40(FP), SI
 28294  	JMP   check_limit_unroll
 28295  	PCALIGN $0x08
 28296  	NOP
 28297  	NOP
 28298  	NOP
 28299  	NOP
 28300  	NOP
 28301  	NOP
 28302  	NOP
 28303  
 28304  loop_unroll:
 28305  	MOVSS (AX), X1
 28306  	MULSS X0, X1
 28307  	ADDSS (DX), X1
 28308  	MOVSS X1, (DX)
 28309  	LEAQ  (AX)(CX*4), AX
 28310  	LEAQ  (DX)(BX*4), DX
 28311  	MOVSS (AX), X1
 28312  	MULSS X0, X1
 28313  	ADDSS (DX), X1
 28314  	MOVSS X1, (DX)
 28315  	LEAQ  (AX)(CX*4), AX
 28316  	LEAQ  (DX)(BX*4), DX
 28317  	MOVSS (AX), X1
 28318  	MULSS X0, X1
 28319  	ADDSS (DX), X1
 28320  	MOVSS X1, (DX)
 28321  	LEAQ  (AX)(CX*4), AX
 28322  	LEAQ  (DX)(BX*4), DX
 28323  	MOVSS (AX), X1
 28324  	MULSS X0, X1
 28325  	ADDSS (DX), X1
 28326  	MOVSS X1, (DX)
 28327  	LEAQ  (AX)(CX*4), AX
 28328  	LEAQ  (DX)(BX*4), DX
 28329  	SUBQ  $0x04, SI
 28330  
 28331  check_limit_unroll:
 28332  	CMPQ SI, $0x04
 28333  	JHS  loop_unroll
 28334  	JMP  check_limit
 28335  
 28336  loop:
 28337  	MOVSS (AX), X1
 28338  	MULSS X0, X1
 28339  	ADDSS (DX), X1
 28340  	MOVSS X1, (DX)
 28341  	DECQ  SI
 28342  	LEAQ  (AX)(CX*4), AX
 28343  	LEAQ  (DX)(BX*4), DX
 28344  
 28345  check_limit:
 28346  	CMPQ SI, $0x00
 28347  	JHI  loop
 28348  	RET
 28349  
 28350  // func AmdAxpyPointerLoopX_V5A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28351  // Requires: SSE
 28352  TEXT ·AmdAxpyPointerLoopX_V5A15U4(SB), NOSPLIT, $0-48
 28353  	MOVSS alpha+0(FP), X0
 28354  	MOVQ  xs+8(FP), AX
 28355  	MOVQ  incx+16(FP), CX
 28356  	MOVQ  ys+24(FP), DX
 28357  	MOVQ  incy+32(FP), BX
 28358  	MOVQ  n+40(FP), SI
 28359  	JMP   check_limit_unroll
 28360  	PCALIGN $0x08
 28361  	NOP
 28362  	NOP
 28363  	NOP
 28364  	NOP
 28365  	NOP
 28366  	NOP
 28367  	NOP
 28368  
 28369  loop_unroll:
 28370  	MOVSS (AX), X1
 28371  	MULSS X0, X1
 28372  	ADDSS (DX), X1
 28373  	MOVSS X1, (DX)
 28374  	LEAQ  (AX)(CX*4), AX
 28375  	LEAQ  (DX)(BX*4), DX
 28376  	MOVSS (AX), X1
 28377  	MULSS X0, X1
 28378  	ADDSS (DX), X1
 28379  	MOVSS X1, (DX)
 28380  	LEAQ  (AX)(CX*4), AX
 28381  	LEAQ  (DX)(BX*4), DX
 28382  	MOVSS (AX), X1
 28383  	MULSS X0, X1
 28384  	ADDSS (DX), X1
 28385  	MOVSS X1, (DX)
 28386  	LEAQ  (AX)(CX*4), AX
 28387  	LEAQ  (DX)(BX*4), DX
 28388  	MOVSS (AX), X1
 28389  	MULSS X0, X1
 28390  	ADDSS (DX), X1
 28391  	MOVSS X1, (DX)
 28392  	LEAQ  (AX)(CX*4), AX
 28393  	LEAQ  (DX)(BX*4), DX
 28394  	SUBQ  $0x04, SI
 28395  
 28396  check_limit_unroll:
 28397  	CMPQ SI, $0x04
 28398  	JHS  loop_unroll
 28399  	JMP  check_limit
 28400  
 28401  loop:
 28402  	MOVSS (AX), X1
 28403  	MULSS X0, X1
 28404  	ADDSS (DX), X1
 28405  	MOVSS X1, (DX)
 28406  	DECQ  SI
 28407  	LEAQ  (AX)(CX*4), AX
 28408  	LEAQ  (DX)(BX*4), DX
 28409  
 28410  check_limit:
 28411  	CMPQ SI, $0x00
 28412  	JHI  loop
 28413  	RET
 28414  
 28415  // func AmdAxpyPointerLoopX_V0A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28416  // Requires: SSE
 28417  TEXT ·AmdAxpyPointerLoopX_V0A16U4(SB), NOSPLIT, $0-48
 28418  	MOVSS alpha+0(FP), X0
 28419  	MOVQ  xs+8(FP), AX
 28420  	MOVQ  incx+16(FP), CX
 28421  	MOVQ  ys+24(FP), DX
 28422  	MOVQ  incy+32(FP), BX
 28423  	MOVQ  n+40(FP), SI
 28424  	JMP   check_limit_unroll
 28425  	PCALIGN $0x10
 28426  
 28427  loop_unroll:
 28428  	MOVSS (AX), X1
 28429  	MULSS X0, X1
 28430  	ADDSS (DX), X1
 28431  	MOVSS X1, (DX)
 28432  	LEAQ  (AX)(CX*4), AX
 28433  	LEAQ  (DX)(BX*4), DX
 28434  	MOVSS (AX), X1
 28435  	MULSS X0, X1
 28436  	ADDSS (DX), X1
 28437  	MOVSS X1, (DX)
 28438  	LEAQ  (AX)(CX*4), AX
 28439  	LEAQ  (DX)(BX*4), DX
 28440  	MOVSS (AX), X1
 28441  	MULSS X0, X1
 28442  	ADDSS (DX), X1
 28443  	MOVSS X1, (DX)
 28444  	LEAQ  (AX)(CX*4), AX
 28445  	LEAQ  (DX)(BX*4), DX
 28446  	MOVSS (AX), X1
 28447  	MULSS X0, X1
 28448  	ADDSS (DX), X1
 28449  	MOVSS X1, (DX)
 28450  	LEAQ  (AX)(CX*4), AX
 28451  	LEAQ  (DX)(BX*4), DX
 28452  	SUBQ  $0x04, SI
 28453  
 28454  check_limit_unroll:
 28455  	CMPQ SI, $0x04
 28456  	JHS  loop_unroll
 28457  	JMP  check_limit
 28458  
 28459  loop:
 28460  	MOVSS (AX), X1
 28461  	MULSS X0, X1
 28462  	ADDSS (DX), X1
 28463  	MOVSS X1, (DX)
 28464  	DECQ  SI
 28465  	LEAQ  (AX)(CX*4), AX
 28466  	LEAQ  (DX)(BX*4), DX
 28467  
 28468  check_limit:
 28469  	CMPQ SI, $0x00
 28470  	JHI  loop
 28471  	RET
 28472  
 28473  // func AmdAxpyPointerLoopX_V1A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28474  // Requires: SSE
 28475  TEXT ·AmdAxpyPointerLoopX_V1A16U4(SB), NOSPLIT, $0-48
 28476  	MOVSS alpha+0(FP), X0
 28477  	MOVQ  xs+8(FP), AX
 28478  	MOVQ  incx+16(FP), CX
 28479  	MOVQ  ys+24(FP), DX
 28480  	MOVQ  incy+32(FP), BX
 28481  	MOVQ  n+40(FP), SI
 28482  	JMP   check_limit_unroll
 28483  	PCALIGN $0x10
 28484  
 28485  loop_unroll:
 28486  	MOVSS (AX), X1
 28487  	MULSS X0, X1
 28488  	ADDSS (DX), X1
 28489  	MOVSS X1, (DX)
 28490  	LEAQ  (AX)(CX*4), AX
 28491  	LEAQ  (DX)(BX*4), DX
 28492  	MOVSS (AX), X1
 28493  	MULSS X0, X1
 28494  	ADDSS (DX), X1
 28495  	MOVSS X1, (DX)
 28496  	LEAQ  (AX)(CX*4), AX
 28497  	LEAQ  (DX)(BX*4), DX
 28498  	MOVSS (AX), X1
 28499  	MULSS X0, X1
 28500  	ADDSS (DX), X1
 28501  	MOVSS X1, (DX)
 28502  	LEAQ  (AX)(CX*4), AX
 28503  	LEAQ  (DX)(BX*4), DX
 28504  	MOVSS (AX), X1
 28505  	MULSS X0, X1
 28506  	ADDSS (DX), X1
 28507  	MOVSS X1, (DX)
 28508  	LEAQ  (AX)(CX*4), AX
 28509  	LEAQ  (DX)(BX*4), DX
 28510  	SUBQ  $0x04, SI
 28511  
 28512  check_limit_unroll:
 28513  	CMPQ SI, $0x04
 28514  	JHS  loop_unroll
 28515  	JMP  check_limit
 28516  
 28517  loop:
 28518  	MOVSS (AX), X1
 28519  	MULSS X0, X1
 28520  	ADDSS (DX), X1
 28521  	MOVSS X1, (DX)
 28522  	DECQ  SI
 28523  	LEAQ  (AX)(CX*4), AX
 28524  	LEAQ  (DX)(BX*4), DX
 28525  
 28526  check_limit:
 28527  	CMPQ SI, $0x00
 28528  	JHI  loop
 28529  	RET
 28530  
 28531  // func AmdAxpyPointerLoopX_V2A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28532  // Requires: SSE
 28533  TEXT ·AmdAxpyPointerLoopX_V2A16U4(SB), NOSPLIT, $0-48
 28534  	MOVSS alpha+0(FP), X0
 28535  	MOVQ  xs+8(FP), AX
 28536  	MOVQ  incx+16(FP), CX
 28537  	MOVQ  ys+24(FP), DX
 28538  	MOVQ  incy+32(FP), BX
 28539  	MOVQ  n+40(FP), SI
 28540  	JMP   check_limit_unroll
 28541  	PCALIGN $0x10
 28542  
 28543  loop_unroll:
 28544  	MOVSS (AX), X1
 28545  	MULSS X0, X1
 28546  	ADDSS (DX), X1
 28547  	MOVSS X1, (DX)
 28548  	LEAQ  (AX)(CX*4), AX
 28549  	LEAQ  (DX)(BX*4), DX
 28550  	MOVSS (AX), X1
 28551  	MULSS X0, X1
 28552  	ADDSS (DX), X1
 28553  	MOVSS X1, (DX)
 28554  	LEAQ  (AX)(CX*4), AX
 28555  	LEAQ  (DX)(BX*4), DX
 28556  	MOVSS (AX), X1
 28557  	MULSS X0, X1
 28558  	ADDSS (DX), X1
 28559  	MOVSS X1, (DX)
 28560  	LEAQ  (AX)(CX*4), AX
 28561  	LEAQ  (DX)(BX*4), DX
 28562  	MOVSS (AX), X1
 28563  	MULSS X0, X1
 28564  	ADDSS (DX), X1
 28565  	MOVSS X1, (DX)
 28566  	LEAQ  (AX)(CX*4), AX
 28567  	LEAQ  (DX)(BX*4), DX
 28568  	SUBQ  $0x04, SI
 28569  
 28570  check_limit_unroll:
 28571  	CMPQ SI, $0x04
 28572  	JHS  loop_unroll
 28573  	JMP  check_limit
 28574  
 28575  loop:
 28576  	MOVSS (AX), X1
 28577  	MULSS X0, X1
 28578  	ADDSS (DX), X1
 28579  	MOVSS X1, (DX)
 28580  	DECQ  SI
 28581  	LEAQ  (AX)(CX*4), AX
 28582  	LEAQ  (DX)(BX*4), DX
 28583  
 28584  check_limit:
 28585  	CMPQ SI, $0x00
 28586  	JHI  loop
 28587  	RET
 28588  
 28589  // func AmdAxpyPointerLoopX_V3A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28590  // Requires: SSE
 28591  TEXT ·AmdAxpyPointerLoopX_V3A16U4(SB), NOSPLIT, $0-48
 28592  	MOVSS alpha+0(FP), X0
 28593  	MOVQ  xs+8(FP), AX
 28594  	MOVQ  incx+16(FP), CX
 28595  	MOVQ  ys+24(FP), DX
 28596  	MOVQ  incy+32(FP), BX
 28597  	MOVQ  n+40(FP), SI
 28598  	JMP   check_limit_unroll
 28599  	PCALIGN $0x10
 28600  
 28601  loop_unroll:
 28602  	MOVSS (AX), X1
 28603  	MULSS X0, X1
 28604  	ADDSS (DX), X1
 28605  	MOVSS X1, (DX)
 28606  	LEAQ  (AX)(CX*4), AX
 28607  	LEAQ  (DX)(BX*4), DX
 28608  	MOVSS (AX), X1
 28609  	MULSS X0, X1
 28610  	ADDSS (DX), X1
 28611  	MOVSS X1, (DX)
 28612  	LEAQ  (AX)(CX*4), AX
 28613  	LEAQ  (DX)(BX*4), DX
 28614  	MOVSS (AX), X1
 28615  	MULSS X0, X1
 28616  	ADDSS (DX), X1
 28617  	MOVSS X1, (DX)
 28618  	LEAQ  (AX)(CX*4), AX
 28619  	LEAQ  (DX)(BX*4), DX
 28620  	MOVSS (AX), X1
 28621  	MULSS X0, X1
 28622  	ADDSS (DX), X1
 28623  	MOVSS X1, (DX)
 28624  	LEAQ  (AX)(CX*4), AX
 28625  	LEAQ  (DX)(BX*4), DX
 28626  	SUBQ  $0x04, SI
 28627  
 28628  check_limit_unroll:
 28629  	CMPQ SI, $0x04
 28630  	JHS  loop_unroll
 28631  	JMP  check_limit
 28632  
 28633  loop:
 28634  	MOVSS (AX), X1
 28635  	MULSS X0, X1
 28636  	ADDSS (DX), X1
 28637  	MOVSS X1, (DX)
 28638  	DECQ  SI
 28639  	LEAQ  (AX)(CX*4), AX
 28640  	LEAQ  (DX)(BX*4), DX
 28641  
 28642  check_limit:
 28643  	CMPQ SI, $0x00
 28644  	JHI  loop
 28645  	RET
 28646  
 28647  // func AmdAxpyPointerLoopX_V4A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28648  // Requires: SSE
 28649  TEXT ·AmdAxpyPointerLoopX_V4A16U4(SB), NOSPLIT, $0-48
 28650  	MOVSS alpha+0(FP), X0
 28651  	MOVQ  xs+8(FP), AX
 28652  	MOVQ  incx+16(FP), CX
 28653  	MOVQ  ys+24(FP), DX
 28654  	MOVQ  incy+32(FP), BX
 28655  	MOVQ  n+40(FP), SI
 28656  	JMP   check_limit_unroll
 28657  	PCALIGN $0x10
 28658  
 28659  loop_unroll:
 28660  	MOVSS (AX), X1
 28661  	MULSS X0, X1
 28662  	ADDSS (DX), X1
 28663  	MOVSS X1, (DX)
 28664  	LEAQ  (AX)(CX*4), AX
 28665  	LEAQ  (DX)(BX*4), DX
 28666  	MOVSS (AX), X1
 28667  	MULSS X0, X1
 28668  	ADDSS (DX), X1
 28669  	MOVSS X1, (DX)
 28670  	LEAQ  (AX)(CX*4), AX
 28671  	LEAQ  (DX)(BX*4), DX
 28672  	MOVSS (AX), X1
 28673  	MULSS X0, X1
 28674  	ADDSS (DX), X1
 28675  	MOVSS X1, (DX)
 28676  	LEAQ  (AX)(CX*4), AX
 28677  	LEAQ  (DX)(BX*4), DX
 28678  	MOVSS (AX), X1
 28679  	MULSS X0, X1
 28680  	ADDSS (DX), X1
 28681  	MOVSS X1, (DX)
 28682  	LEAQ  (AX)(CX*4), AX
 28683  	LEAQ  (DX)(BX*4), DX
 28684  	SUBQ  $0x04, SI
 28685  
 28686  check_limit_unroll:
 28687  	CMPQ SI, $0x04
 28688  	JHS  loop_unroll
 28689  	JMP  check_limit
 28690  
 28691  loop:
 28692  	MOVSS (AX), X1
 28693  	MULSS X0, X1
 28694  	ADDSS (DX), X1
 28695  	MOVSS X1, (DX)
 28696  	DECQ  SI
 28697  	LEAQ  (AX)(CX*4), AX
 28698  	LEAQ  (DX)(BX*4), DX
 28699  
 28700  check_limit:
 28701  	CMPQ SI, $0x00
 28702  	JHI  loop
 28703  	RET
 28704  
 28705  // func AmdAxpyPointerLoopX_V5A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28706  // Requires: SSE
 28707  TEXT ·AmdAxpyPointerLoopX_V5A16U4(SB), NOSPLIT, $0-48
 28708  	MOVSS alpha+0(FP), X0
 28709  	MOVQ  xs+8(FP), AX
 28710  	MOVQ  incx+16(FP), CX
 28711  	MOVQ  ys+24(FP), DX
 28712  	MOVQ  incy+32(FP), BX
 28713  	MOVQ  n+40(FP), SI
 28714  	JMP   check_limit_unroll
 28715  	PCALIGN $0x10
 28716  
 28717  loop_unroll:
 28718  	MOVSS (AX), X1
 28719  	MULSS X0, X1
 28720  	ADDSS (DX), X1
 28721  	MOVSS X1, (DX)
 28722  	LEAQ  (AX)(CX*4), AX
 28723  	LEAQ  (DX)(BX*4), DX
 28724  	MOVSS (AX), X1
 28725  	MULSS X0, X1
 28726  	ADDSS (DX), X1
 28727  	MOVSS X1, (DX)
 28728  	LEAQ  (AX)(CX*4), AX
 28729  	LEAQ  (DX)(BX*4), DX
 28730  	MOVSS (AX), X1
 28731  	MULSS X0, X1
 28732  	ADDSS (DX), X1
 28733  	MOVSS X1, (DX)
 28734  	LEAQ  (AX)(CX*4), AX
 28735  	LEAQ  (DX)(BX*4), DX
 28736  	MOVSS (AX), X1
 28737  	MULSS X0, X1
 28738  	ADDSS (DX), X1
 28739  	MOVSS X1, (DX)
 28740  	LEAQ  (AX)(CX*4), AX
 28741  	LEAQ  (DX)(BX*4), DX
 28742  	SUBQ  $0x04, SI
 28743  
 28744  check_limit_unroll:
 28745  	CMPQ SI, $0x04
 28746  	JHS  loop_unroll
 28747  	JMP  check_limit
 28748  
 28749  loop:
 28750  	MOVSS (AX), X1
 28751  	MULSS X0, X1
 28752  	ADDSS (DX), X1
 28753  	MOVSS X1, (DX)
 28754  	DECQ  SI
 28755  	LEAQ  (AX)(CX*4), AX
 28756  	LEAQ  (DX)(BX*4), DX
 28757  
 28758  check_limit:
 28759  	CMPQ SI, $0x00
 28760  	JHI  loop
 28761  	RET
 28762  
 28763  // func AmdAxpyPointerLoopX_V0A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28764  // Requires: SSE
 28765  TEXT ·AmdAxpyPointerLoopX_V0A0U8(SB), NOSPLIT, $0-48
 28766  	MOVSS alpha+0(FP), X0
 28767  	MOVQ  xs+8(FP), AX
 28768  	MOVQ  incx+16(FP), CX
 28769  	MOVQ  ys+24(FP), DX
 28770  	MOVQ  incy+32(FP), BX
 28771  	MOVQ  n+40(FP), SI
 28772  	JMP   check_limit_unroll
 28773  
 28774  loop_unroll:
 28775  	MOVSS (AX), X1
 28776  	MULSS X0, X1
 28777  	ADDSS (DX), X1
 28778  	MOVSS X1, (DX)
 28779  	LEAQ  (AX)(CX*4), AX
 28780  	LEAQ  (DX)(BX*4), DX
 28781  	MOVSS (AX), X1
 28782  	MULSS X0, X1
 28783  	ADDSS (DX), X1
 28784  	MOVSS X1, (DX)
 28785  	LEAQ  (AX)(CX*4), AX
 28786  	LEAQ  (DX)(BX*4), DX
 28787  	MOVSS (AX), X1
 28788  	MULSS X0, X1
 28789  	ADDSS (DX), X1
 28790  	MOVSS X1, (DX)
 28791  	LEAQ  (AX)(CX*4), AX
 28792  	LEAQ  (DX)(BX*4), DX
 28793  	MOVSS (AX), X1
 28794  	MULSS X0, X1
 28795  	ADDSS (DX), X1
 28796  	MOVSS X1, (DX)
 28797  	LEAQ  (AX)(CX*4), AX
 28798  	LEAQ  (DX)(BX*4), DX
 28799  	MOVSS (AX), X1
 28800  	MULSS X0, X1
 28801  	ADDSS (DX), X1
 28802  	MOVSS X1, (DX)
 28803  	LEAQ  (AX)(CX*4), AX
 28804  	LEAQ  (DX)(BX*4), DX
 28805  	MOVSS (AX), X1
 28806  	MULSS X0, X1
 28807  	ADDSS (DX), X1
 28808  	MOVSS X1, (DX)
 28809  	LEAQ  (AX)(CX*4), AX
 28810  	LEAQ  (DX)(BX*4), DX
 28811  	MOVSS (AX), X1
 28812  	MULSS X0, X1
 28813  	ADDSS (DX), X1
 28814  	MOVSS X1, (DX)
 28815  	LEAQ  (AX)(CX*4), AX
 28816  	LEAQ  (DX)(BX*4), DX
 28817  	MOVSS (AX), X1
 28818  	MULSS X0, X1
 28819  	ADDSS (DX), X1
 28820  	MOVSS X1, (DX)
 28821  	LEAQ  (AX)(CX*4), AX
 28822  	LEAQ  (DX)(BX*4), DX
 28823  	SUBQ  $0x08, SI
 28824  
 28825  check_limit_unroll:
 28826  	CMPQ SI, $0x08
 28827  	JHS  loop_unroll
 28828  	JMP  check_limit
 28829  
 28830  loop:
 28831  	MOVSS (AX), X1
 28832  	MULSS X0, X1
 28833  	ADDSS (DX), X1
 28834  	MOVSS X1, (DX)
 28835  	DECQ  SI
 28836  	LEAQ  (AX)(CX*4), AX
 28837  	LEAQ  (DX)(BX*4), DX
 28838  
 28839  check_limit:
 28840  	CMPQ SI, $0x00
 28841  	JHI  loop
 28842  	RET
 28843  
 28844  // func AmdAxpyPointerLoopX_V1A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28845  // Requires: SSE
 28846  TEXT ·AmdAxpyPointerLoopX_V1A0U8(SB), NOSPLIT, $0-48
 28847  	MOVSS alpha+0(FP), X0
 28848  	MOVQ  xs+8(FP), AX
 28849  	MOVQ  incx+16(FP), CX
 28850  	MOVQ  ys+24(FP), DX
 28851  	MOVQ  incy+32(FP), BX
 28852  	MOVQ  n+40(FP), SI
 28853  	JMP   check_limit_unroll
 28854  
 28855  loop_unroll:
 28856  	MOVSS (AX), X1
 28857  	MULSS X0, X1
 28858  	ADDSS (DX), X1
 28859  	MOVSS X1, (DX)
 28860  	LEAQ  (AX)(CX*4), AX
 28861  	LEAQ  (DX)(BX*4), DX
 28862  	MOVSS (AX), X1
 28863  	MULSS X0, X1
 28864  	ADDSS (DX), X1
 28865  	MOVSS X1, (DX)
 28866  	LEAQ  (AX)(CX*4), AX
 28867  	LEAQ  (DX)(BX*4), DX
 28868  	MOVSS (AX), X1
 28869  	MULSS X0, X1
 28870  	ADDSS (DX), X1
 28871  	MOVSS X1, (DX)
 28872  	LEAQ  (AX)(CX*4), AX
 28873  	LEAQ  (DX)(BX*4), DX
 28874  	MOVSS (AX), X1
 28875  	MULSS X0, X1
 28876  	ADDSS (DX), X1
 28877  	MOVSS X1, (DX)
 28878  	LEAQ  (AX)(CX*4), AX
 28879  	LEAQ  (DX)(BX*4), DX
 28880  	MOVSS (AX), X1
 28881  	MULSS X0, X1
 28882  	ADDSS (DX), X1
 28883  	MOVSS X1, (DX)
 28884  	LEAQ  (AX)(CX*4), AX
 28885  	LEAQ  (DX)(BX*4), DX
 28886  	MOVSS (AX), X1
 28887  	MULSS X0, X1
 28888  	ADDSS (DX), X1
 28889  	MOVSS X1, (DX)
 28890  	LEAQ  (AX)(CX*4), AX
 28891  	LEAQ  (DX)(BX*4), DX
 28892  	MOVSS (AX), X1
 28893  	MULSS X0, X1
 28894  	ADDSS (DX), X1
 28895  	MOVSS X1, (DX)
 28896  	LEAQ  (AX)(CX*4), AX
 28897  	LEAQ  (DX)(BX*4), DX
 28898  	MOVSS (AX), X1
 28899  	MULSS X0, X1
 28900  	ADDSS (DX), X1
 28901  	MOVSS X1, (DX)
 28902  	LEAQ  (AX)(CX*4), AX
 28903  	LEAQ  (DX)(BX*4), DX
 28904  	SUBQ  $0x08, SI
 28905  
 28906  check_limit_unroll:
 28907  	CMPQ SI, $0x08
 28908  	JHS  loop_unroll
 28909  	JMP  check_limit
 28910  
 28911  loop:
 28912  	MOVSS (AX), X1
 28913  	MULSS X0, X1
 28914  	ADDSS (DX), X1
 28915  	MOVSS X1, (DX)
 28916  	DECQ  SI
 28917  	LEAQ  (AX)(CX*4), AX
 28918  	LEAQ  (DX)(BX*4), DX
 28919  
 28920  check_limit:
 28921  	CMPQ SI, $0x00
 28922  	JHI  loop
 28923  	RET
 28924  
 28925  // func AmdAxpyPointerLoopX_V2A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 28926  // Requires: SSE
 28927  TEXT ·AmdAxpyPointerLoopX_V2A0U8(SB), NOSPLIT, $0-48
 28928  	MOVSS alpha+0(FP), X0
 28929  	MOVQ  xs+8(FP), AX
 28930  	MOVQ  incx+16(FP), CX
 28931  	MOVQ  ys+24(FP), DX
 28932  	MOVQ  incy+32(FP), BX
 28933  	MOVQ  n+40(FP), SI
 28934  	JMP   check_limit_unroll
 28935  
 28936  loop_unroll:
 28937  	MOVSS (AX), X1
 28938  	MULSS X0, X1
 28939  	ADDSS (DX), X1
 28940  	MOVSS X1, (DX)
 28941  	LEAQ  (AX)(CX*4), AX
 28942  	LEAQ  (DX)(BX*4), DX
 28943  	MOVSS (AX), X1
 28944  	MULSS X0, X1
 28945  	ADDSS (DX), X1
 28946  	MOVSS X1, (DX)
 28947  	LEAQ  (AX)(CX*4), AX
 28948  	LEAQ  (DX)(BX*4), DX
 28949  	MOVSS (AX), X1
 28950  	MULSS X0, X1
 28951  	ADDSS (DX), X1
 28952  	MOVSS X1, (DX)
 28953  	LEAQ  (AX)(CX*4), AX
 28954  	LEAQ  (DX)(BX*4), DX
 28955  	MOVSS (AX), X1
 28956  	MULSS X0, X1
 28957  	ADDSS (DX), X1
 28958  	MOVSS X1, (DX)
 28959  	LEAQ  (AX)(CX*4), AX
 28960  	LEAQ  (DX)(BX*4), DX
 28961  	MOVSS (AX), X1
 28962  	MULSS X0, X1
 28963  	ADDSS (DX), X1
 28964  	MOVSS X1, (DX)
 28965  	LEAQ  (AX)(CX*4), AX
 28966  	LEAQ  (DX)(BX*4), DX
 28967  	MOVSS (AX), X1
 28968  	MULSS X0, X1
 28969  	ADDSS (DX), X1
 28970  	MOVSS X1, (DX)
 28971  	LEAQ  (AX)(CX*4), AX
 28972  	LEAQ  (DX)(BX*4), DX
 28973  	MOVSS (AX), X1
 28974  	MULSS X0, X1
 28975  	ADDSS (DX), X1
 28976  	MOVSS X1, (DX)
 28977  	LEAQ  (AX)(CX*4), AX
 28978  	LEAQ  (DX)(BX*4), DX
 28979  	MOVSS (AX), X1
 28980  	MULSS X0, X1
 28981  	ADDSS (DX), X1
 28982  	MOVSS X1, (DX)
 28983  	LEAQ  (AX)(CX*4), AX
 28984  	LEAQ  (DX)(BX*4), DX
 28985  	SUBQ  $0x08, SI
 28986  
 28987  check_limit_unroll:
 28988  	CMPQ SI, $0x08
 28989  	JHS  loop_unroll
 28990  	JMP  check_limit
 28991  
 28992  loop:
 28993  	MOVSS (AX), X1
 28994  	MULSS X0, X1
 28995  	ADDSS (DX), X1
 28996  	MOVSS X1, (DX)
 28997  	DECQ  SI
 28998  	LEAQ  (AX)(CX*4), AX
 28999  	LEAQ  (DX)(BX*4), DX
 29000  
 29001  check_limit:
 29002  	CMPQ SI, $0x00
 29003  	JHI  loop
 29004  	RET
 29005  
 29006  // func AmdAxpyPointerLoopX_V3A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29007  // Requires: SSE
 29008  TEXT ·AmdAxpyPointerLoopX_V3A0U8(SB), NOSPLIT, $0-48
 29009  	MOVSS alpha+0(FP), X0
 29010  	MOVQ  xs+8(FP), AX
 29011  	MOVQ  incx+16(FP), CX
 29012  	MOVQ  ys+24(FP), DX
 29013  	MOVQ  incy+32(FP), BX
 29014  	MOVQ  n+40(FP), SI
 29015  	JMP   check_limit_unroll
 29016  
 29017  loop_unroll:
 29018  	MOVSS (AX), X1
 29019  	MULSS X0, X1
 29020  	ADDSS (DX), X1
 29021  	MOVSS X1, (DX)
 29022  	LEAQ  (AX)(CX*4), AX
 29023  	LEAQ  (DX)(BX*4), DX
 29024  	MOVSS (AX), X1
 29025  	MULSS X0, X1
 29026  	ADDSS (DX), X1
 29027  	MOVSS X1, (DX)
 29028  	LEAQ  (AX)(CX*4), AX
 29029  	LEAQ  (DX)(BX*4), DX
 29030  	MOVSS (AX), X1
 29031  	MULSS X0, X1
 29032  	ADDSS (DX), X1
 29033  	MOVSS X1, (DX)
 29034  	LEAQ  (AX)(CX*4), AX
 29035  	LEAQ  (DX)(BX*4), DX
 29036  	MOVSS (AX), X1
 29037  	MULSS X0, X1
 29038  	ADDSS (DX), X1
 29039  	MOVSS X1, (DX)
 29040  	LEAQ  (AX)(CX*4), AX
 29041  	LEAQ  (DX)(BX*4), DX
 29042  	MOVSS (AX), X1
 29043  	MULSS X0, X1
 29044  	ADDSS (DX), X1
 29045  	MOVSS X1, (DX)
 29046  	LEAQ  (AX)(CX*4), AX
 29047  	LEAQ  (DX)(BX*4), DX
 29048  	MOVSS (AX), X1
 29049  	MULSS X0, X1
 29050  	ADDSS (DX), X1
 29051  	MOVSS X1, (DX)
 29052  	LEAQ  (AX)(CX*4), AX
 29053  	LEAQ  (DX)(BX*4), DX
 29054  	MOVSS (AX), X1
 29055  	MULSS X0, X1
 29056  	ADDSS (DX), X1
 29057  	MOVSS X1, (DX)
 29058  	LEAQ  (AX)(CX*4), AX
 29059  	LEAQ  (DX)(BX*4), DX
 29060  	MOVSS (AX), X1
 29061  	MULSS X0, X1
 29062  	ADDSS (DX), X1
 29063  	MOVSS X1, (DX)
 29064  	LEAQ  (AX)(CX*4), AX
 29065  	LEAQ  (DX)(BX*4), DX
 29066  	SUBQ  $0x08, SI
 29067  
 29068  check_limit_unroll:
 29069  	CMPQ SI, $0x08
 29070  	JHS  loop_unroll
 29071  	JMP  check_limit
 29072  
 29073  loop:
 29074  	MOVSS (AX), X1
 29075  	MULSS X0, X1
 29076  	ADDSS (DX), X1
 29077  	MOVSS X1, (DX)
 29078  	DECQ  SI
 29079  	LEAQ  (AX)(CX*4), AX
 29080  	LEAQ  (DX)(BX*4), DX
 29081  
 29082  check_limit:
 29083  	CMPQ SI, $0x00
 29084  	JHI  loop
 29085  	RET
 29086  
 29087  // func AmdAxpyPointerLoopX_V4A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29088  // Requires: SSE
 29089  TEXT ·AmdAxpyPointerLoopX_V4A0U8(SB), NOSPLIT, $0-48
 29090  	MOVSS alpha+0(FP), X0
 29091  	MOVQ  xs+8(FP), AX
 29092  	MOVQ  incx+16(FP), CX
 29093  	MOVQ  ys+24(FP), DX
 29094  	MOVQ  incy+32(FP), BX
 29095  	MOVQ  n+40(FP), SI
 29096  	JMP   check_limit_unroll
 29097  
 29098  loop_unroll:
 29099  	MOVSS (AX), X1
 29100  	MULSS X0, X1
 29101  	ADDSS (DX), X1
 29102  	MOVSS X1, (DX)
 29103  	LEAQ  (AX)(CX*4), AX
 29104  	LEAQ  (DX)(BX*4), DX
 29105  	MOVSS (AX), X1
 29106  	MULSS X0, X1
 29107  	ADDSS (DX), X1
 29108  	MOVSS X1, (DX)
 29109  	LEAQ  (AX)(CX*4), AX
 29110  	LEAQ  (DX)(BX*4), DX
 29111  	MOVSS (AX), X1
 29112  	MULSS X0, X1
 29113  	ADDSS (DX), X1
 29114  	MOVSS X1, (DX)
 29115  	LEAQ  (AX)(CX*4), AX
 29116  	LEAQ  (DX)(BX*4), DX
 29117  	MOVSS (AX), X1
 29118  	MULSS X0, X1
 29119  	ADDSS (DX), X1
 29120  	MOVSS X1, (DX)
 29121  	LEAQ  (AX)(CX*4), AX
 29122  	LEAQ  (DX)(BX*4), DX
 29123  	MOVSS (AX), X1
 29124  	MULSS X0, X1
 29125  	ADDSS (DX), X1
 29126  	MOVSS X1, (DX)
 29127  	LEAQ  (AX)(CX*4), AX
 29128  	LEAQ  (DX)(BX*4), DX
 29129  	MOVSS (AX), X1
 29130  	MULSS X0, X1
 29131  	ADDSS (DX), X1
 29132  	MOVSS X1, (DX)
 29133  	LEAQ  (AX)(CX*4), AX
 29134  	LEAQ  (DX)(BX*4), DX
 29135  	MOVSS (AX), X1
 29136  	MULSS X0, X1
 29137  	ADDSS (DX), X1
 29138  	MOVSS X1, (DX)
 29139  	LEAQ  (AX)(CX*4), AX
 29140  	LEAQ  (DX)(BX*4), DX
 29141  	MOVSS (AX), X1
 29142  	MULSS X0, X1
 29143  	ADDSS (DX), X1
 29144  	MOVSS X1, (DX)
 29145  	LEAQ  (AX)(CX*4), AX
 29146  	LEAQ  (DX)(BX*4), DX
 29147  	SUBQ  $0x08, SI
 29148  
 29149  check_limit_unroll:
 29150  	CMPQ SI, $0x08
 29151  	JHS  loop_unroll
 29152  	JMP  check_limit
 29153  
 29154  loop:
 29155  	MOVSS (AX), X1
 29156  	MULSS X0, X1
 29157  	ADDSS (DX), X1
 29158  	MOVSS X1, (DX)
 29159  	DECQ  SI
 29160  	LEAQ  (AX)(CX*4), AX
 29161  	LEAQ  (DX)(BX*4), DX
 29162  
 29163  check_limit:
 29164  	CMPQ SI, $0x00
 29165  	JHI  loop
 29166  	RET
 29167  
 29168  // func AmdAxpyPointerLoopX_V5A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29169  // Requires: SSE
 29170  TEXT ·AmdAxpyPointerLoopX_V5A0U8(SB), NOSPLIT, $0-48
 29171  	MOVSS alpha+0(FP), X0
 29172  	MOVQ  xs+8(FP), AX
 29173  	MOVQ  incx+16(FP), CX
 29174  	MOVQ  ys+24(FP), DX
 29175  	MOVQ  incy+32(FP), BX
 29176  	MOVQ  n+40(FP), SI
 29177  	JMP   check_limit_unroll
 29178  
 29179  loop_unroll:
 29180  	MOVSS (AX), X1
 29181  	MULSS X0, X1
 29182  	ADDSS (DX), X1
 29183  	MOVSS X1, (DX)
 29184  	LEAQ  (AX)(CX*4), AX
 29185  	LEAQ  (DX)(BX*4), DX
 29186  	MOVSS (AX), X1
 29187  	MULSS X0, X1
 29188  	ADDSS (DX), X1
 29189  	MOVSS X1, (DX)
 29190  	LEAQ  (AX)(CX*4), AX
 29191  	LEAQ  (DX)(BX*4), DX
 29192  	MOVSS (AX), X1
 29193  	MULSS X0, X1
 29194  	ADDSS (DX), X1
 29195  	MOVSS X1, (DX)
 29196  	LEAQ  (AX)(CX*4), AX
 29197  	LEAQ  (DX)(BX*4), DX
 29198  	MOVSS (AX), X1
 29199  	MULSS X0, X1
 29200  	ADDSS (DX), X1
 29201  	MOVSS X1, (DX)
 29202  	LEAQ  (AX)(CX*4), AX
 29203  	LEAQ  (DX)(BX*4), DX
 29204  	MOVSS (AX), X1
 29205  	MULSS X0, X1
 29206  	ADDSS (DX), X1
 29207  	MOVSS X1, (DX)
 29208  	LEAQ  (AX)(CX*4), AX
 29209  	LEAQ  (DX)(BX*4), DX
 29210  	MOVSS (AX), X1
 29211  	MULSS X0, X1
 29212  	ADDSS (DX), X1
 29213  	MOVSS X1, (DX)
 29214  	LEAQ  (AX)(CX*4), AX
 29215  	LEAQ  (DX)(BX*4), DX
 29216  	MOVSS (AX), X1
 29217  	MULSS X0, X1
 29218  	ADDSS (DX), X1
 29219  	MOVSS X1, (DX)
 29220  	LEAQ  (AX)(CX*4), AX
 29221  	LEAQ  (DX)(BX*4), DX
 29222  	MOVSS (AX), X1
 29223  	MULSS X0, X1
 29224  	ADDSS (DX), X1
 29225  	MOVSS X1, (DX)
 29226  	LEAQ  (AX)(CX*4), AX
 29227  	LEAQ  (DX)(BX*4), DX
 29228  	SUBQ  $0x08, SI
 29229  
 29230  check_limit_unroll:
 29231  	CMPQ SI, $0x08
 29232  	JHS  loop_unroll
 29233  	JMP  check_limit
 29234  
 29235  loop:
 29236  	MOVSS (AX), X1
 29237  	MULSS X0, X1
 29238  	ADDSS (DX), X1
 29239  	MOVSS X1, (DX)
 29240  	DECQ  SI
 29241  	LEAQ  (AX)(CX*4), AX
 29242  	LEAQ  (DX)(BX*4), DX
 29243  
 29244  check_limit:
 29245  	CMPQ SI, $0x00
 29246  	JHI  loop
 29247  	RET
 29248  
 29249  // func AmdAxpyPointerLoopX_V0A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29250  // Requires: SSE
 29251  TEXT ·AmdAxpyPointerLoopX_V0A8U8(SB), NOSPLIT, $0-48
 29252  	MOVSS alpha+0(FP), X0
 29253  	MOVQ  xs+8(FP), AX
 29254  	MOVQ  incx+16(FP), CX
 29255  	MOVQ  ys+24(FP), DX
 29256  	MOVQ  incy+32(FP), BX
 29257  	MOVQ  n+40(FP), SI
 29258  	JMP   check_limit_unroll
 29259  	PCALIGN $0x08
 29260  
 29261  loop_unroll:
 29262  	MOVSS (AX), X1
 29263  	MULSS X0, X1
 29264  	ADDSS (DX), X1
 29265  	MOVSS X1, (DX)
 29266  	LEAQ  (AX)(CX*4), AX
 29267  	LEAQ  (DX)(BX*4), DX
 29268  	MOVSS (AX), X1
 29269  	MULSS X0, X1
 29270  	ADDSS (DX), X1
 29271  	MOVSS X1, (DX)
 29272  	LEAQ  (AX)(CX*4), AX
 29273  	LEAQ  (DX)(BX*4), DX
 29274  	MOVSS (AX), X1
 29275  	MULSS X0, X1
 29276  	ADDSS (DX), X1
 29277  	MOVSS X1, (DX)
 29278  	LEAQ  (AX)(CX*4), AX
 29279  	LEAQ  (DX)(BX*4), DX
 29280  	MOVSS (AX), X1
 29281  	MULSS X0, X1
 29282  	ADDSS (DX), X1
 29283  	MOVSS X1, (DX)
 29284  	LEAQ  (AX)(CX*4), AX
 29285  	LEAQ  (DX)(BX*4), DX
 29286  	MOVSS (AX), X1
 29287  	MULSS X0, X1
 29288  	ADDSS (DX), X1
 29289  	MOVSS X1, (DX)
 29290  	LEAQ  (AX)(CX*4), AX
 29291  	LEAQ  (DX)(BX*4), DX
 29292  	MOVSS (AX), X1
 29293  	MULSS X0, X1
 29294  	ADDSS (DX), X1
 29295  	MOVSS X1, (DX)
 29296  	LEAQ  (AX)(CX*4), AX
 29297  	LEAQ  (DX)(BX*4), DX
 29298  	MOVSS (AX), X1
 29299  	MULSS X0, X1
 29300  	ADDSS (DX), X1
 29301  	MOVSS X1, (DX)
 29302  	LEAQ  (AX)(CX*4), AX
 29303  	LEAQ  (DX)(BX*4), DX
 29304  	MOVSS (AX), X1
 29305  	MULSS X0, X1
 29306  	ADDSS (DX), X1
 29307  	MOVSS X1, (DX)
 29308  	LEAQ  (AX)(CX*4), AX
 29309  	LEAQ  (DX)(BX*4), DX
 29310  	SUBQ  $0x08, SI
 29311  
 29312  check_limit_unroll:
 29313  	CMPQ SI, $0x08
 29314  	JHS  loop_unroll
 29315  	JMP  check_limit
 29316  
 29317  loop:
 29318  	MOVSS (AX), X1
 29319  	MULSS X0, X1
 29320  	ADDSS (DX), X1
 29321  	MOVSS X1, (DX)
 29322  	DECQ  SI
 29323  	LEAQ  (AX)(CX*4), AX
 29324  	LEAQ  (DX)(BX*4), DX
 29325  
 29326  check_limit:
 29327  	CMPQ SI, $0x00
 29328  	JHI  loop
 29329  	RET
 29330  
 29331  // func AmdAxpyPointerLoopX_V1A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29332  // Requires: SSE
 29333  TEXT ·AmdAxpyPointerLoopX_V1A8U8(SB), NOSPLIT, $0-48
 29334  	MOVSS alpha+0(FP), X0
 29335  	MOVQ  xs+8(FP), AX
 29336  	MOVQ  incx+16(FP), CX
 29337  	MOVQ  ys+24(FP), DX
 29338  	MOVQ  incy+32(FP), BX
 29339  	MOVQ  n+40(FP), SI
 29340  	JMP   check_limit_unroll
 29341  	PCALIGN $0x08
 29342  
 29343  loop_unroll:
 29344  	MOVSS (AX), X1
 29345  	MULSS X0, X1
 29346  	ADDSS (DX), X1
 29347  	MOVSS X1, (DX)
 29348  	LEAQ  (AX)(CX*4), AX
 29349  	LEAQ  (DX)(BX*4), DX
 29350  	MOVSS (AX), X1
 29351  	MULSS X0, X1
 29352  	ADDSS (DX), X1
 29353  	MOVSS X1, (DX)
 29354  	LEAQ  (AX)(CX*4), AX
 29355  	LEAQ  (DX)(BX*4), DX
 29356  	MOVSS (AX), X1
 29357  	MULSS X0, X1
 29358  	ADDSS (DX), X1
 29359  	MOVSS X1, (DX)
 29360  	LEAQ  (AX)(CX*4), AX
 29361  	LEAQ  (DX)(BX*4), DX
 29362  	MOVSS (AX), X1
 29363  	MULSS X0, X1
 29364  	ADDSS (DX), X1
 29365  	MOVSS X1, (DX)
 29366  	LEAQ  (AX)(CX*4), AX
 29367  	LEAQ  (DX)(BX*4), DX
 29368  	MOVSS (AX), X1
 29369  	MULSS X0, X1
 29370  	ADDSS (DX), X1
 29371  	MOVSS X1, (DX)
 29372  	LEAQ  (AX)(CX*4), AX
 29373  	LEAQ  (DX)(BX*4), DX
 29374  	MOVSS (AX), X1
 29375  	MULSS X0, X1
 29376  	ADDSS (DX), X1
 29377  	MOVSS X1, (DX)
 29378  	LEAQ  (AX)(CX*4), AX
 29379  	LEAQ  (DX)(BX*4), DX
 29380  	MOVSS (AX), X1
 29381  	MULSS X0, X1
 29382  	ADDSS (DX), X1
 29383  	MOVSS X1, (DX)
 29384  	LEAQ  (AX)(CX*4), AX
 29385  	LEAQ  (DX)(BX*4), DX
 29386  	MOVSS (AX), X1
 29387  	MULSS X0, X1
 29388  	ADDSS (DX), X1
 29389  	MOVSS X1, (DX)
 29390  	LEAQ  (AX)(CX*4), AX
 29391  	LEAQ  (DX)(BX*4), DX
 29392  	SUBQ  $0x08, SI
 29393  
 29394  check_limit_unroll:
 29395  	CMPQ SI, $0x08
 29396  	JHS  loop_unroll
 29397  	JMP  check_limit
 29398  
 29399  loop:
 29400  	MOVSS (AX), X1
 29401  	MULSS X0, X1
 29402  	ADDSS (DX), X1
 29403  	MOVSS X1, (DX)
 29404  	DECQ  SI
 29405  	LEAQ  (AX)(CX*4), AX
 29406  	LEAQ  (DX)(BX*4), DX
 29407  
 29408  check_limit:
 29409  	CMPQ SI, $0x00
 29410  	JHI  loop
 29411  	RET
 29412  
 29413  // func AmdAxpyPointerLoopX_V2A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29414  // Requires: SSE
 29415  TEXT ·AmdAxpyPointerLoopX_V2A8U8(SB), NOSPLIT, $0-48
 29416  	MOVSS alpha+0(FP), X0
 29417  	MOVQ  xs+8(FP), AX
 29418  	MOVQ  incx+16(FP), CX
 29419  	MOVQ  ys+24(FP), DX
 29420  	MOVQ  incy+32(FP), BX
 29421  	MOVQ  n+40(FP), SI
 29422  	JMP   check_limit_unroll
 29423  	PCALIGN $0x08
 29424  
 29425  loop_unroll:
 29426  	MOVSS (AX), X1
 29427  	MULSS X0, X1
 29428  	ADDSS (DX), X1
 29429  	MOVSS X1, (DX)
 29430  	LEAQ  (AX)(CX*4), AX
 29431  	LEAQ  (DX)(BX*4), DX
 29432  	MOVSS (AX), X1
 29433  	MULSS X0, X1
 29434  	ADDSS (DX), X1
 29435  	MOVSS X1, (DX)
 29436  	LEAQ  (AX)(CX*4), AX
 29437  	LEAQ  (DX)(BX*4), DX
 29438  	MOVSS (AX), X1
 29439  	MULSS X0, X1
 29440  	ADDSS (DX), X1
 29441  	MOVSS X1, (DX)
 29442  	LEAQ  (AX)(CX*4), AX
 29443  	LEAQ  (DX)(BX*4), DX
 29444  	MOVSS (AX), X1
 29445  	MULSS X0, X1
 29446  	ADDSS (DX), X1
 29447  	MOVSS X1, (DX)
 29448  	LEAQ  (AX)(CX*4), AX
 29449  	LEAQ  (DX)(BX*4), DX
 29450  	MOVSS (AX), X1
 29451  	MULSS X0, X1
 29452  	ADDSS (DX), X1
 29453  	MOVSS X1, (DX)
 29454  	LEAQ  (AX)(CX*4), AX
 29455  	LEAQ  (DX)(BX*4), DX
 29456  	MOVSS (AX), X1
 29457  	MULSS X0, X1
 29458  	ADDSS (DX), X1
 29459  	MOVSS X1, (DX)
 29460  	LEAQ  (AX)(CX*4), AX
 29461  	LEAQ  (DX)(BX*4), DX
 29462  	MOVSS (AX), X1
 29463  	MULSS X0, X1
 29464  	ADDSS (DX), X1
 29465  	MOVSS X1, (DX)
 29466  	LEAQ  (AX)(CX*4), AX
 29467  	LEAQ  (DX)(BX*4), DX
 29468  	MOVSS (AX), X1
 29469  	MULSS X0, X1
 29470  	ADDSS (DX), X1
 29471  	MOVSS X1, (DX)
 29472  	LEAQ  (AX)(CX*4), AX
 29473  	LEAQ  (DX)(BX*4), DX
 29474  	SUBQ  $0x08, SI
 29475  
 29476  check_limit_unroll:
 29477  	CMPQ SI, $0x08
 29478  	JHS  loop_unroll
 29479  	JMP  check_limit
 29480  
 29481  loop:
 29482  	MOVSS (AX), X1
 29483  	MULSS X0, X1
 29484  	ADDSS (DX), X1
 29485  	MOVSS X1, (DX)
 29486  	DECQ  SI
 29487  	LEAQ  (AX)(CX*4), AX
 29488  	LEAQ  (DX)(BX*4), DX
 29489  
 29490  check_limit:
 29491  	CMPQ SI, $0x00
 29492  	JHI  loop
 29493  	RET
 29494  
 29495  // func AmdAxpyPointerLoopX_V3A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29496  // Requires: SSE
 29497  TEXT ·AmdAxpyPointerLoopX_V3A8U8(SB), NOSPLIT, $0-48
 29498  	MOVSS alpha+0(FP), X0
 29499  	MOVQ  xs+8(FP), AX
 29500  	MOVQ  incx+16(FP), CX
 29501  	MOVQ  ys+24(FP), DX
 29502  	MOVQ  incy+32(FP), BX
 29503  	MOVQ  n+40(FP), SI
 29504  	JMP   check_limit_unroll
 29505  	PCALIGN $0x08
 29506  
 29507  loop_unroll:
 29508  	MOVSS (AX), X1
 29509  	MULSS X0, X1
 29510  	ADDSS (DX), X1
 29511  	MOVSS X1, (DX)
 29512  	LEAQ  (AX)(CX*4), AX
 29513  	LEAQ  (DX)(BX*4), DX
 29514  	MOVSS (AX), X1
 29515  	MULSS X0, X1
 29516  	ADDSS (DX), X1
 29517  	MOVSS X1, (DX)
 29518  	LEAQ  (AX)(CX*4), AX
 29519  	LEAQ  (DX)(BX*4), DX
 29520  	MOVSS (AX), X1
 29521  	MULSS X0, X1
 29522  	ADDSS (DX), X1
 29523  	MOVSS X1, (DX)
 29524  	LEAQ  (AX)(CX*4), AX
 29525  	LEAQ  (DX)(BX*4), DX
 29526  	MOVSS (AX), X1
 29527  	MULSS X0, X1
 29528  	ADDSS (DX), X1
 29529  	MOVSS X1, (DX)
 29530  	LEAQ  (AX)(CX*4), AX
 29531  	LEAQ  (DX)(BX*4), DX
 29532  	MOVSS (AX), X1
 29533  	MULSS X0, X1
 29534  	ADDSS (DX), X1
 29535  	MOVSS X1, (DX)
 29536  	LEAQ  (AX)(CX*4), AX
 29537  	LEAQ  (DX)(BX*4), DX
 29538  	MOVSS (AX), X1
 29539  	MULSS X0, X1
 29540  	ADDSS (DX), X1
 29541  	MOVSS X1, (DX)
 29542  	LEAQ  (AX)(CX*4), AX
 29543  	LEAQ  (DX)(BX*4), DX
 29544  	MOVSS (AX), X1
 29545  	MULSS X0, X1
 29546  	ADDSS (DX), X1
 29547  	MOVSS X1, (DX)
 29548  	LEAQ  (AX)(CX*4), AX
 29549  	LEAQ  (DX)(BX*4), DX
 29550  	MOVSS (AX), X1
 29551  	MULSS X0, X1
 29552  	ADDSS (DX), X1
 29553  	MOVSS X1, (DX)
 29554  	LEAQ  (AX)(CX*4), AX
 29555  	LEAQ  (DX)(BX*4), DX
 29556  	SUBQ  $0x08, SI
 29557  
 29558  check_limit_unroll:
 29559  	CMPQ SI, $0x08
 29560  	JHS  loop_unroll
 29561  	JMP  check_limit
 29562  
 29563  loop:
 29564  	MOVSS (AX), X1
 29565  	MULSS X0, X1
 29566  	ADDSS (DX), X1
 29567  	MOVSS X1, (DX)
 29568  	DECQ  SI
 29569  	LEAQ  (AX)(CX*4), AX
 29570  	LEAQ  (DX)(BX*4), DX
 29571  
 29572  check_limit:
 29573  	CMPQ SI, $0x00
 29574  	JHI  loop
 29575  	RET
 29576  
 29577  // func AmdAxpyPointerLoopX_V4A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29578  // Requires: SSE
 29579  TEXT ·AmdAxpyPointerLoopX_V4A8U8(SB), NOSPLIT, $0-48
 29580  	MOVSS alpha+0(FP), X0
 29581  	MOVQ  xs+8(FP), AX
 29582  	MOVQ  incx+16(FP), CX
 29583  	MOVQ  ys+24(FP), DX
 29584  	MOVQ  incy+32(FP), BX
 29585  	MOVQ  n+40(FP), SI
 29586  	JMP   check_limit_unroll
 29587  	PCALIGN $0x08
 29588  
 29589  loop_unroll:
 29590  	MOVSS (AX), X1
 29591  	MULSS X0, X1
 29592  	ADDSS (DX), X1
 29593  	MOVSS X1, (DX)
 29594  	LEAQ  (AX)(CX*4), AX
 29595  	LEAQ  (DX)(BX*4), DX
 29596  	MOVSS (AX), X1
 29597  	MULSS X0, X1
 29598  	ADDSS (DX), X1
 29599  	MOVSS X1, (DX)
 29600  	LEAQ  (AX)(CX*4), AX
 29601  	LEAQ  (DX)(BX*4), DX
 29602  	MOVSS (AX), X1
 29603  	MULSS X0, X1
 29604  	ADDSS (DX), X1
 29605  	MOVSS X1, (DX)
 29606  	LEAQ  (AX)(CX*4), AX
 29607  	LEAQ  (DX)(BX*4), DX
 29608  	MOVSS (AX), X1
 29609  	MULSS X0, X1
 29610  	ADDSS (DX), X1
 29611  	MOVSS X1, (DX)
 29612  	LEAQ  (AX)(CX*4), AX
 29613  	LEAQ  (DX)(BX*4), DX
 29614  	MOVSS (AX), X1
 29615  	MULSS X0, X1
 29616  	ADDSS (DX), X1
 29617  	MOVSS X1, (DX)
 29618  	LEAQ  (AX)(CX*4), AX
 29619  	LEAQ  (DX)(BX*4), DX
 29620  	MOVSS (AX), X1
 29621  	MULSS X0, X1
 29622  	ADDSS (DX), X1
 29623  	MOVSS X1, (DX)
 29624  	LEAQ  (AX)(CX*4), AX
 29625  	LEAQ  (DX)(BX*4), DX
 29626  	MOVSS (AX), X1
 29627  	MULSS X0, X1
 29628  	ADDSS (DX), X1
 29629  	MOVSS X1, (DX)
 29630  	LEAQ  (AX)(CX*4), AX
 29631  	LEAQ  (DX)(BX*4), DX
 29632  	MOVSS (AX), X1
 29633  	MULSS X0, X1
 29634  	ADDSS (DX), X1
 29635  	MOVSS X1, (DX)
 29636  	LEAQ  (AX)(CX*4), AX
 29637  	LEAQ  (DX)(BX*4), DX
 29638  	SUBQ  $0x08, SI
 29639  
 29640  check_limit_unroll:
 29641  	CMPQ SI, $0x08
 29642  	JHS  loop_unroll
 29643  	JMP  check_limit
 29644  
 29645  loop:
 29646  	MOVSS (AX), X1
 29647  	MULSS X0, X1
 29648  	ADDSS (DX), X1
 29649  	MOVSS X1, (DX)
 29650  	DECQ  SI
 29651  	LEAQ  (AX)(CX*4), AX
 29652  	LEAQ  (DX)(BX*4), DX
 29653  
 29654  check_limit:
 29655  	CMPQ SI, $0x00
 29656  	JHI  loop
 29657  	RET
 29658  
 29659  // func AmdAxpyPointerLoopX_V5A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29660  // Requires: SSE
 29661  TEXT ·AmdAxpyPointerLoopX_V5A8U8(SB), NOSPLIT, $0-48
 29662  	MOVSS alpha+0(FP), X0
 29663  	MOVQ  xs+8(FP), AX
 29664  	MOVQ  incx+16(FP), CX
 29665  	MOVQ  ys+24(FP), DX
 29666  	MOVQ  incy+32(FP), BX
 29667  	MOVQ  n+40(FP), SI
 29668  	JMP   check_limit_unroll
 29669  	PCALIGN $0x08
 29670  
 29671  loop_unroll:
 29672  	MOVSS (AX), X1
 29673  	MULSS X0, X1
 29674  	ADDSS (DX), X1
 29675  	MOVSS X1, (DX)
 29676  	LEAQ  (AX)(CX*4), AX
 29677  	LEAQ  (DX)(BX*4), DX
 29678  	MOVSS (AX), X1
 29679  	MULSS X0, X1
 29680  	ADDSS (DX), X1
 29681  	MOVSS X1, (DX)
 29682  	LEAQ  (AX)(CX*4), AX
 29683  	LEAQ  (DX)(BX*4), DX
 29684  	MOVSS (AX), X1
 29685  	MULSS X0, X1
 29686  	ADDSS (DX), X1
 29687  	MOVSS X1, (DX)
 29688  	LEAQ  (AX)(CX*4), AX
 29689  	LEAQ  (DX)(BX*4), DX
 29690  	MOVSS (AX), X1
 29691  	MULSS X0, X1
 29692  	ADDSS (DX), X1
 29693  	MOVSS X1, (DX)
 29694  	LEAQ  (AX)(CX*4), AX
 29695  	LEAQ  (DX)(BX*4), DX
 29696  	MOVSS (AX), X1
 29697  	MULSS X0, X1
 29698  	ADDSS (DX), X1
 29699  	MOVSS X1, (DX)
 29700  	LEAQ  (AX)(CX*4), AX
 29701  	LEAQ  (DX)(BX*4), DX
 29702  	MOVSS (AX), X1
 29703  	MULSS X0, X1
 29704  	ADDSS (DX), X1
 29705  	MOVSS X1, (DX)
 29706  	LEAQ  (AX)(CX*4), AX
 29707  	LEAQ  (DX)(BX*4), DX
 29708  	MOVSS (AX), X1
 29709  	MULSS X0, X1
 29710  	ADDSS (DX), X1
 29711  	MOVSS X1, (DX)
 29712  	LEAQ  (AX)(CX*4), AX
 29713  	LEAQ  (DX)(BX*4), DX
 29714  	MOVSS (AX), X1
 29715  	MULSS X0, X1
 29716  	ADDSS (DX), X1
 29717  	MOVSS X1, (DX)
 29718  	LEAQ  (AX)(CX*4), AX
 29719  	LEAQ  (DX)(BX*4), DX
 29720  	SUBQ  $0x08, SI
 29721  
 29722  check_limit_unroll:
 29723  	CMPQ SI, $0x08
 29724  	JHS  loop_unroll
 29725  	JMP  check_limit
 29726  
 29727  loop:
 29728  	MOVSS (AX), X1
 29729  	MULSS X0, X1
 29730  	ADDSS (DX), X1
 29731  	MOVSS X1, (DX)
 29732  	DECQ  SI
 29733  	LEAQ  (AX)(CX*4), AX
 29734  	LEAQ  (DX)(BX*4), DX
 29735  
 29736  check_limit:
 29737  	CMPQ SI, $0x00
 29738  	JHI  loop
 29739  	RET
 29740  
 29741  // func AmdAxpyPointerLoopX_V0A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29742  // Requires: SSE
 29743  TEXT ·AmdAxpyPointerLoopX_V0A9U8(SB), NOSPLIT, $0-48
 29744  	MOVSS alpha+0(FP), X0
 29745  	MOVQ  xs+8(FP), AX
 29746  	MOVQ  incx+16(FP), CX
 29747  	MOVQ  ys+24(FP), DX
 29748  	MOVQ  incy+32(FP), BX
 29749  	MOVQ  n+40(FP), SI
 29750  	JMP   check_limit_unroll
 29751  	PCALIGN $0x08
 29752  	NOP
 29753  
 29754  loop_unroll:
 29755  	MOVSS (AX), X1
 29756  	MULSS X0, X1
 29757  	ADDSS (DX), X1
 29758  	MOVSS X1, (DX)
 29759  	LEAQ  (AX)(CX*4), AX
 29760  	LEAQ  (DX)(BX*4), DX
 29761  	MOVSS (AX), X1
 29762  	MULSS X0, X1
 29763  	ADDSS (DX), X1
 29764  	MOVSS X1, (DX)
 29765  	LEAQ  (AX)(CX*4), AX
 29766  	LEAQ  (DX)(BX*4), DX
 29767  	MOVSS (AX), X1
 29768  	MULSS X0, X1
 29769  	ADDSS (DX), X1
 29770  	MOVSS X1, (DX)
 29771  	LEAQ  (AX)(CX*4), AX
 29772  	LEAQ  (DX)(BX*4), DX
 29773  	MOVSS (AX), X1
 29774  	MULSS X0, X1
 29775  	ADDSS (DX), X1
 29776  	MOVSS X1, (DX)
 29777  	LEAQ  (AX)(CX*4), AX
 29778  	LEAQ  (DX)(BX*4), DX
 29779  	MOVSS (AX), X1
 29780  	MULSS X0, X1
 29781  	ADDSS (DX), X1
 29782  	MOVSS X1, (DX)
 29783  	LEAQ  (AX)(CX*4), AX
 29784  	LEAQ  (DX)(BX*4), DX
 29785  	MOVSS (AX), X1
 29786  	MULSS X0, X1
 29787  	ADDSS (DX), X1
 29788  	MOVSS X1, (DX)
 29789  	LEAQ  (AX)(CX*4), AX
 29790  	LEAQ  (DX)(BX*4), DX
 29791  	MOVSS (AX), X1
 29792  	MULSS X0, X1
 29793  	ADDSS (DX), X1
 29794  	MOVSS X1, (DX)
 29795  	LEAQ  (AX)(CX*4), AX
 29796  	LEAQ  (DX)(BX*4), DX
 29797  	MOVSS (AX), X1
 29798  	MULSS X0, X1
 29799  	ADDSS (DX), X1
 29800  	MOVSS X1, (DX)
 29801  	LEAQ  (AX)(CX*4), AX
 29802  	LEAQ  (DX)(BX*4), DX
 29803  	SUBQ  $0x08, SI
 29804  
 29805  check_limit_unroll:
 29806  	CMPQ SI, $0x08
 29807  	JHS  loop_unroll
 29808  	JMP  check_limit
 29809  
 29810  loop:
 29811  	MOVSS (AX), X1
 29812  	MULSS X0, X1
 29813  	ADDSS (DX), X1
 29814  	MOVSS X1, (DX)
 29815  	DECQ  SI
 29816  	LEAQ  (AX)(CX*4), AX
 29817  	LEAQ  (DX)(BX*4), DX
 29818  
 29819  check_limit:
 29820  	CMPQ SI, $0x00
 29821  	JHI  loop
 29822  	RET
 29823  
 29824  // func AmdAxpyPointerLoopX_V1A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29825  // Requires: SSE
 29826  TEXT ·AmdAxpyPointerLoopX_V1A9U8(SB), NOSPLIT, $0-48
 29827  	MOVSS alpha+0(FP), X0
 29828  	MOVQ  xs+8(FP), AX
 29829  	MOVQ  incx+16(FP), CX
 29830  	MOVQ  ys+24(FP), DX
 29831  	MOVQ  incy+32(FP), BX
 29832  	MOVQ  n+40(FP), SI
 29833  	JMP   check_limit_unroll
 29834  	PCALIGN $0x08
 29835  	NOP
 29836  
 29837  loop_unroll:
 29838  	MOVSS (AX), X1
 29839  	MULSS X0, X1
 29840  	ADDSS (DX), X1
 29841  	MOVSS X1, (DX)
 29842  	LEAQ  (AX)(CX*4), AX
 29843  	LEAQ  (DX)(BX*4), DX
 29844  	MOVSS (AX), X1
 29845  	MULSS X0, X1
 29846  	ADDSS (DX), X1
 29847  	MOVSS X1, (DX)
 29848  	LEAQ  (AX)(CX*4), AX
 29849  	LEAQ  (DX)(BX*4), DX
 29850  	MOVSS (AX), X1
 29851  	MULSS X0, X1
 29852  	ADDSS (DX), X1
 29853  	MOVSS X1, (DX)
 29854  	LEAQ  (AX)(CX*4), AX
 29855  	LEAQ  (DX)(BX*4), DX
 29856  	MOVSS (AX), X1
 29857  	MULSS X0, X1
 29858  	ADDSS (DX), X1
 29859  	MOVSS X1, (DX)
 29860  	LEAQ  (AX)(CX*4), AX
 29861  	LEAQ  (DX)(BX*4), DX
 29862  	MOVSS (AX), X1
 29863  	MULSS X0, X1
 29864  	ADDSS (DX), X1
 29865  	MOVSS X1, (DX)
 29866  	LEAQ  (AX)(CX*4), AX
 29867  	LEAQ  (DX)(BX*4), DX
 29868  	MOVSS (AX), X1
 29869  	MULSS X0, X1
 29870  	ADDSS (DX), X1
 29871  	MOVSS X1, (DX)
 29872  	LEAQ  (AX)(CX*4), AX
 29873  	LEAQ  (DX)(BX*4), DX
 29874  	MOVSS (AX), X1
 29875  	MULSS X0, X1
 29876  	ADDSS (DX), X1
 29877  	MOVSS X1, (DX)
 29878  	LEAQ  (AX)(CX*4), AX
 29879  	LEAQ  (DX)(BX*4), DX
 29880  	MOVSS (AX), X1
 29881  	MULSS X0, X1
 29882  	ADDSS (DX), X1
 29883  	MOVSS X1, (DX)
 29884  	LEAQ  (AX)(CX*4), AX
 29885  	LEAQ  (DX)(BX*4), DX
 29886  	SUBQ  $0x08, SI
 29887  
 29888  check_limit_unroll:
 29889  	CMPQ SI, $0x08
 29890  	JHS  loop_unroll
 29891  	JMP  check_limit
 29892  
 29893  loop:
 29894  	MOVSS (AX), X1
 29895  	MULSS X0, X1
 29896  	ADDSS (DX), X1
 29897  	MOVSS X1, (DX)
 29898  	DECQ  SI
 29899  	LEAQ  (AX)(CX*4), AX
 29900  	LEAQ  (DX)(BX*4), DX
 29901  
 29902  check_limit:
 29903  	CMPQ SI, $0x00
 29904  	JHI  loop
 29905  	RET
 29906  
 29907  // func AmdAxpyPointerLoopX_V2A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29908  // Requires: SSE
 29909  TEXT ·AmdAxpyPointerLoopX_V2A9U8(SB), NOSPLIT, $0-48
 29910  	MOVSS alpha+0(FP), X0
 29911  	MOVQ  xs+8(FP), AX
 29912  	MOVQ  incx+16(FP), CX
 29913  	MOVQ  ys+24(FP), DX
 29914  	MOVQ  incy+32(FP), BX
 29915  	MOVQ  n+40(FP), SI
 29916  	JMP   check_limit_unroll
 29917  	PCALIGN $0x08
 29918  	NOP
 29919  
 29920  loop_unroll:
 29921  	MOVSS (AX), X1
 29922  	MULSS X0, X1
 29923  	ADDSS (DX), X1
 29924  	MOVSS X1, (DX)
 29925  	LEAQ  (AX)(CX*4), AX
 29926  	LEAQ  (DX)(BX*4), DX
 29927  	MOVSS (AX), X1
 29928  	MULSS X0, X1
 29929  	ADDSS (DX), X1
 29930  	MOVSS X1, (DX)
 29931  	LEAQ  (AX)(CX*4), AX
 29932  	LEAQ  (DX)(BX*4), DX
 29933  	MOVSS (AX), X1
 29934  	MULSS X0, X1
 29935  	ADDSS (DX), X1
 29936  	MOVSS X1, (DX)
 29937  	LEAQ  (AX)(CX*4), AX
 29938  	LEAQ  (DX)(BX*4), DX
 29939  	MOVSS (AX), X1
 29940  	MULSS X0, X1
 29941  	ADDSS (DX), X1
 29942  	MOVSS X1, (DX)
 29943  	LEAQ  (AX)(CX*4), AX
 29944  	LEAQ  (DX)(BX*4), DX
 29945  	MOVSS (AX), X1
 29946  	MULSS X0, X1
 29947  	ADDSS (DX), X1
 29948  	MOVSS X1, (DX)
 29949  	LEAQ  (AX)(CX*4), AX
 29950  	LEAQ  (DX)(BX*4), DX
 29951  	MOVSS (AX), X1
 29952  	MULSS X0, X1
 29953  	ADDSS (DX), X1
 29954  	MOVSS X1, (DX)
 29955  	LEAQ  (AX)(CX*4), AX
 29956  	LEAQ  (DX)(BX*4), DX
 29957  	MOVSS (AX), X1
 29958  	MULSS X0, X1
 29959  	ADDSS (DX), X1
 29960  	MOVSS X1, (DX)
 29961  	LEAQ  (AX)(CX*4), AX
 29962  	LEAQ  (DX)(BX*4), DX
 29963  	MOVSS (AX), X1
 29964  	MULSS X0, X1
 29965  	ADDSS (DX), X1
 29966  	MOVSS X1, (DX)
 29967  	LEAQ  (AX)(CX*4), AX
 29968  	LEAQ  (DX)(BX*4), DX
 29969  	SUBQ  $0x08, SI
 29970  
 29971  check_limit_unroll:
 29972  	CMPQ SI, $0x08
 29973  	JHS  loop_unroll
 29974  	JMP  check_limit
 29975  
 29976  loop:
 29977  	MOVSS (AX), X1
 29978  	MULSS X0, X1
 29979  	ADDSS (DX), X1
 29980  	MOVSS X1, (DX)
 29981  	DECQ  SI
 29982  	LEAQ  (AX)(CX*4), AX
 29983  	LEAQ  (DX)(BX*4), DX
 29984  
 29985  check_limit:
 29986  	CMPQ SI, $0x00
 29987  	JHI  loop
 29988  	RET
 29989  
 29990  // func AmdAxpyPointerLoopX_V3A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 29991  // Requires: SSE
 29992  TEXT ·AmdAxpyPointerLoopX_V3A9U8(SB), NOSPLIT, $0-48
 29993  	MOVSS alpha+0(FP), X0
 29994  	MOVQ  xs+8(FP), AX
 29995  	MOVQ  incx+16(FP), CX
 29996  	MOVQ  ys+24(FP), DX
 29997  	MOVQ  incy+32(FP), BX
 29998  	MOVQ  n+40(FP), SI
 29999  	JMP   check_limit_unroll
 30000  	PCALIGN $0x08
 30001  	NOP
 30002  
 30003  loop_unroll:
 30004  	MOVSS (AX), X1
 30005  	MULSS X0, X1
 30006  	ADDSS (DX), X1
 30007  	MOVSS X1, (DX)
 30008  	LEAQ  (AX)(CX*4), AX
 30009  	LEAQ  (DX)(BX*4), DX
 30010  	MOVSS (AX), X1
 30011  	MULSS X0, X1
 30012  	ADDSS (DX), X1
 30013  	MOVSS X1, (DX)
 30014  	LEAQ  (AX)(CX*4), AX
 30015  	LEAQ  (DX)(BX*4), DX
 30016  	MOVSS (AX), X1
 30017  	MULSS X0, X1
 30018  	ADDSS (DX), X1
 30019  	MOVSS X1, (DX)
 30020  	LEAQ  (AX)(CX*4), AX
 30021  	LEAQ  (DX)(BX*4), DX
 30022  	MOVSS (AX), X1
 30023  	MULSS X0, X1
 30024  	ADDSS (DX), X1
 30025  	MOVSS X1, (DX)
 30026  	LEAQ  (AX)(CX*4), AX
 30027  	LEAQ  (DX)(BX*4), DX
 30028  	MOVSS (AX), X1
 30029  	MULSS X0, X1
 30030  	ADDSS (DX), X1
 30031  	MOVSS X1, (DX)
 30032  	LEAQ  (AX)(CX*4), AX
 30033  	LEAQ  (DX)(BX*4), DX
 30034  	MOVSS (AX), X1
 30035  	MULSS X0, X1
 30036  	ADDSS (DX), X1
 30037  	MOVSS X1, (DX)
 30038  	LEAQ  (AX)(CX*4), AX
 30039  	LEAQ  (DX)(BX*4), DX
 30040  	MOVSS (AX), X1
 30041  	MULSS X0, X1
 30042  	ADDSS (DX), X1
 30043  	MOVSS X1, (DX)
 30044  	LEAQ  (AX)(CX*4), AX
 30045  	LEAQ  (DX)(BX*4), DX
 30046  	MOVSS (AX), X1
 30047  	MULSS X0, X1
 30048  	ADDSS (DX), X1
 30049  	MOVSS X1, (DX)
 30050  	LEAQ  (AX)(CX*4), AX
 30051  	LEAQ  (DX)(BX*4), DX
 30052  	SUBQ  $0x08, SI
 30053  
 30054  check_limit_unroll:
 30055  	CMPQ SI, $0x08
 30056  	JHS  loop_unroll
 30057  	JMP  check_limit
 30058  
 30059  loop:
 30060  	MOVSS (AX), X1
 30061  	MULSS X0, X1
 30062  	ADDSS (DX), X1
 30063  	MOVSS X1, (DX)
 30064  	DECQ  SI
 30065  	LEAQ  (AX)(CX*4), AX
 30066  	LEAQ  (DX)(BX*4), DX
 30067  
 30068  check_limit:
 30069  	CMPQ SI, $0x00
 30070  	JHI  loop
 30071  	RET
 30072  
 30073  // func AmdAxpyPointerLoopX_V4A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30074  // Requires: SSE
 30075  TEXT ·AmdAxpyPointerLoopX_V4A9U8(SB), NOSPLIT, $0-48
 30076  	MOVSS alpha+0(FP), X0
 30077  	MOVQ  xs+8(FP), AX
 30078  	MOVQ  incx+16(FP), CX
 30079  	MOVQ  ys+24(FP), DX
 30080  	MOVQ  incy+32(FP), BX
 30081  	MOVQ  n+40(FP), SI
 30082  	JMP   check_limit_unroll
 30083  	PCALIGN $0x08
 30084  	NOP
 30085  
 30086  loop_unroll:
 30087  	MOVSS (AX), X1
 30088  	MULSS X0, X1
 30089  	ADDSS (DX), X1
 30090  	MOVSS X1, (DX)
 30091  	LEAQ  (AX)(CX*4), AX
 30092  	LEAQ  (DX)(BX*4), DX
 30093  	MOVSS (AX), X1
 30094  	MULSS X0, X1
 30095  	ADDSS (DX), X1
 30096  	MOVSS X1, (DX)
 30097  	LEAQ  (AX)(CX*4), AX
 30098  	LEAQ  (DX)(BX*4), DX
 30099  	MOVSS (AX), X1
 30100  	MULSS X0, X1
 30101  	ADDSS (DX), X1
 30102  	MOVSS X1, (DX)
 30103  	LEAQ  (AX)(CX*4), AX
 30104  	LEAQ  (DX)(BX*4), DX
 30105  	MOVSS (AX), X1
 30106  	MULSS X0, X1
 30107  	ADDSS (DX), X1
 30108  	MOVSS X1, (DX)
 30109  	LEAQ  (AX)(CX*4), AX
 30110  	LEAQ  (DX)(BX*4), DX
 30111  	MOVSS (AX), X1
 30112  	MULSS X0, X1
 30113  	ADDSS (DX), X1
 30114  	MOVSS X1, (DX)
 30115  	LEAQ  (AX)(CX*4), AX
 30116  	LEAQ  (DX)(BX*4), DX
 30117  	MOVSS (AX), X1
 30118  	MULSS X0, X1
 30119  	ADDSS (DX), X1
 30120  	MOVSS X1, (DX)
 30121  	LEAQ  (AX)(CX*4), AX
 30122  	LEAQ  (DX)(BX*4), DX
 30123  	MOVSS (AX), X1
 30124  	MULSS X0, X1
 30125  	ADDSS (DX), X1
 30126  	MOVSS X1, (DX)
 30127  	LEAQ  (AX)(CX*4), AX
 30128  	LEAQ  (DX)(BX*4), DX
 30129  	MOVSS (AX), X1
 30130  	MULSS X0, X1
 30131  	ADDSS (DX), X1
 30132  	MOVSS X1, (DX)
 30133  	LEAQ  (AX)(CX*4), AX
 30134  	LEAQ  (DX)(BX*4), DX
 30135  	SUBQ  $0x08, SI
 30136  
 30137  check_limit_unroll:
 30138  	CMPQ SI, $0x08
 30139  	JHS  loop_unroll
 30140  	JMP  check_limit
 30141  
 30142  loop:
 30143  	MOVSS (AX), X1
 30144  	MULSS X0, X1
 30145  	ADDSS (DX), X1
 30146  	MOVSS X1, (DX)
 30147  	DECQ  SI
 30148  	LEAQ  (AX)(CX*4), AX
 30149  	LEAQ  (DX)(BX*4), DX
 30150  
 30151  check_limit:
 30152  	CMPQ SI, $0x00
 30153  	JHI  loop
 30154  	RET
 30155  
 30156  // func AmdAxpyPointerLoopX_V5A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30157  // Requires: SSE
 30158  TEXT ·AmdAxpyPointerLoopX_V5A9U8(SB), NOSPLIT, $0-48
 30159  	MOVSS alpha+0(FP), X0
 30160  	MOVQ  xs+8(FP), AX
 30161  	MOVQ  incx+16(FP), CX
 30162  	MOVQ  ys+24(FP), DX
 30163  	MOVQ  incy+32(FP), BX
 30164  	MOVQ  n+40(FP), SI
 30165  	JMP   check_limit_unroll
 30166  	PCALIGN $0x08
 30167  	NOP
 30168  
 30169  loop_unroll:
 30170  	MOVSS (AX), X1
 30171  	MULSS X0, X1
 30172  	ADDSS (DX), X1
 30173  	MOVSS X1, (DX)
 30174  	LEAQ  (AX)(CX*4), AX
 30175  	LEAQ  (DX)(BX*4), DX
 30176  	MOVSS (AX), X1
 30177  	MULSS X0, X1
 30178  	ADDSS (DX), X1
 30179  	MOVSS X1, (DX)
 30180  	LEAQ  (AX)(CX*4), AX
 30181  	LEAQ  (DX)(BX*4), DX
 30182  	MOVSS (AX), X1
 30183  	MULSS X0, X1
 30184  	ADDSS (DX), X1
 30185  	MOVSS X1, (DX)
 30186  	LEAQ  (AX)(CX*4), AX
 30187  	LEAQ  (DX)(BX*4), DX
 30188  	MOVSS (AX), X1
 30189  	MULSS X0, X1
 30190  	ADDSS (DX), X1
 30191  	MOVSS X1, (DX)
 30192  	LEAQ  (AX)(CX*4), AX
 30193  	LEAQ  (DX)(BX*4), DX
 30194  	MOVSS (AX), X1
 30195  	MULSS X0, X1
 30196  	ADDSS (DX), X1
 30197  	MOVSS X1, (DX)
 30198  	LEAQ  (AX)(CX*4), AX
 30199  	LEAQ  (DX)(BX*4), DX
 30200  	MOVSS (AX), X1
 30201  	MULSS X0, X1
 30202  	ADDSS (DX), X1
 30203  	MOVSS X1, (DX)
 30204  	LEAQ  (AX)(CX*4), AX
 30205  	LEAQ  (DX)(BX*4), DX
 30206  	MOVSS (AX), X1
 30207  	MULSS X0, X1
 30208  	ADDSS (DX), X1
 30209  	MOVSS X1, (DX)
 30210  	LEAQ  (AX)(CX*4), AX
 30211  	LEAQ  (DX)(BX*4), DX
 30212  	MOVSS (AX), X1
 30213  	MULSS X0, X1
 30214  	ADDSS (DX), X1
 30215  	MOVSS X1, (DX)
 30216  	LEAQ  (AX)(CX*4), AX
 30217  	LEAQ  (DX)(BX*4), DX
 30218  	SUBQ  $0x08, SI
 30219  
 30220  check_limit_unroll:
 30221  	CMPQ SI, $0x08
 30222  	JHS  loop_unroll
 30223  	JMP  check_limit
 30224  
 30225  loop:
 30226  	MOVSS (AX), X1
 30227  	MULSS X0, X1
 30228  	ADDSS (DX), X1
 30229  	MOVSS X1, (DX)
 30230  	DECQ  SI
 30231  	LEAQ  (AX)(CX*4), AX
 30232  	LEAQ  (DX)(BX*4), DX
 30233  
 30234  check_limit:
 30235  	CMPQ SI, $0x00
 30236  	JHI  loop
 30237  	RET
 30238  
 30239  // func AmdAxpyPointerLoopX_V0A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30240  // Requires: SSE
 30241  TEXT ·AmdAxpyPointerLoopX_V0A10U8(SB), NOSPLIT, $0-48
 30242  	MOVSS alpha+0(FP), X0
 30243  	MOVQ  xs+8(FP), AX
 30244  	MOVQ  incx+16(FP), CX
 30245  	MOVQ  ys+24(FP), DX
 30246  	MOVQ  incy+32(FP), BX
 30247  	MOVQ  n+40(FP), SI
 30248  	JMP   check_limit_unroll
 30249  	PCALIGN $0x08
 30250  	NOP
 30251  	NOP
 30252  
 30253  loop_unroll:
 30254  	MOVSS (AX), X1
 30255  	MULSS X0, X1
 30256  	ADDSS (DX), X1
 30257  	MOVSS X1, (DX)
 30258  	LEAQ  (AX)(CX*4), AX
 30259  	LEAQ  (DX)(BX*4), DX
 30260  	MOVSS (AX), X1
 30261  	MULSS X0, X1
 30262  	ADDSS (DX), X1
 30263  	MOVSS X1, (DX)
 30264  	LEAQ  (AX)(CX*4), AX
 30265  	LEAQ  (DX)(BX*4), DX
 30266  	MOVSS (AX), X1
 30267  	MULSS X0, X1
 30268  	ADDSS (DX), X1
 30269  	MOVSS X1, (DX)
 30270  	LEAQ  (AX)(CX*4), AX
 30271  	LEAQ  (DX)(BX*4), DX
 30272  	MOVSS (AX), X1
 30273  	MULSS X0, X1
 30274  	ADDSS (DX), X1
 30275  	MOVSS X1, (DX)
 30276  	LEAQ  (AX)(CX*4), AX
 30277  	LEAQ  (DX)(BX*4), DX
 30278  	MOVSS (AX), X1
 30279  	MULSS X0, X1
 30280  	ADDSS (DX), X1
 30281  	MOVSS X1, (DX)
 30282  	LEAQ  (AX)(CX*4), AX
 30283  	LEAQ  (DX)(BX*4), DX
 30284  	MOVSS (AX), X1
 30285  	MULSS X0, X1
 30286  	ADDSS (DX), X1
 30287  	MOVSS X1, (DX)
 30288  	LEAQ  (AX)(CX*4), AX
 30289  	LEAQ  (DX)(BX*4), DX
 30290  	MOVSS (AX), X1
 30291  	MULSS X0, X1
 30292  	ADDSS (DX), X1
 30293  	MOVSS X1, (DX)
 30294  	LEAQ  (AX)(CX*4), AX
 30295  	LEAQ  (DX)(BX*4), DX
 30296  	MOVSS (AX), X1
 30297  	MULSS X0, X1
 30298  	ADDSS (DX), X1
 30299  	MOVSS X1, (DX)
 30300  	LEAQ  (AX)(CX*4), AX
 30301  	LEAQ  (DX)(BX*4), DX
 30302  	SUBQ  $0x08, SI
 30303  
 30304  check_limit_unroll:
 30305  	CMPQ SI, $0x08
 30306  	JHS  loop_unroll
 30307  	JMP  check_limit
 30308  
 30309  loop:
 30310  	MOVSS (AX), X1
 30311  	MULSS X0, X1
 30312  	ADDSS (DX), X1
 30313  	MOVSS X1, (DX)
 30314  	DECQ  SI
 30315  	LEAQ  (AX)(CX*4), AX
 30316  	LEAQ  (DX)(BX*4), DX
 30317  
 30318  check_limit:
 30319  	CMPQ SI, $0x00
 30320  	JHI  loop
 30321  	RET
 30322  
 30323  // func AmdAxpyPointerLoopX_V1A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30324  // Requires: SSE
 30325  TEXT ·AmdAxpyPointerLoopX_V1A10U8(SB), NOSPLIT, $0-48
 30326  	MOVSS alpha+0(FP), X0
 30327  	MOVQ  xs+8(FP), AX
 30328  	MOVQ  incx+16(FP), CX
 30329  	MOVQ  ys+24(FP), DX
 30330  	MOVQ  incy+32(FP), BX
 30331  	MOVQ  n+40(FP), SI
 30332  	JMP   check_limit_unroll
 30333  	PCALIGN $0x08
 30334  	NOP
 30335  	NOP
 30336  
 30337  loop_unroll:
 30338  	MOVSS (AX), X1
 30339  	MULSS X0, X1
 30340  	ADDSS (DX), X1
 30341  	MOVSS X1, (DX)
 30342  	LEAQ  (AX)(CX*4), AX
 30343  	LEAQ  (DX)(BX*4), DX
 30344  	MOVSS (AX), X1
 30345  	MULSS X0, X1
 30346  	ADDSS (DX), X1
 30347  	MOVSS X1, (DX)
 30348  	LEAQ  (AX)(CX*4), AX
 30349  	LEAQ  (DX)(BX*4), DX
 30350  	MOVSS (AX), X1
 30351  	MULSS X0, X1
 30352  	ADDSS (DX), X1
 30353  	MOVSS X1, (DX)
 30354  	LEAQ  (AX)(CX*4), AX
 30355  	LEAQ  (DX)(BX*4), DX
 30356  	MOVSS (AX), X1
 30357  	MULSS X0, X1
 30358  	ADDSS (DX), X1
 30359  	MOVSS X1, (DX)
 30360  	LEAQ  (AX)(CX*4), AX
 30361  	LEAQ  (DX)(BX*4), DX
 30362  	MOVSS (AX), X1
 30363  	MULSS X0, X1
 30364  	ADDSS (DX), X1
 30365  	MOVSS X1, (DX)
 30366  	LEAQ  (AX)(CX*4), AX
 30367  	LEAQ  (DX)(BX*4), DX
 30368  	MOVSS (AX), X1
 30369  	MULSS X0, X1
 30370  	ADDSS (DX), X1
 30371  	MOVSS X1, (DX)
 30372  	LEAQ  (AX)(CX*4), AX
 30373  	LEAQ  (DX)(BX*4), DX
 30374  	MOVSS (AX), X1
 30375  	MULSS X0, X1
 30376  	ADDSS (DX), X1
 30377  	MOVSS X1, (DX)
 30378  	LEAQ  (AX)(CX*4), AX
 30379  	LEAQ  (DX)(BX*4), DX
 30380  	MOVSS (AX), X1
 30381  	MULSS X0, X1
 30382  	ADDSS (DX), X1
 30383  	MOVSS X1, (DX)
 30384  	LEAQ  (AX)(CX*4), AX
 30385  	LEAQ  (DX)(BX*4), DX
 30386  	SUBQ  $0x08, SI
 30387  
 30388  check_limit_unroll:
 30389  	CMPQ SI, $0x08
 30390  	JHS  loop_unroll
 30391  	JMP  check_limit
 30392  
 30393  loop:
 30394  	MOVSS (AX), X1
 30395  	MULSS X0, X1
 30396  	ADDSS (DX), X1
 30397  	MOVSS X1, (DX)
 30398  	DECQ  SI
 30399  	LEAQ  (AX)(CX*4), AX
 30400  	LEAQ  (DX)(BX*4), DX
 30401  
 30402  check_limit:
 30403  	CMPQ SI, $0x00
 30404  	JHI  loop
 30405  	RET
 30406  
 30407  // func AmdAxpyPointerLoopX_V2A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30408  // Requires: SSE
 30409  TEXT ·AmdAxpyPointerLoopX_V2A10U8(SB), NOSPLIT, $0-48
 30410  	MOVSS alpha+0(FP), X0
 30411  	MOVQ  xs+8(FP), AX
 30412  	MOVQ  incx+16(FP), CX
 30413  	MOVQ  ys+24(FP), DX
 30414  	MOVQ  incy+32(FP), BX
 30415  	MOVQ  n+40(FP), SI
 30416  	JMP   check_limit_unroll
 30417  	PCALIGN $0x08
 30418  	NOP
 30419  	NOP
 30420  
 30421  loop_unroll:
 30422  	MOVSS (AX), X1
 30423  	MULSS X0, X1
 30424  	ADDSS (DX), X1
 30425  	MOVSS X1, (DX)
 30426  	LEAQ  (AX)(CX*4), AX
 30427  	LEAQ  (DX)(BX*4), DX
 30428  	MOVSS (AX), X1
 30429  	MULSS X0, X1
 30430  	ADDSS (DX), X1
 30431  	MOVSS X1, (DX)
 30432  	LEAQ  (AX)(CX*4), AX
 30433  	LEAQ  (DX)(BX*4), DX
 30434  	MOVSS (AX), X1
 30435  	MULSS X0, X1
 30436  	ADDSS (DX), X1
 30437  	MOVSS X1, (DX)
 30438  	LEAQ  (AX)(CX*4), AX
 30439  	LEAQ  (DX)(BX*4), DX
 30440  	MOVSS (AX), X1
 30441  	MULSS X0, X1
 30442  	ADDSS (DX), X1
 30443  	MOVSS X1, (DX)
 30444  	LEAQ  (AX)(CX*4), AX
 30445  	LEAQ  (DX)(BX*4), DX
 30446  	MOVSS (AX), X1
 30447  	MULSS X0, X1
 30448  	ADDSS (DX), X1
 30449  	MOVSS X1, (DX)
 30450  	LEAQ  (AX)(CX*4), AX
 30451  	LEAQ  (DX)(BX*4), DX
 30452  	MOVSS (AX), X1
 30453  	MULSS X0, X1
 30454  	ADDSS (DX), X1
 30455  	MOVSS X1, (DX)
 30456  	LEAQ  (AX)(CX*4), AX
 30457  	LEAQ  (DX)(BX*4), DX
 30458  	MOVSS (AX), X1
 30459  	MULSS X0, X1
 30460  	ADDSS (DX), X1
 30461  	MOVSS X1, (DX)
 30462  	LEAQ  (AX)(CX*4), AX
 30463  	LEAQ  (DX)(BX*4), DX
 30464  	MOVSS (AX), X1
 30465  	MULSS X0, X1
 30466  	ADDSS (DX), X1
 30467  	MOVSS X1, (DX)
 30468  	LEAQ  (AX)(CX*4), AX
 30469  	LEAQ  (DX)(BX*4), DX
 30470  	SUBQ  $0x08, SI
 30471  
 30472  check_limit_unroll:
 30473  	CMPQ SI, $0x08
 30474  	JHS  loop_unroll
 30475  	JMP  check_limit
 30476  
 30477  loop:
 30478  	MOVSS (AX), X1
 30479  	MULSS X0, X1
 30480  	ADDSS (DX), X1
 30481  	MOVSS X1, (DX)
 30482  	DECQ  SI
 30483  	LEAQ  (AX)(CX*4), AX
 30484  	LEAQ  (DX)(BX*4), DX
 30485  
 30486  check_limit:
 30487  	CMPQ SI, $0x00
 30488  	JHI  loop
 30489  	RET
 30490  
 30491  // func AmdAxpyPointerLoopX_V3A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30492  // Requires: SSE
 30493  TEXT ·AmdAxpyPointerLoopX_V3A10U8(SB), NOSPLIT, $0-48
 30494  	MOVSS alpha+0(FP), X0
 30495  	MOVQ  xs+8(FP), AX
 30496  	MOVQ  incx+16(FP), CX
 30497  	MOVQ  ys+24(FP), DX
 30498  	MOVQ  incy+32(FP), BX
 30499  	MOVQ  n+40(FP), SI
 30500  	JMP   check_limit_unroll
 30501  	PCALIGN $0x08
 30502  	NOP
 30503  	NOP
 30504  
 30505  loop_unroll:
 30506  	MOVSS (AX), X1
 30507  	MULSS X0, X1
 30508  	ADDSS (DX), X1
 30509  	MOVSS X1, (DX)
 30510  	LEAQ  (AX)(CX*4), AX
 30511  	LEAQ  (DX)(BX*4), DX
 30512  	MOVSS (AX), X1
 30513  	MULSS X0, X1
 30514  	ADDSS (DX), X1
 30515  	MOVSS X1, (DX)
 30516  	LEAQ  (AX)(CX*4), AX
 30517  	LEAQ  (DX)(BX*4), DX
 30518  	MOVSS (AX), X1
 30519  	MULSS X0, X1
 30520  	ADDSS (DX), X1
 30521  	MOVSS X1, (DX)
 30522  	LEAQ  (AX)(CX*4), AX
 30523  	LEAQ  (DX)(BX*4), DX
 30524  	MOVSS (AX), X1
 30525  	MULSS X0, X1
 30526  	ADDSS (DX), X1
 30527  	MOVSS X1, (DX)
 30528  	LEAQ  (AX)(CX*4), AX
 30529  	LEAQ  (DX)(BX*4), DX
 30530  	MOVSS (AX), X1
 30531  	MULSS X0, X1
 30532  	ADDSS (DX), X1
 30533  	MOVSS X1, (DX)
 30534  	LEAQ  (AX)(CX*4), AX
 30535  	LEAQ  (DX)(BX*4), DX
 30536  	MOVSS (AX), X1
 30537  	MULSS X0, X1
 30538  	ADDSS (DX), X1
 30539  	MOVSS X1, (DX)
 30540  	LEAQ  (AX)(CX*4), AX
 30541  	LEAQ  (DX)(BX*4), DX
 30542  	MOVSS (AX), X1
 30543  	MULSS X0, X1
 30544  	ADDSS (DX), X1
 30545  	MOVSS X1, (DX)
 30546  	LEAQ  (AX)(CX*4), AX
 30547  	LEAQ  (DX)(BX*4), DX
 30548  	MOVSS (AX), X1
 30549  	MULSS X0, X1
 30550  	ADDSS (DX), X1
 30551  	MOVSS X1, (DX)
 30552  	LEAQ  (AX)(CX*4), AX
 30553  	LEAQ  (DX)(BX*4), DX
 30554  	SUBQ  $0x08, SI
 30555  
 30556  check_limit_unroll:
 30557  	CMPQ SI, $0x08
 30558  	JHS  loop_unroll
 30559  	JMP  check_limit
 30560  
 30561  loop:
 30562  	MOVSS (AX), X1
 30563  	MULSS X0, X1
 30564  	ADDSS (DX), X1
 30565  	MOVSS X1, (DX)
 30566  	DECQ  SI
 30567  	LEAQ  (AX)(CX*4), AX
 30568  	LEAQ  (DX)(BX*4), DX
 30569  
 30570  check_limit:
 30571  	CMPQ SI, $0x00
 30572  	JHI  loop
 30573  	RET
 30574  
 30575  // func AmdAxpyPointerLoopX_V4A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30576  // Requires: SSE
 30577  TEXT ·AmdAxpyPointerLoopX_V4A10U8(SB), NOSPLIT, $0-48
 30578  	MOVSS alpha+0(FP), X0
 30579  	MOVQ  xs+8(FP), AX
 30580  	MOVQ  incx+16(FP), CX
 30581  	MOVQ  ys+24(FP), DX
 30582  	MOVQ  incy+32(FP), BX
 30583  	MOVQ  n+40(FP), SI
 30584  	JMP   check_limit_unroll
 30585  	PCALIGN $0x08
 30586  	NOP
 30587  	NOP
 30588  
 30589  loop_unroll:
 30590  	MOVSS (AX), X1
 30591  	MULSS X0, X1
 30592  	ADDSS (DX), X1
 30593  	MOVSS X1, (DX)
 30594  	LEAQ  (AX)(CX*4), AX
 30595  	LEAQ  (DX)(BX*4), DX
 30596  	MOVSS (AX), X1
 30597  	MULSS X0, X1
 30598  	ADDSS (DX), X1
 30599  	MOVSS X1, (DX)
 30600  	LEAQ  (AX)(CX*4), AX
 30601  	LEAQ  (DX)(BX*4), DX
 30602  	MOVSS (AX), X1
 30603  	MULSS X0, X1
 30604  	ADDSS (DX), X1
 30605  	MOVSS X1, (DX)
 30606  	LEAQ  (AX)(CX*4), AX
 30607  	LEAQ  (DX)(BX*4), DX
 30608  	MOVSS (AX), X1
 30609  	MULSS X0, X1
 30610  	ADDSS (DX), X1
 30611  	MOVSS X1, (DX)
 30612  	LEAQ  (AX)(CX*4), AX
 30613  	LEAQ  (DX)(BX*4), DX
 30614  	MOVSS (AX), X1
 30615  	MULSS X0, X1
 30616  	ADDSS (DX), X1
 30617  	MOVSS X1, (DX)
 30618  	LEAQ  (AX)(CX*4), AX
 30619  	LEAQ  (DX)(BX*4), DX
 30620  	MOVSS (AX), X1
 30621  	MULSS X0, X1
 30622  	ADDSS (DX), X1
 30623  	MOVSS X1, (DX)
 30624  	LEAQ  (AX)(CX*4), AX
 30625  	LEAQ  (DX)(BX*4), DX
 30626  	MOVSS (AX), X1
 30627  	MULSS X0, X1
 30628  	ADDSS (DX), X1
 30629  	MOVSS X1, (DX)
 30630  	LEAQ  (AX)(CX*4), AX
 30631  	LEAQ  (DX)(BX*4), DX
 30632  	MOVSS (AX), X1
 30633  	MULSS X0, X1
 30634  	ADDSS (DX), X1
 30635  	MOVSS X1, (DX)
 30636  	LEAQ  (AX)(CX*4), AX
 30637  	LEAQ  (DX)(BX*4), DX
 30638  	SUBQ  $0x08, SI
 30639  
 30640  check_limit_unroll:
 30641  	CMPQ SI, $0x08
 30642  	JHS  loop_unroll
 30643  	JMP  check_limit
 30644  
 30645  loop:
 30646  	MOVSS (AX), X1
 30647  	MULSS X0, X1
 30648  	ADDSS (DX), X1
 30649  	MOVSS X1, (DX)
 30650  	DECQ  SI
 30651  	LEAQ  (AX)(CX*4), AX
 30652  	LEAQ  (DX)(BX*4), DX
 30653  
 30654  check_limit:
 30655  	CMPQ SI, $0x00
 30656  	JHI  loop
 30657  	RET
 30658  
 30659  // func AmdAxpyPointerLoopX_V5A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30660  // Requires: SSE
 30661  TEXT ·AmdAxpyPointerLoopX_V5A10U8(SB), NOSPLIT, $0-48
 30662  	MOVSS alpha+0(FP), X0
 30663  	MOVQ  xs+8(FP), AX
 30664  	MOVQ  incx+16(FP), CX
 30665  	MOVQ  ys+24(FP), DX
 30666  	MOVQ  incy+32(FP), BX
 30667  	MOVQ  n+40(FP), SI
 30668  	JMP   check_limit_unroll
 30669  	PCALIGN $0x08
 30670  	NOP
 30671  	NOP
 30672  
 30673  loop_unroll:
 30674  	MOVSS (AX), X1
 30675  	MULSS X0, X1
 30676  	ADDSS (DX), X1
 30677  	MOVSS X1, (DX)
 30678  	LEAQ  (AX)(CX*4), AX
 30679  	LEAQ  (DX)(BX*4), DX
 30680  	MOVSS (AX), X1
 30681  	MULSS X0, X1
 30682  	ADDSS (DX), X1
 30683  	MOVSS X1, (DX)
 30684  	LEAQ  (AX)(CX*4), AX
 30685  	LEAQ  (DX)(BX*4), DX
 30686  	MOVSS (AX), X1
 30687  	MULSS X0, X1
 30688  	ADDSS (DX), X1
 30689  	MOVSS X1, (DX)
 30690  	LEAQ  (AX)(CX*4), AX
 30691  	LEAQ  (DX)(BX*4), DX
 30692  	MOVSS (AX), X1
 30693  	MULSS X0, X1
 30694  	ADDSS (DX), X1
 30695  	MOVSS X1, (DX)
 30696  	LEAQ  (AX)(CX*4), AX
 30697  	LEAQ  (DX)(BX*4), DX
 30698  	MOVSS (AX), X1
 30699  	MULSS X0, X1
 30700  	ADDSS (DX), X1
 30701  	MOVSS X1, (DX)
 30702  	LEAQ  (AX)(CX*4), AX
 30703  	LEAQ  (DX)(BX*4), DX
 30704  	MOVSS (AX), X1
 30705  	MULSS X0, X1
 30706  	ADDSS (DX), X1
 30707  	MOVSS X1, (DX)
 30708  	LEAQ  (AX)(CX*4), AX
 30709  	LEAQ  (DX)(BX*4), DX
 30710  	MOVSS (AX), X1
 30711  	MULSS X0, X1
 30712  	ADDSS (DX), X1
 30713  	MOVSS X1, (DX)
 30714  	LEAQ  (AX)(CX*4), AX
 30715  	LEAQ  (DX)(BX*4), DX
 30716  	MOVSS (AX), X1
 30717  	MULSS X0, X1
 30718  	ADDSS (DX), X1
 30719  	MOVSS X1, (DX)
 30720  	LEAQ  (AX)(CX*4), AX
 30721  	LEAQ  (DX)(BX*4), DX
 30722  	SUBQ  $0x08, SI
 30723  
 30724  check_limit_unroll:
 30725  	CMPQ SI, $0x08
 30726  	JHS  loop_unroll
 30727  	JMP  check_limit
 30728  
 30729  loop:
 30730  	MOVSS (AX), X1
 30731  	MULSS X0, X1
 30732  	ADDSS (DX), X1
 30733  	MOVSS X1, (DX)
 30734  	DECQ  SI
 30735  	LEAQ  (AX)(CX*4), AX
 30736  	LEAQ  (DX)(BX*4), DX
 30737  
 30738  check_limit:
 30739  	CMPQ SI, $0x00
 30740  	JHI  loop
 30741  	RET
 30742  
 30743  // func AmdAxpyPointerLoopX_V0A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30744  // Requires: SSE
 30745  TEXT ·AmdAxpyPointerLoopX_V0A11U8(SB), NOSPLIT, $0-48
 30746  	MOVSS alpha+0(FP), X0
 30747  	MOVQ  xs+8(FP), AX
 30748  	MOVQ  incx+16(FP), CX
 30749  	MOVQ  ys+24(FP), DX
 30750  	MOVQ  incy+32(FP), BX
 30751  	MOVQ  n+40(FP), SI
 30752  	JMP   check_limit_unroll
 30753  	PCALIGN $0x08
 30754  	NOP
 30755  	NOP
 30756  	NOP
 30757  
 30758  loop_unroll:
 30759  	MOVSS (AX), X1
 30760  	MULSS X0, X1
 30761  	ADDSS (DX), X1
 30762  	MOVSS X1, (DX)
 30763  	LEAQ  (AX)(CX*4), AX
 30764  	LEAQ  (DX)(BX*4), DX
 30765  	MOVSS (AX), X1
 30766  	MULSS X0, X1
 30767  	ADDSS (DX), X1
 30768  	MOVSS X1, (DX)
 30769  	LEAQ  (AX)(CX*4), AX
 30770  	LEAQ  (DX)(BX*4), DX
 30771  	MOVSS (AX), X1
 30772  	MULSS X0, X1
 30773  	ADDSS (DX), X1
 30774  	MOVSS X1, (DX)
 30775  	LEAQ  (AX)(CX*4), AX
 30776  	LEAQ  (DX)(BX*4), DX
 30777  	MOVSS (AX), X1
 30778  	MULSS X0, X1
 30779  	ADDSS (DX), X1
 30780  	MOVSS X1, (DX)
 30781  	LEAQ  (AX)(CX*4), AX
 30782  	LEAQ  (DX)(BX*4), DX
 30783  	MOVSS (AX), X1
 30784  	MULSS X0, X1
 30785  	ADDSS (DX), X1
 30786  	MOVSS X1, (DX)
 30787  	LEAQ  (AX)(CX*4), AX
 30788  	LEAQ  (DX)(BX*4), DX
 30789  	MOVSS (AX), X1
 30790  	MULSS X0, X1
 30791  	ADDSS (DX), X1
 30792  	MOVSS X1, (DX)
 30793  	LEAQ  (AX)(CX*4), AX
 30794  	LEAQ  (DX)(BX*4), DX
 30795  	MOVSS (AX), X1
 30796  	MULSS X0, X1
 30797  	ADDSS (DX), X1
 30798  	MOVSS X1, (DX)
 30799  	LEAQ  (AX)(CX*4), AX
 30800  	LEAQ  (DX)(BX*4), DX
 30801  	MOVSS (AX), X1
 30802  	MULSS X0, X1
 30803  	ADDSS (DX), X1
 30804  	MOVSS X1, (DX)
 30805  	LEAQ  (AX)(CX*4), AX
 30806  	LEAQ  (DX)(BX*4), DX
 30807  	SUBQ  $0x08, SI
 30808  
 30809  check_limit_unroll:
 30810  	CMPQ SI, $0x08
 30811  	JHS  loop_unroll
 30812  	JMP  check_limit
 30813  
 30814  loop:
 30815  	MOVSS (AX), X1
 30816  	MULSS X0, X1
 30817  	ADDSS (DX), X1
 30818  	MOVSS X1, (DX)
 30819  	DECQ  SI
 30820  	LEAQ  (AX)(CX*4), AX
 30821  	LEAQ  (DX)(BX*4), DX
 30822  
 30823  check_limit:
 30824  	CMPQ SI, $0x00
 30825  	JHI  loop
 30826  	RET
 30827  
 30828  // func AmdAxpyPointerLoopX_V1A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30829  // Requires: SSE
 30830  TEXT ·AmdAxpyPointerLoopX_V1A11U8(SB), NOSPLIT, $0-48
 30831  	MOVSS alpha+0(FP), X0
 30832  	MOVQ  xs+8(FP), AX
 30833  	MOVQ  incx+16(FP), CX
 30834  	MOVQ  ys+24(FP), DX
 30835  	MOVQ  incy+32(FP), BX
 30836  	MOVQ  n+40(FP), SI
 30837  	JMP   check_limit_unroll
 30838  	PCALIGN $0x08
 30839  	NOP
 30840  	NOP
 30841  	NOP
 30842  
 30843  loop_unroll:
 30844  	MOVSS (AX), X1
 30845  	MULSS X0, X1
 30846  	ADDSS (DX), X1
 30847  	MOVSS X1, (DX)
 30848  	LEAQ  (AX)(CX*4), AX
 30849  	LEAQ  (DX)(BX*4), DX
 30850  	MOVSS (AX), X1
 30851  	MULSS X0, X1
 30852  	ADDSS (DX), X1
 30853  	MOVSS X1, (DX)
 30854  	LEAQ  (AX)(CX*4), AX
 30855  	LEAQ  (DX)(BX*4), DX
 30856  	MOVSS (AX), X1
 30857  	MULSS X0, X1
 30858  	ADDSS (DX), X1
 30859  	MOVSS X1, (DX)
 30860  	LEAQ  (AX)(CX*4), AX
 30861  	LEAQ  (DX)(BX*4), DX
 30862  	MOVSS (AX), X1
 30863  	MULSS X0, X1
 30864  	ADDSS (DX), X1
 30865  	MOVSS X1, (DX)
 30866  	LEAQ  (AX)(CX*4), AX
 30867  	LEAQ  (DX)(BX*4), DX
 30868  	MOVSS (AX), X1
 30869  	MULSS X0, X1
 30870  	ADDSS (DX), X1
 30871  	MOVSS X1, (DX)
 30872  	LEAQ  (AX)(CX*4), AX
 30873  	LEAQ  (DX)(BX*4), DX
 30874  	MOVSS (AX), X1
 30875  	MULSS X0, X1
 30876  	ADDSS (DX), X1
 30877  	MOVSS X1, (DX)
 30878  	LEAQ  (AX)(CX*4), AX
 30879  	LEAQ  (DX)(BX*4), DX
 30880  	MOVSS (AX), X1
 30881  	MULSS X0, X1
 30882  	ADDSS (DX), X1
 30883  	MOVSS X1, (DX)
 30884  	LEAQ  (AX)(CX*4), AX
 30885  	LEAQ  (DX)(BX*4), DX
 30886  	MOVSS (AX), X1
 30887  	MULSS X0, X1
 30888  	ADDSS (DX), X1
 30889  	MOVSS X1, (DX)
 30890  	LEAQ  (AX)(CX*4), AX
 30891  	LEAQ  (DX)(BX*4), DX
 30892  	SUBQ  $0x08, SI
 30893  
 30894  check_limit_unroll:
 30895  	CMPQ SI, $0x08
 30896  	JHS  loop_unroll
 30897  	JMP  check_limit
 30898  
 30899  loop:
 30900  	MOVSS (AX), X1
 30901  	MULSS X0, X1
 30902  	ADDSS (DX), X1
 30903  	MOVSS X1, (DX)
 30904  	DECQ  SI
 30905  	LEAQ  (AX)(CX*4), AX
 30906  	LEAQ  (DX)(BX*4), DX
 30907  
 30908  check_limit:
 30909  	CMPQ SI, $0x00
 30910  	JHI  loop
 30911  	RET
 30912  
 30913  // func AmdAxpyPointerLoopX_V2A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30914  // Requires: SSE
 30915  TEXT ·AmdAxpyPointerLoopX_V2A11U8(SB), NOSPLIT, $0-48
 30916  	MOVSS alpha+0(FP), X0
 30917  	MOVQ  xs+8(FP), AX
 30918  	MOVQ  incx+16(FP), CX
 30919  	MOVQ  ys+24(FP), DX
 30920  	MOVQ  incy+32(FP), BX
 30921  	MOVQ  n+40(FP), SI
 30922  	JMP   check_limit_unroll
 30923  	PCALIGN $0x08
 30924  	NOP
 30925  	NOP
 30926  	NOP
 30927  
 30928  loop_unroll:
 30929  	MOVSS (AX), X1
 30930  	MULSS X0, X1
 30931  	ADDSS (DX), X1
 30932  	MOVSS X1, (DX)
 30933  	LEAQ  (AX)(CX*4), AX
 30934  	LEAQ  (DX)(BX*4), DX
 30935  	MOVSS (AX), X1
 30936  	MULSS X0, X1
 30937  	ADDSS (DX), X1
 30938  	MOVSS X1, (DX)
 30939  	LEAQ  (AX)(CX*4), AX
 30940  	LEAQ  (DX)(BX*4), DX
 30941  	MOVSS (AX), X1
 30942  	MULSS X0, X1
 30943  	ADDSS (DX), X1
 30944  	MOVSS X1, (DX)
 30945  	LEAQ  (AX)(CX*4), AX
 30946  	LEAQ  (DX)(BX*4), DX
 30947  	MOVSS (AX), X1
 30948  	MULSS X0, X1
 30949  	ADDSS (DX), X1
 30950  	MOVSS X1, (DX)
 30951  	LEAQ  (AX)(CX*4), AX
 30952  	LEAQ  (DX)(BX*4), DX
 30953  	MOVSS (AX), X1
 30954  	MULSS X0, X1
 30955  	ADDSS (DX), X1
 30956  	MOVSS X1, (DX)
 30957  	LEAQ  (AX)(CX*4), AX
 30958  	LEAQ  (DX)(BX*4), DX
 30959  	MOVSS (AX), X1
 30960  	MULSS X0, X1
 30961  	ADDSS (DX), X1
 30962  	MOVSS X1, (DX)
 30963  	LEAQ  (AX)(CX*4), AX
 30964  	LEAQ  (DX)(BX*4), DX
 30965  	MOVSS (AX), X1
 30966  	MULSS X0, X1
 30967  	ADDSS (DX), X1
 30968  	MOVSS X1, (DX)
 30969  	LEAQ  (AX)(CX*4), AX
 30970  	LEAQ  (DX)(BX*4), DX
 30971  	MOVSS (AX), X1
 30972  	MULSS X0, X1
 30973  	ADDSS (DX), X1
 30974  	MOVSS X1, (DX)
 30975  	LEAQ  (AX)(CX*4), AX
 30976  	LEAQ  (DX)(BX*4), DX
 30977  	SUBQ  $0x08, SI
 30978  
 30979  check_limit_unroll:
 30980  	CMPQ SI, $0x08
 30981  	JHS  loop_unroll
 30982  	JMP  check_limit
 30983  
 30984  loop:
 30985  	MOVSS (AX), X1
 30986  	MULSS X0, X1
 30987  	ADDSS (DX), X1
 30988  	MOVSS X1, (DX)
 30989  	DECQ  SI
 30990  	LEAQ  (AX)(CX*4), AX
 30991  	LEAQ  (DX)(BX*4), DX
 30992  
 30993  check_limit:
 30994  	CMPQ SI, $0x00
 30995  	JHI  loop
 30996  	RET
 30997  
 30998  // func AmdAxpyPointerLoopX_V3A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 30999  // Requires: SSE
 31000  TEXT ·AmdAxpyPointerLoopX_V3A11U8(SB), NOSPLIT, $0-48
 31001  	MOVSS alpha+0(FP), X0
 31002  	MOVQ  xs+8(FP), AX
 31003  	MOVQ  incx+16(FP), CX
 31004  	MOVQ  ys+24(FP), DX
 31005  	MOVQ  incy+32(FP), BX
 31006  	MOVQ  n+40(FP), SI
 31007  	JMP   check_limit_unroll
 31008  	PCALIGN $0x08
 31009  	NOP
 31010  	NOP
 31011  	NOP
 31012  
 31013  loop_unroll:
 31014  	MOVSS (AX), X1
 31015  	MULSS X0, X1
 31016  	ADDSS (DX), X1
 31017  	MOVSS X1, (DX)
 31018  	LEAQ  (AX)(CX*4), AX
 31019  	LEAQ  (DX)(BX*4), DX
 31020  	MOVSS (AX), X1
 31021  	MULSS X0, X1
 31022  	ADDSS (DX), X1
 31023  	MOVSS X1, (DX)
 31024  	LEAQ  (AX)(CX*4), AX
 31025  	LEAQ  (DX)(BX*4), DX
 31026  	MOVSS (AX), X1
 31027  	MULSS X0, X1
 31028  	ADDSS (DX), X1
 31029  	MOVSS X1, (DX)
 31030  	LEAQ  (AX)(CX*4), AX
 31031  	LEAQ  (DX)(BX*4), DX
 31032  	MOVSS (AX), X1
 31033  	MULSS X0, X1
 31034  	ADDSS (DX), X1
 31035  	MOVSS X1, (DX)
 31036  	LEAQ  (AX)(CX*4), AX
 31037  	LEAQ  (DX)(BX*4), DX
 31038  	MOVSS (AX), X1
 31039  	MULSS X0, X1
 31040  	ADDSS (DX), X1
 31041  	MOVSS X1, (DX)
 31042  	LEAQ  (AX)(CX*4), AX
 31043  	LEAQ  (DX)(BX*4), DX
 31044  	MOVSS (AX), X1
 31045  	MULSS X0, X1
 31046  	ADDSS (DX), X1
 31047  	MOVSS X1, (DX)
 31048  	LEAQ  (AX)(CX*4), AX
 31049  	LEAQ  (DX)(BX*4), DX
 31050  	MOVSS (AX), X1
 31051  	MULSS X0, X1
 31052  	ADDSS (DX), X1
 31053  	MOVSS X1, (DX)
 31054  	LEAQ  (AX)(CX*4), AX
 31055  	LEAQ  (DX)(BX*4), DX
 31056  	MOVSS (AX), X1
 31057  	MULSS X0, X1
 31058  	ADDSS (DX), X1
 31059  	MOVSS X1, (DX)
 31060  	LEAQ  (AX)(CX*4), AX
 31061  	LEAQ  (DX)(BX*4), DX
 31062  	SUBQ  $0x08, SI
 31063  
 31064  check_limit_unroll:
 31065  	CMPQ SI, $0x08
 31066  	JHS  loop_unroll
 31067  	JMP  check_limit
 31068  
 31069  loop:
 31070  	MOVSS (AX), X1
 31071  	MULSS X0, X1
 31072  	ADDSS (DX), X1
 31073  	MOVSS X1, (DX)
 31074  	DECQ  SI
 31075  	LEAQ  (AX)(CX*4), AX
 31076  	LEAQ  (DX)(BX*4), DX
 31077  
 31078  check_limit:
 31079  	CMPQ SI, $0x00
 31080  	JHI  loop
 31081  	RET
 31082  
 31083  // func AmdAxpyPointerLoopX_V4A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31084  // Requires: SSE
 31085  TEXT ·AmdAxpyPointerLoopX_V4A11U8(SB), NOSPLIT, $0-48
 31086  	MOVSS alpha+0(FP), X0
 31087  	MOVQ  xs+8(FP), AX
 31088  	MOVQ  incx+16(FP), CX
 31089  	MOVQ  ys+24(FP), DX
 31090  	MOVQ  incy+32(FP), BX
 31091  	MOVQ  n+40(FP), SI
 31092  	JMP   check_limit_unroll
 31093  	PCALIGN $0x08
 31094  	NOP
 31095  	NOP
 31096  	NOP
 31097  
 31098  loop_unroll:
 31099  	MOVSS (AX), X1
 31100  	MULSS X0, X1
 31101  	ADDSS (DX), X1
 31102  	MOVSS X1, (DX)
 31103  	LEAQ  (AX)(CX*4), AX
 31104  	LEAQ  (DX)(BX*4), DX
 31105  	MOVSS (AX), X1
 31106  	MULSS X0, X1
 31107  	ADDSS (DX), X1
 31108  	MOVSS X1, (DX)
 31109  	LEAQ  (AX)(CX*4), AX
 31110  	LEAQ  (DX)(BX*4), DX
 31111  	MOVSS (AX), X1
 31112  	MULSS X0, X1
 31113  	ADDSS (DX), X1
 31114  	MOVSS X1, (DX)
 31115  	LEAQ  (AX)(CX*4), AX
 31116  	LEAQ  (DX)(BX*4), DX
 31117  	MOVSS (AX), X1
 31118  	MULSS X0, X1
 31119  	ADDSS (DX), X1
 31120  	MOVSS X1, (DX)
 31121  	LEAQ  (AX)(CX*4), AX
 31122  	LEAQ  (DX)(BX*4), DX
 31123  	MOVSS (AX), X1
 31124  	MULSS X0, X1
 31125  	ADDSS (DX), X1
 31126  	MOVSS X1, (DX)
 31127  	LEAQ  (AX)(CX*4), AX
 31128  	LEAQ  (DX)(BX*4), DX
 31129  	MOVSS (AX), X1
 31130  	MULSS X0, X1
 31131  	ADDSS (DX), X1
 31132  	MOVSS X1, (DX)
 31133  	LEAQ  (AX)(CX*4), AX
 31134  	LEAQ  (DX)(BX*4), DX
 31135  	MOVSS (AX), X1
 31136  	MULSS X0, X1
 31137  	ADDSS (DX), X1
 31138  	MOVSS X1, (DX)
 31139  	LEAQ  (AX)(CX*4), AX
 31140  	LEAQ  (DX)(BX*4), DX
 31141  	MOVSS (AX), X1
 31142  	MULSS X0, X1
 31143  	ADDSS (DX), X1
 31144  	MOVSS X1, (DX)
 31145  	LEAQ  (AX)(CX*4), AX
 31146  	LEAQ  (DX)(BX*4), DX
 31147  	SUBQ  $0x08, SI
 31148  
 31149  check_limit_unroll:
 31150  	CMPQ SI, $0x08
 31151  	JHS  loop_unroll
 31152  	JMP  check_limit
 31153  
 31154  loop:
 31155  	MOVSS (AX), X1
 31156  	MULSS X0, X1
 31157  	ADDSS (DX), X1
 31158  	MOVSS X1, (DX)
 31159  	DECQ  SI
 31160  	LEAQ  (AX)(CX*4), AX
 31161  	LEAQ  (DX)(BX*4), DX
 31162  
 31163  check_limit:
 31164  	CMPQ SI, $0x00
 31165  	JHI  loop
 31166  	RET
 31167  
 31168  // func AmdAxpyPointerLoopX_V5A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31169  // Requires: SSE
 31170  TEXT ·AmdAxpyPointerLoopX_V5A11U8(SB), NOSPLIT, $0-48
 31171  	MOVSS alpha+0(FP), X0
 31172  	MOVQ  xs+8(FP), AX
 31173  	MOVQ  incx+16(FP), CX
 31174  	MOVQ  ys+24(FP), DX
 31175  	MOVQ  incy+32(FP), BX
 31176  	MOVQ  n+40(FP), SI
 31177  	JMP   check_limit_unroll
 31178  	PCALIGN $0x08
 31179  	NOP
 31180  	NOP
 31181  	NOP
 31182  
 31183  loop_unroll:
 31184  	MOVSS (AX), X1
 31185  	MULSS X0, X1
 31186  	ADDSS (DX), X1
 31187  	MOVSS X1, (DX)
 31188  	LEAQ  (AX)(CX*4), AX
 31189  	LEAQ  (DX)(BX*4), DX
 31190  	MOVSS (AX), X1
 31191  	MULSS X0, X1
 31192  	ADDSS (DX), X1
 31193  	MOVSS X1, (DX)
 31194  	LEAQ  (AX)(CX*4), AX
 31195  	LEAQ  (DX)(BX*4), DX
 31196  	MOVSS (AX), X1
 31197  	MULSS X0, X1
 31198  	ADDSS (DX), X1
 31199  	MOVSS X1, (DX)
 31200  	LEAQ  (AX)(CX*4), AX
 31201  	LEAQ  (DX)(BX*4), DX
 31202  	MOVSS (AX), X1
 31203  	MULSS X0, X1
 31204  	ADDSS (DX), X1
 31205  	MOVSS X1, (DX)
 31206  	LEAQ  (AX)(CX*4), AX
 31207  	LEAQ  (DX)(BX*4), DX
 31208  	MOVSS (AX), X1
 31209  	MULSS X0, X1
 31210  	ADDSS (DX), X1
 31211  	MOVSS X1, (DX)
 31212  	LEAQ  (AX)(CX*4), AX
 31213  	LEAQ  (DX)(BX*4), DX
 31214  	MOVSS (AX), X1
 31215  	MULSS X0, X1
 31216  	ADDSS (DX), X1
 31217  	MOVSS X1, (DX)
 31218  	LEAQ  (AX)(CX*4), AX
 31219  	LEAQ  (DX)(BX*4), DX
 31220  	MOVSS (AX), X1
 31221  	MULSS X0, X1
 31222  	ADDSS (DX), X1
 31223  	MOVSS X1, (DX)
 31224  	LEAQ  (AX)(CX*4), AX
 31225  	LEAQ  (DX)(BX*4), DX
 31226  	MOVSS (AX), X1
 31227  	MULSS X0, X1
 31228  	ADDSS (DX), X1
 31229  	MOVSS X1, (DX)
 31230  	LEAQ  (AX)(CX*4), AX
 31231  	LEAQ  (DX)(BX*4), DX
 31232  	SUBQ  $0x08, SI
 31233  
 31234  check_limit_unroll:
 31235  	CMPQ SI, $0x08
 31236  	JHS  loop_unroll
 31237  	JMP  check_limit
 31238  
 31239  loop:
 31240  	MOVSS (AX), X1
 31241  	MULSS X0, X1
 31242  	ADDSS (DX), X1
 31243  	MOVSS X1, (DX)
 31244  	DECQ  SI
 31245  	LEAQ  (AX)(CX*4), AX
 31246  	LEAQ  (DX)(BX*4), DX
 31247  
 31248  check_limit:
 31249  	CMPQ SI, $0x00
 31250  	JHI  loop
 31251  	RET
 31252  
 31253  // func AmdAxpyPointerLoopX_V0A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31254  // Requires: SSE
 31255  TEXT ·AmdAxpyPointerLoopX_V0A12U8(SB), NOSPLIT, $0-48
 31256  	MOVSS alpha+0(FP), X0
 31257  	MOVQ  xs+8(FP), AX
 31258  	MOVQ  incx+16(FP), CX
 31259  	MOVQ  ys+24(FP), DX
 31260  	MOVQ  incy+32(FP), BX
 31261  	MOVQ  n+40(FP), SI
 31262  	JMP   check_limit_unroll
 31263  	PCALIGN $0x08
 31264  	NOP
 31265  	NOP
 31266  	NOP
 31267  	NOP
 31268  
 31269  loop_unroll:
 31270  	MOVSS (AX), X1
 31271  	MULSS X0, X1
 31272  	ADDSS (DX), X1
 31273  	MOVSS X1, (DX)
 31274  	LEAQ  (AX)(CX*4), AX
 31275  	LEAQ  (DX)(BX*4), DX
 31276  	MOVSS (AX), X1
 31277  	MULSS X0, X1
 31278  	ADDSS (DX), X1
 31279  	MOVSS X1, (DX)
 31280  	LEAQ  (AX)(CX*4), AX
 31281  	LEAQ  (DX)(BX*4), DX
 31282  	MOVSS (AX), X1
 31283  	MULSS X0, X1
 31284  	ADDSS (DX), X1
 31285  	MOVSS X1, (DX)
 31286  	LEAQ  (AX)(CX*4), AX
 31287  	LEAQ  (DX)(BX*4), DX
 31288  	MOVSS (AX), X1
 31289  	MULSS X0, X1
 31290  	ADDSS (DX), X1
 31291  	MOVSS X1, (DX)
 31292  	LEAQ  (AX)(CX*4), AX
 31293  	LEAQ  (DX)(BX*4), DX
 31294  	MOVSS (AX), X1
 31295  	MULSS X0, X1
 31296  	ADDSS (DX), X1
 31297  	MOVSS X1, (DX)
 31298  	LEAQ  (AX)(CX*4), AX
 31299  	LEAQ  (DX)(BX*4), DX
 31300  	MOVSS (AX), X1
 31301  	MULSS X0, X1
 31302  	ADDSS (DX), X1
 31303  	MOVSS X1, (DX)
 31304  	LEAQ  (AX)(CX*4), AX
 31305  	LEAQ  (DX)(BX*4), DX
 31306  	MOVSS (AX), X1
 31307  	MULSS X0, X1
 31308  	ADDSS (DX), X1
 31309  	MOVSS X1, (DX)
 31310  	LEAQ  (AX)(CX*4), AX
 31311  	LEAQ  (DX)(BX*4), DX
 31312  	MOVSS (AX), X1
 31313  	MULSS X0, X1
 31314  	ADDSS (DX), X1
 31315  	MOVSS X1, (DX)
 31316  	LEAQ  (AX)(CX*4), AX
 31317  	LEAQ  (DX)(BX*4), DX
 31318  	SUBQ  $0x08, SI
 31319  
 31320  check_limit_unroll:
 31321  	CMPQ SI, $0x08
 31322  	JHS  loop_unroll
 31323  	JMP  check_limit
 31324  
 31325  loop:
 31326  	MOVSS (AX), X1
 31327  	MULSS X0, X1
 31328  	ADDSS (DX), X1
 31329  	MOVSS X1, (DX)
 31330  	DECQ  SI
 31331  	LEAQ  (AX)(CX*4), AX
 31332  	LEAQ  (DX)(BX*4), DX
 31333  
 31334  check_limit:
 31335  	CMPQ SI, $0x00
 31336  	JHI  loop
 31337  	RET
 31338  
 31339  // func AmdAxpyPointerLoopX_V1A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31340  // Requires: SSE
 31341  TEXT ·AmdAxpyPointerLoopX_V1A12U8(SB), NOSPLIT, $0-48
 31342  	MOVSS alpha+0(FP), X0
 31343  	MOVQ  xs+8(FP), AX
 31344  	MOVQ  incx+16(FP), CX
 31345  	MOVQ  ys+24(FP), DX
 31346  	MOVQ  incy+32(FP), BX
 31347  	MOVQ  n+40(FP), SI
 31348  	JMP   check_limit_unroll
 31349  	PCALIGN $0x08
 31350  	NOP
 31351  	NOP
 31352  	NOP
 31353  	NOP
 31354  
 31355  loop_unroll:
 31356  	MOVSS (AX), X1
 31357  	MULSS X0, X1
 31358  	ADDSS (DX), X1
 31359  	MOVSS X1, (DX)
 31360  	LEAQ  (AX)(CX*4), AX
 31361  	LEAQ  (DX)(BX*4), DX
 31362  	MOVSS (AX), X1
 31363  	MULSS X0, X1
 31364  	ADDSS (DX), X1
 31365  	MOVSS X1, (DX)
 31366  	LEAQ  (AX)(CX*4), AX
 31367  	LEAQ  (DX)(BX*4), DX
 31368  	MOVSS (AX), X1
 31369  	MULSS X0, X1
 31370  	ADDSS (DX), X1
 31371  	MOVSS X1, (DX)
 31372  	LEAQ  (AX)(CX*4), AX
 31373  	LEAQ  (DX)(BX*4), DX
 31374  	MOVSS (AX), X1
 31375  	MULSS X0, X1
 31376  	ADDSS (DX), X1
 31377  	MOVSS X1, (DX)
 31378  	LEAQ  (AX)(CX*4), AX
 31379  	LEAQ  (DX)(BX*4), DX
 31380  	MOVSS (AX), X1
 31381  	MULSS X0, X1
 31382  	ADDSS (DX), X1
 31383  	MOVSS X1, (DX)
 31384  	LEAQ  (AX)(CX*4), AX
 31385  	LEAQ  (DX)(BX*4), DX
 31386  	MOVSS (AX), X1
 31387  	MULSS X0, X1
 31388  	ADDSS (DX), X1
 31389  	MOVSS X1, (DX)
 31390  	LEAQ  (AX)(CX*4), AX
 31391  	LEAQ  (DX)(BX*4), DX
 31392  	MOVSS (AX), X1
 31393  	MULSS X0, X1
 31394  	ADDSS (DX), X1
 31395  	MOVSS X1, (DX)
 31396  	LEAQ  (AX)(CX*4), AX
 31397  	LEAQ  (DX)(BX*4), DX
 31398  	MOVSS (AX), X1
 31399  	MULSS X0, X1
 31400  	ADDSS (DX), X1
 31401  	MOVSS X1, (DX)
 31402  	LEAQ  (AX)(CX*4), AX
 31403  	LEAQ  (DX)(BX*4), DX
 31404  	SUBQ  $0x08, SI
 31405  
 31406  check_limit_unroll:
 31407  	CMPQ SI, $0x08
 31408  	JHS  loop_unroll
 31409  	JMP  check_limit
 31410  
 31411  loop:
 31412  	MOVSS (AX), X1
 31413  	MULSS X0, X1
 31414  	ADDSS (DX), X1
 31415  	MOVSS X1, (DX)
 31416  	DECQ  SI
 31417  	LEAQ  (AX)(CX*4), AX
 31418  	LEAQ  (DX)(BX*4), DX
 31419  
 31420  check_limit:
 31421  	CMPQ SI, $0x00
 31422  	JHI  loop
 31423  	RET
 31424  
 31425  // func AmdAxpyPointerLoopX_V2A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31426  // Requires: SSE
 31427  TEXT ·AmdAxpyPointerLoopX_V2A12U8(SB), NOSPLIT, $0-48
 31428  	MOVSS alpha+0(FP), X0
 31429  	MOVQ  xs+8(FP), AX
 31430  	MOVQ  incx+16(FP), CX
 31431  	MOVQ  ys+24(FP), DX
 31432  	MOVQ  incy+32(FP), BX
 31433  	MOVQ  n+40(FP), SI
 31434  	JMP   check_limit_unroll
 31435  	PCALIGN $0x08
 31436  	NOP
 31437  	NOP
 31438  	NOP
 31439  	NOP
 31440  
 31441  loop_unroll:
 31442  	MOVSS (AX), X1
 31443  	MULSS X0, X1
 31444  	ADDSS (DX), X1
 31445  	MOVSS X1, (DX)
 31446  	LEAQ  (AX)(CX*4), AX
 31447  	LEAQ  (DX)(BX*4), DX
 31448  	MOVSS (AX), X1
 31449  	MULSS X0, X1
 31450  	ADDSS (DX), X1
 31451  	MOVSS X1, (DX)
 31452  	LEAQ  (AX)(CX*4), AX
 31453  	LEAQ  (DX)(BX*4), DX
 31454  	MOVSS (AX), X1
 31455  	MULSS X0, X1
 31456  	ADDSS (DX), X1
 31457  	MOVSS X1, (DX)
 31458  	LEAQ  (AX)(CX*4), AX
 31459  	LEAQ  (DX)(BX*4), DX
 31460  	MOVSS (AX), X1
 31461  	MULSS X0, X1
 31462  	ADDSS (DX), X1
 31463  	MOVSS X1, (DX)
 31464  	LEAQ  (AX)(CX*4), AX
 31465  	LEAQ  (DX)(BX*4), DX
 31466  	MOVSS (AX), X1
 31467  	MULSS X0, X1
 31468  	ADDSS (DX), X1
 31469  	MOVSS X1, (DX)
 31470  	LEAQ  (AX)(CX*4), AX
 31471  	LEAQ  (DX)(BX*4), DX
 31472  	MOVSS (AX), X1
 31473  	MULSS X0, X1
 31474  	ADDSS (DX), X1
 31475  	MOVSS X1, (DX)
 31476  	LEAQ  (AX)(CX*4), AX
 31477  	LEAQ  (DX)(BX*4), DX
 31478  	MOVSS (AX), X1
 31479  	MULSS X0, X1
 31480  	ADDSS (DX), X1
 31481  	MOVSS X1, (DX)
 31482  	LEAQ  (AX)(CX*4), AX
 31483  	LEAQ  (DX)(BX*4), DX
 31484  	MOVSS (AX), X1
 31485  	MULSS X0, X1
 31486  	ADDSS (DX), X1
 31487  	MOVSS X1, (DX)
 31488  	LEAQ  (AX)(CX*4), AX
 31489  	LEAQ  (DX)(BX*4), DX
 31490  	SUBQ  $0x08, SI
 31491  
 31492  check_limit_unroll:
 31493  	CMPQ SI, $0x08
 31494  	JHS  loop_unroll
 31495  	JMP  check_limit
 31496  
 31497  loop:
 31498  	MOVSS (AX), X1
 31499  	MULSS X0, X1
 31500  	ADDSS (DX), X1
 31501  	MOVSS X1, (DX)
 31502  	DECQ  SI
 31503  	LEAQ  (AX)(CX*4), AX
 31504  	LEAQ  (DX)(BX*4), DX
 31505  
 31506  check_limit:
 31507  	CMPQ SI, $0x00
 31508  	JHI  loop
 31509  	RET
 31510  
 31511  // func AmdAxpyPointerLoopX_V3A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31512  // Requires: SSE
 31513  TEXT ·AmdAxpyPointerLoopX_V3A12U8(SB), NOSPLIT, $0-48
 31514  	MOVSS alpha+0(FP), X0
 31515  	MOVQ  xs+8(FP), AX
 31516  	MOVQ  incx+16(FP), CX
 31517  	MOVQ  ys+24(FP), DX
 31518  	MOVQ  incy+32(FP), BX
 31519  	MOVQ  n+40(FP), SI
 31520  	JMP   check_limit_unroll
 31521  	PCALIGN $0x08
 31522  	NOP
 31523  	NOP
 31524  	NOP
 31525  	NOP
 31526  
 31527  loop_unroll:
 31528  	MOVSS (AX), X1
 31529  	MULSS X0, X1
 31530  	ADDSS (DX), X1
 31531  	MOVSS X1, (DX)
 31532  	LEAQ  (AX)(CX*4), AX
 31533  	LEAQ  (DX)(BX*4), DX
 31534  	MOVSS (AX), X1
 31535  	MULSS X0, X1
 31536  	ADDSS (DX), X1
 31537  	MOVSS X1, (DX)
 31538  	LEAQ  (AX)(CX*4), AX
 31539  	LEAQ  (DX)(BX*4), DX
 31540  	MOVSS (AX), X1
 31541  	MULSS X0, X1
 31542  	ADDSS (DX), X1
 31543  	MOVSS X1, (DX)
 31544  	LEAQ  (AX)(CX*4), AX
 31545  	LEAQ  (DX)(BX*4), DX
 31546  	MOVSS (AX), X1
 31547  	MULSS X0, X1
 31548  	ADDSS (DX), X1
 31549  	MOVSS X1, (DX)
 31550  	LEAQ  (AX)(CX*4), AX
 31551  	LEAQ  (DX)(BX*4), DX
 31552  	MOVSS (AX), X1
 31553  	MULSS X0, X1
 31554  	ADDSS (DX), X1
 31555  	MOVSS X1, (DX)
 31556  	LEAQ  (AX)(CX*4), AX
 31557  	LEAQ  (DX)(BX*4), DX
 31558  	MOVSS (AX), X1
 31559  	MULSS X0, X1
 31560  	ADDSS (DX), X1
 31561  	MOVSS X1, (DX)
 31562  	LEAQ  (AX)(CX*4), AX
 31563  	LEAQ  (DX)(BX*4), DX
 31564  	MOVSS (AX), X1
 31565  	MULSS X0, X1
 31566  	ADDSS (DX), X1
 31567  	MOVSS X1, (DX)
 31568  	LEAQ  (AX)(CX*4), AX
 31569  	LEAQ  (DX)(BX*4), DX
 31570  	MOVSS (AX), X1
 31571  	MULSS X0, X1
 31572  	ADDSS (DX), X1
 31573  	MOVSS X1, (DX)
 31574  	LEAQ  (AX)(CX*4), AX
 31575  	LEAQ  (DX)(BX*4), DX
 31576  	SUBQ  $0x08, SI
 31577  
 31578  check_limit_unroll:
 31579  	CMPQ SI, $0x08
 31580  	JHS  loop_unroll
 31581  	JMP  check_limit
 31582  
 31583  loop:
 31584  	MOVSS (AX), X1
 31585  	MULSS X0, X1
 31586  	ADDSS (DX), X1
 31587  	MOVSS X1, (DX)
 31588  	DECQ  SI
 31589  	LEAQ  (AX)(CX*4), AX
 31590  	LEAQ  (DX)(BX*4), DX
 31591  
 31592  check_limit:
 31593  	CMPQ SI, $0x00
 31594  	JHI  loop
 31595  	RET
 31596  
 31597  // func AmdAxpyPointerLoopX_V4A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31598  // Requires: SSE
 31599  TEXT ·AmdAxpyPointerLoopX_V4A12U8(SB), NOSPLIT, $0-48
 31600  	MOVSS alpha+0(FP), X0
 31601  	MOVQ  xs+8(FP), AX
 31602  	MOVQ  incx+16(FP), CX
 31603  	MOVQ  ys+24(FP), DX
 31604  	MOVQ  incy+32(FP), BX
 31605  	MOVQ  n+40(FP), SI
 31606  	JMP   check_limit_unroll
 31607  	PCALIGN $0x08
 31608  	NOP
 31609  	NOP
 31610  	NOP
 31611  	NOP
 31612  
 31613  loop_unroll:
 31614  	MOVSS (AX), X1
 31615  	MULSS X0, X1
 31616  	ADDSS (DX), X1
 31617  	MOVSS X1, (DX)
 31618  	LEAQ  (AX)(CX*4), AX
 31619  	LEAQ  (DX)(BX*4), DX
 31620  	MOVSS (AX), X1
 31621  	MULSS X0, X1
 31622  	ADDSS (DX), X1
 31623  	MOVSS X1, (DX)
 31624  	LEAQ  (AX)(CX*4), AX
 31625  	LEAQ  (DX)(BX*4), DX
 31626  	MOVSS (AX), X1
 31627  	MULSS X0, X1
 31628  	ADDSS (DX), X1
 31629  	MOVSS X1, (DX)
 31630  	LEAQ  (AX)(CX*4), AX
 31631  	LEAQ  (DX)(BX*4), DX
 31632  	MOVSS (AX), X1
 31633  	MULSS X0, X1
 31634  	ADDSS (DX), X1
 31635  	MOVSS X1, (DX)
 31636  	LEAQ  (AX)(CX*4), AX
 31637  	LEAQ  (DX)(BX*4), DX
 31638  	MOVSS (AX), X1
 31639  	MULSS X0, X1
 31640  	ADDSS (DX), X1
 31641  	MOVSS X1, (DX)
 31642  	LEAQ  (AX)(CX*4), AX
 31643  	LEAQ  (DX)(BX*4), DX
 31644  	MOVSS (AX), X1
 31645  	MULSS X0, X1
 31646  	ADDSS (DX), X1
 31647  	MOVSS X1, (DX)
 31648  	LEAQ  (AX)(CX*4), AX
 31649  	LEAQ  (DX)(BX*4), DX
 31650  	MOVSS (AX), X1
 31651  	MULSS X0, X1
 31652  	ADDSS (DX), X1
 31653  	MOVSS X1, (DX)
 31654  	LEAQ  (AX)(CX*4), AX
 31655  	LEAQ  (DX)(BX*4), DX
 31656  	MOVSS (AX), X1
 31657  	MULSS X0, X1
 31658  	ADDSS (DX), X1
 31659  	MOVSS X1, (DX)
 31660  	LEAQ  (AX)(CX*4), AX
 31661  	LEAQ  (DX)(BX*4), DX
 31662  	SUBQ  $0x08, SI
 31663  
 31664  check_limit_unroll:
 31665  	CMPQ SI, $0x08
 31666  	JHS  loop_unroll
 31667  	JMP  check_limit
 31668  
 31669  loop:
 31670  	MOVSS (AX), X1
 31671  	MULSS X0, X1
 31672  	ADDSS (DX), X1
 31673  	MOVSS X1, (DX)
 31674  	DECQ  SI
 31675  	LEAQ  (AX)(CX*4), AX
 31676  	LEAQ  (DX)(BX*4), DX
 31677  
 31678  check_limit:
 31679  	CMPQ SI, $0x00
 31680  	JHI  loop
 31681  	RET
 31682  
 31683  // func AmdAxpyPointerLoopX_V5A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31684  // Requires: SSE
 31685  TEXT ·AmdAxpyPointerLoopX_V5A12U8(SB), NOSPLIT, $0-48
 31686  	MOVSS alpha+0(FP), X0
 31687  	MOVQ  xs+8(FP), AX
 31688  	MOVQ  incx+16(FP), CX
 31689  	MOVQ  ys+24(FP), DX
 31690  	MOVQ  incy+32(FP), BX
 31691  	MOVQ  n+40(FP), SI
 31692  	JMP   check_limit_unroll
 31693  	PCALIGN $0x08
 31694  	NOP
 31695  	NOP
 31696  	NOP
 31697  	NOP
 31698  
 31699  loop_unroll:
 31700  	MOVSS (AX), X1
 31701  	MULSS X0, X1
 31702  	ADDSS (DX), X1
 31703  	MOVSS X1, (DX)
 31704  	LEAQ  (AX)(CX*4), AX
 31705  	LEAQ  (DX)(BX*4), DX
 31706  	MOVSS (AX), X1
 31707  	MULSS X0, X1
 31708  	ADDSS (DX), X1
 31709  	MOVSS X1, (DX)
 31710  	LEAQ  (AX)(CX*4), AX
 31711  	LEAQ  (DX)(BX*4), DX
 31712  	MOVSS (AX), X1
 31713  	MULSS X0, X1
 31714  	ADDSS (DX), X1
 31715  	MOVSS X1, (DX)
 31716  	LEAQ  (AX)(CX*4), AX
 31717  	LEAQ  (DX)(BX*4), DX
 31718  	MOVSS (AX), X1
 31719  	MULSS X0, X1
 31720  	ADDSS (DX), X1
 31721  	MOVSS X1, (DX)
 31722  	LEAQ  (AX)(CX*4), AX
 31723  	LEAQ  (DX)(BX*4), DX
 31724  	MOVSS (AX), X1
 31725  	MULSS X0, X1
 31726  	ADDSS (DX), X1
 31727  	MOVSS X1, (DX)
 31728  	LEAQ  (AX)(CX*4), AX
 31729  	LEAQ  (DX)(BX*4), DX
 31730  	MOVSS (AX), X1
 31731  	MULSS X0, X1
 31732  	ADDSS (DX), X1
 31733  	MOVSS X1, (DX)
 31734  	LEAQ  (AX)(CX*4), AX
 31735  	LEAQ  (DX)(BX*4), DX
 31736  	MOVSS (AX), X1
 31737  	MULSS X0, X1
 31738  	ADDSS (DX), X1
 31739  	MOVSS X1, (DX)
 31740  	LEAQ  (AX)(CX*4), AX
 31741  	LEAQ  (DX)(BX*4), DX
 31742  	MOVSS (AX), X1
 31743  	MULSS X0, X1
 31744  	ADDSS (DX), X1
 31745  	MOVSS X1, (DX)
 31746  	LEAQ  (AX)(CX*4), AX
 31747  	LEAQ  (DX)(BX*4), DX
 31748  	SUBQ  $0x08, SI
 31749  
 31750  check_limit_unroll:
 31751  	CMPQ SI, $0x08
 31752  	JHS  loop_unroll
 31753  	JMP  check_limit
 31754  
 31755  loop:
 31756  	MOVSS (AX), X1
 31757  	MULSS X0, X1
 31758  	ADDSS (DX), X1
 31759  	MOVSS X1, (DX)
 31760  	DECQ  SI
 31761  	LEAQ  (AX)(CX*4), AX
 31762  	LEAQ  (DX)(BX*4), DX
 31763  
 31764  check_limit:
 31765  	CMPQ SI, $0x00
 31766  	JHI  loop
 31767  	RET
 31768  
 31769  // func AmdAxpyPointerLoopX_V0A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31770  // Requires: SSE
 31771  TEXT ·AmdAxpyPointerLoopX_V0A13U8(SB), NOSPLIT, $0-48
 31772  	MOVSS alpha+0(FP), X0
 31773  	MOVQ  xs+8(FP), AX
 31774  	MOVQ  incx+16(FP), CX
 31775  	MOVQ  ys+24(FP), DX
 31776  	MOVQ  incy+32(FP), BX
 31777  	MOVQ  n+40(FP), SI
 31778  	JMP   check_limit_unroll
 31779  	PCALIGN $0x08
 31780  	NOP
 31781  	NOP
 31782  	NOP
 31783  	NOP
 31784  	NOP
 31785  
 31786  loop_unroll:
 31787  	MOVSS (AX), X1
 31788  	MULSS X0, X1
 31789  	ADDSS (DX), X1
 31790  	MOVSS X1, (DX)
 31791  	LEAQ  (AX)(CX*4), AX
 31792  	LEAQ  (DX)(BX*4), DX
 31793  	MOVSS (AX), X1
 31794  	MULSS X0, X1
 31795  	ADDSS (DX), X1
 31796  	MOVSS X1, (DX)
 31797  	LEAQ  (AX)(CX*4), AX
 31798  	LEAQ  (DX)(BX*4), DX
 31799  	MOVSS (AX), X1
 31800  	MULSS X0, X1
 31801  	ADDSS (DX), X1
 31802  	MOVSS X1, (DX)
 31803  	LEAQ  (AX)(CX*4), AX
 31804  	LEAQ  (DX)(BX*4), DX
 31805  	MOVSS (AX), X1
 31806  	MULSS X0, X1
 31807  	ADDSS (DX), X1
 31808  	MOVSS X1, (DX)
 31809  	LEAQ  (AX)(CX*4), AX
 31810  	LEAQ  (DX)(BX*4), DX
 31811  	MOVSS (AX), X1
 31812  	MULSS X0, X1
 31813  	ADDSS (DX), X1
 31814  	MOVSS X1, (DX)
 31815  	LEAQ  (AX)(CX*4), AX
 31816  	LEAQ  (DX)(BX*4), DX
 31817  	MOVSS (AX), X1
 31818  	MULSS X0, X1
 31819  	ADDSS (DX), X1
 31820  	MOVSS X1, (DX)
 31821  	LEAQ  (AX)(CX*4), AX
 31822  	LEAQ  (DX)(BX*4), DX
 31823  	MOVSS (AX), X1
 31824  	MULSS X0, X1
 31825  	ADDSS (DX), X1
 31826  	MOVSS X1, (DX)
 31827  	LEAQ  (AX)(CX*4), AX
 31828  	LEAQ  (DX)(BX*4), DX
 31829  	MOVSS (AX), X1
 31830  	MULSS X0, X1
 31831  	ADDSS (DX), X1
 31832  	MOVSS X1, (DX)
 31833  	LEAQ  (AX)(CX*4), AX
 31834  	LEAQ  (DX)(BX*4), DX
 31835  	SUBQ  $0x08, SI
 31836  
 31837  check_limit_unroll:
 31838  	CMPQ SI, $0x08
 31839  	JHS  loop_unroll
 31840  	JMP  check_limit
 31841  
 31842  loop:
 31843  	MOVSS (AX), X1
 31844  	MULSS X0, X1
 31845  	ADDSS (DX), X1
 31846  	MOVSS X1, (DX)
 31847  	DECQ  SI
 31848  	LEAQ  (AX)(CX*4), AX
 31849  	LEAQ  (DX)(BX*4), DX
 31850  
 31851  check_limit:
 31852  	CMPQ SI, $0x00
 31853  	JHI  loop
 31854  	RET
 31855  
 31856  // func AmdAxpyPointerLoopX_V1A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31857  // Requires: SSE
 31858  TEXT ·AmdAxpyPointerLoopX_V1A13U8(SB), NOSPLIT, $0-48
 31859  	MOVSS alpha+0(FP), X0
 31860  	MOVQ  xs+8(FP), AX
 31861  	MOVQ  incx+16(FP), CX
 31862  	MOVQ  ys+24(FP), DX
 31863  	MOVQ  incy+32(FP), BX
 31864  	MOVQ  n+40(FP), SI
 31865  	JMP   check_limit_unroll
 31866  	PCALIGN $0x08
 31867  	NOP
 31868  	NOP
 31869  	NOP
 31870  	NOP
 31871  	NOP
 31872  
 31873  loop_unroll:
 31874  	MOVSS (AX), X1
 31875  	MULSS X0, X1
 31876  	ADDSS (DX), X1
 31877  	MOVSS X1, (DX)
 31878  	LEAQ  (AX)(CX*4), AX
 31879  	LEAQ  (DX)(BX*4), DX
 31880  	MOVSS (AX), X1
 31881  	MULSS X0, X1
 31882  	ADDSS (DX), X1
 31883  	MOVSS X1, (DX)
 31884  	LEAQ  (AX)(CX*4), AX
 31885  	LEAQ  (DX)(BX*4), DX
 31886  	MOVSS (AX), X1
 31887  	MULSS X0, X1
 31888  	ADDSS (DX), X1
 31889  	MOVSS X1, (DX)
 31890  	LEAQ  (AX)(CX*4), AX
 31891  	LEAQ  (DX)(BX*4), DX
 31892  	MOVSS (AX), X1
 31893  	MULSS X0, X1
 31894  	ADDSS (DX), X1
 31895  	MOVSS X1, (DX)
 31896  	LEAQ  (AX)(CX*4), AX
 31897  	LEAQ  (DX)(BX*4), DX
 31898  	MOVSS (AX), X1
 31899  	MULSS X0, X1
 31900  	ADDSS (DX), X1
 31901  	MOVSS X1, (DX)
 31902  	LEAQ  (AX)(CX*4), AX
 31903  	LEAQ  (DX)(BX*4), DX
 31904  	MOVSS (AX), X1
 31905  	MULSS X0, X1
 31906  	ADDSS (DX), X1
 31907  	MOVSS X1, (DX)
 31908  	LEAQ  (AX)(CX*4), AX
 31909  	LEAQ  (DX)(BX*4), DX
 31910  	MOVSS (AX), X1
 31911  	MULSS X0, X1
 31912  	ADDSS (DX), X1
 31913  	MOVSS X1, (DX)
 31914  	LEAQ  (AX)(CX*4), AX
 31915  	LEAQ  (DX)(BX*4), DX
 31916  	MOVSS (AX), X1
 31917  	MULSS X0, X1
 31918  	ADDSS (DX), X1
 31919  	MOVSS X1, (DX)
 31920  	LEAQ  (AX)(CX*4), AX
 31921  	LEAQ  (DX)(BX*4), DX
 31922  	SUBQ  $0x08, SI
 31923  
 31924  check_limit_unroll:
 31925  	CMPQ SI, $0x08
 31926  	JHS  loop_unroll
 31927  	JMP  check_limit
 31928  
 31929  loop:
 31930  	MOVSS (AX), X1
 31931  	MULSS X0, X1
 31932  	ADDSS (DX), X1
 31933  	MOVSS X1, (DX)
 31934  	DECQ  SI
 31935  	LEAQ  (AX)(CX*4), AX
 31936  	LEAQ  (DX)(BX*4), DX
 31937  
 31938  check_limit:
 31939  	CMPQ SI, $0x00
 31940  	JHI  loop
 31941  	RET
 31942  
 31943  // func AmdAxpyPointerLoopX_V2A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 31944  // Requires: SSE
 31945  TEXT ·AmdAxpyPointerLoopX_V2A13U8(SB), NOSPLIT, $0-48
 31946  	MOVSS alpha+0(FP), X0
 31947  	MOVQ  xs+8(FP), AX
 31948  	MOVQ  incx+16(FP), CX
 31949  	MOVQ  ys+24(FP), DX
 31950  	MOVQ  incy+32(FP), BX
 31951  	MOVQ  n+40(FP), SI
 31952  	JMP   check_limit_unroll
 31953  	PCALIGN $0x08
 31954  	NOP
 31955  	NOP
 31956  	NOP
 31957  	NOP
 31958  	NOP
 31959  
 31960  loop_unroll:
 31961  	MOVSS (AX), X1
 31962  	MULSS X0, X1
 31963  	ADDSS (DX), X1
 31964  	MOVSS X1, (DX)
 31965  	LEAQ  (AX)(CX*4), AX
 31966  	LEAQ  (DX)(BX*4), DX
 31967  	MOVSS (AX), X1
 31968  	MULSS X0, X1
 31969  	ADDSS (DX), X1
 31970  	MOVSS X1, (DX)
 31971  	LEAQ  (AX)(CX*4), AX
 31972  	LEAQ  (DX)(BX*4), DX
 31973  	MOVSS (AX), X1
 31974  	MULSS X0, X1
 31975  	ADDSS (DX), X1
 31976  	MOVSS X1, (DX)
 31977  	LEAQ  (AX)(CX*4), AX
 31978  	LEAQ  (DX)(BX*4), DX
 31979  	MOVSS (AX), X1
 31980  	MULSS X0, X1
 31981  	ADDSS (DX), X1
 31982  	MOVSS X1, (DX)
 31983  	LEAQ  (AX)(CX*4), AX
 31984  	LEAQ  (DX)(BX*4), DX
 31985  	MOVSS (AX), X1
 31986  	MULSS X0, X1
 31987  	ADDSS (DX), X1
 31988  	MOVSS X1, (DX)
 31989  	LEAQ  (AX)(CX*4), AX
 31990  	LEAQ  (DX)(BX*4), DX
 31991  	MOVSS (AX), X1
 31992  	MULSS X0, X1
 31993  	ADDSS (DX), X1
 31994  	MOVSS X1, (DX)
 31995  	LEAQ  (AX)(CX*4), AX
 31996  	LEAQ  (DX)(BX*4), DX
 31997  	MOVSS (AX), X1
 31998  	MULSS X0, X1
 31999  	ADDSS (DX), X1
 32000  	MOVSS X1, (DX)
 32001  	LEAQ  (AX)(CX*4), AX
 32002  	LEAQ  (DX)(BX*4), DX
 32003  	MOVSS (AX), X1
 32004  	MULSS X0, X1
 32005  	ADDSS (DX), X1
 32006  	MOVSS X1, (DX)
 32007  	LEAQ  (AX)(CX*4), AX
 32008  	LEAQ  (DX)(BX*4), DX
 32009  	SUBQ  $0x08, SI
 32010  
 32011  check_limit_unroll:
 32012  	CMPQ SI, $0x08
 32013  	JHS  loop_unroll
 32014  	JMP  check_limit
 32015  
 32016  loop:
 32017  	MOVSS (AX), X1
 32018  	MULSS X0, X1
 32019  	ADDSS (DX), X1
 32020  	MOVSS X1, (DX)
 32021  	DECQ  SI
 32022  	LEAQ  (AX)(CX*4), AX
 32023  	LEAQ  (DX)(BX*4), DX
 32024  
 32025  check_limit:
 32026  	CMPQ SI, $0x00
 32027  	JHI  loop
 32028  	RET
 32029  
 32030  // func AmdAxpyPointerLoopX_V3A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32031  // Requires: SSE
 32032  TEXT ·AmdAxpyPointerLoopX_V3A13U8(SB), NOSPLIT, $0-48
 32033  	MOVSS alpha+0(FP), X0
 32034  	MOVQ  xs+8(FP), AX
 32035  	MOVQ  incx+16(FP), CX
 32036  	MOVQ  ys+24(FP), DX
 32037  	MOVQ  incy+32(FP), BX
 32038  	MOVQ  n+40(FP), SI
 32039  	JMP   check_limit_unroll
 32040  	PCALIGN $0x08
 32041  	NOP
 32042  	NOP
 32043  	NOP
 32044  	NOP
 32045  	NOP
 32046  
 32047  loop_unroll:
 32048  	MOVSS (AX), X1
 32049  	MULSS X0, X1
 32050  	ADDSS (DX), X1
 32051  	MOVSS X1, (DX)
 32052  	LEAQ  (AX)(CX*4), AX
 32053  	LEAQ  (DX)(BX*4), DX
 32054  	MOVSS (AX), X1
 32055  	MULSS X0, X1
 32056  	ADDSS (DX), X1
 32057  	MOVSS X1, (DX)
 32058  	LEAQ  (AX)(CX*4), AX
 32059  	LEAQ  (DX)(BX*4), DX
 32060  	MOVSS (AX), X1
 32061  	MULSS X0, X1
 32062  	ADDSS (DX), X1
 32063  	MOVSS X1, (DX)
 32064  	LEAQ  (AX)(CX*4), AX
 32065  	LEAQ  (DX)(BX*4), DX
 32066  	MOVSS (AX), X1
 32067  	MULSS X0, X1
 32068  	ADDSS (DX), X1
 32069  	MOVSS X1, (DX)
 32070  	LEAQ  (AX)(CX*4), AX
 32071  	LEAQ  (DX)(BX*4), DX
 32072  	MOVSS (AX), X1
 32073  	MULSS X0, X1
 32074  	ADDSS (DX), X1
 32075  	MOVSS X1, (DX)
 32076  	LEAQ  (AX)(CX*4), AX
 32077  	LEAQ  (DX)(BX*4), DX
 32078  	MOVSS (AX), X1
 32079  	MULSS X0, X1
 32080  	ADDSS (DX), X1
 32081  	MOVSS X1, (DX)
 32082  	LEAQ  (AX)(CX*4), AX
 32083  	LEAQ  (DX)(BX*4), DX
 32084  	MOVSS (AX), X1
 32085  	MULSS X0, X1
 32086  	ADDSS (DX), X1
 32087  	MOVSS X1, (DX)
 32088  	LEAQ  (AX)(CX*4), AX
 32089  	LEAQ  (DX)(BX*4), DX
 32090  	MOVSS (AX), X1
 32091  	MULSS X0, X1
 32092  	ADDSS (DX), X1
 32093  	MOVSS X1, (DX)
 32094  	LEAQ  (AX)(CX*4), AX
 32095  	LEAQ  (DX)(BX*4), DX
 32096  	SUBQ  $0x08, SI
 32097  
 32098  check_limit_unroll:
 32099  	CMPQ SI, $0x08
 32100  	JHS  loop_unroll
 32101  	JMP  check_limit
 32102  
 32103  loop:
 32104  	MOVSS (AX), X1
 32105  	MULSS X0, X1
 32106  	ADDSS (DX), X1
 32107  	MOVSS X1, (DX)
 32108  	DECQ  SI
 32109  	LEAQ  (AX)(CX*4), AX
 32110  	LEAQ  (DX)(BX*4), DX
 32111  
 32112  check_limit:
 32113  	CMPQ SI, $0x00
 32114  	JHI  loop
 32115  	RET
 32116  
 32117  // func AmdAxpyPointerLoopX_V4A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32118  // Requires: SSE
 32119  TEXT ·AmdAxpyPointerLoopX_V4A13U8(SB), NOSPLIT, $0-48
 32120  	MOVSS alpha+0(FP), X0
 32121  	MOVQ  xs+8(FP), AX
 32122  	MOVQ  incx+16(FP), CX
 32123  	MOVQ  ys+24(FP), DX
 32124  	MOVQ  incy+32(FP), BX
 32125  	MOVQ  n+40(FP), SI
 32126  	JMP   check_limit_unroll
 32127  	PCALIGN $0x08
 32128  	NOP
 32129  	NOP
 32130  	NOP
 32131  	NOP
 32132  	NOP
 32133  
 32134  loop_unroll:
 32135  	MOVSS (AX), X1
 32136  	MULSS X0, X1
 32137  	ADDSS (DX), X1
 32138  	MOVSS X1, (DX)
 32139  	LEAQ  (AX)(CX*4), AX
 32140  	LEAQ  (DX)(BX*4), DX
 32141  	MOVSS (AX), X1
 32142  	MULSS X0, X1
 32143  	ADDSS (DX), X1
 32144  	MOVSS X1, (DX)
 32145  	LEAQ  (AX)(CX*4), AX
 32146  	LEAQ  (DX)(BX*4), DX
 32147  	MOVSS (AX), X1
 32148  	MULSS X0, X1
 32149  	ADDSS (DX), X1
 32150  	MOVSS X1, (DX)
 32151  	LEAQ  (AX)(CX*4), AX
 32152  	LEAQ  (DX)(BX*4), DX
 32153  	MOVSS (AX), X1
 32154  	MULSS X0, X1
 32155  	ADDSS (DX), X1
 32156  	MOVSS X1, (DX)
 32157  	LEAQ  (AX)(CX*4), AX
 32158  	LEAQ  (DX)(BX*4), DX
 32159  	MOVSS (AX), X1
 32160  	MULSS X0, X1
 32161  	ADDSS (DX), X1
 32162  	MOVSS X1, (DX)
 32163  	LEAQ  (AX)(CX*4), AX
 32164  	LEAQ  (DX)(BX*4), DX
 32165  	MOVSS (AX), X1
 32166  	MULSS X0, X1
 32167  	ADDSS (DX), X1
 32168  	MOVSS X1, (DX)
 32169  	LEAQ  (AX)(CX*4), AX
 32170  	LEAQ  (DX)(BX*4), DX
 32171  	MOVSS (AX), X1
 32172  	MULSS X0, X1
 32173  	ADDSS (DX), X1
 32174  	MOVSS X1, (DX)
 32175  	LEAQ  (AX)(CX*4), AX
 32176  	LEAQ  (DX)(BX*4), DX
 32177  	MOVSS (AX), X1
 32178  	MULSS X0, X1
 32179  	ADDSS (DX), X1
 32180  	MOVSS X1, (DX)
 32181  	LEAQ  (AX)(CX*4), AX
 32182  	LEAQ  (DX)(BX*4), DX
 32183  	SUBQ  $0x08, SI
 32184  
 32185  check_limit_unroll:
 32186  	CMPQ SI, $0x08
 32187  	JHS  loop_unroll
 32188  	JMP  check_limit
 32189  
 32190  loop:
 32191  	MOVSS (AX), X1
 32192  	MULSS X0, X1
 32193  	ADDSS (DX), X1
 32194  	MOVSS X1, (DX)
 32195  	DECQ  SI
 32196  	LEAQ  (AX)(CX*4), AX
 32197  	LEAQ  (DX)(BX*4), DX
 32198  
 32199  check_limit:
 32200  	CMPQ SI, $0x00
 32201  	JHI  loop
 32202  	RET
 32203  
 32204  // func AmdAxpyPointerLoopX_V5A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32205  // Requires: SSE
 32206  TEXT ·AmdAxpyPointerLoopX_V5A13U8(SB), NOSPLIT, $0-48
 32207  	MOVSS alpha+0(FP), X0
 32208  	MOVQ  xs+8(FP), AX
 32209  	MOVQ  incx+16(FP), CX
 32210  	MOVQ  ys+24(FP), DX
 32211  	MOVQ  incy+32(FP), BX
 32212  	MOVQ  n+40(FP), SI
 32213  	JMP   check_limit_unroll
 32214  	PCALIGN $0x08
 32215  	NOP
 32216  	NOP
 32217  	NOP
 32218  	NOP
 32219  	NOP
 32220  
 32221  loop_unroll:
 32222  	MOVSS (AX), X1
 32223  	MULSS X0, X1
 32224  	ADDSS (DX), X1
 32225  	MOVSS X1, (DX)
 32226  	LEAQ  (AX)(CX*4), AX
 32227  	LEAQ  (DX)(BX*4), DX
 32228  	MOVSS (AX), X1
 32229  	MULSS X0, X1
 32230  	ADDSS (DX), X1
 32231  	MOVSS X1, (DX)
 32232  	LEAQ  (AX)(CX*4), AX
 32233  	LEAQ  (DX)(BX*4), DX
 32234  	MOVSS (AX), X1
 32235  	MULSS X0, X1
 32236  	ADDSS (DX), X1
 32237  	MOVSS X1, (DX)
 32238  	LEAQ  (AX)(CX*4), AX
 32239  	LEAQ  (DX)(BX*4), DX
 32240  	MOVSS (AX), X1
 32241  	MULSS X0, X1
 32242  	ADDSS (DX), X1
 32243  	MOVSS X1, (DX)
 32244  	LEAQ  (AX)(CX*4), AX
 32245  	LEAQ  (DX)(BX*4), DX
 32246  	MOVSS (AX), X1
 32247  	MULSS X0, X1
 32248  	ADDSS (DX), X1
 32249  	MOVSS X1, (DX)
 32250  	LEAQ  (AX)(CX*4), AX
 32251  	LEAQ  (DX)(BX*4), DX
 32252  	MOVSS (AX), X1
 32253  	MULSS X0, X1
 32254  	ADDSS (DX), X1
 32255  	MOVSS X1, (DX)
 32256  	LEAQ  (AX)(CX*4), AX
 32257  	LEAQ  (DX)(BX*4), DX
 32258  	MOVSS (AX), X1
 32259  	MULSS X0, X1
 32260  	ADDSS (DX), X1
 32261  	MOVSS X1, (DX)
 32262  	LEAQ  (AX)(CX*4), AX
 32263  	LEAQ  (DX)(BX*4), DX
 32264  	MOVSS (AX), X1
 32265  	MULSS X0, X1
 32266  	ADDSS (DX), X1
 32267  	MOVSS X1, (DX)
 32268  	LEAQ  (AX)(CX*4), AX
 32269  	LEAQ  (DX)(BX*4), DX
 32270  	SUBQ  $0x08, SI
 32271  
 32272  check_limit_unroll:
 32273  	CMPQ SI, $0x08
 32274  	JHS  loop_unroll
 32275  	JMP  check_limit
 32276  
 32277  loop:
 32278  	MOVSS (AX), X1
 32279  	MULSS X0, X1
 32280  	ADDSS (DX), X1
 32281  	MOVSS X1, (DX)
 32282  	DECQ  SI
 32283  	LEAQ  (AX)(CX*4), AX
 32284  	LEAQ  (DX)(BX*4), DX
 32285  
 32286  check_limit:
 32287  	CMPQ SI, $0x00
 32288  	JHI  loop
 32289  	RET
 32290  
 32291  // func AmdAxpyPointerLoopX_V0A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32292  // Requires: SSE
 32293  TEXT ·AmdAxpyPointerLoopX_V0A14U8(SB), NOSPLIT, $0-48
 32294  	MOVSS alpha+0(FP), X0
 32295  	MOVQ  xs+8(FP), AX
 32296  	MOVQ  incx+16(FP), CX
 32297  	MOVQ  ys+24(FP), DX
 32298  	MOVQ  incy+32(FP), BX
 32299  	MOVQ  n+40(FP), SI
 32300  	JMP   check_limit_unroll
 32301  	PCALIGN $0x08
 32302  	NOP
 32303  	NOP
 32304  	NOP
 32305  	NOP
 32306  	NOP
 32307  	NOP
 32308  
 32309  loop_unroll:
 32310  	MOVSS (AX), X1
 32311  	MULSS X0, X1
 32312  	ADDSS (DX), X1
 32313  	MOVSS X1, (DX)
 32314  	LEAQ  (AX)(CX*4), AX
 32315  	LEAQ  (DX)(BX*4), DX
 32316  	MOVSS (AX), X1
 32317  	MULSS X0, X1
 32318  	ADDSS (DX), X1
 32319  	MOVSS X1, (DX)
 32320  	LEAQ  (AX)(CX*4), AX
 32321  	LEAQ  (DX)(BX*4), DX
 32322  	MOVSS (AX), X1
 32323  	MULSS X0, X1
 32324  	ADDSS (DX), X1
 32325  	MOVSS X1, (DX)
 32326  	LEAQ  (AX)(CX*4), AX
 32327  	LEAQ  (DX)(BX*4), DX
 32328  	MOVSS (AX), X1
 32329  	MULSS X0, X1
 32330  	ADDSS (DX), X1
 32331  	MOVSS X1, (DX)
 32332  	LEAQ  (AX)(CX*4), AX
 32333  	LEAQ  (DX)(BX*4), DX
 32334  	MOVSS (AX), X1
 32335  	MULSS X0, X1
 32336  	ADDSS (DX), X1
 32337  	MOVSS X1, (DX)
 32338  	LEAQ  (AX)(CX*4), AX
 32339  	LEAQ  (DX)(BX*4), DX
 32340  	MOVSS (AX), X1
 32341  	MULSS X0, X1
 32342  	ADDSS (DX), X1
 32343  	MOVSS X1, (DX)
 32344  	LEAQ  (AX)(CX*4), AX
 32345  	LEAQ  (DX)(BX*4), DX
 32346  	MOVSS (AX), X1
 32347  	MULSS X0, X1
 32348  	ADDSS (DX), X1
 32349  	MOVSS X1, (DX)
 32350  	LEAQ  (AX)(CX*4), AX
 32351  	LEAQ  (DX)(BX*4), DX
 32352  	MOVSS (AX), X1
 32353  	MULSS X0, X1
 32354  	ADDSS (DX), X1
 32355  	MOVSS X1, (DX)
 32356  	LEAQ  (AX)(CX*4), AX
 32357  	LEAQ  (DX)(BX*4), DX
 32358  	SUBQ  $0x08, SI
 32359  
 32360  check_limit_unroll:
 32361  	CMPQ SI, $0x08
 32362  	JHS  loop_unroll
 32363  	JMP  check_limit
 32364  
 32365  loop:
 32366  	MOVSS (AX), X1
 32367  	MULSS X0, X1
 32368  	ADDSS (DX), X1
 32369  	MOVSS X1, (DX)
 32370  	DECQ  SI
 32371  	LEAQ  (AX)(CX*4), AX
 32372  	LEAQ  (DX)(BX*4), DX
 32373  
 32374  check_limit:
 32375  	CMPQ SI, $0x00
 32376  	JHI  loop
 32377  	RET
 32378  
 32379  // func AmdAxpyPointerLoopX_V1A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32380  // Requires: SSE
 32381  TEXT ·AmdAxpyPointerLoopX_V1A14U8(SB), NOSPLIT, $0-48
 32382  	MOVSS alpha+0(FP), X0
 32383  	MOVQ  xs+8(FP), AX
 32384  	MOVQ  incx+16(FP), CX
 32385  	MOVQ  ys+24(FP), DX
 32386  	MOVQ  incy+32(FP), BX
 32387  	MOVQ  n+40(FP), SI
 32388  	JMP   check_limit_unroll
 32389  	PCALIGN $0x08
 32390  	NOP
 32391  	NOP
 32392  	NOP
 32393  	NOP
 32394  	NOP
 32395  	NOP
 32396  
 32397  loop_unroll:
 32398  	MOVSS (AX), X1
 32399  	MULSS X0, X1
 32400  	ADDSS (DX), X1
 32401  	MOVSS X1, (DX)
 32402  	LEAQ  (AX)(CX*4), AX
 32403  	LEAQ  (DX)(BX*4), DX
 32404  	MOVSS (AX), X1
 32405  	MULSS X0, X1
 32406  	ADDSS (DX), X1
 32407  	MOVSS X1, (DX)
 32408  	LEAQ  (AX)(CX*4), AX
 32409  	LEAQ  (DX)(BX*4), DX
 32410  	MOVSS (AX), X1
 32411  	MULSS X0, X1
 32412  	ADDSS (DX), X1
 32413  	MOVSS X1, (DX)
 32414  	LEAQ  (AX)(CX*4), AX
 32415  	LEAQ  (DX)(BX*4), DX
 32416  	MOVSS (AX), X1
 32417  	MULSS X0, X1
 32418  	ADDSS (DX), X1
 32419  	MOVSS X1, (DX)
 32420  	LEAQ  (AX)(CX*4), AX
 32421  	LEAQ  (DX)(BX*4), DX
 32422  	MOVSS (AX), X1
 32423  	MULSS X0, X1
 32424  	ADDSS (DX), X1
 32425  	MOVSS X1, (DX)
 32426  	LEAQ  (AX)(CX*4), AX
 32427  	LEAQ  (DX)(BX*4), DX
 32428  	MOVSS (AX), X1
 32429  	MULSS X0, X1
 32430  	ADDSS (DX), X1
 32431  	MOVSS X1, (DX)
 32432  	LEAQ  (AX)(CX*4), AX
 32433  	LEAQ  (DX)(BX*4), DX
 32434  	MOVSS (AX), X1
 32435  	MULSS X0, X1
 32436  	ADDSS (DX), X1
 32437  	MOVSS X1, (DX)
 32438  	LEAQ  (AX)(CX*4), AX
 32439  	LEAQ  (DX)(BX*4), DX
 32440  	MOVSS (AX), X1
 32441  	MULSS X0, X1
 32442  	ADDSS (DX), X1
 32443  	MOVSS X1, (DX)
 32444  	LEAQ  (AX)(CX*4), AX
 32445  	LEAQ  (DX)(BX*4), DX
 32446  	SUBQ  $0x08, SI
 32447  
 32448  check_limit_unroll:
 32449  	CMPQ SI, $0x08
 32450  	JHS  loop_unroll
 32451  	JMP  check_limit
 32452  
 32453  loop:
 32454  	MOVSS (AX), X1
 32455  	MULSS X0, X1
 32456  	ADDSS (DX), X1
 32457  	MOVSS X1, (DX)
 32458  	DECQ  SI
 32459  	LEAQ  (AX)(CX*4), AX
 32460  	LEAQ  (DX)(BX*4), DX
 32461  
 32462  check_limit:
 32463  	CMPQ SI, $0x00
 32464  	JHI  loop
 32465  	RET
 32466  
 32467  // func AmdAxpyPointerLoopX_V2A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32468  // Requires: SSE
 32469  TEXT ·AmdAxpyPointerLoopX_V2A14U8(SB), NOSPLIT, $0-48
 32470  	MOVSS alpha+0(FP), X0
 32471  	MOVQ  xs+8(FP), AX
 32472  	MOVQ  incx+16(FP), CX
 32473  	MOVQ  ys+24(FP), DX
 32474  	MOVQ  incy+32(FP), BX
 32475  	MOVQ  n+40(FP), SI
 32476  	JMP   check_limit_unroll
 32477  	PCALIGN $0x08
 32478  	NOP
 32479  	NOP
 32480  	NOP
 32481  	NOP
 32482  	NOP
 32483  	NOP
 32484  
 32485  loop_unroll:
 32486  	MOVSS (AX), X1
 32487  	MULSS X0, X1
 32488  	ADDSS (DX), X1
 32489  	MOVSS X1, (DX)
 32490  	LEAQ  (AX)(CX*4), AX
 32491  	LEAQ  (DX)(BX*4), DX
 32492  	MOVSS (AX), X1
 32493  	MULSS X0, X1
 32494  	ADDSS (DX), X1
 32495  	MOVSS X1, (DX)
 32496  	LEAQ  (AX)(CX*4), AX
 32497  	LEAQ  (DX)(BX*4), DX
 32498  	MOVSS (AX), X1
 32499  	MULSS X0, X1
 32500  	ADDSS (DX), X1
 32501  	MOVSS X1, (DX)
 32502  	LEAQ  (AX)(CX*4), AX
 32503  	LEAQ  (DX)(BX*4), DX
 32504  	MOVSS (AX), X1
 32505  	MULSS X0, X1
 32506  	ADDSS (DX), X1
 32507  	MOVSS X1, (DX)
 32508  	LEAQ  (AX)(CX*4), AX
 32509  	LEAQ  (DX)(BX*4), DX
 32510  	MOVSS (AX), X1
 32511  	MULSS X0, X1
 32512  	ADDSS (DX), X1
 32513  	MOVSS X1, (DX)
 32514  	LEAQ  (AX)(CX*4), AX
 32515  	LEAQ  (DX)(BX*4), DX
 32516  	MOVSS (AX), X1
 32517  	MULSS X0, X1
 32518  	ADDSS (DX), X1
 32519  	MOVSS X1, (DX)
 32520  	LEAQ  (AX)(CX*4), AX
 32521  	LEAQ  (DX)(BX*4), DX
 32522  	MOVSS (AX), X1
 32523  	MULSS X0, X1
 32524  	ADDSS (DX), X1
 32525  	MOVSS X1, (DX)
 32526  	LEAQ  (AX)(CX*4), AX
 32527  	LEAQ  (DX)(BX*4), DX
 32528  	MOVSS (AX), X1
 32529  	MULSS X0, X1
 32530  	ADDSS (DX), X1
 32531  	MOVSS X1, (DX)
 32532  	LEAQ  (AX)(CX*4), AX
 32533  	LEAQ  (DX)(BX*4), DX
 32534  	SUBQ  $0x08, SI
 32535  
 32536  check_limit_unroll:
 32537  	CMPQ SI, $0x08
 32538  	JHS  loop_unroll
 32539  	JMP  check_limit
 32540  
 32541  loop:
 32542  	MOVSS (AX), X1
 32543  	MULSS X0, X1
 32544  	ADDSS (DX), X1
 32545  	MOVSS X1, (DX)
 32546  	DECQ  SI
 32547  	LEAQ  (AX)(CX*4), AX
 32548  	LEAQ  (DX)(BX*4), DX
 32549  
 32550  check_limit:
 32551  	CMPQ SI, $0x00
 32552  	JHI  loop
 32553  	RET
 32554  
 32555  // func AmdAxpyPointerLoopX_V3A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32556  // Requires: SSE
 32557  TEXT ·AmdAxpyPointerLoopX_V3A14U8(SB), NOSPLIT, $0-48
 32558  	MOVSS alpha+0(FP), X0
 32559  	MOVQ  xs+8(FP), AX
 32560  	MOVQ  incx+16(FP), CX
 32561  	MOVQ  ys+24(FP), DX
 32562  	MOVQ  incy+32(FP), BX
 32563  	MOVQ  n+40(FP), SI
 32564  	JMP   check_limit_unroll
 32565  	PCALIGN $0x08
 32566  	NOP
 32567  	NOP
 32568  	NOP
 32569  	NOP
 32570  	NOP
 32571  	NOP
 32572  
 32573  loop_unroll:
 32574  	MOVSS (AX), X1
 32575  	MULSS X0, X1
 32576  	ADDSS (DX), X1
 32577  	MOVSS X1, (DX)
 32578  	LEAQ  (AX)(CX*4), AX
 32579  	LEAQ  (DX)(BX*4), DX
 32580  	MOVSS (AX), X1
 32581  	MULSS X0, X1
 32582  	ADDSS (DX), X1
 32583  	MOVSS X1, (DX)
 32584  	LEAQ  (AX)(CX*4), AX
 32585  	LEAQ  (DX)(BX*4), DX
 32586  	MOVSS (AX), X1
 32587  	MULSS X0, X1
 32588  	ADDSS (DX), X1
 32589  	MOVSS X1, (DX)
 32590  	LEAQ  (AX)(CX*4), AX
 32591  	LEAQ  (DX)(BX*4), DX
 32592  	MOVSS (AX), X1
 32593  	MULSS X0, X1
 32594  	ADDSS (DX), X1
 32595  	MOVSS X1, (DX)
 32596  	LEAQ  (AX)(CX*4), AX
 32597  	LEAQ  (DX)(BX*4), DX
 32598  	MOVSS (AX), X1
 32599  	MULSS X0, X1
 32600  	ADDSS (DX), X1
 32601  	MOVSS X1, (DX)
 32602  	LEAQ  (AX)(CX*4), AX
 32603  	LEAQ  (DX)(BX*4), DX
 32604  	MOVSS (AX), X1
 32605  	MULSS X0, X1
 32606  	ADDSS (DX), X1
 32607  	MOVSS X1, (DX)
 32608  	LEAQ  (AX)(CX*4), AX
 32609  	LEAQ  (DX)(BX*4), DX
 32610  	MOVSS (AX), X1
 32611  	MULSS X0, X1
 32612  	ADDSS (DX), X1
 32613  	MOVSS X1, (DX)
 32614  	LEAQ  (AX)(CX*4), AX
 32615  	LEAQ  (DX)(BX*4), DX
 32616  	MOVSS (AX), X1
 32617  	MULSS X0, X1
 32618  	ADDSS (DX), X1
 32619  	MOVSS X1, (DX)
 32620  	LEAQ  (AX)(CX*4), AX
 32621  	LEAQ  (DX)(BX*4), DX
 32622  	SUBQ  $0x08, SI
 32623  
 32624  check_limit_unroll:
 32625  	CMPQ SI, $0x08
 32626  	JHS  loop_unroll
 32627  	JMP  check_limit
 32628  
 32629  loop:
 32630  	MOVSS (AX), X1
 32631  	MULSS X0, X1
 32632  	ADDSS (DX), X1
 32633  	MOVSS X1, (DX)
 32634  	DECQ  SI
 32635  	LEAQ  (AX)(CX*4), AX
 32636  	LEAQ  (DX)(BX*4), DX
 32637  
 32638  check_limit:
 32639  	CMPQ SI, $0x00
 32640  	JHI  loop
 32641  	RET
 32642  
 32643  // func AmdAxpyPointerLoopX_V4A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32644  // Requires: SSE
 32645  TEXT ·AmdAxpyPointerLoopX_V4A14U8(SB), NOSPLIT, $0-48
 32646  	MOVSS alpha+0(FP), X0
 32647  	MOVQ  xs+8(FP), AX
 32648  	MOVQ  incx+16(FP), CX
 32649  	MOVQ  ys+24(FP), DX
 32650  	MOVQ  incy+32(FP), BX
 32651  	MOVQ  n+40(FP), SI
 32652  	JMP   check_limit_unroll
 32653  	PCALIGN $0x08
 32654  	NOP
 32655  	NOP
 32656  	NOP
 32657  	NOP
 32658  	NOP
 32659  	NOP
 32660  
 32661  loop_unroll:
 32662  	MOVSS (AX), X1
 32663  	MULSS X0, X1
 32664  	ADDSS (DX), X1
 32665  	MOVSS X1, (DX)
 32666  	LEAQ  (AX)(CX*4), AX
 32667  	LEAQ  (DX)(BX*4), DX
 32668  	MOVSS (AX), X1
 32669  	MULSS X0, X1
 32670  	ADDSS (DX), X1
 32671  	MOVSS X1, (DX)
 32672  	LEAQ  (AX)(CX*4), AX
 32673  	LEAQ  (DX)(BX*4), DX
 32674  	MOVSS (AX), X1
 32675  	MULSS X0, X1
 32676  	ADDSS (DX), X1
 32677  	MOVSS X1, (DX)
 32678  	LEAQ  (AX)(CX*4), AX
 32679  	LEAQ  (DX)(BX*4), DX
 32680  	MOVSS (AX), X1
 32681  	MULSS X0, X1
 32682  	ADDSS (DX), X1
 32683  	MOVSS X1, (DX)
 32684  	LEAQ  (AX)(CX*4), AX
 32685  	LEAQ  (DX)(BX*4), DX
 32686  	MOVSS (AX), X1
 32687  	MULSS X0, X1
 32688  	ADDSS (DX), X1
 32689  	MOVSS X1, (DX)
 32690  	LEAQ  (AX)(CX*4), AX
 32691  	LEAQ  (DX)(BX*4), DX
 32692  	MOVSS (AX), X1
 32693  	MULSS X0, X1
 32694  	ADDSS (DX), X1
 32695  	MOVSS X1, (DX)
 32696  	LEAQ  (AX)(CX*4), AX
 32697  	LEAQ  (DX)(BX*4), DX
 32698  	MOVSS (AX), X1
 32699  	MULSS X0, X1
 32700  	ADDSS (DX), X1
 32701  	MOVSS X1, (DX)
 32702  	LEAQ  (AX)(CX*4), AX
 32703  	LEAQ  (DX)(BX*4), DX
 32704  	MOVSS (AX), X1
 32705  	MULSS X0, X1
 32706  	ADDSS (DX), X1
 32707  	MOVSS X1, (DX)
 32708  	LEAQ  (AX)(CX*4), AX
 32709  	LEAQ  (DX)(BX*4), DX
 32710  	SUBQ  $0x08, SI
 32711  
 32712  check_limit_unroll:
 32713  	CMPQ SI, $0x08
 32714  	JHS  loop_unroll
 32715  	JMP  check_limit
 32716  
 32717  loop:
 32718  	MOVSS (AX), X1
 32719  	MULSS X0, X1
 32720  	ADDSS (DX), X1
 32721  	MOVSS X1, (DX)
 32722  	DECQ  SI
 32723  	LEAQ  (AX)(CX*4), AX
 32724  	LEAQ  (DX)(BX*4), DX
 32725  
 32726  check_limit:
 32727  	CMPQ SI, $0x00
 32728  	JHI  loop
 32729  	RET
 32730  
 32731  // func AmdAxpyPointerLoopX_V5A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32732  // Requires: SSE
 32733  TEXT ·AmdAxpyPointerLoopX_V5A14U8(SB), NOSPLIT, $0-48
 32734  	MOVSS alpha+0(FP), X0
 32735  	MOVQ  xs+8(FP), AX
 32736  	MOVQ  incx+16(FP), CX
 32737  	MOVQ  ys+24(FP), DX
 32738  	MOVQ  incy+32(FP), BX
 32739  	MOVQ  n+40(FP), SI
 32740  	JMP   check_limit_unroll
 32741  	PCALIGN $0x08
 32742  	NOP
 32743  	NOP
 32744  	NOP
 32745  	NOP
 32746  	NOP
 32747  	NOP
 32748  
 32749  loop_unroll:
 32750  	MOVSS (AX), X1
 32751  	MULSS X0, X1
 32752  	ADDSS (DX), X1
 32753  	MOVSS X1, (DX)
 32754  	LEAQ  (AX)(CX*4), AX
 32755  	LEAQ  (DX)(BX*4), DX
 32756  	MOVSS (AX), X1
 32757  	MULSS X0, X1
 32758  	ADDSS (DX), X1
 32759  	MOVSS X1, (DX)
 32760  	LEAQ  (AX)(CX*4), AX
 32761  	LEAQ  (DX)(BX*4), DX
 32762  	MOVSS (AX), X1
 32763  	MULSS X0, X1
 32764  	ADDSS (DX), X1
 32765  	MOVSS X1, (DX)
 32766  	LEAQ  (AX)(CX*4), AX
 32767  	LEAQ  (DX)(BX*4), DX
 32768  	MOVSS (AX), X1
 32769  	MULSS X0, X1
 32770  	ADDSS (DX), X1
 32771  	MOVSS X1, (DX)
 32772  	LEAQ  (AX)(CX*4), AX
 32773  	LEAQ  (DX)(BX*4), DX
 32774  	MOVSS (AX), X1
 32775  	MULSS X0, X1
 32776  	ADDSS (DX), X1
 32777  	MOVSS X1, (DX)
 32778  	LEAQ  (AX)(CX*4), AX
 32779  	LEAQ  (DX)(BX*4), DX
 32780  	MOVSS (AX), X1
 32781  	MULSS X0, X1
 32782  	ADDSS (DX), X1
 32783  	MOVSS X1, (DX)
 32784  	LEAQ  (AX)(CX*4), AX
 32785  	LEAQ  (DX)(BX*4), DX
 32786  	MOVSS (AX), X1
 32787  	MULSS X0, X1
 32788  	ADDSS (DX), X1
 32789  	MOVSS X1, (DX)
 32790  	LEAQ  (AX)(CX*4), AX
 32791  	LEAQ  (DX)(BX*4), DX
 32792  	MOVSS (AX), X1
 32793  	MULSS X0, X1
 32794  	ADDSS (DX), X1
 32795  	MOVSS X1, (DX)
 32796  	LEAQ  (AX)(CX*4), AX
 32797  	LEAQ  (DX)(BX*4), DX
 32798  	SUBQ  $0x08, SI
 32799  
 32800  check_limit_unroll:
 32801  	CMPQ SI, $0x08
 32802  	JHS  loop_unroll
 32803  	JMP  check_limit
 32804  
 32805  loop:
 32806  	MOVSS (AX), X1
 32807  	MULSS X0, X1
 32808  	ADDSS (DX), X1
 32809  	MOVSS X1, (DX)
 32810  	DECQ  SI
 32811  	LEAQ  (AX)(CX*4), AX
 32812  	LEAQ  (DX)(BX*4), DX
 32813  
 32814  check_limit:
 32815  	CMPQ SI, $0x00
 32816  	JHI  loop
 32817  	RET
 32818  
 32819  // func AmdAxpyPointerLoopX_V0A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32820  // Requires: SSE
 32821  TEXT ·AmdAxpyPointerLoopX_V0A15U8(SB), NOSPLIT, $0-48
 32822  	MOVSS alpha+0(FP), X0
 32823  	MOVQ  xs+8(FP), AX
 32824  	MOVQ  incx+16(FP), CX
 32825  	MOVQ  ys+24(FP), DX
 32826  	MOVQ  incy+32(FP), BX
 32827  	MOVQ  n+40(FP), SI
 32828  	JMP   check_limit_unroll
 32829  	PCALIGN $0x08
 32830  	NOP
 32831  	NOP
 32832  	NOP
 32833  	NOP
 32834  	NOP
 32835  	NOP
 32836  	NOP
 32837  
 32838  loop_unroll:
 32839  	MOVSS (AX), X1
 32840  	MULSS X0, X1
 32841  	ADDSS (DX), X1
 32842  	MOVSS X1, (DX)
 32843  	LEAQ  (AX)(CX*4), AX
 32844  	LEAQ  (DX)(BX*4), DX
 32845  	MOVSS (AX), X1
 32846  	MULSS X0, X1
 32847  	ADDSS (DX), X1
 32848  	MOVSS X1, (DX)
 32849  	LEAQ  (AX)(CX*4), AX
 32850  	LEAQ  (DX)(BX*4), DX
 32851  	MOVSS (AX), X1
 32852  	MULSS X0, X1
 32853  	ADDSS (DX), X1
 32854  	MOVSS X1, (DX)
 32855  	LEAQ  (AX)(CX*4), AX
 32856  	LEAQ  (DX)(BX*4), DX
 32857  	MOVSS (AX), X1
 32858  	MULSS X0, X1
 32859  	ADDSS (DX), X1
 32860  	MOVSS X1, (DX)
 32861  	LEAQ  (AX)(CX*4), AX
 32862  	LEAQ  (DX)(BX*4), DX
 32863  	MOVSS (AX), X1
 32864  	MULSS X0, X1
 32865  	ADDSS (DX), X1
 32866  	MOVSS X1, (DX)
 32867  	LEAQ  (AX)(CX*4), AX
 32868  	LEAQ  (DX)(BX*4), DX
 32869  	MOVSS (AX), X1
 32870  	MULSS X0, X1
 32871  	ADDSS (DX), X1
 32872  	MOVSS X1, (DX)
 32873  	LEAQ  (AX)(CX*4), AX
 32874  	LEAQ  (DX)(BX*4), DX
 32875  	MOVSS (AX), X1
 32876  	MULSS X0, X1
 32877  	ADDSS (DX), X1
 32878  	MOVSS X1, (DX)
 32879  	LEAQ  (AX)(CX*4), AX
 32880  	LEAQ  (DX)(BX*4), DX
 32881  	MOVSS (AX), X1
 32882  	MULSS X0, X1
 32883  	ADDSS (DX), X1
 32884  	MOVSS X1, (DX)
 32885  	LEAQ  (AX)(CX*4), AX
 32886  	LEAQ  (DX)(BX*4), DX
 32887  	SUBQ  $0x08, SI
 32888  
 32889  check_limit_unroll:
 32890  	CMPQ SI, $0x08
 32891  	JHS  loop_unroll
 32892  	JMP  check_limit
 32893  
 32894  loop:
 32895  	MOVSS (AX), X1
 32896  	MULSS X0, X1
 32897  	ADDSS (DX), X1
 32898  	MOVSS X1, (DX)
 32899  	DECQ  SI
 32900  	LEAQ  (AX)(CX*4), AX
 32901  	LEAQ  (DX)(BX*4), DX
 32902  
 32903  check_limit:
 32904  	CMPQ SI, $0x00
 32905  	JHI  loop
 32906  	RET
 32907  
 32908  // func AmdAxpyPointerLoopX_V1A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32909  // Requires: SSE
 32910  TEXT ·AmdAxpyPointerLoopX_V1A15U8(SB), NOSPLIT, $0-48
 32911  	MOVSS alpha+0(FP), X0
 32912  	MOVQ  xs+8(FP), AX
 32913  	MOVQ  incx+16(FP), CX
 32914  	MOVQ  ys+24(FP), DX
 32915  	MOVQ  incy+32(FP), BX
 32916  	MOVQ  n+40(FP), SI
 32917  	JMP   check_limit_unroll
 32918  	PCALIGN $0x08
 32919  	NOP
 32920  	NOP
 32921  	NOP
 32922  	NOP
 32923  	NOP
 32924  	NOP
 32925  	NOP
 32926  
 32927  loop_unroll:
 32928  	MOVSS (AX), X1
 32929  	MULSS X0, X1
 32930  	ADDSS (DX), X1
 32931  	MOVSS X1, (DX)
 32932  	LEAQ  (AX)(CX*4), AX
 32933  	LEAQ  (DX)(BX*4), DX
 32934  	MOVSS (AX), X1
 32935  	MULSS X0, X1
 32936  	ADDSS (DX), X1
 32937  	MOVSS X1, (DX)
 32938  	LEAQ  (AX)(CX*4), AX
 32939  	LEAQ  (DX)(BX*4), DX
 32940  	MOVSS (AX), X1
 32941  	MULSS X0, X1
 32942  	ADDSS (DX), X1
 32943  	MOVSS X1, (DX)
 32944  	LEAQ  (AX)(CX*4), AX
 32945  	LEAQ  (DX)(BX*4), DX
 32946  	MOVSS (AX), X1
 32947  	MULSS X0, X1
 32948  	ADDSS (DX), X1
 32949  	MOVSS X1, (DX)
 32950  	LEAQ  (AX)(CX*4), AX
 32951  	LEAQ  (DX)(BX*4), DX
 32952  	MOVSS (AX), X1
 32953  	MULSS X0, X1
 32954  	ADDSS (DX), X1
 32955  	MOVSS X1, (DX)
 32956  	LEAQ  (AX)(CX*4), AX
 32957  	LEAQ  (DX)(BX*4), DX
 32958  	MOVSS (AX), X1
 32959  	MULSS X0, X1
 32960  	ADDSS (DX), X1
 32961  	MOVSS X1, (DX)
 32962  	LEAQ  (AX)(CX*4), AX
 32963  	LEAQ  (DX)(BX*4), DX
 32964  	MOVSS (AX), X1
 32965  	MULSS X0, X1
 32966  	ADDSS (DX), X1
 32967  	MOVSS X1, (DX)
 32968  	LEAQ  (AX)(CX*4), AX
 32969  	LEAQ  (DX)(BX*4), DX
 32970  	MOVSS (AX), X1
 32971  	MULSS X0, X1
 32972  	ADDSS (DX), X1
 32973  	MOVSS X1, (DX)
 32974  	LEAQ  (AX)(CX*4), AX
 32975  	LEAQ  (DX)(BX*4), DX
 32976  	SUBQ  $0x08, SI
 32977  
 32978  check_limit_unroll:
 32979  	CMPQ SI, $0x08
 32980  	JHS  loop_unroll
 32981  	JMP  check_limit
 32982  
 32983  loop:
 32984  	MOVSS (AX), X1
 32985  	MULSS X0, X1
 32986  	ADDSS (DX), X1
 32987  	MOVSS X1, (DX)
 32988  	DECQ  SI
 32989  	LEAQ  (AX)(CX*4), AX
 32990  	LEAQ  (DX)(BX*4), DX
 32991  
 32992  check_limit:
 32993  	CMPQ SI, $0x00
 32994  	JHI  loop
 32995  	RET
 32996  
 32997  // func AmdAxpyPointerLoopX_V2A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 32998  // Requires: SSE
 32999  TEXT ·AmdAxpyPointerLoopX_V2A15U8(SB), NOSPLIT, $0-48
 33000  	MOVSS alpha+0(FP), X0
 33001  	MOVQ  xs+8(FP), AX
 33002  	MOVQ  incx+16(FP), CX
 33003  	MOVQ  ys+24(FP), DX
 33004  	MOVQ  incy+32(FP), BX
 33005  	MOVQ  n+40(FP), SI
 33006  	JMP   check_limit_unroll
 33007  	PCALIGN $0x08
 33008  	NOP
 33009  	NOP
 33010  	NOP
 33011  	NOP
 33012  	NOP
 33013  	NOP
 33014  	NOP
 33015  
 33016  loop_unroll:
 33017  	MOVSS (AX), X1
 33018  	MULSS X0, X1
 33019  	ADDSS (DX), X1
 33020  	MOVSS X1, (DX)
 33021  	LEAQ  (AX)(CX*4), AX
 33022  	LEAQ  (DX)(BX*4), DX
 33023  	MOVSS (AX), X1
 33024  	MULSS X0, X1
 33025  	ADDSS (DX), X1
 33026  	MOVSS X1, (DX)
 33027  	LEAQ  (AX)(CX*4), AX
 33028  	LEAQ  (DX)(BX*4), DX
 33029  	MOVSS (AX), X1
 33030  	MULSS X0, X1
 33031  	ADDSS (DX), X1
 33032  	MOVSS X1, (DX)
 33033  	LEAQ  (AX)(CX*4), AX
 33034  	LEAQ  (DX)(BX*4), DX
 33035  	MOVSS (AX), X1
 33036  	MULSS X0, X1
 33037  	ADDSS (DX), X1
 33038  	MOVSS X1, (DX)
 33039  	LEAQ  (AX)(CX*4), AX
 33040  	LEAQ  (DX)(BX*4), DX
 33041  	MOVSS (AX), X1
 33042  	MULSS X0, X1
 33043  	ADDSS (DX), X1
 33044  	MOVSS X1, (DX)
 33045  	LEAQ  (AX)(CX*4), AX
 33046  	LEAQ  (DX)(BX*4), DX
 33047  	MOVSS (AX), X1
 33048  	MULSS X0, X1
 33049  	ADDSS (DX), X1
 33050  	MOVSS X1, (DX)
 33051  	LEAQ  (AX)(CX*4), AX
 33052  	LEAQ  (DX)(BX*4), DX
 33053  	MOVSS (AX), X1
 33054  	MULSS X0, X1
 33055  	ADDSS (DX), X1
 33056  	MOVSS X1, (DX)
 33057  	LEAQ  (AX)(CX*4), AX
 33058  	LEAQ  (DX)(BX*4), DX
 33059  	MOVSS (AX), X1
 33060  	MULSS X0, X1
 33061  	ADDSS (DX), X1
 33062  	MOVSS X1, (DX)
 33063  	LEAQ  (AX)(CX*4), AX
 33064  	LEAQ  (DX)(BX*4), DX
 33065  	SUBQ  $0x08, SI
 33066  
 33067  check_limit_unroll:
 33068  	CMPQ SI, $0x08
 33069  	JHS  loop_unroll
 33070  	JMP  check_limit
 33071  
 33072  loop:
 33073  	MOVSS (AX), X1
 33074  	MULSS X0, X1
 33075  	ADDSS (DX), X1
 33076  	MOVSS X1, (DX)
 33077  	DECQ  SI
 33078  	LEAQ  (AX)(CX*4), AX
 33079  	LEAQ  (DX)(BX*4), DX
 33080  
 33081  check_limit:
 33082  	CMPQ SI, $0x00
 33083  	JHI  loop
 33084  	RET
 33085  
 33086  // func AmdAxpyPointerLoopX_V3A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33087  // Requires: SSE
 33088  TEXT ·AmdAxpyPointerLoopX_V3A15U8(SB), NOSPLIT, $0-48
 33089  	MOVSS alpha+0(FP), X0
 33090  	MOVQ  xs+8(FP), AX
 33091  	MOVQ  incx+16(FP), CX
 33092  	MOVQ  ys+24(FP), DX
 33093  	MOVQ  incy+32(FP), BX
 33094  	MOVQ  n+40(FP), SI
 33095  	JMP   check_limit_unroll
 33096  	PCALIGN $0x08
 33097  	NOP
 33098  	NOP
 33099  	NOP
 33100  	NOP
 33101  	NOP
 33102  	NOP
 33103  	NOP
 33104  
 33105  loop_unroll:
 33106  	MOVSS (AX), X1
 33107  	MULSS X0, X1
 33108  	ADDSS (DX), X1
 33109  	MOVSS X1, (DX)
 33110  	LEAQ  (AX)(CX*4), AX
 33111  	LEAQ  (DX)(BX*4), DX
 33112  	MOVSS (AX), X1
 33113  	MULSS X0, X1
 33114  	ADDSS (DX), X1
 33115  	MOVSS X1, (DX)
 33116  	LEAQ  (AX)(CX*4), AX
 33117  	LEAQ  (DX)(BX*4), DX
 33118  	MOVSS (AX), X1
 33119  	MULSS X0, X1
 33120  	ADDSS (DX), X1
 33121  	MOVSS X1, (DX)
 33122  	LEAQ  (AX)(CX*4), AX
 33123  	LEAQ  (DX)(BX*4), DX
 33124  	MOVSS (AX), X1
 33125  	MULSS X0, X1
 33126  	ADDSS (DX), X1
 33127  	MOVSS X1, (DX)
 33128  	LEAQ  (AX)(CX*4), AX
 33129  	LEAQ  (DX)(BX*4), DX
 33130  	MOVSS (AX), X1
 33131  	MULSS X0, X1
 33132  	ADDSS (DX), X1
 33133  	MOVSS X1, (DX)
 33134  	LEAQ  (AX)(CX*4), AX
 33135  	LEAQ  (DX)(BX*4), DX
 33136  	MOVSS (AX), X1
 33137  	MULSS X0, X1
 33138  	ADDSS (DX), X1
 33139  	MOVSS X1, (DX)
 33140  	LEAQ  (AX)(CX*4), AX
 33141  	LEAQ  (DX)(BX*4), DX
 33142  	MOVSS (AX), X1
 33143  	MULSS X0, X1
 33144  	ADDSS (DX), X1
 33145  	MOVSS X1, (DX)
 33146  	LEAQ  (AX)(CX*4), AX
 33147  	LEAQ  (DX)(BX*4), DX
 33148  	MOVSS (AX), X1
 33149  	MULSS X0, X1
 33150  	ADDSS (DX), X1
 33151  	MOVSS X1, (DX)
 33152  	LEAQ  (AX)(CX*4), AX
 33153  	LEAQ  (DX)(BX*4), DX
 33154  	SUBQ  $0x08, SI
 33155  
 33156  check_limit_unroll:
 33157  	CMPQ SI, $0x08
 33158  	JHS  loop_unroll
 33159  	JMP  check_limit
 33160  
 33161  loop:
 33162  	MOVSS (AX), X1
 33163  	MULSS X0, X1
 33164  	ADDSS (DX), X1
 33165  	MOVSS X1, (DX)
 33166  	DECQ  SI
 33167  	LEAQ  (AX)(CX*4), AX
 33168  	LEAQ  (DX)(BX*4), DX
 33169  
 33170  check_limit:
 33171  	CMPQ SI, $0x00
 33172  	JHI  loop
 33173  	RET
 33174  
 33175  // func AmdAxpyPointerLoopX_V4A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33176  // Requires: SSE
 33177  TEXT ·AmdAxpyPointerLoopX_V4A15U8(SB), NOSPLIT, $0-48
 33178  	MOVSS alpha+0(FP), X0
 33179  	MOVQ  xs+8(FP), AX
 33180  	MOVQ  incx+16(FP), CX
 33181  	MOVQ  ys+24(FP), DX
 33182  	MOVQ  incy+32(FP), BX
 33183  	MOVQ  n+40(FP), SI
 33184  	JMP   check_limit_unroll
 33185  	PCALIGN $0x08
 33186  	NOP
 33187  	NOP
 33188  	NOP
 33189  	NOP
 33190  	NOP
 33191  	NOP
 33192  	NOP
 33193  
 33194  loop_unroll:
 33195  	MOVSS (AX), X1
 33196  	MULSS X0, X1
 33197  	ADDSS (DX), X1
 33198  	MOVSS X1, (DX)
 33199  	LEAQ  (AX)(CX*4), AX
 33200  	LEAQ  (DX)(BX*4), DX
 33201  	MOVSS (AX), X1
 33202  	MULSS X0, X1
 33203  	ADDSS (DX), X1
 33204  	MOVSS X1, (DX)
 33205  	LEAQ  (AX)(CX*4), AX
 33206  	LEAQ  (DX)(BX*4), DX
 33207  	MOVSS (AX), X1
 33208  	MULSS X0, X1
 33209  	ADDSS (DX), X1
 33210  	MOVSS X1, (DX)
 33211  	LEAQ  (AX)(CX*4), AX
 33212  	LEAQ  (DX)(BX*4), DX
 33213  	MOVSS (AX), X1
 33214  	MULSS X0, X1
 33215  	ADDSS (DX), X1
 33216  	MOVSS X1, (DX)
 33217  	LEAQ  (AX)(CX*4), AX
 33218  	LEAQ  (DX)(BX*4), DX
 33219  	MOVSS (AX), X1
 33220  	MULSS X0, X1
 33221  	ADDSS (DX), X1
 33222  	MOVSS X1, (DX)
 33223  	LEAQ  (AX)(CX*4), AX
 33224  	LEAQ  (DX)(BX*4), DX
 33225  	MOVSS (AX), X1
 33226  	MULSS X0, X1
 33227  	ADDSS (DX), X1
 33228  	MOVSS X1, (DX)
 33229  	LEAQ  (AX)(CX*4), AX
 33230  	LEAQ  (DX)(BX*4), DX
 33231  	MOVSS (AX), X1
 33232  	MULSS X0, X1
 33233  	ADDSS (DX), X1
 33234  	MOVSS X1, (DX)
 33235  	LEAQ  (AX)(CX*4), AX
 33236  	LEAQ  (DX)(BX*4), DX
 33237  	MOVSS (AX), X1
 33238  	MULSS X0, X1
 33239  	ADDSS (DX), X1
 33240  	MOVSS X1, (DX)
 33241  	LEAQ  (AX)(CX*4), AX
 33242  	LEAQ  (DX)(BX*4), DX
 33243  	SUBQ  $0x08, SI
 33244  
 33245  check_limit_unroll:
 33246  	CMPQ SI, $0x08
 33247  	JHS  loop_unroll
 33248  	JMP  check_limit
 33249  
 33250  loop:
 33251  	MOVSS (AX), X1
 33252  	MULSS X0, X1
 33253  	ADDSS (DX), X1
 33254  	MOVSS X1, (DX)
 33255  	DECQ  SI
 33256  	LEAQ  (AX)(CX*4), AX
 33257  	LEAQ  (DX)(BX*4), DX
 33258  
 33259  check_limit:
 33260  	CMPQ SI, $0x00
 33261  	JHI  loop
 33262  	RET
 33263  
 33264  // func AmdAxpyPointerLoopX_V5A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33265  // Requires: SSE
 33266  TEXT ·AmdAxpyPointerLoopX_V5A15U8(SB), NOSPLIT, $0-48
 33267  	MOVSS alpha+0(FP), X0
 33268  	MOVQ  xs+8(FP), AX
 33269  	MOVQ  incx+16(FP), CX
 33270  	MOVQ  ys+24(FP), DX
 33271  	MOVQ  incy+32(FP), BX
 33272  	MOVQ  n+40(FP), SI
 33273  	JMP   check_limit_unroll
 33274  	PCALIGN $0x08
 33275  	NOP
 33276  	NOP
 33277  	NOP
 33278  	NOP
 33279  	NOP
 33280  	NOP
 33281  	NOP
 33282  
 33283  loop_unroll:
 33284  	MOVSS (AX), X1
 33285  	MULSS X0, X1
 33286  	ADDSS (DX), X1
 33287  	MOVSS X1, (DX)
 33288  	LEAQ  (AX)(CX*4), AX
 33289  	LEAQ  (DX)(BX*4), DX
 33290  	MOVSS (AX), X1
 33291  	MULSS X0, X1
 33292  	ADDSS (DX), X1
 33293  	MOVSS X1, (DX)
 33294  	LEAQ  (AX)(CX*4), AX
 33295  	LEAQ  (DX)(BX*4), DX
 33296  	MOVSS (AX), X1
 33297  	MULSS X0, X1
 33298  	ADDSS (DX), X1
 33299  	MOVSS X1, (DX)
 33300  	LEAQ  (AX)(CX*4), AX
 33301  	LEAQ  (DX)(BX*4), DX
 33302  	MOVSS (AX), X1
 33303  	MULSS X0, X1
 33304  	ADDSS (DX), X1
 33305  	MOVSS X1, (DX)
 33306  	LEAQ  (AX)(CX*4), AX
 33307  	LEAQ  (DX)(BX*4), DX
 33308  	MOVSS (AX), X1
 33309  	MULSS X0, X1
 33310  	ADDSS (DX), X1
 33311  	MOVSS X1, (DX)
 33312  	LEAQ  (AX)(CX*4), AX
 33313  	LEAQ  (DX)(BX*4), DX
 33314  	MOVSS (AX), X1
 33315  	MULSS X0, X1
 33316  	ADDSS (DX), X1
 33317  	MOVSS X1, (DX)
 33318  	LEAQ  (AX)(CX*4), AX
 33319  	LEAQ  (DX)(BX*4), DX
 33320  	MOVSS (AX), X1
 33321  	MULSS X0, X1
 33322  	ADDSS (DX), X1
 33323  	MOVSS X1, (DX)
 33324  	LEAQ  (AX)(CX*4), AX
 33325  	LEAQ  (DX)(BX*4), DX
 33326  	MOVSS (AX), X1
 33327  	MULSS X0, X1
 33328  	ADDSS (DX), X1
 33329  	MOVSS X1, (DX)
 33330  	LEAQ  (AX)(CX*4), AX
 33331  	LEAQ  (DX)(BX*4), DX
 33332  	SUBQ  $0x08, SI
 33333  
 33334  check_limit_unroll:
 33335  	CMPQ SI, $0x08
 33336  	JHS  loop_unroll
 33337  	JMP  check_limit
 33338  
 33339  loop:
 33340  	MOVSS (AX), X1
 33341  	MULSS X0, X1
 33342  	ADDSS (DX), X1
 33343  	MOVSS X1, (DX)
 33344  	DECQ  SI
 33345  	LEAQ  (AX)(CX*4), AX
 33346  	LEAQ  (DX)(BX*4), DX
 33347  
 33348  check_limit:
 33349  	CMPQ SI, $0x00
 33350  	JHI  loop
 33351  	RET
 33352  
 33353  // func AmdAxpyPointerLoopX_V0A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33354  // Requires: SSE
 33355  TEXT ·AmdAxpyPointerLoopX_V0A16U8(SB), NOSPLIT, $0-48
 33356  	MOVSS alpha+0(FP), X0
 33357  	MOVQ  xs+8(FP), AX
 33358  	MOVQ  incx+16(FP), CX
 33359  	MOVQ  ys+24(FP), DX
 33360  	MOVQ  incy+32(FP), BX
 33361  	MOVQ  n+40(FP), SI
 33362  	JMP   check_limit_unroll
 33363  	PCALIGN $0x10
 33364  
 33365  loop_unroll:
 33366  	MOVSS (AX), X1
 33367  	MULSS X0, X1
 33368  	ADDSS (DX), X1
 33369  	MOVSS X1, (DX)
 33370  	LEAQ  (AX)(CX*4), AX
 33371  	LEAQ  (DX)(BX*4), DX
 33372  	MOVSS (AX), X1
 33373  	MULSS X0, X1
 33374  	ADDSS (DX), X1
 33375  	MOVSS X1, (DX)
 33376  	LEAQ  (AX)(CX*4), AX
 33377  	LEAQ  (DX)(BX*4), DX
 33378  	MOVSS (AX), X1
 33379  	MULSS X0, X1
 33380  	ADDSS (DX), X1
 33381  	MOVSS X1, (DX)
 33382  	LEAQ  (AX)(CX*4), AX
 33383  	LEAQ  (DX)(BX*4), DX
 33384  	MOVSS (AX), X1
 33385  	MULSS X0, X1
 33386  	ADDSS (DX), X1
 33387  	MOVSS X1, (DX)
 33388  	LEAQ  (AX)(CX*4), AX
 33389  	LEAQ  (DX)(BX*4), DX
 33390  	MOVSS (AX), X1
 33391  	MULSS X0, X1
 33392  	ADDSS (DX), X1
 33393  	MOVSS X1, (DX)
 33394  	LEAQ  (AX)(CX*4), AX
 33395  	LEAQ  (DX)(BX*4), DX
 33396  	MOVSS (AX), X1
 33397  	MULSS X0, X1
 33398  	ADDSS (DX), X1
 33399  	MOVSS X1, (DX)
 33400  	LEAQ  (AX)(CX*4), AX
 33401  	LEAQ  (DX)(BX*4), DX
 33402  	MOVSS (AX), X1
 33403  	MULSS X0, X1
 33404  	ADDSS (DX), X1
 33405  	MOVSS X1, (DX)
 33406  	LEAQ  (AX)(CX*4), AX
 33407  	LEAQ  (DX)(BX*4), DX
 33408  	MOVSS (AX), X1
 33409  	MULSS X0, X1
 33410  	ADDSS (DX), X1
 33411  	MOVSS X1, (DX)
 33412  	LEAQ  (AX)(CX*4), AX
 33413  	LEAQ  (DX)(BX*4), DX
 33414  	SUBQ  $0x08, SI
 33415  
 33416  check_limit_unroll:
 33417  	CMPQ SI, $0x08
 33418  	JHS  loop_unroll
 33419  	JMP  check_limit
 33420  
 33421  loop:
 33422  	MOVSS (AX), X1
 33423  	MULSS X0, X1
 33424  	ADDSS (DX), X1
 33425  	MOVSS X1, (DX)
 33426  	DECQ  SI
 33427  	LEAQ  (AX)(CX*4), AX
 33428  	LEAQ  (DX)(BX*4), DX
 33429  
 33430  check_limit:
 33431  	CMPQ SI, $0x00
 33432  	JHI  loop
 33433  	RET
 33434  
 33435  // func AmdAxpyPointerLoopX_V1A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33436  // Requires: SSE
 33437  TEXT ·AmdAxpyPointerLoopX_V1A16U8(SB), NOSPLIT, $0-48
 33438  	MOVSS alpha+0(FP), X0
 33439  	MOVQ  xs+8(FP), AX
 33440  	MOVQ  incx+16(FP), CX
 33441  	MOVQ  ys+24(FP), DX
 33442  	MOVQ  incy+32(FP), BX
 33443  	MOVQ  n+40(FP), SI
 33444  	JMP   check_limit_unroll
 33445  	PCALIGN $0x10
 33446  
 33447  loop_unroll:
 33448  	MOVSS (AX), X1
 33449  	MULSS X0, X1
 33450  	ADDSS (DX), X1
 33451  	MOVSS X1, (DX)
 33452  	LEAQ  (AX)(CX*4), AX
 33453  	LEAQ  (DX)(BX*4), DX
 33454  	MOVSS (AX), X1
 33455  	MULSS X0, X1
 33456  	ADDSS (DX), X1
 33457  	MOVSS X1, (DX)
 33458  	LEAQ  (AX)(CX*4), AX
 33459  	LEAQ  (DX)(BX*4), DX
 33460  	MOVSS (AX), X1
 33461  	MULSS X0, X1
 33462  	ADDSS (DX), X1
 33463  	MOVSS X1, (DX)
 33464  	LEAQ  (AX)(CX*4), AX
 33465  	LEAQ  (DX)(BX*4), DX
 33466  	MOVSS (AX), X1
 33467  	MULSS X0, X1
 33468  	ADDSS (DX), X1
 33469  	MOVSS X1, (DX)
 33470  	LEAQ  (AX)(CX*4), AX
 33471  	LEAQ  (DX)(BX*4), DX
 33472  	MOVSS (AX), X1
 33473  	MULSS X0, X1
 33474  	ADDSS (DX), X1
 33475  	MOVSS X1, (DX)
 33476  	LEAQ  (AX)(CX*4), AX
 33477  	LEAQ  (DX)(BX*4), DX
 33478  	MOVSS (AX), X1
 33479  	MULSS X0, X1
 33480  	ADDSS (DX), X1
 33481  	MOVSS X1, (DX)
 33482  	LEAQ  (AX)(CX*4), AX
 33483  	LEAQ  (DX)(BX*4), DX
 33484  	MOVSS (AX), X1
 33485  	MULSS X0, X1
 33486  	ADDSS (DX), X1
 33487  	MOVSS X1, (DX)
 33488  	LEAQ  (AX)(CX*4), AX
 33489  	LEAQ  (DX)(BX*4), DX
 33490  	MOVSS (AX), X1
 33491  	MULSS X0, X1
 33492  	ADDSS (DX), X1
 33493  	MOVSS X1, (DX)
 33494  	LEAQ  (AX)(CX*4), AX
 33495  	LEAQ  (DX)(BX*4), DX
 33496  	SUBQ  $0x08, SI
 33497  
 33498  check_limit_unroll:
 33499  	CMPQ SI, $0x08
 33500  	JHS  loop_unroll
 33501  	JMP  check_limit
 33502  
 33503  loop:
 33504  	MOVSS (AX), X1
 33505  	MULSS X0, X1
 33506  	ADDSS (DX), X1
 33507  	MOVSS X1, (DX)
 33508  	DECQ  SI
 33509  	LEAQ  (AX)(CX*4), AX
 33510  	LEAQ  (DX)(BX*4), DX
 33511  
 33512  check_limit:
 33513  	CMPQ SI, $0x00
 33514  	JHI  loop
 33515  	RET
 33516  
 33517  // func AmdAxpyPointerLoopX_V2A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33518  // Requires: SSE
 33519  TEXT ·AmdAxpyPointerLoopX_V2A16U8(SB), NOSPLIT, $0-48
 33520  	MOVSS alpha+0(FP), X0
 33521  	MOVQ  xs+8(FP), AX
 33522  	MOVQ  incx+16(FP), CX
 33523  	MOVQ  ys+24(FP), DX
 33524  	MOVQ  incy+32(FP), BX
 33525  	MOVQ  n+40(FP), SI
 33526  	JMP   check_limit_unroll
 33527  	PCALIGN $0x10
 33528  
 33529  loop_unroll:
 33530  	MOVSS (AX), X1
 33531  	MULSS X0, X1
 33532  	ADDSS (DX), X1
 33533  	MOVSS X1, (DX)
 33534  	LEAQ  (AX)(CX*4), AX
 33535  	LEAQ  (DX)(BX*4), DX
 33536  	MOVSS (AX), X1
 33537  	MULSS X0, X1
 33538  	ADDSS (DX), X1
 33539  	MOVSS X1, (DX)
 33540  	LEAQ  (AX)(CX*4), AX
 33541  	LEAQ  (DX)(BX*4), DX
 33542  	MOVSS (AX), X1
 33543  	MULSS X0, X1
 33544  	ADDSS (DX), X1
 33545  	MOVSS X1, (DX)
 33546  	LEAQ  (AX)(CX*4), AX
 33547  	LEAQ  (DX)(BX*4), DX
 33548  	MOVSS (AX), X1
 33549  	MULSS X0, X1
 33550  	ADDSS (DX), X1
 33551  	MOVSS X1, (DX)
 33552  	LEAQ  (AX)(CX*4), AX
 33553  	LEAQ  (DX)(BX*4), DX
 33554  	MOVSS (AX), X1
 33555  	MULSS X0, X1
 33556  	ADDSS (DX), X1
 33557  	MOVSS X1, (DX)
 33558  	LEAQ  (AX)(CX*4), AX
 33559  	LEAQ  (DX)(BX*4), DX
 33560  	MOVSS (AX), X1
 33561  	MULSS X0, X1
 33562  	ADDSS (DX), X1
 33563  	MOVSS X1, (DX)
 33564  	LEAQ  (AX)(CX*4), AX
 33565  	LEAQ  (DX)(BX*4), DX
 33566  	MOVSS (AX), X1
 33567  	MULSS X0, X1
 33568  	ADDSS (DX), X1
 33569  	MOVSS X1, (DX)
 33570  	LEAQ  (AX)(CX*4), AX
 33571  	LEAQ  (DX)(BX*4), DX
 33572  	MOVSS (AX), X1
 33573  	MULSS X0, X1
 33574  	ADDSS (DX), X1
 33575  	MOVSS X1, (DX)
 33576  	LEAQ  (AX)(CX*4), AX
 33577  	LEAQ  (DX)(BX*4), DX
 33578  	SUBQ  $0x08, SI
 33579  
 33580  check_limit_unroll:
 33581  	CMPQ SI, $0x08
 33582  	JHS  loop_unroll
 33583  	JMP  check_limit
 33584  
 33585  loop:
 33586  	MOVSS (AX), X1
 33587  	MULSS X0, X1
 33588  	ADDSS (DX), X1
 33589  	MOVSS X1, (DX)
 33590  	DECQ  SI
 33591  	LEAQ  (AX)(CX*4), AX
 33592  	LEAQ  (DX)(BX*4), DX
 33593  
 33594  check_limit:
 33595  	CMPQ SI, $0x00
 33596  	JHI  loop
 33597  	RET
 33598  
 33599  // func AmdAxpyPointerLoopX_V3A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33600  // Requires: SSE
 33601  TEXT ·AmdAxpyPointerLoopX_V3A16U8(SB), NOSPLIT, $0-48
 33602  	MOVSS alpha+0(FP), X0
 33603  	MOVQ  xs+8(FP), AX
 33604  	MOVQ  incx+16(FP), CX
 33605  	MOVQ  ys+24(FP), DX
 33606  	MOVQ  incy+32(FP), BX
 33607  	MOVQ  n+40(FP), SI
 33608  	JMP   check_limit_unroll
 33609  	PCALIGN $0x10
 33610  
 33611  loop_unroll:
 33612  	MOVSS (AX), X1
 33613  	MULSS X0, X1
 33614  	ADDSS (DX), X1
 33615  	MOVSS X1, (DX)
 33616  	LEAQ  (AX)(CX*4), AX
 33617  	LEAQ  (DX)(BX*4), DX
 33618  	MOVSS (AX), X1
 33619  	MULSS X0, X1
 33620  	ADDSS (DX), X1
 33621  	MOVSS X1, (DX)
 33622  	LEAQ  (AX)(CX*4), AX
 33623  	LEAQ  (DX)(BX*4), DX
 33624  	MOVSS (AX), X1
 33625  	MULSS X0, X1
 33626  	ADDSS (DX), X1
 33627  	MOVSS X1, (DX)
 33628  	LEAQ  (AX)(CX*4), AX
 33629  	LEAQ  (DX)(BX*4), DX
 33630  	MOVSS (AX), X1
 33631  	MULSS X0, X1
 33632  	ADDSS (DX), X1
 33633  	MOVSS X1, (DX)
 33634  	LEAQ  (AX)(CX*4), AX
 33635  	LEAQ  (DX)(BX*4), DX
 33636  	MOVSS (AX), X1
 33637  	MULSS X0, X1
 33638  	ADDSS (DX), X1
 33639  	MOVSS X1, (DX)
 33640  	LEAQ  (AX)(CX*4), AX
 33641  	LEAQ  (DX)(BX*4), DX
 33642  	MOVSS (AX), X1
 33643  	MULSS X0, X1
 33644  	ADDSS (DX), X1
 33645  	MOVSS X1, (DX)
 33646  	LEAQ  (AX)(CX*4), AX
 33647  	LEAQ  (DX)(BX*4), DX
 33648  	MOVSS (AX), X1
 33649  	MULSS X0, X1
 33650  	ADDSS (DX), X1
 33651  	MOVSS X1, (DX)
 33652  	LEAQ  (AX)(CX*4), AX
 33653  	LEAQ  (DX)(BX*4), DX
 33654  	MOVSS (AX), X1
 33655  	MULSS X0, X1
 33656  	ADDSS (DX), X1
 33657  	MOVSS X1, (DX)
 33658  	LEAQ  (AX)(CX*4), AX
 33659  	LEAQ  (DX)(BX*4), DX
 33660  	SUBQ  $0x08, SI
 33661  
 33662  check_limit_unroll:
 33663  	CMPQ SI, $0x08
 33664  	JHS  loop_unroll
 33665  	JMP  check_limit
 33666  
 33667  loop:
 33668  	MOVSS (AX), X1
 33669  	MULSS X0, X1
 33670  	ADDSS (DX), X1
 33671  	MOVSS X1, (DX)
 33672  	DECQ  SI
 33673  	LEAQ  (AX)(CX*4), AX
 33674  	LEAQ  (DX)(BX*4), DX
 33675  
 33676  check_limit:
 33677  	CMPQ SI, $0x00
 33678  	JHI  loop
 33679  	RET
 33680  
 33681  // func AmdAxpyPointerLoopX_V4A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33682  // Requires: SSE
 33683  TEXT ·AmdAxpyPointerLoopX_V4A16U8(SB), NOSPLIT, $0-48
 33684  	MOVSS alpha+0(FP), X0
 33685  	MOVQ  xs+8(FP), AX
 33686  	MOVQ  incx+16(FP), CX
 33687  	MOVQ  ys+24(FP), DX
 33688  	MOVQ  incy+32(FP), BX
 33689  	MOVQ  n+40(FP), SI
 33690  	JMP   check_limit_unroll
 33691  	PCALIGN $0x10
 33692  
 33693  loop_unroll:
 33694  	MOVSS (AX), X1
 33695  	MULSS X0, X1
 33696  	ADDSS (DX), X1
 33697  	MOVSS X1, (DX)
 33698  	LEAQ  (AX)(CX*4), AX
 33699  	LEAQ  (DX)(BX*4), DX
 33700  	MOVSS (AX), X1
 33701  	MULSS X0, X1
 33702  	ADDSS (DX), X1
 33703  	MOVSS X1, (DX)
 33704  	LEAQ  (AX)(CX*4), AX
 33705  	LEAQ  (DX)(BX*4), DX
 33706  	MOVSS (AX), X1
 33707  	MULSS X0, X1
 33708  	ADDSS (DX), X1
 33709  	MOVSS X1, (DX)
 33710  	LEAQ  (AX)(CX*4), AX
 33711  	LEAQ  (DX)(BX*4), DX
 33712  	MOVSS (AX), X1
 33713  	MULSS X0, X1
 33714  	ADDSS (DX), X1
 33715  	MOVSS X1, (DX)
 33716  	LEAQ  (AX)(CX*4), AX
 33717  	LEAQ  (DX)(BX*4), DX
 33718  	MOVSS (AX), X1
 33719  	MULSS X0, X1
 33720  	ADDSS (DX), X1
 33721  	MOVSS X1, (DX)
 33722  	LEAQ  (AX)(CX*4), AX
 33723  	LEAQ  (DX)(BX*4), DX
 33724  	MOVSS (AX), X1
 33725  	MULSS X0, X1
 33726  	ADDSS (DX), X1
 33727  	MOVSS X1, (DX)
 33728  	LEAQ  (AX)(CX*4), AX
 33729  	LEAQ  (DX)(BX*4), DX
 33730  	MOVSS (AX), X1
 33731  	MULSS X0, X1
 33732  	ADDSS (DX), X1
 33733  	MOVSS X1, (DX)
 33734  	LEAQ  (AX)(CX*4), AX
 33735  	LEAQ  (DX)(BX*4), DX
 33736  	MOVSS (AX), X1
 33737  	MULSS X0, X1
 33738  	ADDSS (DX), X1
 33739  	MOVSS X1, (DX)
 33740  	LEAQ  (AX)(CX*4), AX
 33741  	LEAQ  (DX)(BX*4), DX
 33742  	SUBQ  $0x08, SI
 33743  
 33744  check_limit_unroll:
 33745  	CMPQ SI, $0x08
 33746  	JHS  loop_unroll
 33747  	JMP  check_limit
 33748  
 33749  loop:
 33750  	MOVSS (AX), X1
 33751  	MULSS X0, X1
 33752  	ADDSS (DX), X1
 33753  	MOVSS X1, (DX)
 33754  	DECQ  SI
 33755  	LEAQ  (AX)(CX*4), AX
 33756  	LEAQ  (DX)(BX*4), DX
 33757  
 33758  check_limit:
 33759  	CMPQ SI, $0x00
 33760  	JHI  loop
 33761  	RET
 33762  
 33763  // func AmdAxpyPointerLoopX_V5A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33764  // Requires: SSE
 33765  TEXT ·AmdAxpyPointerLoopX_V5A16U8(SB), NOSPLIT, $0-48
 33766  	MOVSS alpha+0(FP), X0
 33767  	MOVQ  xs+8(FP), AX
 33768  	MOVQ  incx+16(FP), CX
 33769  	MOVQ  ys+24(FP), DX
 33770  	MOVQ  incy+32(FP), BX
 33771  	MOVQ  n+40(FP), SI
 33772  	JMP   check_limit_unroll
 33773  	PCALIGN $0x10
 33774  
 33775  loop_unroll:
 33776  	MOVSS (AX), X1
 33777  	MULSS X0, X1
 33778  	ADDSS (DX), X1
 33779  	MOVSS X1, (DX)
 33780  	LEAQ  (AX)(CX*4), AX
 33781  	LEAQ  (DX)(BX*4), DX
 33782  	MOVSS (AX), X1
 33783  	MULSS X0, X1
 33784  	ADDSS (DX), X1
 33785  	MOVSS X1, (DX)
 33786  	LEAQ  (AX)(CX*4), AX
 33787  	LEAQ  (DX)(BX*4), DX
 33788  	MOVSS (AX), X1
 33789  	MULSS X0, X1
 33790  	ADDSS (DX), X1
 33791  	MOVSS X1, (DX)
 33792  	LEAQ  (AX)(CX*4), AX
 33793  	LEAQ  (DX)(BX*4), DX
 33794  	MOVSS (AX), X1
 33795  	MULSS X0, X1
 33796  	ADDSS (DX), X1
 33797  	MOVSS X1, (DX)
 33798  	LEAQ  (AX)(CX*4), AX
 33799  	LEAQ  (DX)(BX*4), DX
 33800  	MOVSS (AX), X1
 33801  	MULSS X0, X1
 33802  	ADDSS (DX), X1
 33803  	MOVSS X1, (DX)
 33804  	LEAQ  (AX)(CX*4), AX
 33805  	LEAQ  (DX)(BX*4), DX
 33806  	MOVSS (AX), X1
 33807  	MULSS X0, X1
 33808  	ADDSS (DX), X1
 33809  	MOVSS X1, (DX)
 33810  	LEAQ  (AX)(CX*4), AX
 33811  	LEAQ  (DX)(BX*4), DX
 33812  	MOVSS (AX), X1
 33813  	MULSS X0, X1
 33814  	ADDSS (DX), X1
 33815  	MOVSS X1, (DX)
 33816  	LEAQ  (AX)(CX*4), AX
 33817  	LEAQ  (DX)(BX*4), DX
 33818  	MOVSS (AX), X1
 33819  	MULSS X0, X1
 33820  	ADDSS (DX), X1
 33821  	MOVSS X1, (DX)
 33822  	LEAQ  (AX)(CX*4), AX
 33823  	LEAQ  (DX)(BX*4), DX
 33824  	SUBQ  $0x08, SI
 33825  
 33826  check_limit_unroll:
 33827  	CMPQ SI, $0x08
 33828  	JHS  loop_unroll
 33829  	JMP  check_limit
 33830  
 33831  loop:
 33832  	MOVSS (AX), X1
 33833  	MULSS X0, X1
 33834  	ADDSS (DX), X1
 33835  	MOVSS X1, (DX)
 33836  	DECQ  SI
 33837  	LEAQ  (AX)(CX*4), AX
 33838  	LEAQ  (DX)(BX*4), DX
 33839  
 33840  check_limit:
 33841  	CMPQ SI, $0x00
 33842  	JHI  loop
 33843  	RET
 33844  
 33845  // func AmdAxpyPointerLoopXInterleave_V0A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33846  // Requires: SSE
 33847  TEXT ·AmdAxpyPointerLoopXInterleave_V0A0U4(SB), NOSPLIT, $0-48
 33848  	MOVSS alpha+0(FP), X0
 33849  	MOVQ  xs+8(FP), AX
 33850  	MOVQ  incx+16(FP), CX
 33851  	MOVQ  CX, DX
 33852  	SHLQ  $0x04, DX
 33853  	MOVQ  ys+24(FP), DX
 33854  	MOVQ  incy+32(FP), BX
 33855  	MOVQ  BX, SI
 33856  	SHLQ  $0x04, SI
 33857  	MOVQ  n+40(FP), SI
 33858  	JMP   check_limit_unroll
 33859  
 33860  loop_unroll:
 33861  	MOVSS (AX), X1
 33862  	LEAQ  (AX)(CX*4), AX
 33863  	MOVSS (AX), X2
 33864  	LEAQ  (AX)(CX*4), AX
 33865  	MOVSS (AX), X3
 33866  	LEAQ  (AX)(CX*4), AX
 33867  	MOVSS (AX), X4
 33868  	LEAQ  (AX)(CX*4), AX
 33869  	MULSS X0, X1
 33870  	MULSS X0, X2
 33871  	MULSS X0, X3
 33872  	MULSS X0, X4
 33873  	ADDSS (DX), X1
 33874  	MOVSS X1, (DX)
 33875  	LEAQ  (DX)(BX*4), DX
 33876  	ADDSS (DX), X2
 33877  	MOVSS X2, (DX)
 33878  	LEAQ  (DX)(BX*4), DX
 33879  	ADDSS (DX), X3
 33880  	MOVSS X3, (DX)
 33881  	LEAQ  (DX)(BX*4), DX
 33882  	ADDSS (DX), X4
 33883  	MOVSS X4, (DX)
 33884  	LEAQ  (DX)(BX*4), DX
 33885  	SUBQ  $0x04, SI
 33886  
 33887  check_limit_unroll:
 33888  	CMPQ SI, $0x04
 33889  	JHS  loop_unroll
 33890  	JMP  check_limit
 33891  
 33892  loop:
 33893  	MOVSS (AX), X1
 33894  	MULSS X0, X1
 33895  	ADDSS (DX), X1
 33896  	MOVSS X1, (DX)
 33897  	DECQ  SI
 33898  	LEAQ  (AX)(CX*4), AX
 33899  	LEAQ  (DX)(BX*4), DX
 33900  
 33901  check_limit:
 33902  	CMPQ SI, $0x00
 33903  	JHI  loop
 33904  	RET
 33905  
 33906  // func AmdAxpyPointerLoopXInterleave_V1A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33907  // Requires: SSE
 33908  TEXT ·AmdAxpyPointerLoopXInterleave_V1A0U4(SB), NOSPLIT, $0-48
 33909  	MOVSS alpha+0(FP), X0
 33910  	MOVQ  xs+8(FP), AX
 33911  	MOVQ  incx+16(FP), CX
 33912  	MOVQ  CX, DX
 33913  	SHLQ  $0x04, DX
 33914  	MOVQ  ys+24(FP), DX
 33915  	MOVQ  incy+32(FP), BX
 33916  	MOVQ  BX, SI
 33917  	SHLQ  $0x04, SI
 33918  	MOVQ  n+40(FP), SI
 33919  	JMP   check_limit_unroll
 33920  
 33921  loop_unroll:
 33922  	MOVSS (AX), X1
 33923  	LEAQ  (AX)(CX*4), AX
 33924  	MOVSS (AX), X2
 33925  	LEAQ  (AX)(CX*4), AX
 33926  	MOVSS (AX), X3
 33927  	LEAQ  (AX)(CX*4), AX
 33928  	MOVSS (AX), X4
 33929  	LEAQ  (AX)(CX*4), AX
 33930  	MULSS X0, X1
 33931  	MULSS X0, X2
 33932  	MULSS X0, X3
 33933  	MULSS X0, X4
 33934  	ADDSS (DX), X1
 33935  	MOVSS X1, (DX)
 33936  	LEAQ  (DX)(BX*4), DX
 33937  	ADDSS (DX), X2
 33938  	MOVSS X2, (DX)
 33939  	LEAQ  (DX)(BX*4), DX
 33940  	ADDSS (DX), X3
 33941  	MOVSS X3, (DX)
 33942  	LEAQ  (DX)(BX*4), DX
 33943  	ADDSS (DX), X4
 33944  	MOVSS X4, (DX)
 33945  	LEAQ  (DX)(BX*4), DX
 33946  	SUBQ  $0x04, SI
 33947  
 33948  check_limit_unroll:
 33949  	CMPQ SI, $0x04
 33950  	JHS  loop_unroll
 33951  	JMP  check_limit
 33952  
 33953  loop:
 33954  	MOVSS (AX), X1
 33955  	MULSS X0, X1
 33956  	ADDSS (DX), X1
 33957  	MOVSS X1, (DX)
 33958  	DECQ  SI
 33959  	LEAQ  (AX)(CX*4), AX
 33960  	LEAQ  (DX)(BX*4), DX
 33961  
 33962  check_limit:
 33963  	CMPQ SI, $0x00
 33964  	JHI  loop
 33965  	RET
 33966  
 33967  // func AmdAxpyPointerLoopXInterleave_V2A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 33968  // Requires: SSE
 33969  TEXT ·AmdAxpyPointerLoopXInterleave_V2A0U4(SB), NOSPLIT, $0-48
 33970  	MOVSS alpha+0(FP), X0
 33971  	MOVQ  xs+8(FP), AX
 33972  	MOVQ  incx+16(FP), CX
 33973  	MOVQ  CX, DX
 33974  	SHLQ  $0x04, DX
 33975  	MOVQ  ys+24(FP), DX
 33976  	MOVQ  incy+32(FP), BX
 33977  	MOVQ  BX, SI
 33978  	SHLQ  $0x04, SI
 33979  	MOVQ  n+40(FP), SI
 33980  	JMP   check_limit_unroll
 33981  
 33982  loop_unroll:
 33983  	MOVSS (AX), X1
 33984  	LEAQ  (AX)(CX*4), AX
 33985  	MOVSS (AX), X2
 33986  	LEAQ  (AX)(CX*4), AX
 33987  	MOVSS (AX), X3
 33988  	LEAQ  (AX)(CX*4), AX
 33989  	MOVSS (AX), X4
 33990  	LEAQ  (AX)(CX*4), AX
 33991  	MULSS X0, X1
 33992  	MULSS X0, X2
 33993  	MULSS X0, X3
 33994  	MULSS X0, X4
 33995  	ADDSS (DX), X1
 33996  	MOVSS X1, (DX)
 33997  	LEAQ  (DX)(BX*4), DX
 33998  	ADDSS (DX), X2
 33999  	MOVSS X2, (DX)
 34000  	LEAQ  (DX)(BX*4), DX
 34001  	ADDSS (DX), X3
 34002  	MOVSS X3, (DX)
 34003  	LEAQ  (DX)(BX*4), DX
 34004  	ADDSS (DX), X4
 34005  	MOVSS X4, (DX)
 34006  	LEAQ  (DX)(BX*4), DX
 34007  	SUBQ  $0x04, SI
 34008  
 34009  check_limit_unroll:
 34010  	CMPQ SI, $0x04
 34011  	JHS  loop_unroll
 34012  	JMP  check_limit
 34013  
 34014  loop:
 34015  	MOVSS (AX), X1
 34016  	MULSS X0, X1
 34017  	ADDSS (DX), X1
 34018  	MOVSS X1, (DX)
 34019  	DECQ  SI
 34020  	LEAQ  (AX)(CX*4), AX
 34021  	LEAQ  (DX)(BX*4), DX
 34022  
 34023  check_limit:
 34024  	CMPQ SI, $0x00
 34025  	JHI  loop
 34026  	RET
 34027  
 34028  // func AmdAxpyPointerLoopXInterleave_V3A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34029  // Requires: SSE
 34030  TEXT ·AmdAxpyPointerLoopXInterleave_V3A0U4(SB), NOSPLIT, $0-48
 34031  	MOVSS alpha+0(FP), X0
 34032  	MOVQ  xs+8(FP), AX
 34033  	MOVQ  incx+16(FP), CX
 34034  	MOVQ  CX, DX
 34035  	SHLQ  $0x04, DX
 34036  	MOVQ  ys+24(FP), DX
 34037  	MOVQ  incy+32(FP), BX
 34038  	MOVQ  BX, SI
 34039  	SHLQ  $0x04, SI
 34040  	MOVQ  n+40(FP), SI
 34041  	JMP   check_limit_unroll
 34042  
 34043  loop_unroll:
 34044  	MOVSS (AX), X1
 34045  	LEAQ  (AX)(CX*4), AX
 34046  	MOVSS (AX), X2
 34047  	LEAQ  (AX)(CX*4), AX
 34048  	MOVSS (AX), X3
 34049  	LEAQ  (AX)(CX*4), AX
 34050  	MOVSS (AX), X4
 34051  	LEAQ  (AX)(CX*4), AX
 34052  	MULSS X0, X1
 34053  	MULSS X0, X2
 34054  	MULSS X0, X3
 34055  	MULSS X0, X4
 34056  	ADDSS (DX), X1
 34057  	MOVSS X1, (DX)
 34058  	LEAQ  (DX)(BX*4), DX
 34059  	ADDSS (DX), X2
 34060  	MOVSS X2, (DX)
 34061  	LEAQ  (DX)(BX*4), DX
 34062  	ADDSS (DX), X3
 34063  	MOVSS X3, (DX)
 34064  	LEAQ  (DX)(BX*4), DX
 34065  	ADDSS (DX), X4
 34066  	MOVSS X4, (DX)
 34067  	LEAQ  (DX)(BX*4), DX
 34068  	SUBQ  $0x04, SI
 34069  
 34070  check_limit_unroll:
 34071  	CMPQ SI, $0x04
 34072  	JHS  loop_unroll
 34073  	JMP  check_limit
 34074  
 34075  loop:
 34076  	MOVSS (AX), X1
 34077  	MULSS X0, X1
 34078  	ADDSS (DX), X1
 34079  	MOVSS X1, (DX)
 34080  	DECQ  SI
 34081  	LEAQ  (AX)(CX*4), AX
 34082  	LEAQ  (DX)(BX*4), DX
 34083  
 34084  check_limit:
 34085  	CMPQ SI, $0x00
 34086  	JHI  loop
 34087  	RET
 34088  
 34089  // func AmdAxpyPointerLoopXInterleave_V4A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34090  // Requires: SSE
 34091  TEXT ·AmdAxpyPointerLoopXInterleave_V4A0U4(SB), NOSPLIT, $0-48
 34092  	MOVSS alpha+0(FP), X0
 34093  	MOVQ  xs+8(FP), AX
 34094  	MOVQ  incx+16(FP), CX
 34095  	MOVQ  CX, DX
 34096  	SHLQ  $0x04, DX
 34097  	MOVQ  ys+24(FP), DX
 34098  	MOVQ  incy+32(FP), BX
 34099  	MOVQ  BX, SI
 34100  	SHLQ  $0x04, SI
 34101  	MOVQ  n+40(FP), SI
 34102  	JMP   check_limit_unroll
 34103  
 34104  loop_unroll:
 34105  	MOVSS (AX), X1
 34106  	LEAQ  (AX)(CX*4), AX
 34107  	MOVSS (AX), X2
 34108  	LEAQ  (AX)(CX*4), AX
 34109  	MOVSS (AX), X3
 34110  	LEAQ  (AX)(CX*4), AX
 34111  	MOVSS (AX), X4
 34112  	LEAQ  (AX)(CX*4), AX
 34113  	MULSS X0, X1
 34114  	MULSS X0, X2
 34115  	MULSS X0, X3
 34116  	MULSS X0, X4
 34117  	ADDSS (DX), X1
 34118  	MOVSS X1, (DX)
 34119  	LEAQ  (DX)(BX*4), DX
 34120  	ADDSS (DX), X2
 34121  	MOVSS X2, (DX)
 34122  	LEAQ  (DX)(BX*4), DX
 34123  	ADDSS (DX), X3
 34124  	MOVSS X3, (DX)
 34125  	LEAQ  (DX)(BX*4), DX
 34126  	ADDSS (DX), X4
 34127  	MOVSS X4, (DX)
 34128  	LEAQ  (DX)(BX*4), DX
 34129  	SUBQ  $0x04, SI
 34130  
 34131  check_limit_unroll:
 34132  	CMPQ SI, $0x04
 34133  	JHS  loop_unroll
 34134  	JMP  check_limit
 34135  
 34136  loop:
 34137  	MOVSS (AX), X1
 34138  	MULSS X0, X1
 34139  	ADDSS (DX), X1
 34140  	MOVSS X1, (DX)
 34141  	DECQ  SI
 34142  	LEAQ  (AX)(CX*4), AX
 34143  	LEAQ  (DX)(BX*4), DX
 34144  
 34145  check_limit:
 34146  	CMPQ SI, $0x00
 34147  	JHI  loop
 34148  	RET
 34149  
 34150  // func AmdAxpyPointerLoopXInterleave_V5A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34151  // Requires: SSE
 34152  TEXT ·AmdAxpyPointerLoopXInterleave_V5A0U4(SB), NOSPLIT, $0-48
 34153  	MOVSS alpha+0(FP), X0
 34154  	MOVQ  xs+8(FP), AX
 34155  	MOVQ  incx+16(FP), CX
 34156  	MOVQ  CX, DX
 34157  	SHLQ  $0x04, DX
 34158  	MOVQ  ys+24(FP), DX
 34159  	MOVQ  incy+32(FP), BX
 34160  	MOVQ  BX, SI
 34161  	SHLQ  $0x04, SI
 34162  	MOVQ  n+40(FP), SI
 34163  	JMP   check_limit_unroll
 34164  
 34165  loop_unroll:
 34166  	MOVSS (AX), X1
 34167  	LEAQ  (AX)(CX*4), AX
 34168  	MOVSS (AX), X2
 34169  	LEAQ  (AX)(CX*4), AX
 34170  	MOVSS (AX), X3
 34171  	LEAQ  (AX)(CX*4), AX
 34172  	MOVSS (AX), X4
 34173  	LEAQ  (AX)(CX*4), AX
 34174  	MULSS X0, X1
 34175  	MULSS X0, X2
 34176  	MULSS X0, X3
 34177  	MULSS X0, X4
 34178  	ADDSS (DX), X1
 34179  	MOVSS X1, (DX)
 34180  	LEAQ  (DX)(BX*4), DX
 34181  	ADDSS (DX), X2
 34182  	MOVSS X2, (DX)
 34183  	LEAQ  (DX)(BX*4), DX
 34184  	ADDSS (DX), X3
 34185  	MOVSS X3, (DX)
 34186  	LEAQ  (DX)(BX*4), DX
 34187  	ADDSS (DX), X4
 34188  	MOVSS X4, (DX)
 34189  	LEAQ  (DX)(BX*4), DX
 34190  	SUBQ  $0x04, SI
 34191  
 34192  check_limit_unroll:
 34193  	CMPQ SI, $0x04
 34194  	JHS  loop_unroll
 34195  	JMP  check_limit
 34196  
 34197  loop:
 34198  	MOVSS (AX), X1
 34199  	MULSS X0, X1
 34200  	ADDSS (DX), X1
 34201  	MOVSS X1, (DX)
 34202  	DECQ  SI
 34203  	LEAQ  (AX)(CX*4), AX
 34204  	LEAQ  (DX)(BX*4), DX
 34205  
 34206  check_limit:
 34207  	CMPQ SI, $0x00
 34208  	JHI  loop
 34209  	RET
 34210  
 34211  // func AmdAxpyPointerLoopXInterleave_V0A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34212  // Requires: SSE
 34213  TEXT ·AmdAxpyPointerLoopXInterleave_V0A8U4(SB), NOSPLIT, $0-48
 34214  	MOVSS alpha+0(FP), X0
 34215  	MOVQ  xs+8(FP), AX
 34216  	MOVQ  incx+16(FP), CX
 34217  	MOVQ  CX, DX
 34218  	SHLQ  $0x04, DX
 34219  	MOVQ  ys+24(FP), DX
 34220  	MOVQ  incy+32(FP), BX
 34221  	MOVQ  BX, SI
 34222  	SHLQ  $0x04, SI
 34223  	MOVQ  n+40(FP), SI
 34224  	JMP   check_limit_unroll
 34225  	PCALIGN $0x08
 34226  
 34227  loop_unroll:
 34228  	MOVSS (AX), X1
 34229  	LEAQ  (AX)(CX*4), AX
 34230  	MOVSS (AX), X2
 34231  	LEAQ  (AX)(CX*4), AX
 34232  	MOVSS (AX), X3
 34233  	LEAQ  (AX)(CX*4), AX
 34234  	MOVSS (AX), X4
 34235  	LEAQ  (AX)(CX*4), AX
 34236  	MULSS X0, X1
 34237  	MULSS X0, X2
 34238  	MULSS X0, X3
 34239  	MULSS X0, X4
 34240  	ADDSS (DX), X1
 34241  	MOVSS X1, (DX)
 34242  	LEAQ  (DX)(BX*4), DX
 34243  	ADDSS (DX), X2
 34244  	MOVSS X2, (DX)
 34245  	LEAQ  (DX)(BX*4), DX
 34246  	ADDSS (DX), X3
 34247  	MOVSS X3, (DX)
 34248  	LEAQ  (DX)(BX*4), DX
 34249  	ADDSS (DX), X4
 34250  	MOVSS X4, (DX)
 34251  	LEAQ  (DX)(BX*4), DX
 34252  	SUBQ  $0x04, SI
 34253  
 34254  check_limit_unroll:
 34255  	CMPQ SI, $0x04
 34256  	JHS  loop_unroll
 34257  	JMP  check_limit
 34258  
 34259  loop:
 34260  	MOVSS (AX), X1
 34261  	MULSS X0, X1
 34262  	ADDSS (DX), X1
 34263  	MOVSS X1, (DX)
 34264  	DECQ  SI
 34265  	LEAQ  (AX)(CX*4), AX
 34266  	LEAQ  (DX)(BX*4), DX
 34267  
 34268  check_limit:
 34269  	CMPQ SI, $0x00
 34270  	JHI  loop
 34271  	RET
 34272  
 34273  // func AmdAxpyPointerLoopXInterleave_V1A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34274  // Requires: SSE
 34275  TEXT ·AmdAxpyPointerLoopXInterleave_V1A8U4(SB), NOSPLIT, $0-48
 34276  	MOVSS alpha+0(FP), X0
 34277  	MOVQ  xs+8(FP), AX
 34278  	MOVQ  incx+16(FP), CX
 34279  	MOVQ  CX, DX
 34280  	SHLQ  $0x04, DX
 34281  	MOVQ  ys+24(FP), DX
 34282  	MOVQ  incy+32(FP), BX
 34283  	MOVQ  BX, SI
 34284  	SHLQ  $0x04, SI
 34285  	MOVQ  n+40(FP), SI
 34286  	JMP   check_limit_unroll
 34287  	PCALIGN $0x08
 34288  
 34289  loop_unroll:
 34290  	MOVSS (AX), X1
 34291  	LEAQ  (AX)(CX*4), AX
 34292  	MOVSS (AX), X2
 34293  	LEAQ  (AX)(CX*4), AX
 34294  	MOVSS (AX), X3
 34295  	LEAQ  (AX)(CX*4), AX
 34296  	MOVSS (AX), X4
 34297  	LEAQ  (AX)(CX*4), AX
 34298  	MULSS X0, X1
 34299  	MULSS X0, X2
 34300  	MULSS X0, X3
 34301  	MULSS X0, X4
 34302  	ADDSS (DX), X1
 34303  	MOVSS X1, (DX)
 34304  	LEAQ  (DX)(BX*4), DX
 34305  	ADDSS (DX), X2
 34306  	MOVSS X2, (DX)
 34307  	LEAQ  (DX)(BX*4), DX
 34308  	ADDSS (DX), X3
 34309  	MOVSS X3, (DX)
 34310  	LEAQ  (DX)(BX*4), DX
 34311  	ADDSS (DX), X4
 34312  	MOVSS X4, (DX)
 34313  	LEAQ  (DX)(BX*4), DX
 34314  	SUBQ  $0x04, SI
 34315  
 34316  check_limit_unroll:
 34317  	CMPQ SI, $0x04
 34318  	JHS  loop_unroll
 34319  	JMP  check_limit
 34320  
 34321  loop:
 34322  	MOVSS (AX), X1
 34323  	MULSS X0, X1
 34324  	ADDSS (DX), X1
 34325  	MOVSS X1, (DX)
 34326  	DECQ  SI
 34327  	LEAQ  (AX)(CX*4), AX
 34328  	LEAQ  (DX)(BX*4), DX
 34329  
 34330  check_limit:
 34331  	CMPQ SI, $0x00
 34332  	JHI  loop
 34333  	RET
 34334  
 34335  // func AmdAxpyPointerLoopXInterleave_V2A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34336  // Requires: SSE
 34337  TEXT ·AmdAxpyPointerLoopXInterleave_V2A8U4(SB), NOSPLIT, $0-48
 34338  	MOVSS alpha+0(FP), X0
 34339  	MOVQ  xs+8(FP), AX
 34340  	MOVQ  incx+16(FP), CX
 34341  	MOVQ  CX, DX
 34342  	SHLQ  $0x04, DX
 34343  	MOVQ  ys+24(FP), DX
 34344  	MOVQ  incy+32(FP), BX
 34345  	MOVQ  BX, SI
 34346  	SHLQ  $0x04, SI
 34347  	MOVQ  n+40(FP), SI
 34348  	JMP   check_limit_unroll
 34349  	PCALIGN $0x08
 34350  
 34351  loop_unroll:
 34352  	MOVSS (AX), X1
 34353  	LEAQ  (AX)(CX*4), AX
 34354  	MOVSS (AX), X2
 34355  	LEAQ  (AX)(CX*4), AX
 34356  	MOVSS (AX), X3
 34357  	LEAQ  (AX)(CX*4), AX
 34358  	MOVSS (AX), X4
 34359  	LEAQ  (AX)(CX*4), AX
 34360  	MULSS X0, X1
 34361  	MULSS X0, X2
 34362  	MULSS X0, X3
 34363  	MULSS X0, X4
 34364  	ADDSS (DX), X1
 34365  	MOVSS X1, (DX)
 34366  	LEAQ  (DX)(BX*4), DX
 34367  	ADDSS (DX), X2
 34368  	MOVSS X2, (DX)
 34369  	LEAQ  (DX)(BX*4), DX
 34370  	ADDSS (DX), X3
 34371  	MOVSS X3, (DX)
 34372  	LEAQ  (DX)(BX*4), DX
 34373  	ADDSS (DX), X4
 34374  	MOVSS X4, (DX)
 34375  	LEAQ  (DX)(BX*4), DX
 34376  	SUBQ  $0x04, SI
 34377  
 34378  check_limit_unroll:
 34379  	CMPQ SI, $0x04
 34380  	JHS  loop_unroll
 34381  	JMP  check_limit
 34382  
 34383  loop:
 34384  	MOVSS (AX), X1
 34385  	MULSS X0, X1
 34386  	ADDSS (DX), X1
 34387  	MOVSS X1, (DX)
 34388  	DECQ  SI
 34389  	LEAQ  (AX)(CX*4), AX
 34390  	LEAQ  (DX)(BX*4), DX
 34391  
 34392  check_limit:
 34393  	CMPQ SI, $0x00
 34394  	JHI  loop
 34395  	RET
 34396  
 34397  // func AmdAxpyPointerLoopXInterleave_V3A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34398  // Requires: SSE
 34399  TEXT ·AmdAxpyPointerLoopXInterleave_V3A8U4(SB), NOSPLIT, $0-48
 34400  	MOVSS alpha+0(FP), X0
 34401  	MOVQ  xs+8(FP), AX
 34402  	MOVQ  incx+16(FP), CX
 34403  	MOVQ  CX, DX
 34404  	SHLQ  $0x04, DX
 34405  	MOVQ  ys+24(FP), DX
 34406  	MOVQ  incy+32(FP), BX
 34407  	MOVQ  BX, SI
 34408  	SHLQ  $0x04, SI
 34409  	MOVQ  n+40(FP), SI
 34410  	JMP   check_limit_unroll
 34411  	PCALIGN $0x08
 34412  
 34413  loop_unroll:
 34414  	MOVSS (AX), X1
 34415  	LEAQ  (AX)(CX*4), AX
 34416  	MOVSS (AX), X2
 34417  	LEAQ  (AX)(CX*4), AX
 34418  	MOVSS (AX), X3
 34419  	LEAQ  (AX)(CX*4), AX
 34420  	MOVSS (AX), X4
 34421  	LEAQ  (AX)(CX*4), AX
 34422  	MULSS X0, X1
 34423  	MULSS X0, X2
 34424  	MULSS X0, X3
 34425  	MULSS X0, X4
 34426  	ADDSS (DX), X1
 34427  	MOVSS X1, (DX)
 34428  	LEAQ  (DX)(BX*4), DX
 34429  	ADDSS (DX), X2
 34430  	MOVSS X2, (DX)
 34431  	LEAQ  (DX)(BX*4), DX
 34432  	ADDSS (DX), X3
 34433  	MOVSS X3, (DX)
 34434  	LEAQ  (DX)(BX*4), DX
 34435  	ADDSS (DX), X4
 34436  	MOVSS X4, (DX)
 34437  	LEAQ  (DX)(BX*4), DX
 34438  	SUBQ  $0x04, SI
 34439  
 34440  check_limit_unroll:
 34441  	CMPQ SI, $0x04
 34442  	JHS  loop_unroll
 34443  	JMP  check_limit
 34444  
 34445  loop:
 34446  	MOVSS (AX), X1
 34447  	MULSS X0, X1
 34448  	ADDSS (DX), X1
 34449  	MOVSS X1, (DX)
 34450  	DECQ  SI
 34451  	LEAQ  (AX)(CX*4), AX
 34452  	LEAQ  (DX)(BX*4), DX
 34453  
 34454  check_limit:
 34455  	CMPQ SI, $0x00
 34456  	JHI  loop
 34457  	RET
 34458  
 34459  // func AmdAxpyPointerLoopXInterleave_V4A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34460  // Requires: SSE
 34461  TEXT ·AmdAxpyPointerLoopXInterleave_V4A8U4(SB), NOSPLIT, $0-48
 34462  	MOVSS alpha+0(FP), X0
 34463  	MOVQ  xs+8(FP), AX
 34464  	MOVQ  incx+16(FP), CX
 34465  	MOVQ  CX, DX
 34466  	SHLQ  $0x04, DX
 34467  	MOVQ  ys+24(FP), DX
 34468  	MOVQ  incy+32(FP), BX
 34469  	MOVQ  BX, SI
 34470  	SHLQ  $0x04, SI
 34471  	MOVQ  n+40(FP), SI
 34472  	JMP   check_limit_unroll
 34473  	PCALIGN $0x08
 34474  
 34475  loop_unroll:
 34476  	MOVSS (AX), X1
 34477  	LEAQ  (AX)(CX*4), AX
 34478  	MOVSS (AX), X2
 34479  	LEAQ  (AX)(CX*4), AX
 34480  	MOVSS (AX), X3
 34481  	LEAQ  (AX)(CX*4), AX
 34482  	MOVSS (AX), X4
 34483  	LEAQ  (AX)(CX*4), AX
 34484  	MULSS X0, X1
 34485  	MULSS X0, X2
 34486  	MULSS X0, X3
 34487  	MULSS X0, X4
 34488  	ADDSS (DX), X1
 34489  	MOVSS X1, (DX)
 34490  	LEAQ  (DX)(BX*4), DX
 34491  	ADDSS (DX), X2
 34492  	MOVSS X2, (DX)
 34493  	LEAQ  (DX)(BX*4), DX
 34494  	ADDSS (DX), X3
 34495  	MOVSS X3, (DX)
 34496  	LEAQ  (DX)(BX*4), DX
 34497  	ADDSS (DX), X4
 34498  	MOVSS X4, (DX)
 34499  	LEAQ  (DX)(BX*4), DX
 34500  	SUBQ  $0x04, SI
 34501  
 34502  check_limit_unroll:
 34503  	CMPQ SI, $0x04
 34504  	JHS  loop_unroll
 34505  	JMP  check_limit
 34506  
 34507  loop:
 34508  	MOVSS (AX), X1
 34509  	MULSS X0, X1
 34510  	ADDSS (DX), X1
 34511  	MOVSS X1, (DX)
 34512  	DECQ  SI
 34513  	LEAQ  (AX)(CX*4), AX
 34514  	LEAQ  (DX)(BX*4), DX
 34515  
 34516  check_limit:
 34517  	CMPQ SI, $0x00
 34518  	JHI  loop
 34519  	RET
 34520  
 34521  // func AmdAxpyPointerLoopXInterleave_V5A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34522  // Requires: SSE
 34523  TEXT ·AmdAxpyPointerLoopXInterleave_V5A8U4(SB), NOSPLIT, $0-48
 34524  	MOVSS alpha+0(FP), X0
 34525  	MOVQ  xs+8(FP), AX
 34526  	MOVQ  incx+16(FP), CX
 34527  	MOVQ  CX, DX
 34528  	SHLQ  $0x04, DX
 34529  	MOVQ  ys+24(FP), DX
 34530  	MOVQ  incy+32(FP), BX
 34531  	MOVQ  BX, SI
 34532  	SHLQ  $0x04, SI
 34533  	MOVQ  n+40(FP), SI
 34534  	JMP   check_limit_unroll
 34535  	PCALIGN $0x08
 34536  
 34537  loop_unroll:
 34538  	MOVSS (AX), X1
 34539  	LEAQ  (AX)(CX*4), AX
 34540  	MOVSS (AX), X2
 34541  	LEAQ  (AX)(CX*4), AX
 34542  	MOVSS (AX), X3
 34543  	LEAQ  (AX)(CX*4), AX
 34544  	MOVSS (AX), X4
 34545  	LEAQ  (AX)(CX*4), AX
 34546  	MULSS X0, X1
 34547  	MULSS X0, X2
 34548  	MULSS X0, X3
 34549  	MULSS X0, X4
 34550  	ADDSS (DX), X1
 34551  	MOVSS X1, (DX)
 34552  	LEAQ  (DX)(BX*4), DX
 34553  	ADDSS (DX), X2
 34554  	MOVSS X2, (DX)
 34555  	LEAQ  (DX)(BX*4), DX
 34556  	ADDSS (DX), X3
 34557  	MOVSS X3, (DX)
 34558  	LEAQ  (DX)(BX*4), DX
 34559  	ADDSS (DX), X4
 34560  	MOVSS X4, (DX)
 34561  	LEAQ  (DX)(BX*4), DX
 34562  	SUBQ  $0x04, SI
 34563  
 34564  check_limit_unroll:
 34565  	CMPQ SI, $0x04
 34566  	JHS  loop_unroll
 34567  	JMP  check_limit
 34568  
 34569  loop:
 34570  	MOVSS (AX), X1
 34571  	MULSS X0, X1
 34572  	ADDSS (DX), X1
 34573  	MOVSS X1, (DX)
 34574  	DECQ  SI
 34575  	LEAQ  (AX)(CX*4), AX
 34576  	LEAQ  (DX)(BX*4), DX
 34577  
 34578  check_limit:
 34579  	CMPQ SI, $0x00
 34580  	JHI  loop
 34581  	RET
 34582  
 34583  // func AmdAxpyPointerLoopXInterleave_V0A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34584  // Requires: SSE
 34585  TEXT ·AmdAxpyPointerLoopXInterleave_V0A9U4(SB), NOSPLIT, $0-48
 34586  	MOVSS alpha+0(FP), X0
 34587  	MOVQ  xs+8(FP), AX
 34588  	MOVQ  incx+16(FP), CX
 34589  	MOVQ  CX, DX
 34590  	SHLQ  $0x04, DX
 34591  	MOVQ  ys+24(FP), DX
 34592  	MOVQ  incy+32(FP), BX
 34593  	MOVQ  BX, SI
 34594  	SHLQ  $0x04, SI
 34595  	MOVQ  n+40(FP), SI
 34596  	JMP   check_limit_unroll
 34597  	PCALIGN $0x08
 34598  	NOP
 34599  
 34600  loop_unroll:
 34601  	MOVSS (AX), X1
 34602  	LEAQ  (AX)(CX*4), AX
 34603  	MOVSS (AX), X2
 34604  	LEAQ  (AX)(CX*4), AX
 34605  	MOVSS (AX), X3
 34606  	LEAQ  (AX)(CX*4), AX
 34607  	MOVSS (AX), X4
 34608  	LEAQ  (AX)(CX*4), AX
 34609  	MULSS X0, X1
 34610  	MULSS X0, X2
 34611  	MULSS X0, X3
 34612  	MULSS X0, X4
 34613  	ADDSS (DX), X1
 34614  	MOVSS X1, (DX)
 34615  	LEAQ  (DX)(BX*4), DX
 34616  	ADDSS (DX), X2
 34617  	MOVSS X2, (DX)
 34618  	LEAQ  (DX)(BX*4), DX
 34619  	ADDSS (DX), X3
 34620  	MOVSS X3, (DX)
 34621  	LEAQ  (DX)(BX*4), DX
 34622  	ADDSS (DX), X4
 34623  	MOVSS X4, (DX)
 34624  	LEAQ  (DX)(BX*4), DX
 34625  	SUBQ  $0x04, SI
 34626  
 34627  check_limit_unroll:
 34628  	CMPQ SI, $0x04
 34629  	JHS  loop_unroll
 34630  	JMP  check_limit
 34631  
 34632  loop:
 34633  	MOVSS (AX), X1
 34634  	MULSS X0, X1
 34635  	ADDSS (DX), X1
 34636  	MOVSS X1, (DX)
 34637  	DECQ  SI
 34638  	LEAQ  (AX)(CX*4), AX
 34639  	LEAQ  (DX)(BX*4), DX
 34640  
 34641  check_limit:
 34642  	CMPQ SI, $0x00
 34643  	JHI  loop
 34644  	RET
 34645  
 34646  // func AmdAxpyPointerLoopXInterleave_V1A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34647  // Requires: SSE
 34648  TEXT ·AmdAxpyPointerLoopXInterleave_V1A9U4(SB), NOSPLIT, $0-48
 34649  	MOVSS alpha+0(FP), X0
 34650  	MOVQ  xs+8(FP), AX
 34651  	MOVQ  incx+16(FP), CX
 34652  	MOVQ  CX, DX
 34653  	SHLQ  $0x04, DX
 34654  	MOVQ  ys+24(FP), DX
 34655  	MOVQ  incy+32(FP), BX
 34656  	MOVQ  BX, SI
 34657  	SHLQ  $0x04, SI
 34658  	MOVQ  n+40(FP), SI
 34659  	JMP   check_limit_unroll
 34660  	PCALIGN $0x08
 34661  	NOP
 34662  
 34663  loop_unroll:
 34664  	MOVSS (AX), X1
 34665  	LEAQ  (AX)(CX*4), AX
 34666  	MOVSS (AX), X2
 34667  	LEAQ  (AX)(CX*4), AX
 34668  	MOVSS (AX), X3
 34669  	LEAQ  (AX)(CX*4), AX
 34670  	MOVSS (AX), X4
 34671  	LEAQ  (AX)(CX*4), AX
 34672  	MULSS X0, X1
 34673  	MULSS X0, X2
 34674  	MULSS X0, X3
 34675  	MULSS X0, X4
 34676  	ADDSS (DX), X1
 34677  	MOVSS X1, (DX)
 34678  	LEAQ  (DX)(BX*4), DX
 34679  	ADDSS (DX), X2
 34680  	MOVSS X2, (DX)
 34681  	LEAQ  (DX)(BX*4), DX
 34682  	ADDSS (DX), X3
 34683  	MOVSS X3, (DX)
 34684  	LEAQ  (DX)(BX*4), DX
 34685  	ADDSS (DX), X4
 34686  	MOVSS X4, (DX)
 34687  	LEAQ  (DX)(BX*4), DX
 34688  	SUBQ  $0x04, SI
 34689  
 34690  check_limit_unroll:
 34691  	CMPQ SI, $0x04
 34692  	JHS  loop_unroll
 34693  	JMP  check_limit
 34694  
 34695  loop:
 34696  	MOVSS (AX), X1
 34697  	MULSS X0, X1
 34698  	ADDSS (DX), X1
 34699  	MOVSS X1, (DX)
 34700  	DECQ  SI
 34701  	LEAQ  (AX)(CX*4), AX
 34702  	LEAQ  (DX)(BX*4), DX
 34703  
 34704  check_limit:
 34705  	CMPQ SI, $0x00
 34706  	JHI  loop
 34707  	RET
 34708  
 34709  // func AmdAxpyPointerLoopXInterleave_V2A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34710  // Requires: SSE
 34711  TEXT ·AmdAxpyPointerLoopXInterleave_V2A9U4(SB), NOSPLIT, $0-48
 34712  	MOVSS alpha+0(FP), X0
 34713  	MOVQ  xs+8(FP), AX
 34714  	MOVQ  incx+16(FP), CX
 34715  	MOVQ  CX, DX
 34716  	SHLQ  $0x04, DX
 34717  	MOVQ  ys+24(FP), DX
 34718  	MOVQ  incy+32(FP), BX
 34719  	MOVQ  BX, SI
 34720  	SHLQ  $0x04, SI
 34721  	MOVQ  n+40(FP), SI
 34722  	JMP   check_limit_unroll
 34723  	PCALIGN $0x08
 34724  	NOP
 34725  
 34726  loop_unroll:
 34727  	MOVSS (AX), X1
 34728  	LEAQ  (AX)(CX*4), AX
 34729  	MOVSS (AX), X2
 34730  	LEAQ  (AX)(CX*4), AX
 34731  	MOVSS (AX), X3
 34732  	LEAQ  (AX)(CX*4), AX
 34733  	MOVSS (AX), X4
 34734  	LEAQ  (AX)(CX*4), AX
 34735  	MULSS X0, X1
 34736  	MULSS X0, X2
 34737  	MULSS X0, X3
 34738  	MULSS X0, X4
 34739  	ADDSS (DX), X1
 34740  	MOVSS X1, (DX)
 34741  	LEAQ  (DX)(BX*4), DX
 34742  	ADDSS (DX), X2
 34743  	MOVSS X2, (DX)
 34744  	LEAQ  (DX)(BX*4), DX
 34745  	ADDSS (DX), X3
 34746  	MOVSS X3, (DX)
 34747  	LEAQ  (DX)(BX*4), DX
 34748  	ADDSS (DX), X4
 34749  	MOVSS X4, (DX)
 34750  	LEAQ  (DX)(BX*4), DX
 34751  	SUBQ  $0x04, SI
 34752  
 34753  check_limit_unroll:
 34754  	CMPQ SI, $0x04
 34755  	JHS  loop_unroll
 34756  	JMP  check_limit
 34757  
 34758  loop:
 34759  	MOVSS (AX), X1
 34760  	MULSS X0, X1
 34761  	ADDSS (DX), X1
 34762  	MOVSS X1, (DX)
 34763  	DECQ  SI
 34764  	LEAQ  (AX)(CX*4), AX
 34765  	LEAQ  (DX)(BX*4), DX
 34766  
 34767  check_limit:
 34768  	CMPQ SI, $0x00
 34769  	JHI  loop
 34770  	RET
 34771  
 34772  // func AmdAxpyPointerLoopXInterleave_V3A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34773  // Requires: SSE
 34774  TEXT ·AmdAxpyPointerLoopXInterleave_V3A9U4(SB), NOSPLIT, $0-48
 34775  	MOVSS alpha+0(FP), X0
 34776  	MOVQ  xs+8(FP), AX
 34777  	MOVQ  incx+16(FP), CX
 34778  	MOVQ  CX, DX
 34779  	SHLQ  $0x04, DX
 34780  	MOVQ  ys+24(FP), DX
 34781  	MOVQ  incy+32(FP), BX
 34782  	MOVQ  BX, SI
 34783  	SHLQ  $0x04, SI
 34784  	MOVQ  n+40(FP), SI
 34785  	JMP   check_limit_unroll
 34786  	PCALIGN $0x08
 34787  	NOP
 34788  
 34789  loop_unroll:
 34790  	MOVSS (AX), X1
 34791  	LEAQ  (AX)(CX*4), AX
 34792  	MOVSS (AX), X2
 34793  	LEAQ  (AX)(CX*4), AX
 34794  	MOVSS (AX), X3
 34795  	LEAQ  (AX)(CX*4), AX
 34796  	MOVSS (AX), X4
 34797  	LEAQ  (AX)(CX*4), AX
 34798  	MULSS X0, X1
 34799  	MULSS X0, X2
 34800  	MULSS X0, X3
 34801  	MULSS X0, X4
 34802  	ADDSS (DX), X1
 34803  	MOVSS X1, (DX)
 34804  	LEAQ  (DX)(BX*4), DX
 34805  	ADDSS (DX), X2
 34806  	MOVSS X2, (DX)
 34807  	LEAQ  (DX)(BX*4), DX
 34808  	ADDSS (DX), X3
 34809  	MOVSS X3, (DX)
 34810  	LEAQ  (DX)(BX*4), DX
 34811  	ADDSS (DX), X4
 34812  	MOVSS X4, (DX)
 34813  	LEAQ  (DX)(BX*4), DX
 34814  	SUBQ  $0x04, SI
 34815  
 34816  check_limit_unroll:
 34817  	CMPQ SI, $0x04
 34818  	JHS  loop_unroll
 34819  	JMP  check_limit
 34820  
 34821  loop:
 34822  	MOVSS (AX), X1
 34823  	MULSS X0, X1
 34824  	ADDSS (DX), X1
 34825  	MOVSS X1, (DX)
 34826  	DECQ  SI
 34827  	LEAQ  (AX)(CX*4), AX
 34828  	LEAQ  (DX)(BX*4), DX
 34829  
 34830  check_limit:
 34831  	CMPQ SI, $0x00
 34832  	JHI  loop
 34833  	RET
 34834  
 34835  // func AmdAxpyPointerLoopXInterleave_V4A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34836  // Requires: SSE
 34837  TEXT ·AmdAxpyPointerLoopXInterleave_V4A9U4(SB), NOSPLIT, $0-48
 34838  	MOVSS alpha+0(FP), X0
 34839  	MOVQ  xs+8(FP), AX
 34840  	MOVQ  incx+16(FP), CX
 34841  	MOVQ  CX, DX
 34842  	SHLQ  $0x04, DX
 34843  	MOVQ  ys+24(FP), DX
 34844  	MOVQ  incy+32(FP), BX
 34845  	MOVQ  BX, SI
 34846  	SHLQ  $0x04, SI
 34847  	MOVQ  n+40(FP), SI
 34848  	JMP   check_limit_unroll
 34849  	PCALIGN $0x08
 34850  	NOP
 34851  
 34852  loop_unroll:
 34853  	MOVSS (AX), X1
 34854  	LEAQ  (AX)(CX*4), AX
 34855  	MOVSS (AX), X2
 34856  	LEAQ  (AX)(CX*4), AX
 34857  	MOVSS (AX), X3
 34858  	LEAQ  (AX)(CX*4), AX
 34859  	MOVSS (AX), X4
 34860  	LEAQ  (AX)(CX*4), AX
 34861  	MULSS X0, X1
 34862  	MULSS X0, X2
 34863  	MULSS X0, X3
 34864  	MULSS X0, X4
 34865  	ADDSS (DX), X1
 34866  	MOVSS X1, (DX)
 34867  	LEAQ  (DX)(BX*4), DX
 34868  	ADDSS (DX), X2
 34869  	MOVSS X2, (DX)
 34870  	LEAQ  (DX)(BX*4), DX
 34871  	ADDSS (DX), X3
 34872  	MOVSS X3, (DX)
 34873  	LEAQ  (DX)(BX*4), DX
 34874  	ADDSS (DX), X4
 34875  	MOVSS X4, (DX)
 34876  	LEAQ  (DX)(BX*4), DX
 34877  	SUBQ  $0x04, SI
 34878  
 34879  check_limit_unroll:
 34880  	CMPQ SI, $0x04
 34881  	JHS  loop_unroll
 34882  	JMP  check_limit
 34883  
 34884  loop:
 34885  	MOVSS (AX), X1
 34886  	MULSS X0, X1
 34887  	ADDSS (DX), X1
 34888  	MOVSS X1, (DX)
 34889  	DECQ  SI
 34890  	LEAQ  (AX)(CX*4), AX
 34891  	LEAQ  (DX)(BX*4), DX
 34892  
 34893  check_limit:
 34894  	CMPQ SI, $0x00
 34895  	JHI  loop
 34896  	RET
 34897  
 34898  // func AmdAxpyPointerLoopXInterleave_V5A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34899  // Requires: SSE
 34900  TEXT ·AmdAxpyPointerLoopXInterleave_V5A9U4(SB), NOSPLIT, $0-48
 34901  	MOVSS alpha+0(FP), X0
 34902  	MOVQ  xs+8(FP), AX
 34903  	MOVQ  incx+16(FP), CX
 34904  	MOVQ  CX, DX
 34905  	SHLQ  $0x04, DX
 34906  	MOVQ  ys+24(FP), DX
 34907  	MOVQ  incy+32(FP), BX
 34908  	MOVQ  BX, SI
 34909  	SHLQ  $0x04, SI
 34910  	MOVQ  n+40(FP), SI
 34911  	JMP   check_limit_unroll
 34912  	PCALIGN $0x08
 34913  	NOP
 34914  
 34915  loop_unroll:
 34916  	MOVSS (AX), X1
 34917  	LEAQ  (AX)(CX*4), AX
 34918  	MOVSS (AX), X2
 34919  	LEAQ  (AX)(CX*4), AX
 34920  	MOVSS (AX), X3
 34921  	LEAQ  (AX)(CX*4), AX
 34922  	MOVSS (AX), X4
 34923  	LEAQ  (AX)(CX*4), AX
 34924  	MULSS X0, X1
 34925  	MULSS X0, X2
 34926  	MULSS X0, X3
 34927  	MULSS X0, X4
 34928  	ADDSS (DX), X1
 34929  	MOVSS X1, (DX)
 34930  	LEAQ  (DX)(BX*4), DX
 34931  	ADDSS (DX), X2
 34932  	MOVSS X2, (DX)
 34933  	LEAQ  (DX)(BX*4), DX
 34934  	ADDSS (DX), X3
 34935  	MOVSS X3, (DX)
 34936  	LEAQ  (DX)(BX*4), DX
 34937  	ADDSS (DX), X4
 34938  	MOVSS X4, (DX)
 34939  	LEAQ  (DX)(BX*4), DX
 34940  	SUBQ  $0x04, SI
 34941  
 34942  check_limit_unroll:
 34943  	CMPQ SI, $0x04
 34944  	JHS  loop_unroll
 34945  	JMP  check_limit
 34946  
 34947  loop:
 34948  	MOVSS (AX), X1
 34949  	MULSS X0, X1
 34950  	ADDSS (DX), X1
 34951  	MOVSS X1, (DX)
 34952  	DECQ  SI
 34953  	LEAQ  (AX)(CX*4), AX
 34954  	LEAQ  (DX)(BX*4), DX
 34955  
 34956  check_limit:
 34957  	CMPQ SI, $0x00
 34958  	JHI  loop
 34959  	RET
 34960  
 34961  // func AmdAxpyPointerLoopXInterleave_V0A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 34962  // Requires: SSE
 34963  TEXT ·AmdAxpyPointerLoopXInterleave_V0A10U4(SB), NOSPLIT, $0-48
 34964  	MOVSS alpha+0(FP), X0
 34965  	MOVQ  xs+8(FP), AX
 34966  	MOVQ  incx+16(FP), CX
 34967  	MOVQ  CX, DX
 34968  	SHLQ  $0x04, DX
 34969  	MOVQ  ys+24(FP), DX
 34970  	MOVQ  incy+32(FP), BX
 34971  	MOVQ  BX, SI
 34972  	SHLQ  $0x04, SI
 34973  	MOVQ  n+40(FP), SI
 34974  	JMP   check_limit_unroll
 34975  	PCALIGN $0x08
 34976  	NOP
 34977  	NOP
 34978  
 34979  loop_unroll:
 34980  	MOVSS (AX), X1
 34981  	LEAQ  (AX)(CX*4), AX
 34982  	MOVSS (AX), X2
 34983  	LEAQ  (AX)(CX*4), AX
 34984  	MOVSS (AX), X3
 34985  	LEAQ  (AX)(CX*4), AX
 34986  	MOVSS (AX), X4
 34987  	LEAQ  (AX)(CX*4), AX
 34988  	MULSS X0, X1
 34989  	MULSS X0, X2
 34990  	MULSS X0, X3
 34991  	MULSS X0, X4
 34992  	ADDSS (DX), X1
 34993  	MOVSS X1, (DX)
 34994  	LEAQ  (DX)(BX*4), DX
 34995  	ADDSS (DX), X2
 34996  	MOVSS X2, (DX)
 34997  	LEAQ  (DX)(BX*4), DX
 34998  	ADDSS (DX), X3
 34999  	MOVSS X3, (DX)
 35000  	LEAQ  (DX)(BX*4), DX
 35001  	ADDSS (DX), X4
 35002  	MOVSS X4, (DX)
 35003  	LEAQ  (DX)(BX*4), DX
 35004  	SUBQ  $0x04, SI
 35005  
 35006  check_limit_unroll:
 35007  	CMPQ SI, $0x04
 35008  	JHS  loop_unroll
 35009  	JMP  check_limit
 35010  
 35011  loop:
 35012  	MOVSS (AX), X1
 35013  	MULSS X0, X1
 35014  	ADDSS (DX), X1
 35015  	MOVSS X1, (DX)
 35016  	DECQ  SI
 35017  	LEAQ  (AX)(CX*4), AX
 35018  	LEAQ  (DX)(BX*4), DX
 35019  
 35020  check_limit:
 35021  	CMPQ SI, $0x00
 35022  	JHI  loop
 35023  	RET
 35024  
 35025  // func AmdAxpyPointerLoopXInterleave_V1A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35026  // Requires: SSE
 35027  TEXT ·AmdAxpyPointerLoopXInterleave_V1A10U4(SB), NOSPLIT, $0-48
 35028  	MOVSS alpha+0(FP), X0
 35029  	MOVQ  xs+8(FP), AX
 35030  	MOVQ  incx+16(FP), CX
 35031  	MOVQ  CX, DX
 35032  	SHLQ  $0x04, DX
 35033  	MOVQ  ys+24(FP), DX
 35034  	MOVQ  incy+32(FP), BX
 35035  	MOVQ  BX, SI
 35036  	SHLQ  $0x04, SI
 35037  	MOVQ  n+40(FP), SI
 35038  	JMP   check_limit_unroll
 35039  	PCALIGN $0x08
 35040  	NOP
 35041  	NOP
 35042  
 35043  loop_unroll:
 35044  	MOVSS (AX), X1
 35045  	LEAQ  (AX)(CX*4), AX
 35046  	MOVSS (AX), X2
 35047  	LEAQ  (AX)(CX*4), AX
 35048  	MOVSS (AX), X3
 35049  	LEAQ  (AX)(CX*4), AX
 35050  	MOVSS (AX), X4
 35051  	LEAQ  (AX)(CX*4), AX
 35052  	MULSS X0, X1
 35053  	MULSS X0, X2
 35054  	MULSS X0, X3
 35055  	MULSS X0, X4
 35056  	ADDSS (DX), X1
 35057  	MOVSS X1, (DX)
 35058  	LEAQ  (DX)(BX*4), DX
 35059  	ADDSS (DX), X2
 35060  	MOVSS X2, (DX)
 35061  	LEAQ  (DX)(BX*4), DX
 35062  	ADDSS (DX), X3
 35063  	MOVSS X3, (DX)
 35064  	LEAQ  (DX)(BX*4), DX
 35065  	ADDSS (DX), X4
 35066  	MOVSS X4, (DX)
 35067  	LEAQ  (DX)(BX*4), DX
 35068  	SUBQ  $0x04, SI
 35069  
 35070  check_limit_unroll:
 35071  	CMPQ SI, $0x04
 35072  	JHS  loop_unroll
 35073  	JMP  check_limit
 35074  
 35075  loop:
 35076  	MOVSS (AX), X1
 35077  	MULSS X0, X1
 35078  	ADDSS (DX), X1
 35079  	MOVSS X1, (DX)
 35080  	DECQ  SI
 35081  	LEAQ  (AX)(CX*4), AX
 35082  	LEAQ  (DX)(BX*4), DX
 35083  
 35084  check_limit:
 35085  	CMPQ SI, $0x00
 35086  	JHI  loop
 35087  	RET
 35088  
 35089  // func AmdAxpyPointerLoopXInterleave_V2A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35090  // Requires: SSE
 35091  TEXT ·AmdAxpyPointerLoopXInterleave_V2A10U4(SB), NOSPLIT, $0-48
 35092  	MOVSS alpha+0(FP), X0
 35093  	MOVQ  xs+8(FP), AX
 35094  	MOVQ  incx+16(FP), CX
 35095  	MOVQ  CX, DX
 35096  	SHLQ  $0x04, DX
 35097  	MOVQ  ys+24(FP), DX
 35098  	MOVQ  incy+32(FP), BX
 35099  	MOVQ  BX, SI
 35100  	SHLQ  $0x04, SI
 35101  	MOVQ  n+40(FP), SI
 35102  	JMP   check_limit_unroll
 35103  	PCALIGN $0x08
 35104  	NOP
 35105  	NOP
 35106  
 35107  loop_unroll:
 35108  	MOVSS (AX), X1
 35109  	LEAQ  (AX)(CX*4), AX
 35110  	MOVSS (AX), X2
 35111  	LEAQ  (AX)(CX*4), AX
 35112  	MOVSS (AX), X3
 35113  	LEAQ  (AX)(CX*4), AX
 35114  	MOVSS (AX), X4
 35115  	LEAQ  (AX)(CX*4), AX
 35116  	MULSS X0, X1
 35117  	MULSS X0, X2
 35118  	MULSS X0, X3
 35119  	MULSS X0, X4
 35120  	ADDSS (DX), X1
 35121  	MOVSS X1, (DX)
 35122  	LEAQ  (DX)(BX*4), DX
 35123  	ADDSS (DX), X2
 35124  	MOVSS X2, (DX)
 35125  	LEAQ  (DX)(BX*4), DX
 35126  	ADDSS (DX), X3
 35127  	MOVSS X3, (DX)
 35128  	LEAQ  (DX)(BX*4), DX
 35129  	ADDSS (DX), X4
 35130  	MOVSS X4, (DX)
 35131  	LEAQ  (DX)(BX*4), DX
 35132  	SUBQ  $0x04, SI
 35133  
 35134  check_limit_unroll:
 35135  	CMPQ SI, $0x04
 35136  	JHS  loop_unroll
 35137  	JMP  check_limit
 35138  
 35139  loop:
 35140  	MOVSS (AX), X1
 35141  	MULSS X0, X1
 35142  	ADDSS (DX), X1
 35143  	MOVSS X1, (DX)
 35144  	DECQ  SI
 35145  	LEAQ  (AX)(CX*4), AX
 35146  	LEAQ  (DX)(BX*4), DX
 35147  
 35148  check_limit:
 35149  	CMPQ SI, $0x00
 35150  	JHI  loop
 35151  	RET
 35152  
 35153  // func AmdAxpyPointerLoopXInterleave_V3A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35154  // Requires: SSE
 35155  TEXT ·AmdAxpyPointerLoopXInterleave_V3A10U4(SB), NOSPLIT, $0-48
 35156  	MOVSS alpha+0(FP), X0
 35157  	MOVQ  xs+8(FP), AX
 35158  	MOVQ  incx+16(FP), CX
 35159  	MOVQ  CX, DX
 35160  	SHLQ  $0x04, DX
 35161  	MOVQ  ys+24(FP), DX
 35162  	MOVQ  incy+32(FP), BX
 35163  	MOVQ  BX, SI
 35164  	SHLQ  $0x04, SI
 35165  	MOVQ  n+40(FP), SI
 35166  	JMP   check_limit_unroll
 35167  	PCALIGN $0x08
 35168  	NOP
 35169  	NOP
 35170  
 35171  loop_unroll:
 35172  	MOVSS (AX), X1
 35173  	LEAQ  (AX)(CX*4), AX
 35174  	MOVSS (AX), X2
 35175  	LEAQ  (AX)(CX*4), AX
 35176  	MOVSS (AX), X3
 35177  	LEAQ  (AX)(CX*4), AX
 35178  	MOVSS (AX), X4
 35179  	LEAQ  (AX)(CX*4), AX
 35180  	MULSS X0, X1
 35181  	MULSS X0, X2
 35182  	MULSS X0, X3
 35183  	MULSS X0, X4
 35184  	ADDSS (DX), X1
 35185  	MOVSS X1, (DX)
 35186  	LEAQ  (DX)(BX*4), DX
 35187  	ADDSS (DX), X2
 35188  	MOVSS X2, (DX)
 35189  	LEAQ  (DX)(BX*4), DX
 35190  	ADDSS (DX), X3
 35191  	MOVSS X3, (DX)
 35192  	LEAQ  (DX)(BX*4), DX
 35193  	ADDSS (DX), X4
 35194  	MOVSS X4, (DX)
 35195  	LEAQ  (DX)(BX*4), DX
 35196  	SUBQ  $0x04, SI
 35197  
 35198  check_limit_unroll:
 35199  	CMPQ SI, $0x04
 35200  	JHS  loop_unroll
 35201  	JMP  check_limit
 35202  
 35203  loop:
 35204  	MOVSS (AX), X1
 35205  	MULSS X0, X1
 35206  	ADDSS (DX), X1
 35207  	MOVSS X1, (DX)
 35208  	DECQ  SI
 35209  	LEAQ  (AX)(CX*4), AX
 35210  	LEAQ  (DX)(BX*4), DX
 35211  
 35212  check_limit:
 35213  	CMPQ SI, $0x00
 35214  	JHI  loop
 35215  	RET
 35216  
 35217  // func AmdAxpyPointerLoopXInterleave_V4A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35218  // Requires: SSE
 35219  TEXT ·AmdAxpyPointerLoopXInterleave_V4A10U4(SB), NOSPLIT, $0-48
 35220  	MOVSS alpha+0(FP), X0
 35221  	MOVQ  xs+8(FP), AX
 35222  	MOVQ  incx+16(FP), CX
 35223  	MOVQ  CX, DX
 35224  	SHLQ  $0x04, DX
 35225  	MOVQ  ys+24(FP), DX
 35226  	MOVQ  incy+32(FP), BX
 35227  	MOVQ  BX, SI
 35228  	SHLQ  $0x04, SI
 35229  	MOVQ  n+40(FP), SI
 35230  	JMP   check_limit_unroll
 35231  	PCALIGN $0x08
 35232  	NOP
 35233  	NOP
 35234  
 35235  loop_unroll:
 35236  	MOVSS (AX), X1
 35237  	LEAQ  (AX)(CX*4), AX
 35238  	MOVSS (AX), X2
 35239  	LEAQ  (AX)(CX*4), AX
 35240  	MOVSS (AX), X3
 35241  	LEAQ  (AX)(CX*4), AX
 35242  	MOVSS (AX), X4
 35243  	LEAQ  (AX)(CX*4), AX
 35244  	MULSS X0, X1
 35245  	MULSS X0, X2
 35246  	MULSS X0, X3
 35247  	MULSS X0, X4
 35248  	ADDSS (DX), X1
 35249  	MOVSS X1, (DX)
 35250  	LEAQ  (DX)(BX*4), DX
 35251  	ADDSS (DX), X2
 35252  	MOVSS X2, (DX)
 35253  	LEAQ  (DX)(BX*4), DX
 35254  	ADDSS (DX), X3
 35255  	MOVSS X3, (DX)
 35256  	LEAQ  (DX)(BX*4), DX
 35257  	ADDSS (DX), X4
 35258  	MOVSS X4, (DX)
 35259  	LEAQ  (DX)(BX*4), DX
 35260  	SUBQ  $0x04, SI
 35261  
 35262  check_limit_unroll:
 35263  	CMPQ SI, $0x04
 35264  	JHS  loop_unroll
 35265  	JMP  check_limit
 35266  
 35267  loop:
 35268  	MOVSS (AX), X1
 35269  	MULSS X0, X1
 35270  	ADDSS (DX), X1
 35271  	MOVSS X1, (DX)
 35272  	DECQ  SI
 35273  	LEAQ  (AX)(CX*4), AX
 35274  	LEAQ  (DX)(BX*4), DX
 35275  
 35276  check_limit:
 35277  	CMPQ SI, $0x00
 35278  	JHI  loop
 35279  	RET
 35280  
 35281  // func AmdAxpyPointerLoopXInterleave_V5A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35282  // Requires: SSE
 35283  TEXT ·AmdAxpyPointerLoopXInterleave_V5A10U4(SB), NOSPLIT, $0-48
 35284  	MOVSS alpha+0(FP), X0
 35285  	MOVQ  xs+8(FP), AX
 35286  	MOVQ  incx+16(FP), CX
 35287  	MOVQ  CX, DX
 35288  	SHLQ  $0x04, DX
 35289  	MOVQ  ys+24(FP), DX
 35290  	MOVQ  incy+32(FP), BX
 35291  	MOVQ  BX, SI
 35292  	SHLQ  $0x04, SI
 35293  	MOVQ  n+40(FP), SI
 35294  	JMP   check_limit_unroll
 35295  	PCALIGN $0x08
 35296  	NOP
 35297  	NOP
 35298  
 35299  loop_unroll:
 35300  	MOVSS (AX), X1
 35301  	LEAQ  (AX)(CX*4), AX
 35302  	MOVSS (AX), X2
 35303  	LEAQ  (AX)(CX*4), AX
 35304  	MOVSS (AX), X3
 35305  	LEAQ  (AX)(CX*4), AX
 35306  	MOVSS (AX), X4
 35307  	LEAQ  (AX)(CX*4), AX
 35308  	MULSS X0, X1
 35309  	MULSS X0, X2
 35310  	MULSS X0, X3
 35311  	MULSS X0, X4
 35312  	ADDSS (DX), X1
 35313  	MOVSS X1, (DX)
 35314  	LEAQ  (DX)(BX*4), DX
 35315  	ADDSS (DX), X2
 35316  	MOVSS X2, (DX)
 35317  	LEAQ  (DX)(BX*4), DX
 35318  	ADDSS (DX), X3
 35319  	MOVSS X3, (DX)
 35320  	LEAQ  (DX)(BX*4), DX
 35321  	ADDSS (DX), X4
 35322  	MOVSS X4, (DX)
 35323  	LEAQ  (DX)(BX*4), DX
 35324  	SUBQ  $0x04, SI
 35325  
 35326  check_limit_unroll:
 35327  	CMPQ SI, $0x04
 35328  	JHS  loop_unroll
 35329  	JMP  check_limit
 35330  
 35331  loop:
 35332  	MOVSS (AX), X1
 35333  	MULSS X0, X1
 35334  	ADDSS (DX), X1
 35335  	MOVSS X1, (DX)
 35336  	DECQ  SI
 35337  	LEAQ  (AX)(CX*4), AX
 35338  	LEAQ  (DX)(BX*4), DX
 35339  
 35340  check_limit:
 35341  	CMPQ SI, $0x00
 35342  	JHI  loop
 35343  	RET
 35344  
 35345  // func AmdAxpyPointerLoopXInterleave_V0A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35346  // Requires: SSE
 35347  TEXT ·AmdAxpyPointerLoopXInterleave_V0A11U4(SB), NOSPLIT, $0-48
 35348  	MOVSS alpha+0(FP), X0
 35349  	MOVQ  xs+8(FP), AX
 35350  	MOVQ  incx+16(FP), CX
 35351  	MOVQ  CX, DX
 35352  	SHLQ  $0x04, DX
 35353  	MOVQ  ys+24(FP), DX
 35354  	MOVQ  incy+32(FP), BX
 35355  	MOVQ  BX, SI
 35356  	SHLQ  $0x04, SI
 35357  	MOVQ  n+40(FP), SI
 35358  	JMP   check_limit_unroll
 35359  	PCALIGN $0x08
 35360  	NOP
 35361  	NOP
 35362  	NOP
 35363  
 35364  loop_unroll:
 35365  	MOVSS (AX), X1
 35366  	LEAQ  (AX)(CX*4), AX
 35367  	MOVSS (AX), X2
 35368  	LEAQ  (AX)(CX*4), AX
 35369  	MOVSS (AX), X3
 35370  	LEAQ  (AX)(CX*4), AX
 35371  	MOVSS (AX), X4
 35372  	LEAQ  (AX)(CX*4), AX
 35373  	MULSS X0, X1
 35374  	MULSS X0, X2
 35375  	MULSS X0, X3
 35376  	MULSS X0, X4
 35377  	ADDSS (DX), X1
 35378  	MOVSS X1, (DX)
 35379  	LEAQ  (DX)(BX*4), DX
 35380  	ADDSS (DX), X2
 35381  	MOVSS X2, (DX)
 35382  	LEAQ  (DX)(BX*4), DX
 35383  	ADDSS (DX), X3
 35384  	MOVSS X3, (DX)
 35385  	LEAQ  (DX)(BX*4), DX
 35386  	ADDSS (DX), X4
 35387  	MOVSS X4, (DX)
 35388  	LEAQ  (DX)(BX*4), DX
 35389  	SUBQ  $0x04, SI
 35390  
 35391  check_limit_unroll:
 35392  	CMPQ SI, $0x04
 35393  	JHS  loop_unroll
 35394  	JMP  check_limit
 35395  
 35396  loop:
 35397  	MOVSS (AX), X1
 35398  	MULSS X0, X1
 35399  	ADDSS (DX), X1
 35400  	MOVSS X1, (DX)
 35401  	DECQ  SI
 35402  	LEAQ  (AX)(CX*4), AX
 35403  	LEAQ  (DX)(BX*4), DX
 35404  
 35405  check_limit:
 35406  	CMPQ SI, $0x00
 35407  	JHI  loop
 35408  	RET
 35409  
 35410  // func AmdAxpyPointerLoopXInterleave_V1A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35411  // Requires: SSE
 35412  TEXT ·AmdAxpyPointerLoopXInterleave_V1A11U4(SB), NOSPLIT, $0-48
 35413  	MOVSS alpha+0(FP), X0
 35414  	MOVQ  xs+8(FP), AX
 35415  	MOVQ  incx+16(FP), CX
 35416  	MOVQ  CX, DX
 35417  	SHLQ  $0x04, DX
 35418  	MOVQ  ys+24(FP), DX
 35419  	MOVQ  incy+32(FP), BX
 35420  	MOVQ  BX, SI
 35421  	SHLQ  $0x04, SI
 35422  	MOVQ  n+40(FP), SI
 35423  	JMP   check_limit_unroll
 35424  	PCALIGN $0x08
 35425  	NOP
 35426  	NOP
 35427  	NOP
 35428  
 35429  loop_unroll:
 35430  	MOVSS (AX), X1
 35431  	LEAQ  (AX)(CX*4), AX
 35432  	MOVSS (AX), X2
 35433  	LEAQ  (AX)(CX*4), AX
 35434  	MOVSS (AX), X3
 35435  	LEAQ  (AX)(CX*4), AX
 35436  	MOVSS (AX), X4
 35437  	LEAQ  (AX)(CX*4), AX
 35438  	MULSS X0, X1
 35439  	MULSS X0, X2
 35440  	MULSS X0, X3
 35441  	MULSS X0, X4
 35442  	ADDSS (DX), X1
 35443  	MOVSS X1, (DX)
 35444  	LEAQ  (DX)(BX*4), DX
 35445  	ADDSS (DX), X2
 35446  	MOVSS X2, (DX)
 35447  	LEAQ  (DX)(BX*4), DX
 35448  	ADDSS (DX), X3
 35449  	MOVSS X3, (DX)
 35450  	LEAQ  (DX)(BX*4), DX
 35451  	ADDSS (DX), X4
 35452  	MOVSS X4, (DX)
 35453  	LEAQ  (DX)(BX*4), DX
 35454  	SUBQ  $0x04, SI
 35455  
 35456  check_limit_unroll:
 35457  	CMPQ SI, $0x04
 35458  	JHS  loop_unroll
 35459  	JMP  check_limit
 35460  
 35461  loop:
 35462  	MOVSS (AX), X1
 35463  	MULSS X0, X1
 35464  	ADDSS (DX), X1
 35465  	MOVSS X1, (DX)
 35466  	DECQ  SI
 35467  	LEAQ  (AX)(CX*4), AX
 35468  	LEAQ  (DX)(BX*4), DX
 35469  
 35470  check_limit:
 35471  	CMPQ SI, $0x00
 35472  	JHI  loop
 35473  	RET
 35474  
 35475  // func AmdAxpyPointerLoopXInterleave_V2A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35476  // Requires: SSE
 35477  TEXT ·AmdAxpyPointerLoopXInterleave_V2A11U4(SB), NOSPLIT, $0-48
 35478  	MOVSS alpha+0(FP), X0
 35479  	MOVQ  xs+8(FP), AX
 35480  	MOVQ  incx+16(FP), CX
 35481  	MOVQ  CX, DX
 35482  	SHLQ  $0x04, DX
 35483  	MOVQ  ys+24(FP), DX
 35484  	MOVQ  incy+32(FP), BX
 35485  	MOVQ  BX, SI
 35486  	SHLQ  $0x04, SI
 35487  	MOVQ  n+40(FP), SI
 35488  	JMP   check_limit_unroll
 35489  	PCALIGN $0x08
 35490  	NOP
 35491  	NOP
 35492  	NOP
 35493  
 35494  loop_unroll:
 35495  	MOVSS (AX), X1
 35496  	LEAQ  (AX)(CX*4), AX
 35497  	MOVSS (AX), X2
 35498  	LEAQ  (AX)(CX*4), AX
 35499  	MOVSS (AX), X3
 35500  	LEAQ  (AX)(CX*4), AX
 35501  	MOVSS (AX), X4
 35502  	LEAQ  (AX)(CX*4), AX
 35503  	MULSS X0, X1
 35504  	MULSS X0, X2
 35505  	MULSS X0, X3
 35506  	MULSS X0, X4
 35507  	ADDSS (DX), X1
 35508  	MOVSS X1, (DX)
 35509  	LEAQ  (DX)(BX*4), DX
 35510  	ADDSS (DX), X2
 35511  	MOVSS X2, (DX)
 35512  	LEAQ  (DX)(BX*4), DX
 35513  	ADDSS (DX), X3
 35514  	MOVSS X3, (DX)
 35515  	LEAQ  (DX)(BX*4), DX
 35516  	ADDSS (DX), X4
 35517  	MOVSS X4, (DX)
 35518  	LEAQ  (DX)(BX*4), DX
 35519  	SUBQ  $0x04, SI
 35520  
 35521  check_limit_unroll:
 35522  	CMPQ SI, $0x04
 35523  	JHS  loop_unroll
 35524  	JMP  check_limit
 35525  
 35526  loop:
 35527  	MOVSS (AX), X1
 35528  	MULSS X0, X1
 35529  	ADDSS (DX), X1
 35530  	MOVSS X1, (DX)
 35531  	DECQ  SI
 35532  	LEAQ  (AX)(CX*4), AX
 35533  	LEAQ  (DX)(BX*4), DX
 35534  
 35535  check_limit:
 35536  	CMPQ SI, $0x00
 35537  	JHI  loop
 35538  	RET
 35539  
 35540  // func AmdAxpyPointerLoopXInterleave_V3A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35541  // Requires: SSE
 35542  TEXT ·AmdAxpyPointerLoopXInterleave_V3A11U4(SB), NOSPLIT, $0-48
 35543  	MOVSS alpha+0(FP), X0
 35544  	MOVQ  xs+8(FP), AX
 35545  	MOVQ  incx+16(FP), CX
 35546  	MOVQ  CX, DX
 35547  	SHLQ  $0x04, DX
 35548  	MOVQ  ys+24(FP), DX
 35549  	MOVQ  incy+32(FP), BX
 35550  	MOVQ  BX, SI
 35551  	SHLQ  $0x04, SI
 35552  	MOVQ  n+40(FP), SI
 35553  	JMP   check_limit_unroll
 35554  	PCALIGN $0x08
 35555  	NOP
 35556  	NOP
 35557  	NOP
 35558  
 35559  loop_unroll:
 35560  	MOVSS (AX), X1
 35561  	LEAQ  (AX)(CX*4), AX
 35562  	MOVSS (AX), X2
 35563  	LEAQ  (AX)(CX*4), AX
 35564  	MOVSS (AX), X3
 35565  	LEAQ  (AX)(CX*4), AX
 35566  	MOVSS (AX), X4
 35567  	LEAQ  (AX)(CX*4), AX
 35568  	MULSS X0, X1
 35569  	MULSS X0, X2
 35570  	MULSS X0, X3
 35571  	MULSS X0, X4
 35572  	ADDSS (DX), X1
 35573  	MOVSS X1, (DX)
 35574  	LEAQ  (DX)(BX*4), DX
 35575  	ADDSS (DX), X2
 35576  	MOVSS X2, (DX)
 35577  	LEAQ  (DX)(BX*4), DX
 35578  	ADDSS (DX), X3
 35579  	MOVSS X3, (DX)
 35580  	LEAQ  (DX)(BX*4), DX
 35581  	ADDSS (DX), X4
 35582  	MOVSS X4, (DX)
 35583  	LEAQ  (DX)(BX*4), DX
 35584  	SUBQ  $0x04, SI
 35585  
 35586  check_limit_unroll:
 35587  	CMPQ SI, $0x04
 35588  	JHS  loop_unroll
 35589  	JMP  check_limit
 35590  
 35591  loop:
 35592  	MOVSS (AX), X1
 35593  	MULSS X0, X1
 35594  	ADDSS (DX), X1
 35595  	MOVSS X1, (DX)
 35596  	DECQ  SI
 35597  	LEAQ  (AX)(CX*4), AX
 35598  	LEAQ  (DX)(BX*4), DX
 35599  
 35600  check_limit:
 35601  	CMPQ SI, $0x00
 35602  	JHI  loop
 35603  	RET
 35604  
 35605  // func AmdAxpyPointerLoopXInterleave_V4A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35606  // Requires: SSE
 35607  TEXT ·AmdAxpyPointerLoopXInterleave_V4A11U4(SB), NOSPLIT, $0-48
 35608  	MOVSS alpha+0(FP), X0
 35609  	MOVQ  xs+8(FP), AX
 35610  	MOVQ  incx+16(FP), CX
 35611  	MOVQ  CX, DX
 35612  	SHLQ  $0x04, DX
 35613  	MOVQ  ys+24(FP), DX
 35614  	MOVQ  incy+32(FP), BX
 35615  	MOVQ  BX, SI
 35616  	SHLQ  $0x04, SI
 35617  	MOVQ  n+40(FP), SI
 35618  	JMP   check_limit_unroll
 35619  	PCALIGN $0x08
 35620  	NOP
 35621  	NOP
 35622  	NOP
 35623  
 35624  loop_unroll:
 35625  	MOVSS (AX), X1
 35626  	LEAQ  (AX)(CX*4), AX
 35627  	MOVSS (AX), X2
 35628  	LEAQ  (AX)(CX*4), AX
 35629  	MOVSS (AX), X3
 35630  	LEAQ  (AX)(CX*4), AX
 35631  	MOVSS (AX), X4
 35632  	LEAQ  (AX)(CX*4), AX
 35633  	MULSS X0, X1
 35634  	MULSS X0, X2
 35635  	MULSS X0, X3
 35636  	MULSS X0, X4
 35637  	ADDSS (DX), X1
 35638  	MOVSS X1, (DX)
 35639  	LEAQ  (DX)(BX*4), DX
 35640  	ADDSS (DX), X2
 35641  	MOVSS X2, (DX)
 35642  	LEAQ  (DX)(BX*4), DX
 35643  	ADDSS (DX), X3
 35644  	MOVSS X3, (DX)
 35645  	LEAQ  (DX)(BX*4), DX
 35646  	ADDSS (DX), X4
 35647  	MOVSS X4, (DX)
 35648  	LEAQ  (DX)(BX*4), DX
 35649  	SUBQ  $0x04, SI
 35650  
 35651  check_limit_unroll:
 35652  	CMPQ SI, $0x04
 35653  	JHS  loop_unroll
 35654  	JMP  check_limit
 35655  
 35656  loop:
 35657  	MOVSS (AX), X1
 35658  	MULSS X0, X1
 35659  	ADDSS (DX), X1
 35660  	MOVSS X1, (DX)
 35661  	DECQ  SI
 35662  	LEAQ  (AX)(CX*4), AX
 35663  	LEAQ  (DX)(BX*4), DX
 35664  
 35665  check_limit:
 35666  	CMPQ SI, $0x00
 35667  	JHI  loop
 35668  	RET
 35669  
 35670  // func AmdAxpyPointerLoopXInterleave_V5A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35671  // Requires: SSE
 35672  TEXT ·AmdAxpyPointerLoopXInterleave_V5A11U4(SB), NOSPLIT, $0-48
 35673  	MOVSS alpha+0(FP), X0
 35674  	MOVQ  xs+8(FP), AX
 35675  	MOVQ  incx+16(FP), CX
 35676  	MOVQ  CX, DX
 35677  	SHLQ  $0x04, DX
 35678  	MOVQ  ys+24(FP), DX
 35679  	MOVQ  incy+32(FP), BX
 35680  	MOVQ  BX, SI
 35681  	SHLQ  $0x04, SI
 35682  	MOVQ  n+40(FP), SI
 35683  	JMP   check_limit_unroll
 35684  	PCALIGN $0x08
 35685  	NOP
 35686  	NOP
 35687  	NOP
 35688  
 35689  loop_unroll:
 35690  	MOVSS (AX), X1
 35691  	LEAQ  (AX)(CX*4), AX
 35692  	MOVSS (AX), X2
 35693  	LEAQ  (AX)(CX*4), AX
 35694  	MOVSS (AX), X3
 35695  	LEAQ  (AX)(CX*4), AX
 35696  	MOVSS (AX), X4
 35697  	LEAQ  (AX)(CX*4), AX
 35698  	MULSS X0, X1
 35699  	MULSS X0, X2
 35700  	MULSS X0, X3
 35701  	MULSS X0, X4
 35702  	ADDSS (DX), X1
 35703  	MOVSS X1, (DX)
 35704  	LEAQ  (DX)(BX*4), DX
 35705  	ADDSS (DX), X2
 35706  	MOVSS X2, (DX)
 35707  	LEAQ  (DX)(BX*4), DX
 35708  	ADDSS (DX), X3
 35709  	MOVSS X3, (DX)
 35710  	LEAQ  (DX)(BX*4), DX
 35711  	ADDSS (DX), X4
 35712  	MOVSS X4, (DX)
 35713  	LEAQ  (DX)(BX*4), DX
 35714  	SUBQ  $0x04, SI
 35715  
 35716  check_limit_unroll:
 35717  	CMPQ SI, $0x04
 35718  	JHS  loop_unroll
 35719  	JMP  check_limit
 35720  
 35721  loop:
 35722  	MOVSS (AX), X1
 35723  	MULSS X0, X1
 35724  	ADDSS (DX), X1
 35725  	MOVSS X1, (DX)
 35726  	DECQ  SI
 35727  	LEAQ  (AX)(CX*4), AX
 35728  	LEAQ  (DX)(BX*4), DX
 35729  
 35730  check_limit:
 35731  	CMPQ SI, $0x00
 35732  	JHI  loop
 35733  	RET
 35734  
 35735  // func AmdAxpyPointerLoopXInterleave_V0A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35736  // Requires: SSE
 35737  TEXT ·AmdAxpyPointerLoopXInterleave_V0A12U4(SB), NOSPLIT, $0-48
 35738  	MOVSS alpha+0(FP), X0
 35739  	MOVQ  xs+8(FP), AX
 35740  	MOVQ  incx+16(FP), CX
 35741  	MOVQ  CX, DX
 35742  	SHLQ  $0x04, DX
 35743  	MOVQ  ys+24(FP), DX
 35744  	MOVQ  incy+32(FP), BX
 35745  	MOVQ  BX, SI
 35746  	SHLQ  $0x04, SI
 35747  	MOVQ  n+40(FP), SI
 35748  	JMP   check_limit_unroll
 35749  	PCALIGN $0x08
 35750  	NOP
 35751  	NOP
 35752  	NOP
 35753  	NOP
 35754  
 35755  loop_unroll:
 35756  	MOVSS (AX), X1
 35757  	LEAQ  (AX)(CX*4), AX
 35758  	MOVSS (AX), X2
 35759  	LEAQ  (AX)(CX*4), AX
 35760  	MOVSS (AX), X3
 35761  	LEAQ  (AX)(CX*4), AX
 35762  	MOVSS (AX), X4
 35763  	LEAQ  (AX)(CX*4), AX
 35764  	MULSS X0, X1
 35765  	MULSS X0, X2
 35766  	MULSS X0, X3
 35767  	MULSS X0, X4
 35768  	ADDSS (DX), X1
 35769  	MOVSS X1, (DX)
 35770  	LEAQ  (DX)(BX*4), DX
 35771  	ADDSS (DX), X2
 35772  	MOVSS X2, (DX)
 35773  	LEAQ  (DX)(BX*4), DX
 35774  	ADDSS (DX), X3
 35775  	MOVSS X3, (DX)
 35776  	LEAQ  (DX)(BX*4), DX
 35777  	ADDSS (DX), X4
 35778  	MOVSS X4, (DX)
 35779  	LEAQ  (DX)(BX*4), DX
 35780  	SUBQ  $0x04, SI
 35781  
 35782  check_limit_unroll:
 35783  	CMPQ SI, $0x04
 35784  	JHS  loop_unroll
 35785  	JMP  check_limit
 35786  
 35787  loop:
 35788  	MOVSS (AX), X1
 35789  	MULSS X0, X1
 35790  	ADDSS (DX), X1
 35791  	MOVSS X1, (DX)
 35792  	DECQ  SI
 35793  	LEAQ  (AX)(CX*4), AX
 35794  	LEAQ  (DX)(BX*4), DX
 35795  
 35796  check_limit:
 35797  	CMPQ SI, $0x00
 35798  	JHI  loop
 35799  	RET
 35800  
 35801  // func AmdAxpyPointerLoopXInterleave_V1A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35802  // Requires: SSE
 35803  TEXT ·AmdAxpyPointerLoopXInterleave_V1A12U4(SB), NOSPLIT, $0-48
 35804  	MOVSS alpha+0(FP), X0
 35805  	MOVQ  xs+8(FP), AX
 35806  	MOVQ  incx+16(FP), CX
 35807  	MOVQ  CX, DX
 35808  	SHLQ  $0x04, DX
 35809  	MOVQ  ys+24(FP), DX
 35810  	MOVQ  incy+32(FP), BX
 35811  	MOVQ  BX, SI
 35812  	SHLQ  $0x04, SI
 35813  	MOVQ  n+40(FP), SI
 35814  	JMP   check_limit_unroll
 35815  	PCALIGN $0x08
 35816  	NOP
 35817  	NOP
 35818  	NOP
 35819  	NOP
 35820  
 35821  loop_unroll:
 35822  	MOVSS (AX), X1
 35823  	LEAQ  (AX)(CX*4), AX
 35824  	MOVSS (AX), X2
 35825  	LEAQ  (AX)(CX*4), AX
 35826  	MOVSS (AX), X3
 35827  	LEAQ  (AX)(CX*4), AX
 35828  	MOVSS (AX), X4
 35829  	LEAQ  (AX)(CX*4), AX
 35830  	MULSS X0, X1
 35831  	MULSS X0, X2
 35832  	MULSS X0, X3
 35833  	MULSS X0, X4
 35834  	ADDSS (DX), X1
 35835  	MOVSS X1, (DX)
 35836  	LEAQ  (DX)(BX*4), DX
 35837  	ADDSS (DX), X2
 35838  	MOVSS X2, (DX)
 35839  	LEAQ  (DX)(BX*4), DX
 35840  	ADDSS (DX), X3
 35841  	MOVSS X3, (DX)
 35842  	LEAQ  (DX)(BX*4), DX
 35843  	ADDSS (DX), X4
 35844  	MOVSS X4, (DX)
 35845  	LEAQ  (DX)(BX*4), DX
 35846  	SUBQ  $0x04, SI
 35847  
 35848  check_limit_unroll:
 35849  	CMPQ SI, $0x04
 35850  	JHS  loop_unroll
 35851  	JMP  check_limit
 35852  
 35853  loop:
 35854  	MOVSS (AX), X1
 35855  	MULSS X0, X1
 35856  	ADDSS (DX), X1
 35857  	MOVSS X1, (DX)
 35858  	DECQ  SI
 35859  	LEAQ  (AX)(CX*4), AX
 35860  	LEAQ  (DX)(BX*4), DX
 35861  
 35862  check_limit:
 35863  	CMPQ SI, $0x00
 35864  	JHI  loop
 35865  	RET
 35866  
 35867  // func AmdAxpyPointerLoopXInterleave_V2A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35868  // Requires: SSE
 35869  TEXT ·AmdAxpyPointerLoopXInterleave_V2A12U4(SB), NOSPLIT, $0-48
 35870  	MOVSS alpha+0(FP), X0
 35871  	MOVQ  xs+8(FP), AX
 35872  	MOVQ  incx+16(FP), CX
 35873  	MOVQ  CX, DX
 35874  	SHLQ  $0x04, DX
 35875  	MOVQ  ys+24(FP), DX
 35876  	MOVQ  incy+32(FP), BX
 35877  	MOVQ  BX, SI
 35878  	SHLQ  $0x04, SI
 35879  	MOVQ  n+40(FP), SI
 35880  	JMP   check_limit_unroll
 35881  	PCALIGN $0x08
 35882  	NOP
 35883  	NOP
 35884  	NOP
 35885  	NOP
 35886  
 35887  loop_unroll:
 35888  	MOVSS (AX), X1
 35889  	LEAQ  (AX)(CX*4), AX
 35890  	MOVSS (AX), X2
 35891  	LEAQ  (AX)(CX*4), AX
 35892  	MOVSS (AX), X3
 35893  	LEAQ  (AX)(CX*4), AX
 35894  	MOVSS (AX), X4
 35895  	LEAQ  (AX)(CX*4), AX
 35896  	MULSS X0, X1
 35897  	MULSS X0, X2
 35898  	MULSS X0, X3
 35899  	MULSS X0, X4
 35900  	ADDSS (DX), X1
 35901  	MOVSS X1, (DX)
 35902  	LEAQ  (DX)(BX*4), DX
 35903  	ADDSS (DX), X2
 35904  	MOVSS X2, (DX)
 35905  	LEAQ  (DX)(BX*4), DX
 35906  	ADDSS (DX), X3
 35907  	MOVSS X3, (DX)
 35908  	LEAQ  (DX)(BX*4), DX
 35909  	ADDSS (DX), X4
 35910  	MOVSS X4, (DX)
 35911  	LEAQ  (DX)(BX*4), DX
 35912  	SUBQ  $0x04, SI
 35913  
 35914  check_limit_unroll:
 35915  	CMPQ SI, $0x04
 35916  	JHS  loop_unroll
 35917  	JMP  check_limit
 35918  
 35919  loop:
 35920  	MOVSS (AX), X1
 35921  	MULSS X0, X1
 35922  	ADDSS (DX), X1
 35923  	MOVSS X1, (DX)
 35924  	DECQ  SI
 35925  	LEAQ  (AX)(CX*4), AX
 35926  	LEAQ  (DX)(BX*4), DX
 35927  
 35928  check_limit:
 35929  	CMPQ SI, $0x00
 35930  	JHI  loop
 35931  	RET
 35932  
 35933  // func AmdAxpyPointerLoopXInterleave_V3A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 35934  // Requires: SSE
 35935  TEXT ·AmdAxpyPointerLoopXInterleave_V3A12U4(SB), NOSPLIT, $0-48
 35936  	MOVSS alpha+0(FP), X0
 35937  	MOVQ  xs+8(FP), AX
 35938  	MOVQ  incx+16(FP), CX
 35939  	MOVQ  CX, DX
 35940  	SHLQ  $0x04, DX
 35941  	MOVQ  ys+24(FP), DX
 35942  	MOVQ  incy+32(FP), BX
 35943  	MOVQ  BX, SI
 35944  	SHLQ  $0x04, SI
 35945  	MOVQ  n+40(FP), SI
 35946  	JMP   check_limit_unroll
 35947  	PCALIGN $0x08
 35948  	NOP
 35949  	NOP
 35950  	NOP
 35951  	NOP
 35952  
 35953  loop_unroll:
 35954  	MOVSS (AX), X1
 35955  	LEAQ  (AX)(CX*4), AX
 35956  	MOVSS (AX), X2
 35957  	LEAQ  (AX)(CX*4), AX
 35958  	MOVSS (AX), X3
 35959  	LEAQ  (AX)(CX*4), AX
 35960  	MOVSS (AX), X4
 35961  	LEAQ  (AX)(CX*4), AX
 35962  	MULSS X0, X1
 35963  	MULSS X0, X2
 35964  	MULSS X0, X3
 35965  	MULSS X0, X4
 35966  	ADDSS (DX), X1
 35967  	MOVSS X1, (DX)
 35968  	LEAQ  (DX)(BX*4), DX
 35969  	ADDSS (DX), X2
 35970  	MOVSS X2, (DX)
 35971  	LEAQ  (DX)(BX*4), DX
 35972  	ADDSS (DX), X3
 35973  	MOVSS X3, (DX)
 35974  	LEAQ  (DX)(BX*4), DX
 35975  	ADDSS (DX), X4
 35976  	MOVSS X4, (DX)
 35977  	LEAQ  (DX)(BX*4), DX
 35978  	SUBQ  $0x04, SI
 35979  
 35980  check_limit_unroll:
 35981  	CMPQ SI, $0x04
 35982  	JHS  loop_unroll
 35983  	JMP  check_limit
 35984  
 35985  loop:
 35986  	MOVSS (AX), X1
 35987  	MULSS X0, X1
 35988  	ADDSS (DX), X1
 35989  	MOVSS X1, (DX)
 35990  	DECQ  SI
 35991  	LEAQ  (AX)(CX*4), AX
 35992  	LEAQ  (DX)(BX*4), DX
 35993  
 35994  check_limit:
 35995  	CMPQ SI, $0x00
 35996  	JHI  loop
 35997  	RET
 35998  
 35999  // func AmdAxpyPointerLoopXInterleave_V4A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36000  // Requires: SSE
 36001  TEXT ·AmdAxpyPointerLoopXInterleave_V4A12U4(SB), NOSPLIT, $0-48
 36002  	MOVSS alpha+0(FP), X0
 36003  	MOVQ  xs+8(FP), AX
 36004  	MOVQ  incx+16(FP), CX
 36005  	MOVQ  CX, DX
 36006  	SHLQ  $0x04, DX
 36007  	MOVQ  ys+24(FP), DX
 36008  	MOVQ  incy+32(FP), BX
 36009  	MOVQ  BX, SI
 36010  	SHLQ  $0x04, SI
 36011  	MOVQ  n+40(FP), SI
 36012  	JMP   check_limit_unroll
 36013  	PCALIGN $0x08
 36014  	NOP
 36015  	NOP
 36016  	NOP
 36017  	NOP
 36018  
 36019  loop_unroll:
 36020  	MOVSS (AX), X1
 36021  	LEAQ  (AX)(CX*4), AX
 36022  	MOVSS (AX), X2
 36023  	LEAQ  (AX)(CX*4), AX
 36024  	MOVSS (AX), X3
 36025  	LEAQ  (AX)(CX*4), AX
 36026  	MOVSS (AX), X4
 36027  	LEAQ  (AX)(CX*4), AX
 36028  	MULSS X0, X1
 36029  	MULSS X0, X2
 36030  	MULSS X0, X3
 36031  	MULSS X0, X4
 36032  	ADDSS (DX), X1
 36033  	MOVSS X1, (DX)
 36034  	LEAQ  (DX)(BX*4), DX
 36035  	ADDSS (DX), X2
 36036  	MOVSS X2, (DX)
 36037  	LEAQ  (DX)(BX*4), DX
 36038  	ADDSS (DX), X3
 36039  	MOVSS X3, (DX)
 36040  	LEAQ  (DX)(BX*4), DX
 36041  	ADDSS (DX), X4
 36042  	MOVSS X4, (DX)
 36043  	LEAQ  (DX)(BX*4), DX
 36044  	SUBQ  $0x04, SI
 36045  
 36046  check_limit_unroll:
 36047  	CMPQ SI, $0x04
 36048  	JHS  loop_unroll
 36049  	JMP  check_limit
 36050  
 36051  loop:
 36052  	MOVSS (AX), X1
 36053  	MULSS X0, X1
 36054  	ADDSS (DX), X1
 36055  	MOVSS X1, (DX)
 36056  	DECQ  SI
 36057  	LEAQ  (AX)(CX*4), AX
 36058  	LEAQ  (DX)(BX*4), DX
 36059  
 36060  check_limit:
 36061  	CMPQ SI, $0x00
 36062  	JHI  loop
 36063  	RET
 36064  
 36065  // func AmdAxpyPointerLoopXInterleave_V5A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36066  // Requires: SSE
 36067  TEXT ·AmdAxpyPointerLoopXInterleave_V5A12U4(SB), NOSPLIT, $0-48
 36068  	MOVSS alpha+0(FP), X0
 36069  	MOVQ  xs+8(FP), AX
 36070  	MOVQ  incx+16(FP), CX
 36071  	MOVQ  CX, DX
 36072  	SHLQ  $0x04, DX
 36073  	MOVQ  ys+24(FP), DX
 36074  	MOVQ  incy+32(FP), BX
 36075  	MOVQ  BX, SI
 36076  	SHLQ  $0x04, SI
 36077  	MOVQ  n+40(FP), SI
 36078  	JMP   check_limit_unroll
 36079  	PCALIGN $0x08
 36080  	NOP
 36081  	NOP
 36082  	NOP
 36083  	NOP
 36084  
 36085  loop_unroll:
 36086  	MOVSS (AX), X1
 36087  	LEAQ  (AX)(CX*4), AX
 36088  	MOVSS (AX), X2
 36089  	LEAQ  (AX)(CX*4), AX
 36090  	MOVSS (AX), X3
 36091  	LEAQ  (AX)(CX*4), AX
 36092  	MOVSS (AX), X4
 36093  	LEAQ  (AX)(CX*4), AX
 36094  	MULSS X0, X1
 36095  	MULSS X0, X2
 36096  	MULSS X0, X3
 36097  	MULSS X0, X4
 36098  	ADDSS (DX), X1
 36099  	MOVSS X1, (DX)
 36100  	LEAQ  (DX)(BX*4), DX
 36101  	ADDSS (DX), X2
 36102  	MOVSS X2, (DX)
 36103  	LEAQ  (DX)(BX*4), DX
 36104  	ADDSS (DX), X3
 36105  	MOVSS X3, (DX)
 36106  	LEAQ  (DX)(BX*4), DX
 36107  	ADDSS (DX), X4
 36108  	MOVSS X4, (DX)
 36109  	LEAQ  (DX)(BX*4), DX
 36110  	SUBQ  $0x04, SI
 36111  
 36112  check_limit_unroll:
 36113  	CMPQ SI, $0x04
 36114  	JHS  loop_unroll
 36115  	JMP  check_limit
 36116  
 36117  loop:
 36118  	MOVSS (AX), X1
 36119  	MULSS X0, X1
 36120  	ADDSS (DX), X1
 36121  	MOVSS X1, (DX)
 36122  	DECQ  SI
 36123  	LEAQ  (AX)(CX*4), AX
 36124  	LEAQ  (DX)(BX*4), DX
 36125  
 36126  check_limit:
 36127  	CMPQ SI, $0x00
 36128  	JHI  loop
 36129  	RET
 36130  
 36131  // func AmdAxpyPointerLoopXInterleave_V0A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36132  // Requires: SSE
 36133  TEXT ·AmdAxpyPointerLoopXInterleave_V0A13U4(SB), NOSPLIT, $0-48
 36134  	MOVSS alpha+0(FP), X0
 36135  	MOVQ  xs+8(FP), AX
 36136  	MOVQ  incx+16(FP), CX
 36137  	MOVQ  CX, DX
 36138  	SHLQ  $0x04, DX
 36139  	MOVQ  ys+24(FP), DX
 36140  	MOVQ  incy+32(FP), BX
 36141  	MOVQ  BX, SI
 36142  	SHLQ  $0x04, SI
 36143  	MOVQ  n+40(FP), SI
 36144  	JMP   check_limit_unroll
 36145  	PCALIGN $0x08
 36146  	NOP
 36147  	NOP
 36148  	NOP
 36149  	NOP
 36150  	NOP
 36151  
 36152  loop_unroll:
 36153  	MOVSS (AX), X1
 36154  	LEAQ  (AX)(CX*4), AX
 36155  	MOVSS (AX), X2
 36156  	LEAQ  (AX)(CX*4), AX
 36157  	MOVSS (AX), X3
 36158  	LEAQ  (AX)(CX*4), AX
 36159  	MOVSS (AX), X4
 36160  	LEAQ  (AX)(CX*4), AX
 36161  	MULSS X0, X1
 36162  	MULSS X0, X2
 36163  	MULSS X0, X3
 36164  	MULSS X0, X4
 36165  	ADDSS (DX), X1
 36166  	MOVSS X1, (DX)
 36167  	LEAQ  (DX)(BX*4), DX
 36168  	ADDSS (DX), X2
 36169  	MOVSS X2, (DX)
 36170  	LEAQ  (DX)(BX*4), DX
 36171  	ADDSS (DX), X3
 36172  	MOVSS X3, (DX)
 36173  	LEAQ  (DX)(BX*4), DX
 36174  	ADDSS (DX), X4
 36175  	MOVSS X4, (DX)
 36176  	LEAQ  (DX)(BX*4), DX
 36177  	SUBQ  $0x04, SI
 36178  
 36179  check_limit_unroll:
 36180  	CMPQ SI, $0x04
 36181  	JHS  loop_unroll
 36182  	JMP  check_limit
 36183  
 36184  loop:
 36185  	MOVSS (AX), X1
 36186  	MULSS X0, X1
 36187  	ADDSS (DX), X1
 36188  	MOVSS X1, (DX)
 36189  	DECQ  SI
 36190  	LEAQ  (AX)(CX*4), AX
 36191  	LEAQ  (DX)(BX*4), DX
 36192  
 36193  check_limit:
 36194  	CMPQ SI, $0x00
 36195  	JHI  loop
 36196  	RET
 36197  
 36198  // func AmdAxpyPointerLoopXInterleave_V1A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36199  // Requires: SSE
 36200  TEXT ·AmdAxpyPointerLoopXInterleave_V1A13U4(SB), NOSPLIT, $0-48
 36201  	MOVSS alpha+0(FP), X0
 36202  	MOVQ  xs+8(FP), AX
 36203  	MOVQ  incx+16(FP), CX
 36204  	MOVQ  CX, DX
 36205  	SHLQ  $0x04, DX
 36206  	MOVQ  ys+24(FP), DX
 36207  	MOVQ  incy+32(FP), BX
 36208  	MOVQ  BX, SI
 36209  	SHLQ  $0x04, SI
 36210  	MOVQ  n+40(FP), SI
 36211  	JMP   check_limit_unroll
 36212  	PCALIGN $0x08
 36213  	NOP
 36214  	NOP
 36215  	NOP
 36216  	NOP
 36217  	NOP
 36218  
 36219  loop_unroll:
 36220  	MOVSS (AX), X1
 36221  	LEAQ  (AX)(CX*4), AX
 36222  	MOVSS (AX), X2
 36223  	LEAQ  (AX)(CX*4), AX
 36224  	MOVSS (AX), X3
 36225  	LEAQ  (AX)(CX*4), AX
 36226  	MOVSS (AX), X4
 36227  	LEAQ  (AX)(CX*4), AX
 36228  	MULSS X0, X1
 36229  	MULSS X0, X2
 36230  	MULSS X0, X3
 36231  	MULSS X0, X4
 36232  	ADDSS (DX), X1
 36233  	MOVSS X1, (DX)
 36234  	LEAQ  (DX)(BX*4), DX
 36235  	ADDSS (DX), X2
 36236  	MOVSS X2, (DX)
 36237  	LEAQ  (DX)(BX*4), DX
 36238  	ADDSS (DX), X3
 36239  	MOVSS X3, (DX)
 36240  	LEAQ  (DX)(BX*4), DX
 36241  	ADDSS (DX), X4
 36242  	MOVSS X4, (DX)
 36243  	LEAQ  (DX)(BX*4), DX
 36244  	SUBQ  $0x04, SI
 36245  
 36246  check_limit_unroll:
 36247  	CMPQ SI, $0x04
 36248  	JHS  loop_unroll
 36249  	JMP  check_limit
 36250  
 36251  loop:
 36252  	MOVSS (AX), X1
 36253  	MULSS X0, X1
 36254  	ADDSS (DX), X1
 36255  	MOVSS X1, (DX)
 36256  	DECQ  SI
 36257  	LEAQ  (AX)(CX*4), AX
 36258  	LEAQ  (DX)(BX*4), DX
 36259  
 36260  check_limit:
 36261  	CMPQ SI, $0x00
 36262  	JHI  loop
 36263  	RET
 36264  
 36265  // func AmdAxpyPointerLoopXInterleave_V2A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36266  // Requires: SSE
 36267  TEXT ·AmdAxpyPointerLoopXInterleave_V2A13U4(SB), NOSPLIT, $0-48
 36268  	MOVSS alpha+0(FP), X0
 36269  	MOVQ  xs+8(FP), AX
 36270  	MOVQ  incx+16(FP), CX
 36271  	MOVQ  CX, DX
 36272  	SHLQ  $0x04, DX
 36273  	MOVQ  ys+24(FP), DX
 36274  	MOVQ  incy+32(FP), BX
 36275  	MOVQ  BX, SI
 36276  	SHLQ  $0x04, SI
 36277  	MOVQ  n+40(FP), SI
 36278  	JMP   check_limit_unroll
 36279  	PCALIGN $0x08
 36280  	NOP
 36281  	NOP
 36282  	NOP
 36283  	NOP
 36284  	NOP
 36285  
 36286  loop_unroll:
 36287  	MOVSS (AX), X1
 36288  	LEAQ  (AX)(CX*4), AX
 36289  	MOVSS (AX), X2
 36290  	LEAQ  (AX)(CX*4), AX
 36291  	MOVSS (AX), X3
 36292  	LEAQ  (AX)(CX*4), AX
 36293  	MOVSS (AX), X4
 36294  	LEAQ  (AX)(CX*4), AX
 36295  	MULSS X0, X1
 36296  	MULSS X0, X2
 36297  	MULSS X0, X3
 36298  	MULSS X0, X4
 36299  	ADDSS (DX), X1
 36300  	MOVSS X1, (DX)
 36301  	LEAQ  (DX)(BX*4), DX
 36302  	ADDSS (DX), X2
 36303  	MOVSS X2, (DX)
 36304  	LEAQ  (DX)(BX*4), DX
 36305  	ADDSS (DX), X3
 36306  	MOVSS X3, (DX)
 36307  	LEAQ  (DX)(BX*4), DX
 36308  	ADDSS (DX), X4
 36309  	MOVSS X4, (DX)
 36310  	LEAQ  (DX)(BX*4), DX
 36311  	SUBQ  $0x04, SI
 36312  
 36313  check_limit_unroll:
 36314  	CMPQ SI, $0x04
 36315  	JHS  loop_unroll
 36316  	JMP  check_limit
 36317  
 36318  loop:
 36319  	MOVSS (AX), X1
 36320  	MULSS X0, X1
 36321  	ADDSS (DX), X1
 36322  	MOVSS X1, (DX)
 36323  	DECQ  SI
 36324  	LEAQ  (AX)(CX*4), AX
 36325  	LEAQ  (DX)(BX*4), DX
 36326  
 36327  check_limit:
 36328  	CMPQ SI, $0x00
 36329  	JHI  loop
 36330  	RET
 36331  
 36332  // func AmdAxpyPointerLoopXInterleave_V3A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36333  // Requires: SSE
 36334  TEXT ·AmdAxpyPointerLoopXInterleave_V3A13U4(SB), NOSPLIT, $0-48
 36335  	MOVSS alpha+0(FP), X0
 36336  	MOVQ  xs+8(FP), AX
 36337  	MOVQ  incx+16(FP), CX
 36338  	MOVQ  CX, DX
 36339  	SHLQ  $0x04, DX
 36340  	MOVQ  ys+24(FP), DX
 36341  	MOVQ  incy+32(FP), BX
 36342  	MOVQ  BX, SI
 36343  	SHLQ  $0x04, SI
 36344  	MOVQ  n+40(FP), SI
 36345  	JMP   check_limit_unroll
 36346  	PCALIGN $0x08
 36347  	NOP
 36348  	NOP
 36349  	NOP
 36350  	NOP
 36351  	NOP
 36352  
 36353  loop_unroll:
 36354  	MOVSS (AX), X1
 36355  	LEAQ  (AX)(CX*4), AX
 36356  	MOVSS (AX), X2
 36357  	LEAQ  (AX)(CX*4), AX
 36358  	MOVSS (AX), X3
 36359  	LEAQ  (AX)(CX*4), AX
 36360  	MOVSS (AX), X4
 36361  	LEAQ  (AX)(CX*4), AX
 36362  	MULSS X0, X1
 36363  	MULSS X0, X2
 36364  	MULSS X0, X3
 36365  	MULSS X0, X4
 36366  	ADDSS (DX), X1
 36367  	MOVSS X1, (DX)
 36368  	LEAQ  (DX)(BX*4), DX
 36369  	ADDSS (DX), X2
 36370  	MOVSS X2, (DX)
 36371  	LEAQ  (DX)(BX*4), DX
 36372  	ADDSS (DX), X3
 36373  	MOVSS X3, (DX)
 36374  	LEAQ  (DX)(BX*4), DX
 36375  	ADDSS (DX), X4
 36376  	MOVSS X4, (DX)
 36377  	LEAQ  (DX)(BX*4), DX
 36378  	SUBQ  $0x04, SI
 36379  
 36380  check_limit_unroll:
 36381  	CMPQ SI, $0x04
 36382  	JHS  loop_unroll
 36383  	JMP  check_limit
 36384  
 36385  loop:
 36386  	MOVSS (AX), X1
 36387  	MULSS X0, X1
 36388  	ADDSS (DX), X1
 36389  	MOVSS X1, (DX)
 36390  	DECQ  SI
 36391  	LEAQ  (AX)(CX*4), AX
 36392  	LEAQ  (DX)(BX*4), DX
 36393  
 36394  check_limit:
 36395  	CMPQ SI, $0x00
 36396  	JHI  loop
 36397  	RET
 36398  
 36399  // func AmdAxpyPointerLoopXInterleave_V4A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36400  // Requires: SSE
 36401  TEXT ·AmdAxpyPointerLoopXInterleave_V4A13U4(SB), NOSPLIT, $0-48
 36402  	MOVSS alpha+0(FP), X0
 36403  	MOVQ  xs+8(FP), AX
 36404  	MOVQ  incx+16(FP), CX
 36405  	MOVQ  CX, DX
 36406  	SHLQ  $0x04, DX
 36407  	MOVQ  ys+24(FP), DX
 36408  	MOVQ  incy+32(FP), BX
 36409  	MOVQ  BX, SI
 36410  	SHLQ  $0x04, SI
 36411  	MOVQ  n+40(FP), SI
 36412  	JMP   check_limit_unroll
 36413  	PCALIGN $0x08
 36414  	NOP
 36415  	NOP
 36416  	NOP
 36417  	NOP
 36418  	NOP
 36419  
 36420  loop_unroll:
 36421  	MOVSS (AX), X1
 36422  	LEAQ  (AX)(CX*4), AX
 36423  	MOVSS (AX), X2
 36424  	LEAQ  (AX)(CX*4), AX
 36425  	MOVSS (AX), X3
 36426  	LEAQ  (AX)(CX*4), AX
 36427  	MOVSS (AX), X4
 36428  	LEAQ  (AX)(CX*4), AX
 36429  	MULSS X0, X1
 36430  	MULSS X0, X2
 36431  	MULSS X0, X3
 36432  	MULSS X0, X4
 36433  	ADDSS (DX), X1
 36434  	MOVSS X1, (DX)
 36435  	LEAQ  (DX)(BX*4), DX
 36436  	ADDSS (DX), X2
 36437  	MOVSS X2, (DX)
 36438  	LEAQ  (DX)(BX*4), DX
 36439  	ADDSS (DX), X3
 36440  	MOVSS X3, (DX)
 36441  	LEAQ  (DX)(BX*4), DX
 36442  	ADDSS (DX), X4
 36443  	MOVSS X4, (DX)
 36444  	LEAQ  (DX)(BX*4), DX
 36445  	SUBQ  $0x04, SI
 36446  
 36447  check_limit_unroll:
 36448  	CMPQ SI, $0x04
 36449  	JHS  loop_unroll
 36450  	JMP  check_limit
 36451  
 36452  loop:
 36453  	MOVSS (AX), X1
 36454  	MULSS X0, X1
 36455  	ADDSS (DX), X1
 36456  	MOVSS X1, (DX)
 36457  	DECQ  SI
 36458  	LEAQ  (AX)(CX*4), AX
 36459  	LEAQ  (DX)(BX*4), DX
 36460  
 36461  check_limit:
 36462  	CMPQ SI, $0x00
 36463  	JHI  loop
 36464  	RET
 36465  
 36466  // func AmdAxpyPointerLoopXInterleave_V5A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36467  // Requires: SSE
 36468  TEXT ·AmdAxpyPointerLoopXInterleave_V5A13U4(SB), NOSPLIT, $0-48
 36469  	MOVSS alpha+0(FP), X0
 36470  	MOVQ  xs+8(FP), AX
 36471  	MOVQ  incx+16(FP), CX
 36472  	MOVQ  CX, DX
 36473  	SHLQ  $0x04, DX
 36474  	MOVQ  ys+24(FP), DX
 36475  	MOVQ  incy+32(FP), BX
 36476  	MOVQ  BX, SI
 36477  	SHLQ  $0x04, SI
 36478  	MOVQ  n+40(FP), SI
 36479  	JMP   check_limit_unroll
 36480  	PCALIGN $0x08
 36481  	NOP
 36482  	NOP
 36483  	NOP
 36484  	NOP
 36485  	NOP
 36486  
 36487  loop_unroll:
 36488  	MOVSS (AX), X1
 36489  	LEAQ  (AX)(CX*4), AX
 36490  	MOVSS (AX), X2
 36491  	LEAQ  (AX)(CX*4), AX
 36492  	MOVSS (AX), X3
 36493  	LEAQ  (AX)(CX*4), AX
 36494  	MOVSS (AX), X4
 36495  	LEAQ  (AX)(CX*4), AX
 36496  	MULSS X0, X1
 36497  	MULSS X0, X2
 36498  	MULSS X0, X3
 36499  	MULSS X0, X4
 36500  	ADDSS (DX), X1
 36501  	MOVSS X1, (DX)
 36502  	LEAQ  (DX)(BX*4), DX
 36503  	ADDSS (DX), X2
 36504  	MOVSS X2, (DX)
 36505  	LEAQ  (DX)(BX*4), DX
 36506  	ADDSS (DX), X3
 36507  	MOVSS X3, (DX)
 36508  	LEAQ  (DX)(BX*4), DX
 36509  	ADDSS (DX), X4
 36510  	MOVSS X4, (DX)
 36511  	LEAQ  (DX)(BX*4), DX
 36512  	SUBQ  $0x04, SI
 36513  
 36514  check_limit_unroll:
 36515  	CMPQ SI, $0x04
 36516  	JHS  loop_unroll
 36517  	JMP  check_limit
 36518  
 36519  loop:
 36520  	MOVSS (AX), X1
 36521  	MULSS X0, X1
 36522  	ADDSS (DX), X1
 36523  	MOVSS X1, (DX)
 36524  	DECQ  SI
 36525  	LEAQ  (AX)(CX*4), AX
 36526  	LEAQ  (DX)(BX*4), DX
 36527  
 36528  check_limit:
 36529  	CMPQ SI, $0x00
 36530  	JHI  loop
 36531  	RET
 36532  
 36533  // func AmdAxpyPointerLoopXInterleave_V0A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36534  // Requires: SSE
 36535  TEXT ·AmdAxpyPointerLoopXInterleave_V0A14U4(SB), NOSPLIT, $0-48
 36536  	MOVSS alpha+0(FP), X0
 36537  	MOVQ  xs+8(FP), AX
 36538  	MOVQ  incx+16(FP), CX
 36539  	MOVQ  CX, DX
 36540  	SHLQ  $0x04, DX
 36541  	MOVQ  ys+24(FP), DX
 36542  	MOVQ  incy+32(FP), BX
 36543  	MOVQ  BX, SI
 36544  	SHLQ  $0x04, SI
 36545  	MOVQ  n+40(FP), SI
 36546  	JMP   check_limit_unroll
 36547  	PCALIGN $0x08
 36548  	NOP
 36549  	NOP
 36550  	NOP
 36551  	NOP
 36552  	NOP
 36553  	NOP
 36554  
 36555  loop_unroll:
 36556  	MOVSS (AX), X1
 36557  	LEAQ  (AX)(CX*4), AX
 36558  	MOVSS (AX), X2
 36559  	LEAQ  (AX)(CX*4), AX
 36560  	MOVSS (AX), X3
 36561  	LEAQ  (AX)(CX*4), AX
 36562  	MOVSS (AX), X4
 36563  	LEAQ  (AX)(CX*4), AX
 36564  	MULSS X0, X1
 36565  	MULSS X0, X2
 36566  	MULSS X0, X3
 36567  	MULSS X0, X4
 36568  	ADDSS (DX), X1
 36569  	MOVSS X1, (DX)
 36570  	LEAQ  (DX)(BX*4), DX
 36571  	ADDSS (DX), X2
 36572  	MOVSS X2, (DX)
 36573  	LEAQ  (DX)(BX*4), DX
 36574  	ADDSS (DX), X3
 36575  	MOVSS X3, (DX)
 36576  	LEAQ  (DX)(BX*4), DX
 36577  	ADDSS (DX), X4
 36578  	MOVSS X4, (DX)
 36579  	LEAQ  (DX)(BX*4), DX
 36580  	SUBQ  $0x04, SI
 36581  
 36582  check_limit_unroll:
 36583  	CMPQ SI, $0x04
 36584  	JHS  loop_unroll
 36585  	JMP  check_limit
 36586  
 36587  loop:
 36588  	MOVSS (AX), X1
 36589  	MULSS X0, X1
 36590  	ADDSS (DX), X1
 36591  	MOVSS X1, (DX)
 36592  	DECQ  SI
 36593  	LEAQ  (AX)(CX*4), AX
 36594  	LEAQ  (DX)(BX*4), DX
 36595  
 36596  check_limit:
 36597  	CMPQ SI, $0x00
 36598  	JHI  loop
 36599  	RET
 36600  
 36601  // func AmdAxpyPointerLoopXInterleave_V1A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36602  // Requires: SSE
 36603  TEXT ·AmdAxpyPointerLoopXInterleave_V1A14U4(SB), NOSPLIT, $0-48
 36604  	MOVSS alpha+0(FP), X0
 36605  	MOVQ  xs+8(FP), AX
 36606  	MOVQ  incx+16(FP), CX
 36607  	MOVQ  CX, DX
 36608  	SHLQ  $0x04, DX
 36609  	MOVQ  ys+24(FP), DX
 36610  	MOVQ  incy+32(FP), BX
 36611  	MOVQ  BX, SI
 36612  	SHLQ  $0x04, SI
 36613  	MOVQ  n+40(FP), SI
 36614  	JMP   check_limit_unroll
 36615  	PCALIGN $0x08
 36616  	NOP
 36617  	NOP
 36618  	NOP
 36619  	NOP
 36620  	NOP
 36621  	NOP
 36622  
 36623  loop_unroll:
 36624  	MOVSS (AX), X1
 36625  	LEAQ  (AX)(CX*4), AX
 36626  	MOVSS (AX), X2
 36627  	LEAQ  (AX)(CX*4), AX
 36628  	MOVSS (AX), X3
 36629  	LEAQ  (AX)(CX*4), AX
 36630  	MOVSS (AX), X4
 36631  	LEAQ  (AX)(CX*4), AX
 36632  	MULSS X0, X1
 36633  	MULSS X0, X2
 36634  	MULSS X0, X3
 36635  	MULSS X0, X4
 36636  	ADDSS (DX), X1
 36637  	MOVSS X1, (DX)
 36638  	LEAQ  (DX)(BX*4), DX
 36639  	ADDSS (DX), X2
 36640  	MOVSS X2, (DX)
 36641  	LEAQ  (DX)(BX*4), DX
 36642  	ADDSS (DX), X3
 36643  	MOVSS X3, (DX)
 36644  	LEAQ  (DX)(BX*4), DX
 36645  	ADDSS (DX), X4
 36646  	MOVSS X4, (DX)
 36647  	LEAQ  (DX)(BX*4), DX
 36648  	SUBQ  $0x04, SI
 36649  
 36650  check_limit_unroll:
 36651  	CMPQ SI, $0x04
 36652  	JHS  loop_unroll
 36653  	JMP  check_limit
 36654  
 36655  loop:
 36656  	MOVSS (AX), X1
 36657  	MULSS X0, X1
 36658  	ADDSS (DX), X1
 36659  	MOVSS X1, (DX)
 36660  	DECQ  SI
 36661  	LEAQ  (AX)(CX*4), AX
 36662  	LEAQ  (DX)(BX*4), DX
 36663  
 36664  check_limit:
 36665  	CMPQ SI, $0x00
 36666  	JHI  loop
 36667  	RET
 36668  
 36669  // func AmdAxpyPointerLoopXInterleave_V2A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36670  // Requires: SSE
 36671  TEXT ·AmdAxpyPointerLoopXInterleave_V2A14U4(SB), NOSPLIT, $0-48
 36672  	MOVSS alpha+0(FP), X0
 36673  	MOVQ  xs+8(FP), AX
 36674  	MOVQ  incx+16(FP), CX
 36675  	MOVQ  CX, DX
 36676  	SHLQ  $0x04, DX
 36677  	MOVQ  ys+24(FP), DX
 36678  	MOVQ  incy+32(FP), BX
 36679  	MOVQ  BX, SI
 36680  	SHLQ  $0x04, SI
 36681  	MOVQ  n+40(FP), SI
 36682  	JMP   check_limit_unroll
 36683  	PCALIGN $0x08
 36684  	NOP
 36685  	NOP
 36686  	NOP
 36687  	NOP
 36688  	NOP
 36689  	NOP
 36690  
 36691  loop_unroll:
 36692  	MOVSS (AX), X1
 36693  	LEAQ  (AX)(CX*4), AX
 36694  	MOVSS (AX), X2
 36695  	LEAQ  (AX)(CX*4), AX
 36696  	MOVSS (AX), X3
 36697  	LEAQ  (AX)(CX*4), AX
 36698  	MOVSS (AX), X4
 36699  	LEAQ  (AX)(CX*4), AX
 36700  	MULSS X0, X1
 36701  	MULSS X0, X2
 36702  	MULSS X0, X3
 36703  	MULSS X0, X4
 36704  	ADDSS (DX), X1
 36705  	MOVSS X1, (DX)
 36706  	LEAQ  (DX)(BX*4), DX
 36707  	ADDSS (DX), X2
 36708  	MOVSS X2, (DX)
 36709  	LEAQ  (DX)(BX*4), DX
 36710  	ADDSS (DX), X3
 36711  	MOVSS X3, (DX)
 36712  	LEAQ  (DX)(BX*4), DX
 36713  	ADDSS (DX), X4
 36714  	MOVSS X4, (DX)
 36715  	LEAQ  (DX)(BX*4), DX
 36716  	SUBQ  $0x04, SI
 36717  
 36718  check_limit_unroll:
 36719  	CMPQ SI, $0x04
 36720  	JHS  loop_unroll
 36721  	JMP  check_limit
 36722  
 36723  loop:
 36724  	MOVSS (AX), X1
 36725  	MULSS X0, X1
 36726  	ADDSS (DX), X1
 36727  	MOVSS X1, (DX)
 36728  	DECQ  SI
 36729  	LEAQ  (AX)(CX*4), AX
 36730  	LEAQ  (DX)(BX*4), DX
 36731  
 36732  check_limit:
 36733  	CMPQ SI, $0x00
 36734  	JHI  loop
 36735  	RET
 36736  
 36737  // func AmdAxpyPointerLoopXInterleave_V3A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36738  // Requires: SSE
 36739  TEXT ·AmdAxpyPointerLoopXInterleave_V3A14U4(SB), NOSPLIT, $0-48
 36740  	MOVSS alpha+0(FP), X0
 36741  	MOVQ  xs+8(FP), AX
 36742  	MOVQ  incx+16(FP), CX
 36743  	MOVQ  CX, DX
 36744  	SHLQ  $0x04, DX
 36745  	MOVQ  ys+24(FP), DX
 36746  	MOVQ  incy+32(FP), BX
 36747  	MOVQ  BX, SI
 36748  	SHLQ  $0x04, SI
 36749  	MOVQ  n+40(FP), SI
 36750  	JMP   check_limit_unroll
 36751  	PCALIGN $0x08
 36752  	NOP
 36753  	NOP
 36754  	NOP
 36755  	NOP
 36756  	NOP
 36757  	NOP
 36758  
 36759  loop_unroll:
 36760  	MOVSS (AX), X1
 36761  	LEAQ  (AX)(CX*4), AX
 36762  	MOVSS (AX), X2
 36763  	LEAQ  (AX)(CX*4), AX
 36764  	MOVSS (AX), X3
 36765  	LEAQ  (AX)(CX*4), AX
 36766  	MOVSS (AX), X4
 36767  	LEAQ  (AX)(CX*4), AX
 36768  	MULSS X0, X1
 36769  	MULSS X0, X2
 36770  	MULSS X0, X3
 36771  	MULSS X0, X4
 36772  	ADDSS (DX), X1
 36773  	MOVSS X1, (DX)
 36774  	LEAQ  (DX)(BX*4), DX
 36775  	ADDSS (DX), X2
 36776  	MOVSS X2, (DX)
 36777  	LEAQ  (DX)(BX*4), DX
 36778  	ADDSS (DX), X3
 36779  	MOVSS X3, (DX)
 36780  	LEAQ  (DX)(BX*4), DX
 36781  	ADDSS (DX), X4
 36782  	MOVSS X4, (DX)
 36783  	LEAQ  (DX)(BX*4), DX
 36784  	SUBQ  $0x04, SI
 36785  
 36786  check_limit_unroll:
 36787  	CMPQ SI, $0x04
 36788  	JHS  loop_unroll
 36789  	JMP  check_limit
 36790  
 36791  loop:
 36792  	MOVSS (AX), X1
 36793  	MULSS X0, X1
 36794  	ADDSS (DX), X1
 36795  	MOVSS X1, (DX)
 36796  	DECQ  SI
 36797  	LEAQ  (AX)(CX*4), AX
 36798  	LEAQ  (DX)(BX*4), DX
 36799  
 36800  check_limit:
 36801  	CMPQ SI, $0x00
 36802  	JHI  loop
 36803  	RET
 36804  
 36805  // func AmdAxpyPointerLoopXInterleave_V4A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36806  // Requires: SSE
 36807  TEXT ·AmdAxpyPointerLoopXInterleave_V4A14U4(SB), NOSPLIT, $0-48
 36808  	MOVSS alpha+0(FP), X0
 36809  	MOVQ  xs+8(FP), AX
 36810  	MOVQ  incx+16(FP), CX
 36811  	MOVQ  CX, DX
 36812  	SHLQ  $0x04, DX
 36813  	MOVQ  ys+24(FP), DX
 36814  	MOVQ  incy+32(FP), BX
 36815  	MOVQ  BX, SI
 36816  	SHLQ  $0x04, SI
 36817  	MOVQ  n+40(FP), SI
 36818  	JMP   check_limit_unroll
 36819  	PCALIGN $0x08
 36820  	NOP
 36821  	NOP
 36822  	NOP
 36823  	NOP
 36824  	NOP
 36825  	NOP
 36826  
 36827  loop_unroll:
 36828  	MOVSS (AX), X1
 36829  	LEAQ  (AX)(CX*4), AX
 36830  	MOVSS (AX), X2
 36831  	LEAQ  (AX)(CX*4), AX
 36832  	MOVSS (AX), X3
 36833  	LEAQ  (AX)(CX*4), AX
 36834  	MOVSS (AX), X4
 36835  	LEAQ  (AX)(CX*4), AX
 36836  	MULSS X0, X1
 36837  	MULSS X0, X2
 36838  	MULSS X0, X3
 36839  	MULSS X0, X4
 36840  	ADDSS (DX), X1
 36841  	MOVSS X1, (DX)
 36842  	LEAQ  (DX)(BX*4), DX
 36843  	ADDSS (DX), X2
 36844  	MOVSS X2, (DX)
 36845  	LEAQ  (DX)(BX*4), DX
 36846  	ADDSS (DX), X3
 36847  	MOVSS X3, (DX)
 36848  	LEAQ  (DX)(BX*4), DX
 36849  	ADDSS (DX), X4
 36850  	MOVSS X4, (DX)
 36851  	LEAQ  (DX)(BX*4), DX
 36852  	SUBQ  $0x04, SI
 36853  
 36854  check_limit_unroll:
 36855  	CMPQ SI, $0x04
 36856  	JHS  loop_unroll
 36857  	JMP  check_limit
 36858  
 36859  loop:
 36860  	MOVSS (AX), X1
 36861  	MULSS X0, X1
 36862  	ADDSS (DX), X1
 36863  	MOVSS X1, (DX)
 36864  	DECQ  SI
 36865  	LEAQ  (AX)(CX*4), AX
 36866  	LEAQ  (DX)(BX*4), DX
 36867  
 36868  check_limit:
 36869  	CMPQ SI, $0x00
 36870  	JHI  loop
 36871  	RET
 36872  
 36873  // func AmdAxpyPointerLoopXInterleave_V5A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36874  // Requires: SSE
 36875  TEXT ·AmdAxpyPointerLoopXInterleave_V5A14U4(SB), NOSPLIT, $0-48
 36876  	MOVSS alpha+0(FP), X0
 36877  	MOVQ  xs+8(FP), AX
 36878  	MOVQ  incx+16(FP), CX
 36879  	MOVQ  CX, DX
 36880  	SHLQ  $0x04, DX
 36881  	MOVQ  ys+24(FP), DX
 36882  	MOVQ  incy+32(FP), BX
 36883  	MOVQ  BX, SI
 36884  	SHLQ  $0x04, SI
 36885  	MOVQ  n+40(FP), SI
 36886  	JMP   check_limit_unroll
 36887  	PCALIGN $0x08
 36888  	NOP
 36889  	NOP
 36890  	NOP
 36891  	NOP
 36892  	NOP
 36893  	NOP
 36894  
 36895  loop_unroll:
 36896  	MOVSS (AX), X1
 36897  	LEAQ  (AX)(CX*4), AX
 36898  	MOVSS (AX), X2
 36899  	LEAQ  (AX)(CX*4), AX
 36900  	MOVSS (AX), X3
 36901  	LEAQ  (AX)(CX*4), AX
 36902  	MOVSS (AX), X4
 36903  	LEAQ  (AX)(CX*4), AX
 36904  	MULSS X0, X1
 36905  	MULSS X0, X2
 36906  	MULSS X0, X3
 36907  	MULSS X0, X4
 36908  	ADDSS (DX), X1
 36909  	MOVSS X1, (DX)
 36910  	LEAQ  (DX)(BX*4), DX
 36911  	ADDSS (DX), X2
 36912  	MOVSS X2, (DX)
 36913  	LEAQ  (DX)(BX*4), DX
 36914  	ADDSS (DX), X3
 36915  	MOVSS X3, (DX)
 36916  	LEAQ  (DX)(BX*4), DX
 36917  	ADDSS (DX), X4
 36918  	MOVSS X4, (DX)
 36919  	LEAQ  (DX)(BX*4), DX
 36920  	SUBQ  $0x04, SI
 36921  
 36922  check_limit_unroll:
 36923  	CMPQ SI, $0x04
 36924  	JHS  loop_unroll
 36925  	JMP  check_limit
 36926  
 36927  loop:
 36928  	MOVSS (AX), X1
 36929  	MULSS X0, X1
 36930  	ADDSS (DX), X1
 36931  	MOVSS X1, (DX)
 36932  	DECQ  SI
 36933  	LEAQ  (AX)(CX*4), AX
 36934  	LEAQ  (DX)(BX*4), DX
 36935  
 36936  check_limit:
 36937  	CMPQ SI, $0x00
 36938  	JHI  loop
 36939  	RET
 36940  
 36941  // func AmdAxpyPointerLoopXInterleave_V0A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 36942  // Requires: SSE
 36943  TEXT ·AmdAxpyPointerLoopXInterleave_V0A15U4(SB), NOSPLIT, $0-48
 36944  	MOVSS alpha+0(FP), X0
 36945  	MOVQ  xs+8(FP), AX
 36946  	MOVQ  incx+16(FP), CX
 36947  	MOVQ  CX, DX
 36948  	SHLQ  $0x04, DX
 36949  	MOVQ  ys+24(FP), DX
 36950  	MOVQ  incy+32(FP), BX
 36951  	MOVQ  BX, SI
 36952  	SHLQ  $0x04, SI
 36953  	MOVQ  n+40(FP), SI
 36954  	JMP   check_limit_unroll
 36955  	PCALIGN $0x08
 36956  	NOP
 36957  	NOP
 36958  	NOP
 36959  	NOP
 36960  	NOP
 36961  	NOP
 36962  	NOP
 36963  
 36964  loop_unroll:
 36965  	MOVSS (AX), X1
 36966  	LEAQ  (AX)(CX*4), AX
 36967  	MOVSS (AX), X2
 36968  	LEAQ  (AX)(CX*4), AX
 36969  	MOVSS (AX), X3
 36970  	LEAQ  (AX)(CX*4), AX
 36971  	MOVSS (AX), X4
 36972  	LEAQ  (AX)(CX*4), AX
 36973  	MULSS X0, X1
 36974  	MULSS X0, X2
 36975  	MULSS X0, X3
 36976  	MULSS X0, X4
 36977  	ADDSS (DX), X1
 36978  	MOVSS X1, (DX)
 36979  	LEAQ  (DX)(BX*4), DX
 36980  	ADDSS (DX), X2
 36981  	MOVSS X2, (DX)
 36982  	LEAQ  (DX)(BX*4), DX
 36983  	ADDSS (DX), X3
 36984  	MOVSS X3, (DX)
 36985  	LEAQ  (DX)(BX*4), DX
 36986  	ADDSS (DX), X4
 36987  	MOVSS X4, (DX)
 36988  	LEAQ  (DX)(BX*4), DX
 36989  	SUBQ  $0x04, SI
 36990  
 36991  check_limit_unroll:
 36992  	CMPQ SI, $0x04
 36993  	JHS  loop_unroll
 36994  	JMP  check_limit
 36995  
 36996  loop:
 36997  	MOVSS (AX), X1
 36998  	MULSS X0, X1
 36999  	ADDSS (DX), X1
 37000  	MOVSS X1, (DX)
 37001  	DECQ  SI
 37002  	LEAQ  (AX)(CX*4), AX
 37003  	LEAQ  (DX)(BX*4), DX
 37004  
 37005  check_limit:
 37006  	CMPQ SI, $0x00
 37007  	JHI  loop
 37008  	RET
 37009  
 37010  // func AmdAxpyPointerLoopXInterleave_V1A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37011  // Requires: SSE
 37012  TEXT ·AmdAxpyPointerLoopXInterleave_V1A15U4(SB), NOSPLIT, $0-48
 37013  	MOVSS alpha+0(FP), X0
 37014  	MOVQ  xs+8(FP), AX
 37015  	MOVQ  incx+16(FP), CX
 37016  	MOVQ  CX, DX
 37017  	SHLQ  $0x04, DX
 37018  	MOVQ  ys+24(FP), DX
 37019  	MOVQ  incy+32(FP), BX
 37020  	MOVQ  BX, SI
 37021  	SHLQ  $0x04, SI
 37022  	MOVQ  n+40(FP), SI
 37023  	JMP   check_limit_unroll
 37024  	PCALIGN $0x08
 37025  	NOP
 37026  	NOP
 37027  	NOP
 37028  	NOP
 37029  	NOP
 37030  	NOP
 37031  	NOP
 37032  
 37033  loop_unroll:
 37034  	MOVSS (AX), X1
 37035  	LEAQ  (AX)(CX*4), AX
 37036  	MOVSS (AX), X2
 37037  	LEAQ  (AX)(CX*4), AX
 37038  	MOVSS (AX), X3
 37039  	LEAQ  (AX)(CX*4), AX
 37040  	MOVSS (AX), X4
 37041  	LEAQ  (AX)(CX*4), AX
 37042  	MULSS X0, X1
 37043  	MULSS X0, X2
 37044  	MULSS X0, X3
 37045  	MULSS X0, X4
 37046  	ADDSS (DX), X1
 37047  	MOVSS X1, (DX)
 37048  	LEAQ  (DX)(BX*4), DX
 37049  	ADDSS (DX), X2
 37050  	MOVSS X2, (DX)
 37051  	LEAQ  (DX)(BX*4), DX
 37052  	ADDSS (DX), X3
 37053  	MOVSS X3, (DX)
 37054  	LEAQ  (DX)(BX*4), DX
 37055  	ADDSS (DX), X4
 37056  	MOVSS X4, (DX)
 37057  	LEAQ  (DX)(BX*4), DX
 37058  	SUBQ  $0x04, SI
 37059  
 37060  check_limit_unroll:
 37061  	CMPQ SI, $0x04
 37062  	JHS  loop_unroll
 37063  	JMP  check_limit
 37064  
 37065  loop:
 37066  	MOVSS (AX), X1
 37067  	MULSS X0, X1
 37068  	ADDSS (DX), X1
 37069  	MOVSS X1, (DX)
 37070  	DECQ  SI
 37071  	LEAQ  (AX)(CX*4), AX
 37072  	LEAQ  (DX)(BX*4), DX
 37073  
 37074  check_limit:
 37075  	CMPQ SI, $0x00
 37076  	JHI  loop
 37077  	RET
 37078  
 37079  // func AmdAxpyPointerLoopXInterleave_V2A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37080  // Requires: SSE
 37081  TEXT ·AmdAxpyPointerLoopXInterleave_V2A15U4(SB), NOSPLIT, $0-48
 37082  	MOVSS alpha+0(FP), X0
 37083  	MOVQ  xs+8(FP), AX
 37084  	MOVQ  incx+16(FP), CX
 37085  	MOVQ  CX, DX
 37086  	SHLQ  $0x04, DX
 37087  	MOVQ  ys+24(FP), DX
 37088  	MOVQ  incy+32(FP), BX
 37089  	MOVQ  BX, SI
 37090  	SHLQ  $0x04, SI
 37091  	MOVQ  n+40(FP), SI
 37092  	JMP   check_limit_unroll
 37093  	PCALIGN $0x08
 37094  	NOP
 37095  	NOP
 37096  	NOP
 37097  	NOP
 37098  	NOP
 37099  	NOP
 37100  	NOP
 37101  
 37102  loop_unroll:
 37103  	MOVSS (AX), X1
 37104  	LEAQ  (AX)(CX*4), AX
 37105  	MOVSS (AX), X2
 37106  	LEAQ  (AX)(CX*4), AX
 37107  	MOVSS (AX), X3
 37108  	LEAQ  (AX)(CX*4), AX
 37109  	MOVSS (AX), X4
 37110  	LEAQ  (AX)(CX*4), AX
 37111  	MULSS X0, X1
 37112  	MULSS X0, X2
 37113  	MULSS X0, X3
 37114  	MULSS X0, X4
 37115  	ADDSS (DX), X1
 37116  	MOVSS X1, (DX)
 37117  	LEAQ  (DX)(BX*4), DX
 37118  	ADDSS (DX), X2
 37119  	MOVSS X2, (DX)
 37120  	LEAQ  (DX)(BX*4), DX
 37121  	ADDSS (DX), X3
 37122  	MOVSS X3, (DX)
 37123  	LEAQ  (DX)(BX*4), DX
 37124  	ADDSS (DX), X4
 37125  	MOVSS X4, (DX)
 37126  	LEAQ  (DX)(BX*4), DX
 37127  	SUBQ  $0x04, SI
 37128  
 37129  check_limit_unroll:
 37130  	CMPQ SI, $0x04
 37131  	JHS  loop_unroll
 37132  	JMP  check_limit
 37133  
 37134  loop:
 37135  	MOVSS (AX), X1
 37136  	MULSS X0, X1
 37137  	ADDSS (DX), X1
 37138  	MOVSS X1, (DX)
 37139  	DECQ  SI
 37140  	LEAQ  (AX)(CX*4), AX
 37141  	LEAQ  (DX)(BX*4), DX
 37142  
 37143  check_limit:
 37144  	CMPQ SI, $0x00
 37145  	JHI  loop
 37146  	RET
 37147  
 37148  // func AmdAxpyPointerLoopXInterleave_V3A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37149  // Requires: SSE
 37150  TEXT ·AmdAxpyPointerLoopXInterleave_V3A15U4(SB), NOSPLIT, $0-48
 37151  	MOVSS alpha+0(FP), X0
 37152  	MOVQ  xs+8(FP), AX
 37153  	MOVQ  incx+16(FP), CX
 37154  	MOVQ  CX, DX
 37155  	SHLQ  $0x04, DX
 37156  	MOVQ  ys+24(FP), DX
 37157  	MOVQ  incy+32(FP), BX
 37158  	MOVQ  BX, SI
 37159  	SHLQ  $0x04, SI
 37160  	MOVQ  n+40(FP), SI
 37161  	JMP   check_limit_unroll
 37162  	PCALIGN $0x08
 37163  	NOP
 37164  	NOP
 37165  	NOP
 37166  	NOP
 37167  	NOP
 37168  	NOP
 37169  	NOP
 37170  
 37171  loop_unroll:
 37172  	MOVSS (AX), X1
 37173  	LEAQ  (AX)(CX*4), AX
 37174  	MOVSS (AX), X2
 37175  	LEAQ  (AX)(CX*4), AX
 37176  	MOVSS (AX), X3
 37177  	LEAQ  (AX)(CX*4), AX
 37178  	MOVSS (AX), X4
 37179  	LEAQ  (AX)(CX*4), AX
 37180  	MULSS X0, X1
 37181  	MULSS X0, X2
 37182  	MULSS X0, X3
 37183  	MULSS X0, X4
 37184  	ADDSS (DX), X1
 37185  	MOVSS X1, (DX)
 37186  	LEAQ  (DX)(BX*4), DX
 37187  	ADDSS (DX), X2
 37188  	MOVSS X2, (DX)
 37189  	LEAQ  (DX)(BX*4), DX
 37190  	ADDSS (DX), X3
 37191  	MOVSS X3, (DX)
 37192  	LEAQ  (DX)(BX*4), DX
 37193  	ADDSS (DX), X4
 37194  	MOVSS X4, (DX)
 37195  	LEAQ  (DX)(BX*4), DX
 37196  	SUBQ  $0x04, SI
 37197  
 37198  check_limit_unroll:
 37199  	CMPQ SI, $0x04
 37200  	JHS  loop_unroll
 37201  	JMP  check_limit
 37202  
 37203  loop:
 37204  	MOVSS (AX), X1
 37205  	MULSS X0, X1
 37206  	ADDSS (DX), X1
 37207  	MOVSS X1, (DX)
 37208  	DECQ  SI
 37209  	LEAQ  (AX)(CX*4), AX
 37210  	LEAQ  (DX)(BX*4), DX
 37211  
 37212  check_limit:
 37213  	CMPQ SI, $0x00
 37214  	JHI  loop
 37215  	RET
 37216  
 37217  // func AmdAxpyPointerLoopXInterleave_V4A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37218  // Requires: SSE
 37219  TEXT ·AmdAxpyPointerLoopXInterleave_V4A15U4(SB), NOSPLIT, $0-48
 37220  	MOVSS alpha+0(FP), X0
 37221  	MOVQ  xs+8(FP), AX
 37222  	MOVQ  incx+16(FP), CX
 37223  	MOVQ  CX, DX
 37224  	SHLQ  $0x04, DX
 37225  	MOVQ  ys+24(FP), DX
 37226  	MOVQ  incy+32(FP), BX
 37227  	MOVQ  BX, SI
 37228  	SHLQ  $0x04, SI
 37229  	MOVQ  n+40(FP), SI
 37230  	JMP   check_limit_unroll
 37231  	PCALIGN $0x08
 37232  	NOP
 37233  	NOP
 37234  	NOP
 37235  	NOP
 37236  	NOP
 37237  	NOP
 37238  	NOP
 37239  
 37240  loop_unroll:
 37241  	MOVSS (AX), X1
 37242  	LEAQ  (AX)(CX*4), AX
 37243  	MOVSS (AX), X2
 37244  	LEAQ  (AX)(CX*4), AX
 37245  	MOVSS (AX), X3
 37246  	LEAQ  (AX)(CX*4), AX
 37247  	MOVSS (AX), X4
 37248  	LEAQ  (AX)(CX*4), AX
 37249  	MULSS X0, X1
 37250  	MULSS X0, X2
 37251  	MULSS X0, X3
 37252  	MULSS X0, X4
 37253  	ADDSS (DX), X1
 37254  	MOVSS X1, (DX)
 37255  	LEAQ  (DX)(BX*4), DX
 37256  	ADDSS (DX), X2
 37257  	MOVSS X2, (DX)
 37258  	LEAQ  (DX)(BX*4), DX
 37259  	ADDSS (DX), X3
 37260  	MOVSS X3, (DX)
 37261  	LEAQ  (DX)(BX*4), DX
 37262  	ADDSS (DX), X4
 37263  	MOVSS X4, (DX)
 37264  	LEAQ  (DX)(BX*4), DX
 37265  	SUBQ  $0x04, SI
 37266  
 37267  check_limit_unroll:
 37268  	CMPQ SI, $0x04
 37269  	JHS  loop_unroll
 37270  	JMP  check_limit
 37271  
 37272  loop:
 37273  	MOVSS (AX), X1
 37274  	MULSS X0, X1
 37275  	ADDSS (DX), X1
 37276  	MOVSS X1, (DX)
 37277  	DECQ  SI
 37278  	LEAQ  (AX)(CX*4), AX
 37279  	LEAQ  (DX)(BX*4), DX
 37280  
 37281  check_limit:
 37282  	CMPQ SI, $0x00
 37283  	JHI  loop
 37284  	RET
 37285  
 37286  // func AmdAxpyPointerLoopXInterleave_V5A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37287  // Requires: SSE
 37288  TEXT ·AmdAxpyPointerLoopXInterleave_V5A15U4(SB), NOSPLIT, $0-48
 37289  	MOVSS alpha+0(FP), X0
 37290  	MOVQ  xs+8(FP), AX
 37291  	MOVQ  incx+16(FP), CX
 37292  	MOVQ  CX, DX
 37293  	SHLQ  $0x04, DX
 37294  	MOVQ  ys+24(FP), DX
 37295  	MOVQ  incy+32(FP), BX
 37296  	MOVQ  BX, SI
 37297  	SHLQ  $0x04, SI
 37298  	MOVQ  n+40(FP), SI
 37299  	JMP   check_limit_unroll
 37300  	PCALIGN $0x08
 37301  	NOP
 37302  	NOP
 37303  	NOP
 37304  	NOP
 37305  	NOP
 37306  	NOP
 37307  	NOP
 37308  
 37309  loop_unroll:
 37310  	MOVSS (AX), X1
 37311  	LEAQ  (AX)(CX*4), AX
 37312  	MOVSS (AX), X2
 37313  	LEAQ  (AX)(CX*4), AX
 37314  	MOVSS (AX), X3
 37315  	LEAQ  (AX)(CX*4), AX
 37316  	MOVSS (AX), X4
 37317  	LEAQ  (AX)(CX*4), AX
 37318  	MULSS X0, X1
 37319  	MULSS X0, X2
 37320  	MULSS X0, X3
 37321  	MULSS X0, X4
 37322  	ADDSS (DX), X1
 37323  	MOVSS X1, (DX)
 37324  	LEAQ  (DX)(BX*4), DX
 37325  	ADDSS (DX), X2
 37326  	MOVSS X2, (DX)
 37327  	LEAQ  (DX)(BX*4), DX
 37328  	ADDSS (DX), X3
 37329  	MOVSS X3, (DX)
 37330  	LEAQ  (DX)(BX*4), DX
 37331  	ADDSS (DX), X4
 37332  	MOVSS X4, (DX)
 37333  	LEAQ  (DX)(BX*4), DX
 37334  	SUBQ  $0x04, SI
 37335  
 37336  check_limit_unroll:
 37337  	CMPQ SI, $0x04
 37338  	JHS  loop_unroll
 37339  	JMP  check_limit
 37340  
 37341  loop:
 37342  	MOVSS (AX), X1
 37343  	MULSS X0, X1
 37344  	ADDSS (DX), X1
 37345  	MOVSS X1, (DX)
 37346  	DECQ  SI
 37347  	LEAQ  (AX)(CX*4), AX
 37348  	LEAQ  (DX)(BX*4), DX
 37349  
 37350  check_limit:
 37351  	CMPQ SI, $0x00
 37352  	JHI  loop
 37353  	RET
 37354  
 37355  // func AmdAxpyPointerLoopXInterleave_V0A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37356  // Requires: SSE
 37357  TEXT ·AmdAxpyPointerLoopXInterleave_V0A16U4(SB), NOSPLIT, $0-48
 37358  	MOVSS alpha+0(FP), X0
 37359  	MOVQ  xs+8(FP), AX
 37360  	MOVQ  incx+16(FP), CX
 37361  	MOVQ  CX, DX
 37362  	SHLQ  $0x04, DX
 37363  	MOVQ  ys+24(FP), DX
 37364  	MOVQ  incy+32(FP), BX
 37365  	MOVQ  BX, SI
 37366  	SHLQ  $0x04, SI
 37367  	MOVQ  n+40(FP), SI
 37368  	JMP   check_limit_unroll
 37369  	PCALIGN $0x10
 37370  
 37371  loop_unroll:
 37372  	MOVSS (AX), X1
 37373  	LEAQ  (AX)(CX*4), AX
 37374  	MOVSS (AX), X2
 37375  	LEAQ  (AX)(CX*4), AX
 37376  	MOVSS (AX), X3
 37377  	LEAQ  (AX)(CX*4), AX
 37378  	MOVSS (AX), X4
 37379  	LEAQ  (AX)(CX*4), AX
 37380  	MULSS X0, X1
 37381  	MULSS X0, X2
 37382  	MULSS X0, X3
 37383  	MULSS X0, X4
 37384  	ADDSS (DX), X1
 37385  	MOVSS X1, (DX)
 37386  	LEAQ  (DX)(BX*4), DX
 37387  	ADDSS (DX), X2
 37388  	MOVSS X2, (DX)
 37389  	LEAQ  (DX)(BX*4), DX
 37390  	ADDSS (DX), X3
 37391  	MOVSS X3, (DX)
 37392  	LEAQ  (DX)(BX*4), DX
 37393  	ADDSS (DX), X4
 37394  	MOVSS X4, (DX)
 37395  	LEAQ  (DX)(BX*4), DX
 37396  	SUBQ  $0x04, SI
 37397  
 37398  check_limit_unroll:
 37399  	CMPQ SI, $0x04
 37400  	JHS  loop_unroll
 37401  	JMP  check_limit
 37402  
 37403  loop:
 37404  	MOVSS (AX), X1
 37405  	MULSS X0, X1
 37406  	ADDSS (DX), X1
 37407  	MOVSS X1, (DX)
 37408  	DECQ  SI
 37409  	LEAQ  (AX)(CX*4), AX
 37410  	LEAQ  (DX)(BX*4), DX
 37411  
 37412  check_limit:
 37413  	CMPQ SI, $0x00
 37414  	JHI  loop
 37415  	RET
 37416  
 37417  // func AmdAxpyPointerLoopXInterleave_V1A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37418  // Requires: SSE
 37419  TEXT ·AmdAxpyPointerLoopXInterleave_V1A16U4(SB), NOSPLIT, $0-48
 37420  	MOVSS alpha+0(FP), X0
 37421  	MOVQ  xs+8(FP), AX
 37422  	MOVQ  incx+16(FP), CX
 37423  	MOVQ  CX, DX
 37424  	SHLQ  $0x04, DX
 37425  	MOVQ  ys+24(FP), DX
 37426  	MOVQ  incy+32(FP), BX
 37427  	MOVQ  BX, SI
 37428  	SHLQ  $0x04, SI
 37429  	MOVQ  n+40(FP), SI
 37430  	JMP   check_limit_unroll
 37431  	PCALIGN $0x10
 37432  
 37433  loop_unroll:
 37434  	MOVSS (AX), X1
 37435  	LEAQ  (AX)(CX*4), AX
 37436  	MOVSS (AX), X2
 37437  	LEAQ  (AX)(CX*4), AX
 37438  	MOVSS (AX), X3
 37439  	LEAQ  (AX)(CX*4), AX
 37440  	MOVSS (AX), X4
 37441  	LEAQ  (AX)(CX*4), AX
 37442  	MULSS X0, X1
 37443  	MULSS X0, X2
 37444  	MULSS X0, X3
 37445  	MULSS X0, X4
 37446  	ADDSS (DX), X1
 37447  	MOVSS X1, (DX)
 37448  	LEAQ  (DX)(BX*4), DX
 37449  	ADDSS (DX), X2
 37450  	MOVSS X2, (DX)
 37451  	LEAQ  (DX)(BX*4), DX
 37452  	ADDSS (DX), X3
 37453  	MOVSS X3, (DX)
 37454  	LEAQ  (DX)(BX*4), DX
 37455  	ADDSS (DX), X4
 37456  	MOVSS X4, (DX)
 37457  	LEAQ  (DX)(BX*4), DX
 37458  	SUBQ  $0x04, SI
 37459  
 37460  check_limit_unroll:
 37461  	CMPQ SI, $0x04
 37462  	JHS  loop_unroll
 37463  	JMP  check_limit
 37464  
 37465  loop:
 37466  	MOVSS (AX), X1
 37467  	MULSS X0, X1
 37468  	ADDSS (DX), X1
 37469  	MOVSS X1, (DX)
 37470  	DECQ  SI
 37471  	LEAQ  (AX)(CX*4), AX
 37472  	LEAQ  (DX)(BX*4), DX
 37473  
 37474  check_limit:
 37475  	CMPQ SI, $0x00
 37476  	JHI  loop
 37477  	RET
 37478  
 37479  // func AmdAxpyPointerLoopXInterleave_V2A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37480  // Requires: SSE
 37481  TEXT ·AmdAxpyPointerLoopXInterleave_V2A16U4(SB), NOSPLIT, $0-48
 37482  	MOVSS alpha+0(FP), X0
 37483  	MOVQ  xs+8(FP), AX
 37484  	MOVQ  incx+16(FP), CX
 37485  	MOVQ  CX, DX
 37486  	SHLQ  $0x04, DX
 37487  	MOVQ  ys+24(FP), DX
 37488  	MOVQ  incy+32(FP), BX
 37489  	MOVQ  BX, SI
 37490  	SHLQ  $0x04, SI
 37491  	MOVQ  n+40(FP), SI
 37492  	JMP   check_limit_unroll
 37493  	PCALIGN $0x10
 37494  
 37495  loop_unroll:
 37496  	MOVSS (AX), X1
 37497  	LEAQ  (AX)(CX*4), AX
 37498  	MOVSS (AX), X2
 37499  	LEAQ  (AX)(CX*4), AX
 37500  	MOVSS (AX), X3
 37501  	LEAQ  (AX)(CX*4), AX
 37502  	MOVSS (AX), X4
 37503  	LEAQ  (AX)(CX*4), AX
 37504  	MULSS X0, X1
 37505  	MULSS X0, X2
 37506  	MULSS X0, X3
 37507  	MULSS X0, X4
 37508  	ADDSS (DX), X1
 37509  	MOVSS X1, (DX)
 37510  	LEAQ  (DX)(BX*4), DX
 37511  	ADDSS (DX), X2
 37512  	MOVSS X2, (DX)
 37513  	LEAQ  (DX)(BX*4), DX
 37514  	ADDSS (DX), X3
 37515  	MOVSS X3, (DX)
 37516  	LEAQ  (DX)(BX*4), DX
 37517  	ADDSS (DX), X4
 37518  	MOVSS X4, (DX)
 37519  	LEAQ  (DX)(BX*4), DX
 37520  	SUBQ  $0x04, SI
 37521  
 37522  check_limit_unroll:
 37523  	CMPQ SI, $0x04
 37524  	JHS  loop_unroll
 37525  	JMP  check_limit
 37526  
 37527  loop:
 37528  	MOVSS (AX), X1
 37529  	MULSS X0, X1
 37530  	ADDSS (DX), X1
 37531  	MOVSS X1, (DX)
 37532  	DECQ  SI
 37533  	LEAQ  (AX)(CX*4), AX
 37534  	LEAQ  (DX)(BX*4), DX
 37535  
 37536  check_limit:
 37537  	CMPQ SI, $0x00
 37538  	JHI  loop
 37539  	RET
 37540  
 37541  // func AmdAxpyPointerLoopXInterleave_V3A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37542  // Requires: SSE
 37543  TEXT ·AmdAxpyPointerLoopXInterleave_V3A16U4(SB), NOSPLIT, $0-48
 37544  	MOVSS alpha+0(FP), X0
 37545  	MOVQ  xs+8(FP), AX
 37546  	MOVQ  incx+16(FP), CX
 37547  	MOVQ  CX, DX
 37548  	SHLQ  $0x04, DX
 37549  	MOVQ  ys+24(FP), DX
 37550  	MOVQ  incy+32(FP), BX
 37551  	MOVQ  BX, SI
 37552  	SHLQ  $0x04, SI
 37553  	MOVQ  n+40(FP), SI
 37554  	JMP   check_limit_unroll
 37555  	PCALIGN $0x10
 37556  
 37557  loop_unroll:
 37558  	MOVSS (AX), X1
 37559  	LEAQ  (AX)(CX*4), AX
 37560  	MOVSS (AX), X2
 37561  	LEAQ  (AX)(CX*4), AX
 37562  	MOVSS (AX), X3
 37563  	LEAQ  (AX)(CX*4), AX
 37564  	MOVSS (AX), X4
 37565  	LEAQ  (AX)(CX*4), AX
 37566  	MULSS X0, X1
 37567  	MULSS X0, X2
 37568  	MULSS X0, X3
 37569  	MULSS X0, X4
 37570  	ADDSS (DX), X1
 37571  	MOVSS X1, (DX)
 37572  	LEAQ  (DX)(BX*4), DX
 37573  	ADDSS (DX), X2
 37574  	MOVSS X2, (DX)
 37575  	LEAQ  (DX)(BX*4), DX
 37576  	ADDSS (DX), X3
 37577  	MOVSS X3, (DX)
 37578  	LEAQ  (DX)(BX*4), DX
 37579  	ADDSS (DX), X4
 37580  	MOVSS X4, (DX)
 37581  	LEAQ  (DX)(BX*4), DX
 37582  	SUBQ  $0x04, SI
 37583  
 37584  check_limit_unroll:
 37585  	CMPQ SI, $0x04
 37586  	JHS  loop_unroll
 37587  	JMP  check_limit
 37588  
 37589  loop:
 37590  	MOVSS (AX), X1
 37591  	MULSS X0, X1
 37592  	ADDSS (DX), X1
 37593  	MOVSS X1, (DX)
 37594  	DECQ  SI
 37595  	LEAQ  (AX)(CX*4), AX
 37596  	LEAQ  (DX)(BX*4), DX
 37597  
 37598  check_limit:
 37599  	CMPQ SI, $0x00
 37600  	JHI  loop
 37601  	RET
 37602  
 37603  // func AmdAxpyPointerLoopXInterleave_V4A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37604  // Requires: SSE
 37605  TEXT ·AmdAxpyPointerLoopXInterleave_V4A16U4(SB), NOSPLIT, $0-48
 37606  	MOVSS alpha+0(FP), X0
 37607  	MOVQ  xs+8(FP), AX
 37608  	MOVQ  incx+16(FP), CX
 37609  	MOVQ  CX, DX
 37610  	SHLQ  $0x04, DX
 37611  	MOVQ  ys+24(FP), DX
 37612  	MOVQ  incy+32(FP), BX
 37613  	MOVQ  BX, SI
 37614  	SHLQ  $0x04, SI
 37615  	MOVQ  n+40(FP), SI
 37616  	JMP   check_limit_unroll
 37617  	PCALIGN $0x10
 37618  
 37619  loop_unroll:
 37620  	MOVSS (AX), X1
 37621  	LEAQ  (AX)(CX*4), AX
 37622  	MOVSS (AX), X2
 37623  	LEAQ  (AX)(CX*4), AX
 37624  	MOVSS (AX), X3
 37625  	LEAQ  (AX)(CX*4), AX
 37626  	MOVSS (AX), X4
 37627  	LEAQ  (AX)(CX*4), AX
 37628  	MULSS X0, X1
 37629  	MULSS X0, X2
 37630  	MULSS X0, X3
 37631  	MULSS X0, X4
 37632  	ADDSS (DX), X1
 37633  	MOVSS X1, (DX)
 37634  	LEAQ  (DX)(BX*4), DX
 37635  	ADDSS (DX), X2
 37636  	MOVSS X2, (DX)
 37637  	LEAQ  (DX)(BX*4), DX
 37638  	ADDSS (DX), X3
 37639  	MOVSS X3, (DX)
 37640  	LEAQ  (DX)(BX*4), DX
 37641  	ADDSS (DX), X4
 37642  	MOVSS X4, (DX)
 37643  	LEAQ  (DX)(BX*4), DX
 37644  	SUBQ  $0x04, SI
 37645  
 37646  check_limit_unroll:
 37647  	CMPQ SI, $0x04
 37648  	JHS  loop_unroll
 37649  	JMP  check_limit
 37650  
 37651  loop:
 37652  	MOVSS (AX), X1
 37653  	MULSS X0, X1
 37654  	ADDSS (DX), X1
 37655  	MOVSS X1, (DX)
 37656  	DECQ  SI
 37657  	LEAQ  (AX)(CX*4), AX
 37658  	LEAQ  (DX)(BX*4), DX
 37659  
 37660  check_limit:
 37661  	CMPQ SI, $0x00
 37662  	JHI  loop
 37663  	RET
 37664  
 37665  // func AmdAxpyPointerLoopXInterleave_V5A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37666  // Requires: SSE
 37667  TEXT ·AmdAxpyPointerLoopXInterleave_V5A16U4(SB), NOSPLIT, $0-48
 37668  	MOVSS alpha+0(FP), X0
 37669  	MOVQ  xs+8(FP), AX
 37670  	MOVQ  incx+16(FP), CX
 37671  	MOVQ  CX, DX
 37672  	SHLQ  $0x04, DX
 37673  	MOVQ  ys+24(FP), DX
 37674  	MOVQ  incy+32(FP), BX
 37675  	MOVQ  BX, SI
 37676  	SHLQ  $0x04, SI
 37677  	MOVQ  n+40(FP), SI
 37678  	JMP   check_limit_unroll
 37679  	PCALIGN $0x10
 37680  
 37681  loop_unroll:
 37682  	MOVSS (AX), X1
 37683  	LEAQ  (AX)(CX*4), AX
 37684  	MOVSS (AX), X2
 37685  	LEAQ  (AX)(CX*4), AX
 37686  	MOVSS (AX), X3
 37687  	LEAQ  (AX)(CX*4), AX
 37688  	MOVSS (AX), X4
 37689  	LEAQ  (AX)(CX*4), AX
 37690  	MULSS X0, X1
 37691  	MULSS X0, X2
 37692  	MULSS X0, X3
 37693  	MULSS X0, X4
 37694  	ADDSS (DX), X1
 37695  	MOVSS X1, (DX)
 37696  	LEAQ  (DX)(BX*4), DX
 37697  	ADDSS (DX), X2
 37698  	MOVSS X2, (DX)
 37699  	LEAQ  (DX)(BX*4), DX
 37700  	ADDSS (DX), X3
 37701  	MOVSS X3, (DX)
 37702  	LEAQ  (DX)(BX*4), DX
 37703  	ADDSS (DX), X4
 37704  	MOVSS X4, (DX)
 37705  	LEAQ  (DX)(BX*4), DX
 37706  	SUBQ  $0x04, SI
 37707  
 37708  check_limit_unroll:
 37709  	CMPQ SI, $0x04
 37710  	JHS  loop_unroll
 37711  	JMP  check_limit
 37712  
 37713  loop:
 37714  	MOVSS (AX), X1
 37715  	MULSS X0, X1
 37716  	ADDSS (DX), X1
 37717  	MOVSS X1, (DX)
 37718  	DECQ  SI
 37719  	LEAQ  (AX)(CX*4), AX
 37720  	LEAQ  (DX)(BX*4), DX
 37721  
 37722  check_limit:
 37723  	CMPQ SI, $0x00
 37724  	JHI  loop
 37725  	RET
 37726  
 37727  // func AmdAxpyPointerLoopXInterleave_V0A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37728  // Requires: SSE
 37729  TEXT ·AmdAxpyPointerLoopXInterleave_V0A0U8(SB), NOSPLIT, $0-48
 37730  	MOVSS alpha+0(FP), X0
 37731  	MOVQ  xs+8(FP), AX
 37732  	MOVQ  incx+16(FP), CX
 37733  	MOVQ  CX, DX
 37734  	SHLQ  $0x05, DX
 37735  	MOVQ  ys+24(FP), DX
 37736  	MOVQ  incy+32(FP), BX
 37737  	MOVQ  BX, SI
 37738  	SHLQ  $0x05, SI
 37739  	MOVQ  n+40(FP), SI
 37740  	JMP   check_limit_unroll
 37741  
 37742  loop_unroll:
 37743  	MOVSS (AX), X1
 37744  	LEAQ  (AX)(CX*4), AX
 37745  	MOVSS (AX), X2
 37746  	LEAQ  (AX)(CX*4), AX
 37747  	MOVSS (AX), X3
 37748  	LEAQ  (AX)(CX*4), AX
 37749  	MOVSS (AX), X4
 37750  	LEAQ  (AX)(CX*4), AX
 37751  	MOVSS (AX), X5
 37752  	LEAQ  (AX)(CX*4), AX
 37753  	MOVSS (AX), X6
 37754  	LEAQ  (AX)(CX*4), AX
 37755  	MOVSS (AX), X7
 37756  	LEAQ  (AX)(CX*4), AX
 37757  	MOVSS (AX), X8
 37758  	LEAQ  (AX)(CX*4), AX
 37759  	MULSS X0, X1
 37760  	MULSS X0, X2
 37761  	MULSS X0, X3
 37762  	MULSS X0, X4
 37763  	MULSS X0, X5
 37764  	MULSS X0, X6
 37765  	MULSS X0, X7
 37766  	MULSS X0, X8
 37767  	ADDSS (DX), X1
 37768  	MOVSS X1, (DX)
 37769  	LEAQ  (DX)(BX*4), DX
 37770  	ADDSS (DX), X2
 37771  	MOVSS X2, (DX)
 37772  	LEAQ  (DX)(BX*4), DX
 37773  	ADDSS (DX), X3
 37774  	MOVSS X3, (DX)
 37775  	LEAQ  (DX)(BX*4), DX
 37776  	ADDSS (DX), X4
 37777  	MOVSS X4, (DX)
 37778  	LEAQ  (DX)(BX*4), DX
 37779  	ADDSS (DX), X5
 37780  	MOVSS X5, (DX)
 37781  	LEAQ  (DX)(BX*4), DX
 37782  	ADDSS (DX), X6
 37783  	MOVSS X6, (DX)
 37784  	LEAQ  (DX)(BX*4), DX
 37785  	ADDSS (DX), X7
 37786  	MOVSS X7, (DX)
 37787  	LEAQ  (DX)(BX*4), DX
 37788  	ADDSS (DX), X8
 37789  	MOVSS X8, (DX)
 37790  	LEAQ  (DX)(BX*4), DX
 37791  	SUBQ  $0x08, SI
 37792  
 37793  check_limit_unroll:
 37794  	CMPQ SI, $0x08
 37795  	JHS  loop_unroll
 37796  	JMP  check_limit
 37797  
 37798  loop:
 37799  	MOVSS (AX), X1
 37800  	MULSS X0, X1
 37801  	ADDSS (DX), X1
 37802  	MOVSS X1, (DX)
 37803  	DECQ  SI
 37804  	LEAQ  (AX)(CX*4), AX
 37805  	LEAQ  (DX)(BX*4), DX
 37806  
 37807  check_limit:
 37808  	CMPQ SI, $0x00
 37809  	JHI  loop
 37810  	RET
 37811  
 37812  // func AmdAxpyPointerLoopXInterleave_V1A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37813  // Requires: SSE
 37814  TEXT ·AmdAxpyPointerLoopXInterleave_V1A0U8(SB), NOSPLIT, $0-48
 37815  	MOVSS alpha+0(FP), X0
 37816  	MOVQ  xs+8(FP), AX
 37817  	MOVQ  incx+16(FP), CX
 37818  	MOVQ  CX, DX
 37819  	SHLQ  $0x05, DX
 37820  	MOVQ  ys+24(FP), DX
 37821  	MOVQ  incy+32(FP), BX
 37822  	MOVQ  BX, SI
 37823  	SHLQ  $0x05, SI
 37824  	MOVQ  n+40(FP), SI
 37825  	JMP   check_limit_unroll
 37826  
 37827  loop_unroll:
 37828  	MOVSS (AX), X1
 37829  	LEAQ  (AX)(CX*4), AX
 37830  	MOVSS (AX), X2
 37831  	LEAQ  (AX)(CX*4), AX
 37832  	MOVSS (AX), X3
 37833  	LEAQ  (AX)(CX*4), AX
 37834  	MOVSS (AX), X4
 37835  	LEAQ  (AX)(CX*4), AX
 37836  	MOVSS (AX), X5
 37837  	LEAQ  (AX)(CX*4), AX
 37838  	MOVSS (AX), X6
 37839  	LEAQ  (AX)(CX*4), AX
 37840  	MOVSS (AX), X7
 37841  	LEAQ  (AX)(CX*4), AX
 37842  	MOVSS (AX), X8
 37843  	LEAQ  (AX)(CX*4), AX
 37844  	MULSS X0, X1
 37845  	MULSS X0, X2
 37846  	MULSS X0, X3
 37847  	MULSS X0, X4
 37848  	MULSS X0, X5
 37849  	MULSS X0, X6
 37850  	MULSS X0, X7
 37851  	MULSS X0, X8
 37852  	ADDSS (DX), X1
 37853  	MOVSS X1, (DX)
 37854  	LEAQ  (DX)(BX*4), DX
 37855  	ADDSS (DX), X2
 37856  	MOVSS X2, (DX)
 37857  	LEAQ  (DX)(BX*4), DX
 37858  	ADDSS (DX), X3
 37859  	MOVSS X3, (DX)
 37860  	LEAQ  (DX)(BX*4), DX
 37861  	ADDSS (DX), X4
 37862  	MOVSS X4, (DX)
 37863  	LEAQ  (DX)(BX*4), DX
 37864  	ADDSS (DX), X5
 37865  	MOVSS X5, (DX)
 37866  	LEAQ  (DX)(BX*4), DX
 37867  	ADDSS (DX), X6
 37868  	MOVSS X6, (DX)
 37869  	LEAQ  (DX)(BX*4), DX
 37870  	ADDSS (DX), X7
 37871  	MOVSS X7, (DX)
 37872  	LEAQ  (DX)(BX*4), DX
 37873  	ADDSS (DX), X8
 37874  	MOVSS X8, (DX)
 37875  	LEAQ  (DX)(BX*4), DX
 37876  	SUBQ  $0x08, SI
 37877  
 37878  check_limit_unroll:
 37879  	CMPQ SI, $0x08
 37880  	JHS  loop_unroll
 37881  	JMP  check_limit
 37882  
 37883  loop:
 37884  	MOVSS (AX), X1
 37885  	MULSS X0, X1
 37886  	ADDSS (DX), X1
 37887  	MOVSS X1, (DX)
 37888  	DECQ  SI
 37889  	LEAQ  (AX)(CX*4), AX
 37890  	LEAQ  (DX)(BX*4), DX
 37891  
 37892  check_limit:
 37893  	CMPQ SI, $0x00
 37894  	JHI  loop
 37895  	RET
 37896  
 37897  // func AmdAxpyPointerLoopXInterleave_V2A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37898  // Requires: SSE
 37899  TEXT ·AmdAxpyPointerLoopXInterleave_V2A0U8(SB), NOSPLIT, $0-48
 37900  	MOVSS alpha+0(FP), X0
 37901  	MOVQ  xs+8(FP), AX
 37902  	MOVQ  incx+16(FP), CX
 37903  	MOVQ  CX, DX
 37904  	SHLQ  $0x05, DX
 37905  	MOVQ  ys+24(FP), DX
 37906  	MOVQ  incy+32(FP), BX
 37907  	MOVQ  BX, SI
 37908  	SHLQ  $0x05, SI
 37909  	MOVQ  n+40(FP), SI
 37910  	JMP   check_limit_unroll
 37911  
 37912  loop_unroll:
 37913  	MOVSS (AX), X1
 37914  	LEAQ  (AX)(CX*4), AX
 37915  	MOVSS (AX), X2
 37916  	LEAQ  (AX)(CX*4), AX
 37917  	MOVSS (AX), X3
 37918  	LEAQ  (AX)(CX*4), AX
 37919  	MOVSS (AX), X4
 37920  	LEAQ  (AX)(CX*4), AX
 37921  	MOVSS (AX), X5
 37922  	LEAQ  (AX)(CX*4), AX
 37923  	MOVSS (AX), X6
 37924  	LEAQ  (AX)(CX*4), AX
 37925  	MOVSS (AX), X7
 37926  	LEAQ  (AX)(CX*4), AX
 37927  	MOVSS (AX), X8
 37928  	LEAQ  (AX)(CX*4), AX
 37929  	MULSS X0, X1
 37930  	MULSS X0, X2
 37931  	MULSS X0, X3
 37932  	MULSS X0, X4
 37933  	MULSS X0, X5
 37934  	MULSS X0, X6
 37935  	MULSS X0, X7
 37936  	MULSS X0, X8
 37937  	ADDSS (DX), X1
 37938  	MOVSS X1, (DX)
 37939  	LEAQ  (DX)(BX*4), DX
 37940  	ADDSS (DX), X2
 37941  	MOVSS X2, (DX)
 37942  	LEAQ  (DX)(BX*4), DX
 37943  	ADDSS (DX), X3
 37944  	MOVSS X3, (DX)
 37945  	LEAQ  (DX)(BX*4), DX
 37946  	ADDSS (DX), X4
 37947  	MOVSS X4, (DX)
 37948  	LEAQ  (DX)(BX*4), DX
 37949  	ADDSS (DX), X5
 37950  	MOVSS X5, (DX)
 37951  	LEAQ  (DX)(BX*4), DX
 37952  	ADDSS (DX), X6
 37953  	MOVSS X6, (DX)
 37954  	LEAQ  (DX)(BX*4), DX
 37955  	ADDSS (DX), X7
 37956  	MOVSS X7, (DX)
 37957  	LEAQ  (DX)(BX*4), DX
 37958  	ADDSS (DX), X8
 37959  	MOVSS X8, (DX)
 37960  	LEAQ  (DX)(BX*4), DX
 37961  	SUBQ  $0x08, SI
 37962  
 37963  check_limit_unroll:
 37964  	CMPQ SI, $0x08
 37965  	JHS  loop_unroll
 37966  	JMP  check_limit
 37967  
 37968  loop:
 37969  	MOVSS (AX), X1
 37970  	MULSS X0, X1
 37971  	ADDSS (DX), X1
 37972  	MOVSS X1, (DX)
 37973  	DECQ  SI
 37974  	LEAQ  (AX)(CX*4), AX
 37975  	LEAQ  (DX)(BX*4), DX
 37976  
 37977  check_limit:
 37978  	CMPQ SI, $0x00
 37979  	JHI  loop
 37980  	RET
 37981  
 37982  // func AmdAxpyPointerLoopXInterleave_V3A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 37983  // Requires: SSE
 37984  TEXT ·AmdAxpyPointerLoopXInterleave_V3A0U8(SB), NOSPLIT, $0-48
 37985  	MOVSS alpha+0(FP), X0
 37986  	MOVQ  xs+8(FP), AX
 37987  	MOVQ  incx+16(FP), CX
 37988  	MOVQ  CX, DX
 37989  	SHLQ  $0x05, DX
 37990  	MOVQ  ys+24(FP), DX
 37991  	MOVQ  incy+32(FP), BX
 37992  	MOVQ  BX, SI
 37993  	SHLQ  $0x05, SI
 37994  	MOVQ  n+40(FP), SI
 37995  	JMP   check_limit_unroll
 37996  
 37997  loop_unroll:
 37998  	MOVSS (AX), X1
 37999  	LEAQ  (AX)(CX*4), AX
 38000  	MOVSS (AX), X2
 38001  	LEAQ  (AX)(CX*4), AX
 38002  	MOVSS (AX), X3
 38003  	LEAQ  (AX)(CX*4), AX
 38004  	MOVSS (AX), X4
 38005  	LEAQ  (AX)(CX*4), AX
 38006  	MOVSS (AX), X5
 38007  	LEAQ  (AX)(CX*4), AX
 38008  	MOVSS (AX), X6
 38009  	LEAQ  (AX)(CX*4), AX
 38010  	MOVSS (AX), X7
 38011  	LEAQ  (AX)(CX*4), AX
 38012  	MOVSS (AX), X8
 38013  	LEAQ  (AX)(CX*4), AX
 38014  	MULSS X0, X1
 38015  	MULSS X0, X2
 38016  	MULSS X0, X3
 38017  	MULSS X0, X4
 38018  	MULSS X0, X5
 38019  	MULSS X0, X6
 38020  	MULSS X0, X7
 38021  	MULSS X0, X8
 38022  	ADDSS (DX), X1
 38023  	MOVSS X1, (DX)
 38024  	LEAQ  (DX)(BX*4), DX
 38025  	ADDSS (DX), X2
 38026  	MOVSS X2, (DX)
 38027  	LEAQ  (DX)(BX*4), DX
 38028  	ADDSS (DX), X3
 38029  	MOVSS X3, (DX)
 38030  	LEAQ  (DX)(BX*4), DX
 38031  	ADDSS (DX), X4
 38032  	MOVSS X4, (DX)
 38033  	LEAQ  (DX)(BX*4), DX
 38034  	ADDSS (DX), X5
 38035  	MOVSS X5, (DX)
 38036  	LEAQ  (DX)(BX*4), DX
 38037  	ADDSS (DX), X6
 38038  	MOVSS X6, (DX)
 38039  	LEAQ  (DX)(BX*4), DX
 38040  	ADDSS (DX), X7
 38041  	MOVSS X7, (DX)
 38042  	LEAQ  (DX)(BX*4), DX
 38043  	ADDSS (DX), X8
 38044  	MOVSS X8, (DX)
 38045  	LEAQ  (DX)(BX*4), DX
 38046  	SUBQ  $0x08, SI
 38047  
 38048  check_limit_unroll:
 38049  	CMPQ SI, $0x08
 38050  	JHS  loop_unroll
 38051  	JMP  check_limit
 38052  
 38053  loop:
 38054  	MOVSS (AX), X1
 38055  	MULSS X0, X1
 38056  	ADDSS (DX), X1
 38057  	MOVSS X1, (DX)
 38058  	DECQ  SI
 38059  	LEAQ  (AX)(CX*4), AX
 38060  	LEAQ  (DX)(BX*4), DX
 38061  
 38062  check_limit:
 38063  	CMPQ SI, $0x00
 38064  	JHI  loop
 38065  	RET
 38066  
 38067  // func AmdAxpyPointerLoopXInterleave_V4A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38068  // Requires: SSE
 38069  TEXT ·AmdAxpyPointerLoopXInterleave_V4A0U8(SB), NOSPLIT, $0-48
 38070  	MOVSS alpha+0(FP), X0
 38071  	MOVQ  xs+8(FP), AX
 38072  	MOVQ  incx+16(FP), CX
 38073  	MOVQ  CX, DX
 38074  	SHLQ  $0x05, DX
 38075  	MOVQ  ys+24(FP), DX
 38076  	MOVQ  incy+32(FP), BX
 38077  	MOVQ  BX, SI
 38078  	SHLQ  $0x05, SI
 38079  	MOVQ  n+40(FP), SI
 38080  	JMP   check_limit_unroll
 38081  
 38082  loop_unroll:
 38083  	MOVSS (AX), X1
 38084  	LEAQ  (AX)(CX*4), AX
 38085  	MOVSS (AX), X2
 38086  	LEAQ  (AX)(CX*4), AX
 38087  	MOVSS (AX), X3
 38088  	LEAQ  (AX)(CX*4), AX
 38089  	MOVSS (AX), X4
 38090  	LEAQ  (AX)(CX*4), AX
 38091  	MOVSS (AX), X5
 38092  	LEAQ  (AX)(CX*4), AX
 38093  	MOVSS (AX), X6
 38094  	LEAQ  (AX)(CX*4), AX
 38095  	MOVSS (AX), X7
 38096  	LEAQ  (AX)(CX*4), AX
 38097  	MOVSS (AX), X8
 38098  	LEAQ  (AX)(CX*4), AX
 38099  	MULSS X0, X1
 38100  	MULSS X0, X2
 38101  	MULSS X0, X3
 38102  	MULSS X0, X4
 38103  	MULSS X0, X5
 38104  	MULSS X0, X6
 38105  	MULSS X0, X7
 38106  	MULSS X0, X8
 38107  	ADDSS (DX), X1
 38108  	MOVSS X1, (DX)
 38109  	LEAQ  (DX)(BX*4), DX
 38110  	ADDSS (DX), X2
 38111  	MOVSS X2, (DX)
 38112  	LEAQ  (DX)(BX*4), DX
 38113  	ADDSS (DX), X3
 38114  	MOVSS X3, (DX)
 38115  	LEAQ  (DX)(BX*4), DX
 38116  	ADDSS (DX), X4
 38117  	MOVSS X4, (DX)
 38118  	LEAQ  (DX)(BX*4), DX
 38119  	ADDSS (DX), X5
 38120  	MOVSS X5, (DX)
 38121  	LEAQ  (DX)(BX*4), DX
 38122  	ADDSS (DX), X6
 38123  	MOVSS X6, (DX)
 38124  	LEAQ  (DX)(BX*4), DX
 38125  	ADDSS (DX), X7
 38126  	MOVSS X7, (DX)
 38127  	LEAQ  (DX)(BX*4), DX
 38128  	ADDSS (DX), X8
 38129  	MOVSS X8, (DX)
 38130  	LEAQ  (DX)(BX*4), DX
 38131  	SUBQ  $0x08, SI
 38132  
 38133  check_limit_unroll:
 38134  	CMPQ SI, $0x08
 38135  	JHS  loop_unroll
 38136  	JMP  check_limit
 38137  
 38138  loop:
 38139  	MOVSS (AX), X1
 38140  	MULSS X0, X1
 38141  	ADDSS (DX), X1
 38142  	MOVSS X1, (DX)
 38143  	DECQ  SI
 38144  	LEAQ  (AX)(CX*4), AX
 38145  	LEAQ  (DX)(BX*4), DX
 38146  
 38147  check_limit:
 38148  	CMPQ SI, $0x00
 38149  	JHI  loop
 38150  	RET
 38151  
 38152  // func AmdAxpyPointerLoopXInterleave_V5A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38153  // Requires: SSE
 38154  TEXT ·AmdAxpyPointerLoopXInterleave_V5A0U8(SB), NOSPLIT, $0-48
 38155  	MOVSS alpha+0(FP), X0
 38156  	MOVQ  xs+8(FP), AX
 38157  	MOVQ  incx+16(FP), CX
 38158  	MOVQ  CX, DX
 38159  	SHLQ  $0x05, DX
 38160  	MOVQ  ys+24(FP), DX
 38161  	MOVQ  incy+32(FP), BX
 38162  	MOVQ  BX, SI
 38163  	SHLQ  $0x05, SI
 38164  	MOVQ  n+40(FP), SI
 38165  	JMP   check_limit_unroll
 38166  
 38167  loop_unroll:
 38168  	MOVSS (AX), X1
 38169  	LEAQ  (AX)(CX*4), AX
 38170  	MOVSS (AX), X2
 38171  	LEAQ  (AX)(CX*4), AX
 38172  	MOVSS (AX), X3
 38173  	LEAQ  (AX)(CX*4), AX
 38174  	MOVSS (AX), X4
 38175  	LEAQ  (AX)(CX*4), AX
 38176  	MOVSS (AX), X5
 38177  	LEAQ  (AX)(CX*4), AX
 38178  	MOVSS (AX), X6
 38179  	LEAQ  (AX)(CX*4), AX
 38180  	MOVSS (AX), X7
 38181  	LEAQ  (AX)(CX*4), AX
 38182  	MOVSS (AX), X8
 38183  	LEAQ  (AX)(CX*4), AX
 38184  	MULSS X0, X1
 38185  	MULSS X0, X2
 38186  	MULSS X0, X3
 38187  	MULSS X0, X4
 38188  	MULSS X0, X5
 38189  	MULSS X0, X6
 38190  	MULSS X0, X7
 38191  	MULSS X0, X8
 38192  	ADDSS (DX), X1
 38193  	MOVSS X1, (DX)
 38194  	LEAQ  (DX)(BX*4), DX
 38195  	ADDSS (DX), X2
 38196  	MOVSS X2, (DX)
 38197  	LEAQ  (DX)(BX*4), DX
 38198  	ADDSS (DX), X3
 38199  	MOVSS X3, (DX)
 38200  	LEAQ  (DX)(BX*4), DX
 38201  	ADDSS (DX), X4
 38202  	MOVSS X4, (DX)
 38203  	LEAQ  (DX)(BX*4), DX
 38204  	ADDSS (DX), X5
 38205  	MOVSS X5, (DX)
 38206  	LEAQ  (DX)(BX*4), DX
 38207  	ADDSS (DX), X6
 38208  	MOVSS X6, (DX)
 38209  	LEAQ  (DX)(BX*4), DX
 38210  	ADDSS (DX), X7
 38211  	MOVSS X7, (DX)
 38212  	LEAQ  (DX)(BX*4), DX
 38213  	ADDSS (DX), X8
 38214  	MOVSS X8, (DX)
 38215  	LEAQ  (DX)(BX*4), DX
 38216  	SUBQ  $0x08, SI
 38217  
 38218  check_limit_unroll:
 38219  	CMPQ SI, $0x08
 38220  	JHS  loop_unroll
 38221  	JMP  check_limit
 38222  
 38223  loop:
 38224  	MOVSS (AX), X1
 38225  	MULSS X0, X1
 38226  	ADDSS (DX), X1
 38227  	MOVSS X1, (DX)
 38228  	DECQ  SI
 38229  	LEAQ  (AX)(CX*4), AX
 38230  	LEAQ  (DX)(BX*4), DX
 38231  
 38232  check_limit:
 38233  	CMPQ SI, $0x00
 38234  	JHI  loop
 38235  	RET
 38236  
 38237  // func AmdAxpyPointerLoopXInterleave_V0A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38238  // Requires: SSE
 38239  TEXT ·AmdAxpyPointerLoopXInterleave_V0A8U8(SB), NOSPLIT, $0-48
 38240  	MOVSS alpha+0(FP), X0
 38241  	MOVQ  xs+8(FP), AX
 38242  	MOVQ  incx+16(FP), CX
 38243  	MOVQ  CX, DX
 38244  	SHLQ  $0x05, DX
 38245  	MOVQ  ys+24(FP), DX
 38246  	MOVQ  incy+32(FP), BX
 38247  	MOVQ  BX, SI
 38248  	SHLQ  $0x05, SI
 38249  	MOVQ  n+40(FP), SI
 38250  	JMP   check_limit_unroll
 38251  	PCALIGN $0x08
 38252  
 38253  loop_unroll:
 38254  	MOVSS (AX), X1
 38255  	LEAQ  (AX)(CX*4), AX
 38256  	MOVSS (AX), X2
 38257  	LEAQ  (AX)(CX*4), AX
 38258  	MOVSS (AX), X3
 38259  	LEAQ  (AX)(CX*4), AX
 38260  	MOVSS (AX), X4
 38261  	LEAQ  (AX)(CX*4), AX
 38262  	MOVSS (AX), X5
 38263  	LEAQ  (AX)(CX*4), AX
 38264  	MOVSS (AX), X6
 38265  	LEAQ  (AX)(CX*4), AX
 38266  	MOVSS (AX), X7
 38267  	LEAQ  (AX)(CX*4), AX
 38268  	MOVSS (AX), X8
 38269  	LEAQ  (AX)(CX*4), AX
 38270  	MULSS X0, X1
 38271  	MULSS X0, X2
 38272  	MULSS X0, X3
 38273  	MULSS X0, X4
 38274  	MULSS X0, X5
 38275  	MULSS X0, X6
 38276  	MULSS X0, X7
 38277  	MULSS X0, X8
 38278  	ADDSS (DX), X1
 38279  	MOVSS X1, (DX)
 38280  	LEAQ  (DX)(BX*4), DX
 38281  	ADDSS (DX), X2
 38282  	MOVSS X2, (DX)
 38283  	LEAQ  (DX)(BX*4), DX
 38284  	ADDSS (DX), X3
 38285  	MOVSS X3, (DX)
 38286  	LEAQ  (DX)(BX*4), DX
 38287  	ADDSS (DX), X4
 38288  	MOVSS X4, (DX)
 38289  	LEAQ  (DX)(BX*4), DX
 38290  	ADDSS (DX), X5
 38291  	MOVSS X5, (DX)
 38292  	LEAQ  (DX)(BX*4), DX
 38293  	ADDSS (DX), X6
 38294  	MOVSS X6, (DX)
 38295  	LEAQ  (DX)(BX*4), DX
 38296  	ADDSS (DX), X7
 38297  	MOVSS X7, (DX)
 38298  	LEAQ  (DX)(BX*4), DX
 38299  	ADDSS (DX), X8
 38300  	MOVSS X8, (DX)
 38301  	LEAQ  (DX)(BX*4), DX
 38302  	SUBQ  $0x08, SI
 38303  
 38304  check_limit_unroll:
 38305  	CMPQ SI, $0x08
 38306  	JHS  loop_unroll
 38307  	JMP  check_limit
 38308  
 38309  loop:
 38310  	MOVSS (AX), X1
 38311  	MULSS X0, X1
 38312  	ADDSS (DX), X1
 38313  	MOVSS X1, (DX)
 38314  	DECQ  SI
 38315  	LEAQ  (AX)(CX*4), AX
 38316  	LEAQ  (DX)(BX*4), DX
 38317  
 38318  check_limit:
 38319  	CMPQ SI, $0x00
 38320  	JHI  loop
 38321  	RET
 38322  
 38323  // func AmdAxpyPointerLoopXInterleave_V1A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38324  // Requires: SSE
 38325  TEXT ·AmdAxpyPointerLoopXInterleave_V1A8U8(SB), NOSPLIT, $0-48
 38326  	MOVSS alpha+0(FP), X0
 38327  	MOVQ  xs+8(FP), AX
 38328  	MOVQ  incx+16(FP), CX
 38329  	MOVQ  CX, DX
 38330  	SHLQ  $0x05, DX
 38331  	MOVQ  ys+24(FP), DX
 38332  	MOVQ  incy+32(FP), BX
 38333  	MOVQ  BX, SI
 38334  	SHLQ  $0x05, SI
 38335  	MOVQ  n+40(FP), SI
 38336  	JMP   check_limit_unroll
 38337  	PCALIGN $0x08
 38338  
 38339  loop_unroll:
 38340  	MOVSS (AX), X1
 38341  	LEAQ  (AX)(CX*4), AX
 38342  	MOVSS (AX), X2
 38343  	LEAQ  (AX)(CX*4), AX
 38344  	MOVSS (AX), X3
 38345  	LEAQ  (AX)(CX*4), AX
 38346  	MOVSS (AX), X4
 38347  	LEAQ  (AX)(CX*4), AX
 38348  	MOVSS (AX), X5
 38349  	LEAQ  (AX)(CX*4), AX
 38350  	MOVSS (AX), X6
 38351  	LEAQ  (AX)(CX*4), AX
 38352  	MOVSS (AX), X7
 38353  	LEAQ  (AX)(CX*4), AX
 38354  	MOVSS (AX), X8
 38355  	LEAQ  (AX)(CX*4), AX
 38356  	MULSS X0, X1
 38357  	MULSS X0, X2
 38358  	MULSS X0, X3
 38359  	MULSS X0, X4
 38360  	MULSS X0, X5
 38361  	MULSS X0, X6
 38362  	MULSS X0, X7
 38363  	MULSS X0, X8
 38364  	ADDSS (DX), X1
 38365  	MOVSS X1, (DX)
 38366  	LEAQ  (DX)(BX*4), DX
 38367  	ADDSS (DX), X2
 38368  	MOVSS X2, (DX)
 38369  	LEAQ  (DX)(BX*4), DX
 38370  	ADDSS (DX), X3
 38371  	MOVSS X3, (DX)
 38372  	LEAQ  (DX)(BX*4), DX
 38373  	ADDSS (DX), X4
 38374  	MOVSS X4, (DX)
 38375  	LEAQ  (DX)(BX*4), DX
 38376  	ADDSS (DX), X5
 38377  	MOVSS X5, (DX)
 38378  	LEAQ  (DX)(BX*4), DX
 38379  	ADDSS (DX), X6
 38380  	MOVSS X6, (DX)
 38381  	LEAQ  (DX)(BX*4), DX
 38382  	ADDSS (DX), X7
 38383  	MOVSS X7, (DX)
 38384  	LEAQ  (DX)(BX*4), DX
 38385  	ADDSS (DX), X8
 38386  	MOVSS X8, (DX)
 38387  	LEAQ  (DX)(BX*4), DX
 38388  	SUBQ  $0x08, SI
 38389  
 38390  check_limit_unroll:
 38391  	CMPQ SI, $0x08
 38392  	JHS  loop_unroll
 38393  	JMP  check_limit
 38394  
 38395  loop:
 38396  	MOVSS (AX), X1
 38397  	MULSS X0, X1
 38398  	ADDSS (DX), X1
 38399  	MOVSS X1, (DX)
 38400  	DECQ  SI
 38401  	LEAQ  (AX)(CX*4), AX
 38402  	LEAQ  (DX)(BX*4), DX
 38403  
 38404  check_limit:
 38405  	CMPQ SI, $0x00
 38406  	JHI  loop
 38407  	RET
 38408  
 38409  // func AmdAxpyPointerLoopXInterleave_V2A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38410  // Requires: SSE
 38411  TEXT ·AmdAxpyPointerLoopXInterleave_V2A8U8(SB), NOSPLIT, $0-48
 38412  	MOVSS alpha+0(FP), X0
 38413  	MOVQ  xs+8(FP), AX
 38414  	MOVQ  incx+16(FP), CX
 38415  	MOVQ  CX, DX
 38416  	SHLQ  $0x05, DX
 38417  	MOVQ  ys+24(FP), DX
 38418  	MOVQ  incy+32(FP), BX
 38419  	MOVQ  BX, SI
 38420  	SHLQ  $0x05, SI
 38421  	MOVQ  n+40(FP), SI
 38422  	JMP   check_limit_unroll
 38423  	PCALIGN $0x08
 38424  
 38425  loop_unroll:
 38426  	MOVSS (AX), X1
 38427  	LEAQ  (AX)(CX*4), AX
 38428  	MOVSS (AX), X2
 38429  	LEAQ  (AX)(CX*4), AX
 38430  	MOVSS (AX), X3
 38431  	LEAQ  (AX)(CX*4), AX
 38432  	MOVSS (AX), X4
 38433  	LEAQ  (AX)(CX*4), AX
 38434  	MOVSS (AX), X5
 38435  	LEAQ  (AX)(CX*4), AX
 38436  	MOVSS (AX), X6
 38437  	LEAQ  (AX)(CX*4), AX
 38438  	MOVSS (AX), X7
 38439  	LEAQ  (AX)(CX*4), AX
 38440  	MOVSS (AX), X8
 38441  	LEAQ  (AX)(CX*4), AX
 38442  	MULSS X0, X1
 38443  	MULSS X0, X2
 38444  	MULSS X0, X3
 38445  	MULSS X0, X4
 38446  	MULSS X0, X5
 38447  	MULSS X0, X6
 38448  	MULSS X0, X7
 38449  	MULSS X0, X8
 38450  	ADDSS (DX), X1
 38451  	MOVSS X1, (DX)
 38452  	LEAQ  (DX)(BX*4), DX
 38453  	ADDSS (DX), X2
 38454  	MOVSS X2, (DX)
 38455  	LEAQ  (DX)(BX*4), DX
 38456  	ADDSS (DX), X3
 38457  	MOVSS X3, (DX)
 38458  	LEAQ  (DX)(BX*4), DX
 38459  	ADDSS (DX), X4
 38460  	MOVSS X4, (DX)
 38461  	LEAQ  (DX)(BX*4), DX
 38462  	ADDSS (DX), X5
 38463  	MOVSS X5, (DX)
 38464  	LEAQ  (DX)(BX*4), DX
 38465  	ADDSS (DX), X6
 38466  	MOVSS X6, (DX)
 38467  	LEAQ  (DX)(BX*4), DX
 38468  	ADDSS (DX), X7
 38469  	MOVSS X7, (DX)
 38470  	LEAQ  (DX)(BX*4), DX
 38471  	ADDSS (DX), X8
 38472  	MOVSS X8, (DX)
 38473  	LEAQ  (DX)(BX*4), DX
 38474  	SUBQ  $0x08, SI
 38475  
 38476  check_limit_unroll:
 38477  	CMPQ SI, $0x08
 38478  	JHS  loop_unroll
 38479  	JMP  check_limit
 38480  
 38481  loop:
 38482  	MOVSS (AX), X1
 38483  	MULSS X0, X1
 38484  	ADDSS (DX), X1
 38485  	MOVSS X1, (DX)
 38486  	DECQ  SI
 38487  	LEAQ  (AX)(CX*4), AX
 38488  	LEAQ  (DX)(BX*4), DX
 38489  
 38490  check_limit:
 38491  	CMPQ SI, $0x00
 38492  	JHI  loop
 38493  	RET
 38494  
 38495  // func AmdAxpyPointerLoopXInterleave_V3A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38496  // Requires: SSE
 38497  TEXT ·AmdAxpyPointerLoopXInterleave_V3A8U8(SB), NOSPLIT, $0-48
 38498  	MOVSS alpha+0(FP), X0
 38499  	MOVQ  xs+8(FP), AX
 38500  	MOVQ  incx+16(FP), CX
 38501  	MOVQ  CX, DX
 38502  	SHLQ  $0x05, DX
 38503  	MOVQ  ys+24(FP), DX
 38504  	MOVQ  incy+32(FP), BX
 38505  	MOVQ  BX, SI
 38506  	SHLQ  $0x05, SI
 38507  	MOVQ  n+40(FP), SI
 38508  	JMP   check_limit_unroll
 38509  	PCALIGN $0x08
 38510  
 38511  loop_unroll:
 38512  	MOVSS (AX), X1
 38513  	LEAQ  (AX)(CX*4), AX
 38514  	MOVSS (AX), X2
 38515  	LEAQ  (AX)(CX*4), AX
 38516  	MOVSS (AX), X3
 38517  	LEAQ  (AX)(CX*4), AX
 38518  	MOVSS (AX), X4
 38519  	LEAQ  (AX)(CX*4), AX
 38520  	MOVSS (AX), X5
 38521  	LEAQ  (AX)(CX*4), AX
 38522  	MOVSS (AX), X6
 38523  	LEAQ  (AX)(CX*4), AX
 38524  	MOVSS (AX), X7
 38525  	LEAQ  (AX)(CX*4), AX
 38526  	MOVSS (AX), X8
 38527  	LEAQ  (AX)(CX*4), AX
 38528  	MULSS X0, X1
 38529  	MULSS X0, X2
 38530  	MULSS X0, X3
 38531  	MULSS X0, X4
 38532  	MULSS X0, X5
 38533  	MULSS X0, X6
 38534  	MULSS X0, X7
 38535  	MULSS X0, X8
 38536  	ADDSS (DX), X1
 38537  	MOVSS X1, (DX)
 38538  	LEAQ  (DX)(BX*4), DX
 38539  	ADDSS (DX), X2
 38540  	MOVSS X2, (DX)
 38541  	LEAQ  (DX)(BX*4), DX
 38542  	ADDSS (DX), X3
 38543  	MOVSS X3, (DX)
 38544  	LEAQ  (DX)(BX*4), DX
 38545  	ADDSS (DX), X4
 38546  	MOVSS X4, (DX)
 38547  	LEAQ  (DX)(BX*4), DX
 38548  	ADDSS (DX), X5
 38549  	MOVSS X5, (DX)
 38550  	LEAQ  (DX)(BX*4), DX
 38551  	ADDSS (DX), X6
 38552  	MOVSS X6, (DX)
 38553  	LEAQ  (DX)(BX*4), DX
 38554  	ADDSS (DX), X7
 38555  	MOVSS X7, (DX)
 38556  	LEAQ  (DX)(BX*4), DX
 38557  	ADDSS (DX), X8
 38558  	MOVSS X8, (DX)
 38559  	LEAQ  (DX)(BX*4), DX
 38560  	SUBQ  $0x08, SI
 38561  
 38562  check_limit_unroll:
 38563  	CMPQ SI, $0x08
 38564  	JHS  loop_unroll
 38565  	JMP  check_limit
 38566  
 38567  loop:
 38568  	MOVSS (AX), X1
 38569  	MULSS X0, X1
 38570  	ADDSS (DX), X1
 38571  	MOVSS X1, (DX)
 38572  	DECQ  SI
 38573  	LEAQ  (AX)(CX*4), AX
 38574  	LEAQ  (DX)(BX*4), DX
 38575  
 38576  check_limit:
 38577  	CMPQ SI, $0x00
 38578  	JHI  loop
 38579  	RET
 38580  
 38581  // func AmdAxpyPointerLoopXInterleave_V4A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38582  // Requires: SSE
 38583  TEXT ·AmdAxpyPointerLoopXInterleave_V4A8U8(SB), NOSPLIT, $0-48
 38584  	MOVSS alpha+0(FP), X0
 38585  	MOVQ  xs+8(FP), AX
 38586  	MOVQ  incx+16(FP), CX
 38587  	MOVQ  CX, DX
 38588  	SHLQ  $0x05, DX
 38589  	MOVQ  ys+24(FP), DX
 38590  	MOVQ  incy+32(FP), BX
 38591  	MOVQ  BX, SI
 38592  	SHLQ  $0x05, SI
 38593  	MOVQ  n+40(FP), SI
 38594  	JMP   check_limit_unroll
 38595  	PCALIGN $0x08
 38596  
 38597  loop_unroll:
 38598  	MOVSS (AX), X1
 38599  	LEAQ  (AX)(CX*4), AX
 38600  	MOVSS (AX), X2
 38601  	LEAQ  (AX)(CX*4), AX
 38602  	MOVSS (AX), X3
 38603  	LEAQ  (AX)(CX*4), AX
 38604  	MOVSS (AX), X4
 38605  	LEAQ  (AX)(CX*4), AX
 38606  	MOVSS (AX), X5
 38607  	LEAQ  (AX)(CX*4), AX
 38608  	MOVSS (AX), X6
 38609  	LEAQ  (AX)(CX*4), AX
 38610  	MOVSS (AX), X7
 38611  	LEAQ  (AX)(CX*4), AX
 38612  	MOVSS (AX), X8
 38613  	LEAQ  (AX)(CX*4), AX
 38614  	MULSS X0, X1
 38615  	MULSS X0, X2
 38616  	MULSS X0, X3
 38617  	MULSS X0, X4
 38618  	MULSS X0, X5
 38619  	MULSS X0, X6
 38620  	MULSS X0, X7
 38621  	MULSS X0, X8
 38622  	ADDSS (DX), X1
 38623  	MOVSS X1, (DX)
 38624  	LEAQ  (DX)(BX*4), DX
 38625  	ADDSS (DX), X2
 38626  	MOVSS X2, (DX)
 38627  	LEAQ  (DX)(BX*4), DX
 38628  	ADDSS (DX), X3
 38629  	MOVSS X3, (DX)
 38630  	LEAQ  (DX)(BX*4), DX
 38631  	ADDSS (DX), X4
 38632  	MOVSS X4, (DX)
 38633  	LEAQ  (DX)(BX*4), DX
 38634  	ADDSS (DX), X5
 38635  	MOVSS X5, (DX)
 38636  	LEAQ  (DX)(BX*4), DX
 38637  	ADDSS (DX), X6
 38638  	MOVSS X6, (DX)
 38639  	LEAQ  (DX)(BX*4), DX
 38640  	ADDSS (DX), X7
 38641  	MOVSS X7, (DX)
 38642  	LEAQ  (DX)(BX*4), DX
 38643  	ADDSS (DX), X8
 38644  	MOVSS X8, (DX)
 38645  	LEAQ  (DX)(BX*4), DX
 38646  	SUBQ  $0x08, SI
 38647  
 38648  check_limit_unroll:
 38649  	CMPQ SI, $0x08
 38650  	JHS  loop_unroll
 38651  	JMP  check_limit
 38652  
 38653  loop:
 38654  	MOVSS (AX), X1
 38655  	MULSS X0, X1
 38656  	ADDSS (DX), X1
 38657  	MOVSS X1, (DX)
 38658  	DECQ  SI
 38659  	LEAQ  (AX)(CX*4), AX
 38660  	LEAQ  (DX)(BX*4), DX
 38661  
 38662  check_limit:
 38663  	CMPQ SI, $0x00
 38664  	JHI  loop
 38665  	RET
 38666  
 38667  // func AmdAxpyPointerLoopXInterleave_V5A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38668  // Requires: SSE
 38669  TEXT ·AmdAxpyPointerLoopXInterleave_V5A8U8(SB), NOSPLIT, $0-48
 38670  	MOVSS alpha+0(FP), X0
 38671  	MOVQ  xs+8(FP), AX
 38672  	MOVQ  incx+16(FP), CX
 38673  	MOVQ  CX, DX
 38674  	SHLQ  $0x05, DX
 38675  	MOVQ  ys+24(FP), DX
 38676  	MOVQ  incy+32(FP), BX
 38677  	MOVQ  BX, SI
 38678  	SHLQ  $0x05, SI
 38679  	MOVQ  n+40(FP), SI
 38680  	JMP   check_limit_unroll
 38681  	PCALIGN $0x08
 38682  
 38683  loop_unroll:
 38684  	MOVSS (AX), X1
 38685  	LEAQ  (AX)(CX*4), AX
 38686  	MOVSS (AX), X2
 38687  	LEAQ  (AX)(CX*4), AX
 38688  	MOVSS (AX), X3
 38689  	LEAQ  (AX)(CX*4), AX
 38690  	MOVSS (AX), X4
 38691  	LEAQ  (AX)(CX*4), AX
 38692  	MOVSS (AX), X5
 38693  	LEAQ  (AX)(CX*4), AX
 38694  	MOVSS (AX), X6
 38695  	LEAQ  (AX)(CX*4), AX
 38696  	MOVSS (AX), X7
 38697  	LEAQ  (AX)(CX*4), AX
 38698  	MOVSS (AX), X8
 38699  	LEAQ  (AX)(CX*4), AX
 38700  	MULSS X0, X1
 38701  	MULSS X0, X2
 38702  	MULSS X0, X3
 38703  	MULSS X0, X4
 38704  	MULSS X0, X5
 38705  	MULSS X0, X6
 38706  	MULSS X0, X7
 38707  	MULSS X0, X8
 38708  	ADDSS (DX), X1
 38709  	MOVSS X1, (DX)
 38710  	LEAQ  (DX)(BX*4), DX
 38711  	ADDSS (DX), X2
 38712  	MOVSS X2, (DX)
 38713  	LEAQ  (DX)(BX*4), DX
 38714  	ADDSS (DX), X3
 38715  	MOVSS X3, (DX)
 38716  	LEAQ  (DX)(BX*4), DX
 38717  	ADDSS (DX), X4
 38718  	MOVSS X4, (DX)
 38719  	LEAQ  (DX)(BX*4), DX
 38720  	ADDSS (DX), X5
 38721  	MOVSS X5, (DX)
 38722  	LEAQ  (DX)(BX*4), DX
 38723  	ADDSS (DX), X6
 38724  	MOVSS X6, (DX)
 38725  	LEAQ  (DX)(BX*4), DX
 38726  	ADDSS (DX), X7
 38727  	MOVSS X7, (DX)
 38728  	LEAQ  (DX)(BX*4), DX
 38729  	ADDSS (DX), X8
 38730  	MOVSS X8, (DX)
 38731  	LEAQ  (DX)(BX*4), DX
 38732  	SUBQ  $0x08, SI
 38733  
 38734  check_limit_unroll:
 38735  	CMPQ SI, $0x08
 38736  	JHS  loop_unroll
 38737  	JMP  check_limit
 38738  
 38739  loop:
 38740  	MOVSS (AX), X1
 38741  	MULSS X0, X1
 38742  	ADDSS (DX), X1
 38743  	MOVSS X1, (DX)
 38744  	DECQ  SI
 38745  	LEAQ  (AX)(CX*4), AX
 38746  	LEAQ  (DX)(BX*4), DX
 38747  
 38748  check_limit:
 38749  	CMPQ SI, $0x00
 38750  	JHI  loop
 38751  	RET
 38752  
 38753  // func AmdAxpyPointerLoopXInterleave_V0A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38754  // Requires: SSE
 38755  TEXT ·AmdAxpyPointerLoopXInterleave_V0A9U8(SB), NOSPLIT, $0-48
 38756  	MOVSS alpha+0(FP), X0
 38757  	MOVQ  xs+8(FP), AX
 38758  	MOVQ  incx+16(FP), CX
 38759  	MOVQ  CX, DX
 38760  	SHLQ  $0x05, DX
 38761  	MOVQ  ys+24(FP), DX
 38762  	MOVQ  incy+32(FP), BX
 38763  	MOVQ  BX, SI
 38764  	SHLQ  $0x05, SI
 38765  	MOVQ  n+40(FP), SI
 38766  	JMP   check_limit_unroll
 38767  	PCALIGN $0x08
 38768  	NOP
 38769  
 38770  loop_unroll:
 38771  	MOVSS (AX), X1
 38772  	LEAQ  (AX)(CX*4), AX
 38773  	MOVSS (AX), X2
 38774  	LEAQ  (AX)(CX*4), AX
 38775  	MOVSS (AX), X3
 38776  	LEAQ  (AX)(CX*4), AX
 38777  	MOVSS (AX), X4
 38778  	LEAQ  (AX)(CX*4), AX
 38779  	MOVSS (AX), X5
 38780  	LEAQ  (AX)(CX*4), AX
 38781  	MOVSS (AX), X6
 38782  	LEAQ  (AX)(CX*4), AX
 38783  	MOVSS (AX), X7
 38784  	LEAQ  (AX)(CX*4), AX
 38785  	MOVSS (AX), X8
 38786  	LEAQ  (AX)(CX*4), AX
 38787  	MULSS X0, X1
 38788  	MULSS X0, X2
 38789  	MULSS X0, X3
 38790  	MULSS X0, X4
 38791  	MULSS X0, X5
 38792  	MULSS X0, X6
 38793  	MULSS X0, X7
 38794  	MULSS X0, X8
 38795  	ADDSS (DX), X1
 38796  	MOVSS X1, (DX)
 38797  	LEAQ  (DX)(BX*4), DX
 38798  	ADDSS (DX), X2
 38799  	MOVSS X2, (DX)
 38800  	LEAQ  (DX)(BX*4), DX
 38801  	ADDSS (DX), X3
 38802  	MOVSS X3, (DX)
 38803  	LEAQ  (DX)(BX*4), DX
 38804  	ADDSS (DX), X4
 38805  	MOVSS X4, (DX)
 38806  	LEAQ  (DX)(BX*4), DX
 38807  	ADDSS (DX), X5
 38808  	MOVSS X5, (DX)
 38809  	LEAQ  (DX)(BX*4), DX
 38810  	ADDSS (DX), X6
 38811  	MOVSS X6, (DX)
 38812  	LEAQ  (DX)(BX*4), DX
 38813  	ADDSS (DX), X7
 38814  	MOVSS X7, (DX)
 38815  	LEAQ  (DX)(BX*4), DX
 38816  	ADDSS (DX), X8
 38817  	MOVSS X8, (DX)
 38818  	LEAQ  (DX)(BX*4), DX
 38819  	SUBQ  $0x08, SI
 38820  
 38821  check_limit_unroll:
 38822  	CMPQ SI, $0x08
 38823  	JHS  loop_unroll
 38824  	JMP  check_limit
 38825  
 38826  loop:
 38827  	MOVSS (AX), X1
 38828  	MULSS X0, X1
 38829  	ADDSS (DX), X1
 38830  	MOVSS X1, (DX)
 38831  	DECQ  SI
 38832  	LEAQ  (AX)(CX*4), AX
 38833  	LEAQ  (DX)(BX*4), DX
 38834  
 38835  check_limit:
 38836  	CMPQ SI, $0x00
 38837  	JHI  loop
 38838  	RET
 38839  
 38840  // func AmdAxpyPointerLoopXInterleave_V1A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38841  // Requires: SSE
 38842  TEXT ·AmdAxpyPointerLoopXInterleave_V1A9U8(SB), NOSPLIT, $0-48
 38843  	MOVSS alpha+0(FP), X0
 38844  	MOVQ  xs+8(FP), AX
 38845  	MOVQ  incx+16(FP), CX
 38846  	MOVQ  CX, DX
 38847  	SHLQ  $0x05, DX
 38848  	MOVQ  ys+24(FP), DX
 38849  	MOVQ  incy+32(FP), BX
 38850  	MOVQ  BX, SI
 38851  	SHLQ  $0x05, SI
 38852  	MOVQ  n+40(FP), SI
 38853  	JMP   check_limit_unroll
 38854  	PCALIGN $0x08
 38855  	NOP
 38856  
 38857  loop_unroll:
 38858  	MOVSS (AX), X1
 38859  	LEAQ  (AX)(CX*4), AX
 38860  	MOVSS (AX), X2
 38861  	LEAQ  (AX)(CX*4), AX
 38862  	MOVSS (AX), X3
 38863  	LEAQ  (AX)(CX*4), AX
 38864  	MOVSS (AX), X4
 38865  	LEAQ  (AX)(CX*4), AX
 38866  	MOVSS (AX), X5
 38867  	LEAQ  (AX)(CX*4), AX
 38868  	MOVSS (AX), X6
 38869  	LEAQ  (AX)(CX*4), AX
 38870  	MOVSS (AX), X7
 38871  	LEAQ  (AX)(CX*4), AX
 38872  	MOVSS (AX), X8
 38873  	LEAQ  (AX)(CX*4), AX
 38874  	MULSS X0, X1
 38875  	MULSS X0, X2
 38876  	MULSS X0, X3
 38877  	MULSS X0, X4
 38878  	MULSS X0, X5
 38879  	MULSS X0, X6
 38880  	MULSS X0, X7
 38881  	MULSS X0, X8
 38882  	ADDSS (DX), X1
 38883  	MOVSS X1, (DX)
 38884  	LEAQ  (DX)(BX*4), DX
 38885  	ADDSS (DX), X2
 38886  	MOVSS X2, (DX)
 38887  	LEAQ  (DX)(BX*4), DX
 38888  	ADDSS (DX), X3
 38889  	MOVSS X3, (DX)
 38890  	LEAQ  (DX)(BX*4), DX
 38891  	ADDSS (DX), X4
 38892  	MOVSS X4, (DX)
 38893  	LEAQ  (DX)(BX*4), DX
 38894  	ADDSS (DX), X5
 38895  	MOVSS X5, (DX)
 38896  	LEAQ  (DX)(BX*4), DX
 38897  	ADDSS (DX), X6
 38898  	MOVSS X6, (DX)
 38899  	LEAQ  (DX)(BX*4), DX
 38900  	ADDSS (DX), X7
 38901  	MOVSS X7, (DX)
 38902  	LEAQ  (DX)(BX*4), DX
 38903  	ADDSS (DX), X8
 38904  	MOVSS X8, (DX)
 38905  	LEAQ  (DX)(BX*4), DX
 38906  	SUBQ  $0x08, SI
 38907  
 38908  check_limit_unroll:
 38909  	CMPQ SI, $0x08
 38910  	JHS  loop_unroll
 38911  	JMP  check_limit
 38912  
 38913  loop:
 38914  	MOVSS (AX), X1
 38915  	MULSS X0, X1
 38916  	ADDSS (DX), X1
 38917  	MOVSS X1, (DX)
 38918  	DECQ  SI
 38919  	LEAQ  (AX)(CX*4), AX
 38920  	LEAQ  (DX)(BX*4), DX
 38921  
 38922  check_limit:
 38923  	CMPQ SI, $0x00
 38924  	JHI  loop
 38925  	RET
 38926  
 38927  // func AmdAxpyPointerLoopXInterleave_V2A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 38928  // Requires: SSE
 38929  TEXT ·AmdAxpyPointerLoopXInterleave_V2A9U8(SB), NOSPLIT, $0-48
 38930  	MOVSS alpha+0(FP), X0
 38931  	MOVQ  xs+8(FP), AX
 38932  	MOVQ  incx+16(FP), CX
 38933  	MOVQ  CX, DX
 38934  	SHLQ  $0x05, DX
 38935  	MOVQ  ys+24(FP), DX
 38936  	MOVQ  incy+32(FP), BX
 38937  	MOVQ  BX, SI
 38938  	SHLQ  $0x05, SI
 38939  	MOVQ  n+40(FP), SI
 38940  	JMP   check_limit_unroll
 38941  	PCALIGN $0x08
 38942  	NOP
 38943  
 38944  loop_unroll:
 38945  	MOVSS (AX), X1
 38946  	LEAQ  (AX)(CX*4), AX
 38947  	MOVSS (AX), X2
 38948  	LEAQ  (AX)(CX*4), AX
 38949  	MOVSS (AX), X3
 38950  	LEAQ  (AX)(CX*4), AX
 38951  	MOVSS (AX), X4
 38952  	LEAQ  (AX)(CX*4), AX
 38953  	MOVSS (AX), X5
 38954  	LEAQ  (AX)(CX*4), AX
 38955  	MOVSS (AX), X6
 38956  	LEAQ  (AX)(CX*4), AX
 38957  	MOVSS (AX), X7
 38958  	LEAQ  (AX)(CX*4), AX
 38959  	MOVSS (AX), X8
 38960  	LEAQ  (AX)(CX*4), AX
 38961  	MULSS X0, X1
 38962  	MULSS X0, X2
 38963  	MULSS X0, X3
 38964  	MULSS X0, X4
 38965  	MULSS X0, X5
 38966  	MULSS X0, X6
 38967  	MULSS X0, X7
 38968  	MULSS X0, X8
 38969  	ADDSS (DX), X1
 38970  	MOVSS X1, (DX)
 38971  	LEAQ  (DX)(BX*4), DX
 38972  	ADDSS (DX), X2
 38973  	MOVSS X2, (DX)
 38974  	LEAQ  (DX)(BX*4), DX
 38975  	ADDSS (DX), X3
 38976  	MOVSS X3, (DX)
 38977  	LEAQ  (DX)(BX*4), DX
 38978  	ADDSS (DX), X4
 38979  	MOVSS X4, (DX)
 38980  	LEAQ  (DX)(BX*4), DX
 38981  	ADDSS (DX), X5
 38982  	MOVSS X5, (DX)
 38983  	LEAQ  (DX)(BX*4), DX
 38984  	ADDSS (DX), X6
 38985  	MOVSS X6, (DX)
 38986  	LEAQ  (DX)(BX*4), DX
 38987  	ADDSS (DX), X7
 38988  	MOVSS X7, (DX)
 38989  	LEAQ  (DX)(BX*4), DX
 38990  	ADDSS (DX), X8
 38991  	MOVSS X8, (DX)
 38992  	LEAQ  (DX)(BX*4), DX
 38993  	SUBQ  $0x08, SI
 38994  
 38995  check_limit_unroll:
 38996  	CMPQ SI, $0x08
 38997  	JHS  loop_unroll
 38998  	JMP  check_limit
 38999  
 39000  loop:
 39001  	MOVSS (AX), X1
 39002  	MULSS X0, X1
 39003  	ADDSS (DX), X1
 39004  	MOVSS X1, (DX)
 39005  	DECQ  SI
 39006  	LEAQ  (AX)(CX*4), AX
 39007  	LEAQ  (DX)(BX*4), DX
 39008  
 39009  check_limit:
 39010  	CMPQ SI, $0x00
 39011  	JHI  loop
 39012  	RET
 39013  
 39014  // func AmdAxpyPointerLoopXInterleave_V3A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39015  // Requires: SSE
 39016  TEXT ·AmdAxpyPointerLoopXInterleave_V3A9U8(SB), NOSPLIT, $0-48
 39017  	MOVSS alpha+0(FP), X0
 39018  	MOVQ  xs+8(FP), AX
 39019  	MOVQ  incx+16(FP), CX
 39020  	MOVQ  CX, DX
 39021  	SHLQ  $0x05, DX
 39022  	MOVQ  ys+24(FP), DX
 39023  	MOVQ  incy+32(FP), BX
 39024  	MOVQ  BX, SI
 39025  	SHLQ  $0x05, SI
 39026  	MOVQ  n+40(FP), SI
 39027  	JMP   check_limit_unroll
 39028  	PCALIGN $0x08
 39029  	NOP
 39030  
 39031  loop_unroll:
 39032  	MOVSS (AX), X1
 39033  	LEAQ  (AX)(CX*4), AX
 39034  	MOVSS (AX), X2
 39035  	LEAQ  (AX)(CX*4), AX
 39036  	MOVSS (AX), X3
 39037  	LEAQ  (AX)(CX*4), AX
 39038  	MOVSS (AX), X4
 39039  	LEAQ  (AX)(CX*4), AX
 39040  	MOVSS (AX), X5
 39041  	LEAQ  (AX)(CX*4), AX
 39042  	MOVSS (AX), X6
 39043  	LEAQ  (AX)(CX*4), AX
 39044  	MOVSS (AX), X7
 39045  	LEAQ  (AX)(CX*4), AX
 39046  	MOVSS (AX), X8
 39047  	LEAQ  (AX)(CX*4), AX
 39048  	MULSS X0, X1
 39049  	MULSS X0, X2
 39050  	MULSS X0, X3
 39051  	MULSS X0, X4
 39052  	MULSS X0, X5
 39053  	MULSS X0, X6
 39054  	MULSS X0, X7
 39055  	MULSS X0, X8
 39056  	ADDSS (DX), X1
 39057  	MOVSS X1, (DX)
 39058  	LEAQ  (DX)(BX*4), DX
 39059  	ADDSS (DX), X2
 39060  	MOVSS X2, (DX)
 39061  	LEAQ  (DX)(BX*4), DX
 39062  	ADDSS (DX), X3
 39063  	MOVSS X3, (DX)
 39064  	LEAQ  (DX)(BX*4), DX
 39065  	ADDSS (DX), X4
 39066  	MOVSS X4, (DX)
 39067  	LEAQ  (DX)(BX*4), DX
 39068  	ADDSS (DX), X5
 39069  	MOVSS X5, (DX)
 39070  	LEAQ  (DX)(BX*4), DX
 39071  	ADDSS (DX), X6
 39072  	MOVSS X6, (DX)
 39073  	LEAQ  (DX)(BX*4), DX
 39074  	ADDSS (DX), X7
 39075  	MOVSS X7, (DX)
 39076  	LEAQ  (DX)(BX*4), DX
 39077  	ADDSS (DX), X8
 39078  	MOVSS X8, (DX)
 39079  	LEAQ  (DX)(BX*4), DX
 39080  	SUBQ  $0x08, SI
 39081  
 39082  check_limit_unroll:
 39083  	CMPQ SI, $0x08
 39084  	JHS  loop_unroll
 39085  	JMP  check_limit
 39086  
 39087  loop:
 39088  	MOVSS (AX), X1
 39089  	MULSS X0, X1
 39090  	ADDSS (DX), X1
 39091  	MOVSS X1, (DX)
 39092  	DECQ  SI
 39093  	LEAQ  (AX)(CX*4), AX
 39094  	LEAQ  (DX)(BX*4), DX
 39095  
 39096  check_limit:
 39097  	CMPQ SI, $0x00
 39098  	JHI  loop
 39099  	RET
 39100  
 39101  // func AmdAxpyPointerLoopXInterleave_V4A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39102  // Requires: SSE
 39103  TEXT ·AmdAxpyPointerLoopXInterleave_V4A9U8(SB), NOSPLIT, $0-48
 39104  	MOVSS alpha+0(FP), X0
 39105  	MOVQ  xs+8(FP), AX
 39106  	MOVQ  incx+16(FP), CX
 39107  	MOVQ  CX, DX
 39108  	SHLQ  $0x05, DX
 39109  	MOVQ  ys+24(FP), DX
 39110  	MOVQ  incy+32(FP), BX
 39111  	MOVQ  BX, SI
 39112  	SHLQ  $0x05, SI
 39113  	MOVQ  n+40(FP), SI
 39114  	JMP   check_limit_unroll
 39115  	PCALIGN $0x08
 39116  	NOP
 39117  
 39118  loop_unroll:
 39119  	MOVSS (AX), X1
 39120  	LEAQ  (AX)(CX*4), AX
 39121  	MOVSS (AX), X2
 39122  	LEAQ  (AX)(CX*4), AX
 39123  	MOVSS (AX), X3
 39124  	LEAQ  (AX)(CX*4), AX
 39125  	MOVSS (AX), X4
 39126  	LEAQ  (AX)(CX*4), AX
 39127  	MOVSS (AX), X5
 39128  	LEAQ  (AX)(CX*4), AX
 39129  	MOVSS (AX), X6
 39130  	LEAQ  (AX)(CX*4), AX
 39131  	MOVSS (AX), X7
 39132  	LEAQ  (AX)(CX*4), AX
 39133  	MOVSS (AX), X8
 39134  	LEAQ  (AX)(CX*4), AX
 39135  	MULSS X0, X1
 39136  	MULSS X0, X2
 39137  	MULSS X0, X3
 39138  	MULSS X0, X4
 39139  	MULSS X0, X5
 39140  	MULSS X0, X6
 39141  	MULSS X0, X7
 39142  	MULSS X0, X8
 39143  	ADDSS (DX), X1
 39144  	MOVSS X1, (DX)
 39145  	LEAQ  (DX)(BX*4), DX
 39146  	ADDSS (DX), X2
 39147  	MOVSS X2, (DX)
 39148  	LEAQ  (DX)(BX*4), DX
 39149  	ADDSS (DX), X3
 39150  	MOVSS X3, (DX)
 39151  	LEAQ  (DX)(BX*4), DX
 39152  	ADDSS (DX), X4
 39153  	MOVSS X4, (DX)
 39154  	LEAQ  (DX)(BX*4), DX
 39155  	ADDSS (DX), X5
 39156  	MOVSS X5, (DX)
 39157  	LEAQ  (DX)(BX*4), DX
 39158  	ADDSS (DX), X6
 39159  	MOVSS X6, (DX)
 39160  	LEAQ  (DX)(BX*4), DX
 39161  	ADDSS (DX), X7
 39162  	MOVSS X7, (DX)
 39163  	LEAQ  (DX)(BX*4), DX
 39164  	ADDSS (DX), X8
 39165  	MOVSS X8, (DX)
 39166  	LEAQ  (DX)(BX*4), DX
 39167  	SUBQ  $0x08, SI
 39168  
 39169  check_limit_unroll:
 39170  	CMPQ SI, $0x08
 39171  	JHS  loop_unroll
 39172  	JMP  check_limit
 39173  
 39174  loop:
 39175  	MOVSS (AX), X1
 39176  	MULSS X0, X1
 39177  	ADDSS (DX), X1
 39178  	MOVSS X1, (DX)
 39179  	DECQ  SI
 39180  	LEAQ  (AX)(CX*4), AX
 39181  	LEAQ  (DX)(BX*4), DX
 39182  
 39183  check_limit:
 39184  	CMPQ SI, $0x00
 39185  	JHI  loop
 39186  	RET
 39187  
 39188  // func AmdAxpyPointerLoopXInterleave_V5A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39189  // Requires: SSE
 39190  TEXT ·AmdAxpyPointerLoopXInterleave_V5A9U8(SB), NOSPLIT, $0-48
 39191  	MOVSS alpha+0(FP), X0
 39192  	MOVQ  xs+8(FP), AX
 39193  	MOVQ  incx+16(FP), CX
 39194  	MOVQ  CX, DX
 39195  	SHLQ  $0x05, DX
 39196  	MOVQ  ys+24(FP), DX
 39197  	MOVQ  incy+32(FP), BX
 39198  	MOVQ  BX, SI
 39199  	SHLQ  $0x05, SI
 39200  	MOVQ  n+40(FP), SI
 39201  	JMP   check_limit_unroll
 39202  	PCALIGN $0x08
 39203  	NOP
 39204  
 39205  loop_unroll:
 39206  	MOVSS (AX), X1
 39207  	LEAQ  (AX)(CX*4), AX
 39208  	MOVSS (AX), X2
 39209  	LEAQ  (AX)(CX*4), AX
 39210  	MOVSS (AX), X3
 39211  	LEAQ  (AX)(CX*4), AX
 39212  	MOVSS (AX), X4
 39213  	LEAQ  (AX)(CX*4), AX
 39214  	MOVSS (AX), X5
 39215  	LEAQ  (AX)(CX*4), AX
 39216  	MOVSS (AX), X6
 39217  	LEAQ  (AX)(CX*4), AX
 39218  	MOVSS (AX), X7
 39219  	LEAQ  (AX)(CX*4), AX
 39220  	MOVSS (AX), X8
 39221  	LEAQ  (AX)(CX*4), AX
 39222  	MULSS X0, X1
 39223  	MULSS X0, X2
 39224  	MULSS X0, X3
 39225  	MULSS X0, X4
 39226  	MULSS X0, X5
 39227  	MULSS X0, X6
 39228  	MULSS X0, X7
 39229  	MULSS X0, X8
 39230  	ADDSS (DX), X1
 39231  	MOVSS X1, (DX)
 39232  	LEAQ  (DX)(BX*4), DX
 39233  	ADDSS (DX), X2
 39234  	MOVSS X2, (DX)
 39235  	LEAQ  (DX)(BX*4), DX
 39236  	ADDSS (DX), X3
 39237  	MOVSS X3, (DX)
 39238  	LEAQ  (DX)(BX*4), DX
 39239  	ADDSS (DX), X4
 39240  	MOVSS X4, (DX)
 39241  	LEAQ  (DX)(BX*4), DX
 39242  	ADDSS (DX), X5
 39243  	MOVSS X5, (DX)
 39244  	LEAQ  (DX)(BX*4), DX
 39245  	ADDSS (DX), X6
 39246  	MOVSS X6, (DX)
 39247  	LEAQ  (DX)(BX*4), DX
 39248  	ADDSS (DX), X7
 39249  	MOVSS X7, (DX)
 39250  	LEAQ  (DX)(BX*4), DX
 39251  	ADDSS (DX), X8
 39252  	MOVSS X8, (DX)
 39253  	LEAQ  (DX)(BX*4), DX
 39254  	SUBQ  $0x08, SI
 39255  
 39256  check_limit_unroll:
 39257  	CMPQ SI, $0x08
 39258  	JHS  loop_unroll
 39259  	JMP  check_limit
 39260  
 39261  loop:
 39262  	MOVSS (AX), X1
 39263  	MULSS X0, X1
 39264  	ADDSS (DX), X1
 39265  	MOVSS X1, (DX)
 39266  	DECQ  SI
 39267  	LEAQ  (AX)(CX*4), AX
 39268  	LEAQ  (DX)(BX*4), DX
 39269  
 39270  check_limit:
 39271  	CMPQ SI, $0x00
 39272  	JHI  loop
 39273  	RET
 39274  
 39275  // func AmdAxpyPointerLoopXInterleave_V0A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39276  // Requires: SSE
 39277  TEXT ·AmdAxpyPointerLoopXInterleave_V0A10U8(SB), NOSPLIT, $0-48
 39278  	MOVSS alpha+0(FP), X0
 39279  	MOVQ  xs+8(FP), AX
 39280  	MOVQ  incx+16(FP), CX
 39281  	MOVQ  CX, DX
 39282  	SHLQ  $0x05, DX
 39283  	MOVQ  ys+24(FP), DX
 39284  	MOVQ  incy+32(FP), BX
 39285  	MOVQ  BX, SI
 39286  	SHLQ  $0x05, SI
 39287  	MOVQ  n+40(FP), SI
 39288  	JMP   check_limit_unroll
 39289  	PCALIGN $0x08
 39290  	NOP
 39291  	NOP
 39292  
 39293  loop_unroll:
 39294  	MOVSS (AX), X1
 39295  	LEAQ  (AX)(CX*4), AX
 39296  	MOVSS (AX), X2
 39297  	LEAQ  (AX)(CX*4), AX
 39298  	MOVSS (AX), X3
 39299  	LEAQ  (AX)(CX*4), AX
 39300  	MOVSS (AX), X4
 39301  	LEAQ  (AX)(CX*4), AX
 39302  	MOVSS (AX), X5
 39303  	LEAQ  (AX)(CX*4), AX
 39304  	MOVSS (AX), X6
 39305  	LEAQ  (AX)(CX*4), AX
 39306  	MOVSS (AX), X7
 39307  	LEAQ  (AX)(CX*4), AX
 39308  	MOVSS (AX), X8
 39309  	LEAQ  (AX)(CX*4), AX
 39310  	MULSS X0, X1
 39311  	MULSS X0, X2
 39312  	MULSS X0, X3
 39313  	MULSS X0, X4
 39314  	MULSS X0, X5
 39315  	MULSS X0, X6
 39316  	MULSS X0, X7
 39317  	MULSS X0, X8
 39318  	ADDSS (DX), X1
 39319  	MOVSS X1, (DX)
 39320  	LEAQ  (DX)(BX*4), DX
 39321  	ADDSS (DX), X2
 39322  	MOVSS X2, (DX)
 39323  	LEAQ  (DX)(BX*4), DX
 39324  	ADDSS (DX), X3
 39325  	MOVSS X3, (DX)
 39326  	LEAQ  (DX)(BX*4), DX
 39327  	ADDSS (DX), X4
 39328  	MOVSS X4, (DX)
 39329  	LEAQ  (DX)(BX*4), DX
 39330  	ADDSS (DX), X5
 39331  	MOVSS X5, (DX)
 39332  	LEAQ  (DX)(BX*4), DX
 39333  	ADDSS (DX), X6
 39334  	MOVSS X6, (DX)
 39335  	LEAQ  (DX)(BX*4), DX
 39336  	ADDSS (DX), X7
 39337  	MOVSS X7, (DX)
 39338  	LEAQ  (DX)(BX*4), DX
 39339  	ADDSS (DX), X8
 39340  	MOVSS X8, (DX)
 39341  	LEAQ  (DX)(BX*4), DX
 39342  	SUBQ  $0x08, SI
 39343  
 39344  check_limit_unroll:
 39345  	CMPQ SI, $0x08
 39346  	JHS  loop_unroll
 39347  	JMP  check_limit
 39348  
 39349  loop:
 39350  	MOVSS (AX), X1
 39351  	MULSS X0, X1
 39352  	ADDSS (DX), X1
 39353  	MOVSS X1, (DX)
 39354  	DECQ  SI
 39355  	LEAQ  (AX)(CX*4), AX
 39356  	LEAQ  (DX)(BX*4), DX
 39357  
 39358  check_limit:
 39359  	CMPQ SI, $0x00
 39360  	JHI  loop
 39361  	RET
 39362  
 39363  // func AmdAxpyPointerLoopXInterleave_V1A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39364  // Requires: SSE
 39365  TEXT ·AmdAxpyPointerLoopXInterleave_V1A10U8(SB), NOSPLIT, $0-48
 39366  	MOVSS alpha+0(FP), X0
 39367  	MOVQ  xs+8(FP), AX
 39368  	MOVQ  incx+16(FP), CX
 39369  	MOVQ  CX, DX
 39370  	SHLQ  $0x05, DX
 39371  	MOVQ  ys+24(FP), DX
 39372  	MOVQ  incy+32(FP), BX
 39373  	MOVQ  BX, SI
 39374  	SHLQ  $0x05, SI
 39375  	MOVQ  n+40(FP), SI
 39376  	JMP   check_limit_unroll
 39377  	PCALIGN $0x08
 39378  	NOP
 39379  	NOP
 39380  
 39381  loop_unroll:
 39382  	MOVSS (AX), X1
 39383  	LEAQ  (AX)(CX*4), AX
 39384  	MOVSS (AX), X2
 39385  	LEAQ  (AX)(CX*4), AX
 39386  	MOVSS (AX), X3
 39387  	LEAQ  (AX)(CX*4), AX
 39388  	MOVSS (AX), X4
 39389  	LEAQ  (AX)(CX*4), AX
 39390  	MOVSS (AX), X5
 39391  	LEAQ  (AX)(CX*4), AX
 39392  	MOVSS (AX), X6
 39393  	LEAQ  (AX)(CX*4), AX
 39394  	MOVSS (AX), X7
 39395  	LEAQ  (AX)(CX*4), AX
 39396  	MOVSS (AX), X8
 39397  	LEAQ  (AX)(CX*4), AX
 39398  	MULSS X0, X1
 39399  	MULSS X0, X2
 39400  	MULSS X0, X3
 39401  	MULSS X0, X4
 39402  	MULSS X0, X5
 39403  	MULSS X0, X6
 39404  	MULSS X0, X7
 39405  	MULSS X0, X8
 39406  	ADDSS (DX), X1
 39407  	MOVSS X1, (DX)
 39408  	LEAQ  (DX)(BX*4), DX
 39409  	ADDSS (DX), X2
 39410  	MOVSS X2, (DX)
 39411  	LEAQ  (DX)(BX*4), DX
 39412  	ADDSS (DX), X3
 39413  	MOVSS X3, (DX)
 39414  	LEAQ  (DX)(BX*4), DX
 39415  	ADDSS (DX), X4
 39416  	MOVSS X4, (DX)
 39417  	LEAQ  (DX)(BX*4), DX
 39418  	ADDSS (DX), X5
 39419  	MOVSS X5, (DX)
 39420  	LEAQ  (DX)(BX*4), DX
 39421  	ADDSS (DX), X6
 39422  	MOVSS X6, (DX)
 39423  	LEAQ  (DX)(BX*4), DX
 39424  	ADDSS (DX), X7
 39425  	MOVSS X7, (DX)
 39426  	LEAQ  (DX)(BX*4), DX
 39427  	ADDSS (DX), X8
 39428  	MOVSS X8, (DX)
 39429  	LEAQ  (DX)(BX*4), DX
 39430  	SUBQ  $0x08, SI
 39431  
 39432  check_limit_unroll:
 39433  	CMPQ SI, $0x08
 39434  	JHS  loop_unroll
 39435  	JMP  check_limit
 39436  
 39437  loop:
 39438  	MOVSS (AX), X1
 39439  	MULSS X0, X1
 39440  	ADDSS (DX), X1
 39441  	MOVSS X1, (DX)
 39442  	DECQ  SI
 39443  	LEAQ  (AX)(CX*4), AX
 39444  	LEAQ  (DX)(BX*4), DX
 39445  
 39446  check_limit:
 39447  	CMPQ SI, $0x00
 39448  	JHI  loop
 39449  	RET
 39450  
 39451  // func AmdAxpyPointerLoopXInterleave_V2A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39452  // Requires: SSE
 39453  TEXT ·AmdAxpyPointerLoopXInterleave_V2A10U8(SB), NOSPLIT, $0-48
 39454  	MOVSS alpha+0(FP), X0
 39455  	MOVQ  xs+8(FP), AX
 39456  	MOVQ  incx+16(FP), CX
 39457  	MOVQ  CX, DX
 39458  	SHLQ  $0x05, DX
 39459  	MOVQ  ys+24(FP), DX
 39460  	MOVQ  incy+32(FP), BX
 39461  	MOVQ  BX, SI
 39462  	SHLQ  $0x05, SI
 39463  	MOVQ  n+40(FP), SI
 39464  	JMP   check_limit_unroll
 39465  	PCALIGN $0x08
 39466  	NOP
 39467  	NOP
 39468  
 39469  loop_unroll:
 39470  	MOVSS (AX), X1
 39471  	LEAQ  (AX)(CX*4), AX
 39472  	MOVSS (AX), X2
 39473  	LEAQ  (AX)(CX*4), AX
 39474  	MOVSS (AX), X3
 39475  	LEAQ  (AX)(CX*4), AX
 39476  	MOVSS (AX), X4
 39477  	LEAQ  (AX)(CX*4), AX
 39478  	MOVSS (AX), X5
 39479  	LEAQ  (AX)(CX*4), AX
 39480  	MOVSS (AX), X6
 39481  	LEAQ  (AX)(CX*4), AX
 39482  	MOVSS (AX), X7
 39483  	LEAQ  (AX)(CX*4), AX
 39484  	MOVSS (AX), X8
 39485  	LEAQ  (AX)(CX*4), AX
 39486  	MULSS X0, X1
 39487  	MULSS X0, X2
 39488  	MULSS X0, X3
 39489  	MULSS X0, X4
 39490  	MULSS X0, X5
 39491  	MULSS X0, X6
 39492  	MULSS X0, X7
 39493  	MULSS X0, X8
 39494  	ADDSS (DX), X1
 39495  	MOVSS X1, (DX)
 39496  	LEAQ  (DX)(BX*4), DX
 39497  	ADDSS (DX), X2
 39498  	MOVSS X2, (DX)
 39499  	LEAQ  (DX)(BX*4), DX
 39500  	ADDSS (DX), X3
 39501  	MOVSS X3, (DX)
 39502  	LEAQ  (DX)(BX*4), DX
 39503  	ADDSS (DX), X4
 39504  	MOVSS X4, (DX)
 39505  	LEAQ  (DX)(BX*4), DX
 39506  	ADDSS (DX), X5
 39507  	MOVSS X5, (DX)
 39508  	LEAQ  (DX)(BX*4), DX
 39509  	ADDSS (DX), X6
 39510  	MOVSS X6, (DX)
 39511  	LEAQ  (DX)(BX*4), DX
 39512  	ADDSS (DX), X7
 39513  	MOVSS X7, (DX)
 39514  	LEAQ  (DX)(BX*4), DX
 39515  	ADDSS (DX), X8
 39516  	MOVSS X8, (DX)
 39517  	LEAQ  (DX)(BX*4), DX
 39518  	SUBQ  $0x08, SI
 39519  
 39520  check_limit_unroll:
 39521  	CMPQ SI, $0x08
 39522  	JHS  loop_unroll
 39523  	JMP  check_limit
 39524  
 39525  loop:
 39526  	MOVSS (AX), X1
 39527  	MULSS X0, X1
 39528  	ADDSS (DX), X1
 39529  	MOVSS X1, (DX)
 39530  	DECQ  SI
 39531  	LEAQ  (AX)(CX*4), AX
 39532  	LEAQ  (DX)(BX*4), DX
 39533  
 39534  check_limit:
 39535  	CMPQ SI, $0x00
 39536  	JHI  loop
 39537  	RET
 39538  
 39539  // func AmdAxpyPointerLoopXInterleave_V3A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39540  // Requires: SSE
 39541  TEXT ·AmdAxpyPointerLoopXInterleave_V3A10U8(SB), NOSPLIT, $0-48
 39542  	MOVSS alpha+0(FP), X0
 39543  	MOVQ  xs+8(FP), AX
 39544  	MOVQ  incx+16(FP), CX
 39545  	MOVQ  CX, DX
 39546  	SHLQ  $0x05, DX
 39547  	MOVQ  ys+24(FP), DX
 39548  	MOVQ  incy+32(FP), BX
 39549  	MOVQ  BX, SI
 39550  	SHLQ  $0x05, SI
 39551  	MOVQ  n+40(FP), SI
 39552  	JMP   check_limit_unroll
 39553  	PCALIGN $0x08
 39554  	NOP
 39555  	NOP
 39556  
 39557  loop_unroll:
 39558  	MOVSS (AX), X1
 39559  	LEAQ  (AX)(CX*4), AX
 39560  	MOVSS (AX), X2
 39561  	LEAQ  (AX)(CX*4), AX
 39562  	MOVSS (AX), X3
 39563  	LEAQ  (AX)(CX*4), AX
 39564  	MOVSS (AX), X4
 39565  	LEAQ  (AX)(CX*4), AX
 39566  	MOVSS (AX), X5
 39567  	LEAQ  (AX)(CX*4), AX
 39568  	MOVSS (AX), X6
 39569  	LEAQ  (AX)(CX*4), AX
 39570  	MOVSS (AX), X7
 39571  	LEAQ  (AX)(CX*4), AX
 39572  	MOVSS (AX), X8
 39573  	LEAQ  (AX)(CX*4), AX
 39574  	MULSS X0, X1
 39575  	MULSS X0, X2
 39576  	MULSS X0, X3
 39577  	MULSS X0, X4
 39578  	MULSS X0, X5
 39579  	MULSS X0, X6
 39580  	MULSS X0, X7
 39581  	MULSS X0, X8
 39582  	ADDSS (DX), X1
 39583  	MOVSS X1, (DX)
 39584  	LEAQ  (DX)(BX*4), DX
 39585  	ADDSS (DX), X2
 39586  	MOVSS X2, (DX)
 39587  	LEAQ  (DX)(BX*4), DX
 39588  	ADDSS (DX), X3
 39589  	MOVSS X3, (DX)
 39590  	LEAQ  (DX)(BX*4), DX
 39591  	ADDSS (DX), X4
 39592  	MOVSS X4, (DX)
 39593  	LEAQ  (DX)(BX*4), DX
 39594  	ADDSS (DX), X5
 39595  	MOVSS X5, (DX)
 39596  	LEAQ  (DX)(BX*4), DX
 39597  	ADDSS (DX), X6
 39598  	MOVSS X6, (DX)
 39599  	LEAQ  (DX)(BX*4), DX
 39600  	ADDSS (DX), X7
 39601  	MOVSS X7, (DX)
 39602  	LEAQ  (DX)(BX*4), DX
 39603  	ADDSS (DX), X8
 39604  	MOVSS X8, (DX)
 39605  	LEAQ  (DX)(BX*4), DX
 39606  	SUBQ  $0x08, SI
 39607  
 39608  check_limit_unroll:
 39609  	CMPQ SI, $0x08
 39610  	JHS  loop_unroll
 39611  	JMP  check_limit
 39612  
 39613  loop:
 39614  	MOVSS (AX), X1
 39615  	MULSS X0, X1
 39616  	ADDSS (DX), X1
 39617  	MOVSS X1, (DX)
 39618  	DECQ  SI
 39619  	LEAQ  (AX)(CX*4), AX
 39620  	LEAQ  (DX)(BX*4), DX
 39621  
 39622  check_limit:
 39623  	CMPQ SI, $0x00
 39624  	JHI  loop
 39625  	RET
 39626  
 39627  // func AmdAxpyPointerLoopXInterleave_V4A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39628  // Requires: SSE
 39629  TEXT ·AmdAxpyPointerLoopXInterleave_V4A10U8(SB), NOSPLIT, $0-48
 39630  	MOVSS alpha+0(FP), X0
 39631  	MOVQ  xs+8(FP), AX
 39632  	MOVQ  incx+16(FP), CX
 39633  	MOVQ  CX, DX
 39634  	SHLQ  $0x05, DX
 39635  	MOVQ  ys+24(FP), DX
 39636  	MOVQ  incy+32(FP), BX
 39637  	MOVQ  BX, SI
 39638  	SHLQ  $0x05, SI
 39639  	MOVQ  n+40(FP), SI
 39640  	JMP   check_limit_unroll
 39641  	PCALIGN $0x08
 39642  	NOP
 39643  	NOP
 39644  
 39645  loop_unroll:
 39646  	MOVSS (AX), X1
 39647  	LEAQ  (AX)(CX*4), AX
 39648  	MOVSS (AX), X2
 39649  	LEAQ  (AX)(CX*4), AX
 39650  	MOVSS (AX), X3
 39651  	LEAQ  (AX)(CX*4), AX
 39652  	MOVSS (AX), X4
 39653  	LEAQ  (AX)(CX*4), AX
 39654  	MOVSS (AX), X5
 39655  	LEAQ  (AX)(CX*4), AX
 39656  	MOVSS (AX), X6
 39657  	LEAQ  (AX)(CX*4), AX
 39658  	MOVSS (AX), X7
 39659  	LEAQ  (AX)(CX*4), AX
 39660  	MOVSS (AX), X8
 39661  	LEAQ  (AX)(CX*4), AX
 39662  	MULSS X0, X1
 39663  	MULSS X0, X2
 39664  	MULSS X0, X3
 39665  	MULSS X0, X4
 39666  	MULSS X0, X5
 39667  	MULSS X0, X6
 39668  	MULSS X0, X7
 39669  	MULSS X0, X8
 39670  	ADDSS (DX), X1
 39671  	MOVSS X1, (DX)
 39672  	LEAQ  (DX)(BX*4), DX
 39673  	ADDSS (DX), X2
 39674  	MOVSS X2, (DX)
 39675  	LEAQ  (DX)(BX*4), DX
 39676  	ADDSS (DX), X3
 39677  	MOVSS X3, (DX)
 39678  	LEAQ  (DX)(BX*4), DX
 39679  	ADDSS (DX), X4
 39680  	MOVSS X4, (DX)
 39681  	LEAQ  (DX)(BX*4), DX
 39682  	ADDSS (DX), X5
 39683  	MOVSS X5, (DX)
 39684  	LEAQ  (DX)(BX*4), DX
 39685  	ADDSS (DX), X6
 39686  	MOVSS X6, (DX)
 39687  	LEAQ  (DX)(BX*4), DX
 39688  	ADDSS (DX), X7
 39689  	MOVSS X7, (DX)
 39690  	LEAQ  (DX)(BX*4), DX
 39691  	ADDSS (DX), X8
 39692  	MOVSS X8, (DX)
 39693  	LEAQ  (DX)(BX*4), DX
 39694  	SUBQ  $0x08, SI
 39695  
 39696  check_limit_unroll:
 39697  	CMPQ SI, $0x08
 39698  	JHS  loop_unroll
 39699  	JMP  check_limit
 39700  
 39701  loop:
 39702  	MOVSS (AX), X1
 39703  	MULSS X0, X1
 39704  	ADDSS (DX), X1
 39705  	MOVSS X1, (DX)
 39706  	DECQ  SI
 39707  	LEAQ  (AX)(CX*4), AX
 39708  	LEAQ  (DX)(BX*4), DX
 39709  
 39710  check_limit:
 39711  	CMPQ SI, $0x00
 39712  	JHI  loop
 39713  	RET
 39714  
 39715  // func AmdAxpyPointerLoopXInterleave_V5A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39716  // Requires: SSE
 39717  TEXT ·AmdAxpyPointerLoopXInterleave_V5A10U8(SB), NOSPLIT, $0-48
 39718  	MOVSS alpha+0(FP), X0
 39719  	MOVQ  xs+8(FP), AX
 39720  	MOVQ  incx+16(FP), CX
 39721  	MOVQ  CX, DX
 39722  	SHLQ  $0x05, DX
 39723  	MOVQ  ys+24(FP), DX
 39724  	MOVQ  incy+32(FP), BX
 39725  	MOVQ  BX, SI
 39726  	SHLQ  $0x05, SI
 39727  	MOVQ  n+40(FP), SI
 39728  	JMP   check_limit_unroll
 39729  	PCALIGN $0x08
 39730  	NOP
 39731  	NOP
 39732  
 39733  loop_unroll:
 39734  	MOVSS (AX), X1
 39735  	LEAQ  (AX)(CX*4), AX
 39736  	MOVSS (AX), X2
 39737  	LEAQ  (AX)(CX*4), AX
 39738  	MOVSS (AX), X3
 39739  	LEAQ  (AX)(CX*4), AX
 39740  	MOVSS (AX), X4
 39741  	LEAQ  (AX)(CX*4), AX
 39742  	MOVSS (AX), X5
 39743  	LEAQ  (AX)(CX*4), AX
 39744  	MOVSS (AX), X6
 39745  	LEAQ  (AX)(CX*4), AX
 39746  	MOVSS (AX), X7
 39747  	LEAQ  (AX)(CX*4), AX
 39748  	MOVSS (AX), X8
 39749  	LEAQ  (AX)(CX*4), AX
 39750  	MULSS X0, X1
 39751  	MULSS X0, X2
 39752  	MULSS X0, X3
 39753  	MULSS X0, X4
 39754  	MULSS X0, X5
 39755  	MULSS X0, X6
 39756  	MULSS X0, X7
 39757  	MULSS X0, X8
 39758  	ADDSS (DX), X1
 39759  	MOVSS X1, (DX)
 39760  	LEAQ  (DX)(BX*4), DX
 39761  	ADDSS (DX), X2
 39762  	MOVSS X2, (DX)
 39763  	LEAQ  (DX)(BX*4), DX
 39764  	ADDSS (DX), X3
 39765  	MOVSS X3, (DX)
 39766  	LEAQ  (DX)(BX*4), DX
 39767  	ADDSS (DX), X4
 39768  	MOVSS X4, (DX)
 39769  	LEAQ  (DX)(BX*4), DX
 39770  	ADDSS (DX), X5
 39771  	MOVSS X5, (DX)
 39772  	LEAQ  (DX)(BX*4), DX
 39773  	ADDSS (DX), X6
 39774  	MOVSS X6, (DX)
 39775  	LEAQ  (DX)(BX*4), DX
 39776  	ADDSS (DX), X7
 39777  	MOVSS X7, (DX)
 39778  	LEAQ  (DX)(BX*4), DX
 39779  	ADDSS (DX), X8
 39780  	MOVSS X8, (DX)
 39781  	LEAQ  (DX)(BX*4), DX
 39782  	SUBQ  $0x08, SI
 39783  
 39784  check_limit_unroll:
 39785  	CMPQ SI, $0x08
 39786  	JHS  loop_unroll
 39787  	JMP  check_limit
 39788  
 39789  loop:
 39790  	MOVSS (AX), X1
 39791  	MULSS X0, X1
 39792  	ADDSS (DX), X1
 39793  	MOVSS X1, (DX)
 39794  	DECQ  SI
 39795  	LEAQ  (AX)(CX*4), AX
 39796  	LEAQ  (DX)(BX*4), DX
 39797  
 39798  check_limit:
 39799  	CMPQ SI, $0x00
 39800  	JHI  loop
 39801  	RET
 39802  
 39803  // func AmdAxpyPointerLoopXInterleave_V0A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39804  // Requires: SSE
 39805  TEXT ·AmdAxpyPointerLoopXInterleave_V0A11U8(SB), NOSPLIT, $0-48
 39806  	MOVSS alpha+0(FP), X0
 39807  	MOVQ  xs+8(FP), AX
 39808  	MOVQ  incx+16(FP), CX
 39809  	MOVQ  CX, DX
 39810  	SHLQ  $0x05, DX
 39811  	MOVQ  ys+24(FP), DX
 39812  	MOVQ  incy+32(FP), BX
 39813  	MOVQ  BX, SI
 39814  	SHLQ  $0x05, SI
 39815  	MOVQ  n+40(FP), SI
 39816  	JMP   check_limit_unroll
 39817  	PCALIGN $0x08
 39818  	NOP
 39819  	NOP
 39820  	NOP
 39821  
 39822  loop_unroll:
 39823  	MOVSS (AX), X1
 39824  	LEAQ  (AX)(CX*4), AX
 39825  	MOVSS (AX), X2
 39826  	LEAQ  (AX)(CX*4), AX
 39827  	MOVSS (AX), X3
 39828  	LEAQ  (AX)(CX*4), AX
 39829  	MOVSS (AX), X4
 39830  	LEAQ  (AX)(CX*4), AX
 39831  	MOVSS (AX), X5
 39832  	LEAQ  (AX)(CX*4), AX
 39833  	MOVSS (AX), X6
 39834  	LEAQ  (AX)(CX*4), AX
 39835  	MOVSS (AX), X7
 39836  	LEAQ  (AX)(CX*4), AX
 39837  	MOVSS (AX), X8
 39838  	LEAQ  (AX)(CX*4), AX
 39839  	MULSS X0, X1
 39840  	MULSS X0, X2
 39841  	MULSS X0, X3
 39842  	MULSS X0, X4
 39843  	MULSS X0, X5
 39844  	MULSS X0, X6
 39845  	MULSS X0, X7
 39846  	MULSS X0, X8
 39847  	ADDSS (DX), X1
 39848  	MOVSS X1, (DX)
 39849  	LEAQ  (DX)(BX*4), DX
 39850  	ADDSS (DX), X2
 39851  	MOVSS X2, (DX)
 39852  	LEAQ  (DX)(BX*4), DX
 39853  	ADDSS (DX), X3
 39854  	MOVSS X3, (DX)
 39855  	LEAQ  (DX)(BX*4), DX
 39856  	ADDSS (DX), X4
 39857  	MOVSS X4, (DX)
 39858  	LEAQ  (DX)(BX*4), DX
 39859  	ADDSS (DX), X5
 39860  	MOVSS X5, (DX)
 39861  	LEAQ  (DX)(BX*4), DX
 39862  	ADDSS (DX), X6
 39863  	MOVSS X6, (DX)
 39864  	LEAQ  (DX)(BX*4), DX
 39865  	ADDSS (DX), X7
 39866  	MOVSS X7, (DX)
 39867  	LEAQ  (DX)(BX*4), DX
 39868  	ADDSS (DX), X8
 39869  	MOVSS X8, (DX)
 39870  	LEAQ  (DX)(BX*4), DX
 39871  	SUBQ  $0x08, SI
 39872  
 39873  check_limit_unroll:
 39874  	CMPQ SI, $0x08
 39875  	JHS  loop_unroll
 39876  	JMP  check_limit
 39877  
 39878  loop:
 39879  	MOVSS (AX), X1
 39880  	MULSS X0, X1
 39881  	ADDSS (DX), X1
 39882  	MOVSS X1, (DX)
 39883  	DECQ  SI
 39884  	LEAQ  (AX)(CX*4), AX
 39885  	LEAQ  (DX)(BX*4), DX
 39886  
 39887  check_limit:
 39888  	CMPQ SI, $0x00
 39889  	JHI  loop
 39890  	RET
 39891  
 39892  // func AmdAxpyPointerLoopXInterleave_V1A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39893  // Requires: SSE
 39894  TEXT ·AmdAxpyPointerLoopXInterleave_V1A11U8(SB), NOSPLIT, $0-48
 39895  	MOVSS alpha+0(FP), X0
 39896  	MOVQ  xs+8(FP), AX
 39897  	MOVQ  incx+16(FP), CX
 39898  	MOVQ  CX, DX
 39899  	SHLQ  $0x05, DX
 39900  	MOVQ  ys+24(FP), DX
 39901  	MOVQ  incy+32(FP), BX
 39902  	MOVQ  BX, SI
 39903  	SHLQ  $0x05, SI
 39904  	MOVQ  n+40(FP), SI
 39905  	JMP   check_limit_unroll
 39906  	PCALIGN $0x08
 39907  	NOP
 39908  	NOP
 39909  	NOP
 39910  
 39911  loop_unroll:
 39912  	MOVSS (AX), X1
 39913  	LEAQ  (AX)(CX*4), AX
 39914  	MOVSS (AX), X2
 39915  	LEAQ  (AX)(CX*4), AX
 39916  	MOVSS (AX), X3
 39917  	LEAQ  (AX)(CX*4), AX
 39918  	MOVSS (AX), X4
 39919  	LEAQ  (AX)(CX*4), AX
 39920  	MOVSS (AX), X5
 39921  	LEAQ  (AX)(CX*4), AX
 39922  	MOVSS (AX), X6
 39923  	LEAQ  (AX)(CX*4), AX
 39924  	MOVSS (AX), X7
 39925  	LEAQ  (AX)(CX*4), AX
 39926  	MOVSS (AX), X8
 39927  	LEAQ  (AX)(CX*4), AX
 39928  	MULSS X0, X1
 39929  	MULSS X0, X2
 39930  	MULSS X0, X3
 39931  	MULSS X0, X4
 39932  	MULSS X0, X5
 39933  	MULSS X0, X6
 39934  	MULSS X0, X7
 39935  	MULSS X0, X8
 39936  	ADDSS (DX), X1
 39937  	MOVSS X1, (DX)
 39938  	LEAQ  (DX)(BX*4), DX
 39939  	ADDSS (DX), X2
 39940  	MOVSS X2, (DX)
 39941  	LEAQ  (DX)(BX*4), DX
 39942  	ADDSS (DX), X3
 39943  	MOVSS X3, (DX)
 39944  	LEAQ  (DX)(BX*4), DX
 39945  	ADDSS (DX), X4
 39946  	MOVSS X4, (DX)
 39947  	LEAQ  (DX)(BX*4), DX
 39948  	ADDSS (DX), X5
 39949  	MOVSS X5, (DX)
 39950  	LEAQ  (DX)(BX*4), DX
 39951  	ADDSS (DX), X6
 39952  	MOVSS X6, (DX)
 39953  	LEAQ  (DX)(BX*4), DX
 39954  	ADDSS (DX), X7
 39955  	MOVSS X7, (DX)
 39956  	LEAQ  (DX)(BX*4), DX
 39957  	ADDSS (DX), X8
 39958  	MOVSS X8, (DX)
 39959  	LEAQ  (DX)(BX*4), DX
 39960  	SUBQ  $0x08, SI
 39961  
 39962  check_limit_unroll:
 39963  	CMPQ SI, $0x08
 39964  	JHS  loop_unroll
 39965  	JMP  check_limit
 39966  
 39967  loop:
 39968  	MOVSS (AX), X1
 39969  	MULSS X0, X1
 39970  	ADDSS (DX), X1
 39971  	MOVSS X1, (DX)
 39972  	DECQ  SI
 39973  	LEAQ  (AX)(CX*4), AX
 39974  	LEAQ  (DX)(BX*4), DX
 39975  
 39976  check_limit:
 39977  	CMPQ SI, $0x00
 39978  	JHI  loop
 39979  	RET
 39980  
 39981  // func AmdAxpyPointerLoopXInterleave_V2A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 39982  // Requires: SSE
 39983  TEXT ·AmdAxpyPointerLoopXInterleave_V2A11U8(SB), NOSPLIT, $0-48
 39984  	MOVSS alpha+0(FP), X0
 39985  	MOVQ  xs+8(FP), AX
 39986  	MOVQ  incx+16(FP), CX
 39987  	MOVQ  CX, DX
 39988  	SHLQ  $0x05, DX
 39989  	MOVQ  ys+24(FP), DX
 39990  	MOVQ  incy+32(FP), BX
 39991  	MOVQ  BX, SI
 39992  	SHLQ  $0x05, SI
 39993  	MOVQ  n+40(FP), SI
 39994  	JMP   check_limit_unroll
 39995  	PCALIGN $0x08
 39996  	NOP
 39997  	NOP
 39998  	NOP
 39999  
 40000  loop_unroll:
 40001  	MOVSS (AX), X1
 40002  	LEAQ  (AX)(CX*4), AX
 40003  	MOVSS (AX), X2
 40004  	LEAQ  (AX)(CX*4), AX
 40005  	MOVSS (AX), X3
 40006  	LEAQ  (AX)(CX*4), AX
 40007  	MOVSS (AX), X4
 40008  	LEAQ  (AX)(CX*4), AX
 40009  	MOVSS (AX), X5
 40010  	LEAQ  (AX)(CX*4), AX
 40011  	MOVSS (AX), X6
 40012  	LEAQ  (AX)(CX*4), AX
 40013  	MOVSS (AX), X7
 40014  	LEAQ  (AX)(CX*4), AX
 40015  	MOVSS (AX), X8
 40016  	LEAQ  (AX)(CX*4), AX
 40017  	MULSS X0, X1
 40018  	MULSS X0, X2
 40019  	MULSS X0, X3
 40020  	MULSS X0, X4
 40021  	MULSS X0, X5
 40022  	MULSS X0, X6
 40023  	MULSS X0, X7
 40024  	MULSS X0, X8
 40025  	ADDSS (DX), X1
 40026  	MOVSS X1, (DX)
 40027  	LEAQ  (DX)(BX*4), DX
 40028  	ADDSS (DX), X2
 40029  	MOVSS X2, (DX)
 40030  	LEAQ  (DX)(BX*4), DX
 40031  	ADDSS (DX), X3
 40032  	MOVSS X3, (DX)
 40033  	LEAQ  (DX)(BX*4), DX
 40034  	ADDSS (DX), X4
 40035  	MOVSS X4, (DX)
 40036  	LEAQ  (DX)(BX*4), DX
 40037  	ADDSS (DX), X5
 40038  	MOVSS X5, (DX)
 40039  	LEAQ  (DX)(BX*4), DX
 40040  	ADDSS (DX), X6
 40041  	MOVSS X6, (DX)
 40042  	LEAQ  (DX)(BX*4), DX
 40043  	ADDSS (DX), X7
 40044  	MOVSS X7, (DX)
 40045  	LEAQ  (DX)(BX*4), DX
 40046  	ADDSS (DX), X8
 40047  	MOVSS X8, (DX)
 40048  	LEAQ  (DX)(BX*4), DX
 40049  	SUBQ  $0x08, SI
 40050  
 40051  check_limit_unroll:
 40052  	CMPQ SI, $0x08
 40053  	JHS  loop_unroll
 40054  	JMP  check_limit
 40055  
 40056  loop:
 40057  	MOVSS (AX), X1
 40058  	MULSS X0, X1
 40059  	ADDSS (DX), X1
 40060  	MOVSS X1, (DX)
 40061  	DECQ  SI
 40062  	LEAQ  (AX)(CX*4), AX
 40063  	LEAQ  (DX)(BX*4), DX
 40064  
 40065  check_limit:
 40066  	CMPQ SI, $0x00
 40067  	JHI  loop
 40068  	RET
 40069  
 40070  // func AmdAxpyPointerLoopXInterleave_V3A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40071  // Requires: SSE
 40072  TEXT ·AmdAxpyPointerLoopXInterleave_V3A11U8(SB), NOSPLIT, $0-48
 40073  	MOVSS alpha+0(FP), X0
 40074  	MOVQ  xs+8(FP), AX
 40075  	MOVQ  incx+16(FP), CX
 40076  	MOVQ  CX, DX
 40077  	SHLQ  $0x05, DX
 40078  	MOVQ  ys+24(FP), DX
 40079  	MOVQ  incy+32(FP), BX
 40080  	MOVQ  BX, SI
 40081  	SHLQ  $0x05, SI
 40082  	MOVQ  n+40(FP), SI
 40083  	JMP   check_limit_unroll
 40084  	PCALIGN $0x08
 40085  	NOP
 40086  	NOP
 40087  	NOP
 40088  
 40089  loop_unroll:
 40090  	MOVSS (AX), X1
 40091  	LEAQ  (AX)(CX*4), AX
 40092  	MOVSS (AX), X2
 40093  	LEAQ  (AX)(CX*4), AX
 40094  	MOVSS (AX), X3
 40095  	LEAQ  (AX)(CX*4), AX
 40096  	MOVSS (AX), X4
 40097  	LEAQ  (AX)(CX*4), AX
 40098  	MOVSS (AX), X5
 40099  	LEAQ  (AX)(CX*4), AX
 40100  	MOVSS (AX), X6
 40101  	LEAQ  (AX)(CX*4), AX
 40102  	MOVSS (AX), X7
 40103  	LEAQ  (AX)(CX*4), AX
 40104  	MOVSS (AX), X8
 40105  	LEAQ  (AX)(CX*4), AX
 40106  	MULSS X0, X1
 40107  	MULSS X0, X2
 40108  	MULSS X0, X3
 40109  	MULSS X0, X4
 40110  	MULSS X0, X5
 40111  	MULSS X0, X6
 40112  	MULSS X0, X7
 40113  	MULSS X0, X8
 40114  	ADDSS (DX), X1
 40115  	MOVSS X1, (DX)
 40116  	LEAQ  (DX)(BX*4), DX
 40117  	ADDSS (DX), X2
 40118  	MOVSS X2, (DX)
 40119  	LEAQ  (DX)(BX*4), DX
 40120  	ADDSS (DX), X3
 40121  	MOVSS X3, (DX)
 40122  	LEAQ  (DX)(BX*4), DX
 40123  	ADDSS (DX), X4
 40124  	MOVSS X4, (DX)
 40125  	LEAQ  (DX)(BX*4), DX
 40126  	ADDSS (DX), X5
 40127  	MOVSS X5, (DX)
 40128  	LEAQ  (DX)(BX*4), DX
 40129  	ADDSS (DX), X6
 40130  	MOVSS X6, (DX)
 40131  	LEAQ  (DX)(BX*4), DX
 40132  	ADDSS (DX), X7
 40133  	MOVSS X7, (DX)
 40134  	LEAQ  (DX)(BX*4), DX
 40135  	ADDSS (DX), X8
 40136  	MOVSS X8, (DX)
 40137  	LEAQ  (DX)(BX*4), DX
 40138  	SUBQ  $0x08, SI
 40139  
 40140  check_limit_unroll:
 40141  	CMPQ SI, $0x08
 40142  	JHS  loop_unroll
 40143  	JMP  check_limit
 40144  
 40145  loop:
 40146  	MOVSS (AX), X1
 40147  	MULSS X0, X1
 40148  	ADDSS (DX), X1
 40149  	MOVSS X1, (DX)
 40150  	DECQ  SI
 40151  	LEAQ  (AX)(CX*4), AX
 40152  	LEAQ  (DX)(BX*4), DX
 40153  
 40154  check_limit:
 40155  	CMPQ SI, $0x00
 40156  	JHI  loop
 40157  	RET
 40158  
 40159  // func AmdAxpyPointerLoopXInterleave_V4A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40160  // Requires: SSE
 40161  TEXT ·AmdAxpyPointerLoopXInterleave_V4A11U8(SB), NOSPLIT, $0-48
 40162  	MOVSS alpha+0(FP), X0
 40163  	MOVQ  xs+8(FP), AX
 40164  	MOVQ  incx+16(FP), CX
 40165  	MOVQ  CX, DX
 40166  	SHLQ  $0x05, DX
 40167  	MOVQ  ys+24(FP), DX
 40168  	MOVQ  incy+32(FP), BX
 40169  	MOVQ  BX, SI
 40170  	SHLQ  $0x05, SI
 40171  	MOVQ  n+40(FP), SI
 40172  	JMP   check_limit_unroll
 40173  	PCALIGN $0x08
 40174  	NOP
 40175  	NOP
 40176  	NOP
 40177  
 40178  loop_unroll:
 40179  	MOVSS (AX), X1
 40180  	LEAQ  (AX)(CX*4), AX
 40181  	MOVSS (AX), X2
 40182  	LEAQ  (AX)(CX*4), AX
 40183  	MOVSS (AX), X3
 40184  	LEAQ  (AX)(CX*4), AX
 40185  	MOVSS (AX), X4
 40186  	LEAQ  (AX)(CX*4), AX
 40187  	MOVSS (AX), X5
 40188  	LEAQ  (AX)(CX*4), AX
 40189  	MOVSS (AX), X6
 40190  	LEAQ  (AX)(CX*4), AX
 40191  	MOVSS (AX), X7
 40192  	LEAQ  (AX)(CX*4), AX
 40193  	MOVSS (AX), X8
 40194  	LEAQ  (AX)(CX*4), AX
 40195  	MULSS X0, X1
 40196  	MULSS X0, X2
 40197  	MULSS X0, X3
 40198  	MULSS X0, X4
 40199  	MULSS X0, X5
 40200  	MULSS X0, X6
 40201  	MULSS X0, X7
 40202  	MULSS X0, X8
 40203  	ADDSS (DX), X1
 40204  	MOVSS X1, (DX)
 40205  	LEAQ  (DX)(BX*4), DX
 40206  	ADDSS (DX), X2
 40207  	MOVSS X2, (DX)
 40208  	LEAQ  (DX)(BX*4), DX
 40209  	ADDSS (DX), X3
 40210  	MOVSS X3, (DX)
 40211  	LEAQ  (DX)(BX*4), DX
 40212  	ADDSS (DX), X4
 40213  	MOVSS X4, (DX)
 40214  	LEAQ  (DX)(BX*4), DX
 40215  	ADDSS (DX), X5
 40216  	MOVSS X5, (DX)
 40217  	LEAQ  (DX)(BX*4), DX
 40218  	ADDSS (DX), X6
 40219  	MOVSS X6, (DX)
 40220  	LEAQ  (DX)(BX*4), DX
 40221  	ADDSS (DX), X7
 40222  	MOVSS X7, (DX)
 40223  	LEAQ  (DX)(BX*4), DX
 40224  	ADDSS (DX), X8
 40225  	MOVSS X8, (DX)
 40226  	LEAQ  (DX)(BX*4), DX
 40227  	SUBQ  $0x08, SI
 40228  
 40229  check_limit_unroll:
 40230  	CMPQ SI, $0x08
 40231  	JHS  loop_unroll
 40232  	JMP  check_limit
 40233  
 40234  loop:
 40235  	MOVSS (AX), X1
 40236  	MULSS X0, X1
 40237  	ADDSS (DX), X1
 40238  	MOVSS X1, (DX)
 40239  	DECQ  SI
 40240  	LEAQ  (AX)(CX*4), AX
 40241  	LEAQ  (DX)(BX*4), DX
 40242  
 40243  check_limit:
 40244  	CMPQ SI, $0x00
 40245  	JHI  loop
 40246  	RET
 40247  
 40248  // func AmdAxpyPointerLoopXInterleave_V5A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40249  // Requires: SSE
 40250  TEXT ·AmdAxpyPointerLoopXInterleave_V5A11U8(SB), NOSPLIT, $0-48
 40251  	MOVSS alpha+0(FP), X0
 40252  	MOVQ  xs+8(FP), AX
 40253  	MOVQ  incx+16(FP), CX
 40254  	MOVQ  CX, DX
 40255  	SHLQ  $0x05, DX
 40256  	MOVQ  ys+24(FP), DX
 40257  	MOVQ  incy+32(FP), BX
 40258  	MOVQ  BX, SI
 40259  	SHLQ  $0x05, SI
 40260  	MOVQ  n+40(FP), SI
 40261  	JMP   check_limit_unroll
 40262  	PCALIGN $0x08
 40263  	NOP
 40264  	NOP
 40265  	NOP
 40266  
 40267  loop_unroll:
 40268  	MOVSS (AX), X1
 40269  	LEAQ  (AX)(CX*4), AX
 40270  	MOVSS (AX), X2
 40271  	LEAQ  (AX)(CX*4), AX
 40272  	MOVSS (AX), X3
 40273  	LEAQ  (AX)(CX*4), AX
 40274  	MOVSS (AX), X4
 40275  	LEAQ  (AX)(CX*4), AX
 40276  	MOVSS (AX), X5
 40277  	LEAQ  (AX)(CX*4), AX
 40278  	MOVSS (AX), X6
 40279  	LEAQ  (AX)(CX*4), AX
 40280  	MOVSS (AX), X7
 40281  	LEAQ  (AX)(CX*4), AX
 40282  	MOVSS (AX), X8
 40283  	LEAQ  (AX)(CX*4), AX
 40284  	MULSS X0, X1
 40285  	MULSS X0, X2
 40286  	MULSS X0, X3
 40287  	MULSS X0, X4
 40288  	MULSS X0, X5
 40289  	MULSS X0, X6
 40290  	MULSS X0, X7
 40291  	MULSS X0, X8
 40292  	ADDSS (DX), X1
 40293  	MOVSS X1, (DX)
 40294  	LEAQ  (DX)(BX*4), DX
 40295  	ADDSS (DX), X2
 40296  	MOVSS X2, (DX)
 40297  	LEAQ  (DX)(BX*4), DX
 40298  	ADDSS (DX), X3
 40299  	MOVSS X3, (DX)
 40300  	LEAQ  (DX)(BX*4), DX
 40301  	ADDSS (DX), X4
 40302  	MOVSS X4, (DX)
 40303  	LEAQ  (DX)(BX*4), DX
 40304  	ADDSS (DX), X5
 40305  	MOVSS X5, (DX)
 40306  	LEAQ  (DX)(BX*4), DX
 40307  	ADDSS (DX), X6
 40308  	MOVSS X6, (DX)
 40309  	LEAQ  (DX)(BX*4), DX
 40310  	ADDSS (DX), X7
 40311  	MOVSS X7, (DX)
 40312  	LEAQ  (DX)(BX*4), DX
 40313  	ADDSS (DX), X8
 40314  	MOVSS X8, (DX)
 40315  	LEAQ  (DX)(BX*4), DX
 40316  	SUBQ  $0x08, SI
 40317  
 40318  check_limit_unroll:
 40319  	CMPQ SI, $0x08
 40320  	JHS  loop_unroll
 40321  	JMP  check_limit
 40322  
 40323  loop:
 40324  	MOVSS (AX), X1
 40325  	MULSS X0, X1
 40326  	ADDSS (DX), X1
 40327  	MOVSS X1, (DX)
 40328  	DECQ  SI
 40329  	LEAQ  (AX)(CX*4), AX
 40330  	LEAQ  (DX)(BX*4), DX
 40331  
 40332  check_limit:
 40333  	CMPQ SI, $0x00
 40334  	JHI  loop
 40335  	RET
 40336  
 40337  // func AmdAxpyPointerLoopXInterleave_V0A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40338  // Requires: SSE
 40339  TEXT ·AmdAxpyPointerLoopXInterleave_V0A12U8(SB), NOSPLIT, $0-48
 40340  	MOVSS alpha+0(FP), X0
 40341  	MOVQ  xs+8(FP), AX
 40342  	MOVQ  incx+16(FP), CX
 40343  	MOVQ  CX, DX
 40344  	SHLQ  $0x05, DX
 40345  	MOVQ  ys+24(FP), DX
 40346  	MOVQ  incy+32(FP), BX
 40347  	MOVQ  BX, SI
 40348  	SHLQ  $0x05, SI
 40349  	MOVQ  n+40(FP), SI
 40350  	JMP   check_limit_unroll
 40351  	PCALIGN $0x08
 40352  	NOP
 40353  	NOP
 40354  	NOP
 40355  	NOP
 40356  
 40357  loop_unroll:
 40358  	MOVSS (AX), X1
 40359  	LEAQ  (AX)(CX*4), AX
 40360  	MOVSS (AX), X2
 40361  	LEAQ  (AX)(CX*4), AX
 40362  	MOVSS (AX), X3
 40363  	LEAQ  (AX)(CX*4), AX
 40364  	MOVSS (AX), X4
 40365  	LEAQ  (AX)(CX*4), AX
 40366  	MOVSS (AX), X5
 40367  	LEAQ  (AX)(CX*4), AX
 40368  	MOVSS (AX), X6
 40369  	LEAQ  (AX)(CX*4), AX
 40370  	MOVSS (AX), X7
 40371  	LEAQ  (AX)(CX*4), AX
 40372  	MOVSS (AX), X8
 40373  	LEAQ  (AX)(CX*4), AX
 40374  	MULSS X0, X1
 40375  	MULSS X0, X2
 40376  	MULSS X0, X3
 40377  	MULSS X0, X4
 40378  	MULSS X0, X5
 40379  	MULSS X0, X6
 40380  	MULSS X0, X7
 40381  	MULSS X0, X8
 40382  	ADDSS (DX), X1
 40383  	MOVSS X1, (DX)
 40384  	LEAQ  (DX)(BX*4), DX
 40385  	ADDSS (DX), X2
 40386  	MOVSS X2, (DX)
 40387  	LEAQ  (DX)(BX*4), DX
 40388  	ADDSS (DX), X3
 40389  	MOVSS X3, (DX)
 40390  	LEAQ  (DX)(BX*4), DX
 40391  	ADDSS (DX), X4
 40392  	MOVSS X4, (DX)
 40393  	LEAQ  (DX)(BX*4), DX
 40394  	ADDSS (DX), X5
 40395  	MOVSS X5, (DX)
 40396  	LEAQ  (DX)(BX*4), DX
 40397  	ADDSS (DX), X6
 40398  	MOVSS X6, (DX)
 40399  	LEAQ  (DX)(BX*4), DX
 40400  	ADDSS (DX), X7
 40401  	MOVSS X7, (DX)
 40402  	LEAQ  (DX)(BX*4), DX
 40403  	ADDSS (DX), X8
 40404  	MOVSS X8, (DX)
 40405  	LEAQ  (DX)(BX*4), DX
 40406  	SUBQ  $0x08, SI
 40407  
 40408  check_limit_unroll:
 40409  	CMPQ SI, $0x08
 40410  	JHS  loop_unroll
 40411  	JMP  check_limit
 40412  
 40413  loop:
 40414  	MOVSS (AX), X1
 40415  	MULSS X0, X1
 40416  	ADDSS (DX), X1
 40417  	MOVSS X1, (DX)
 40418  	DECQ  SI
 40419  	LEAQ  (AX)(CX*4), AX
 40420  	LEAQ  (DX)(BX*4), DX
 40421  
 40422  check_limit:
 40423  	CMPQ SI, $0x00
 40424  	JHI  loop
 40425  	RET
 40426  
 40427  // func AmdAxpyPointerLoopXInterleave_V1A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40428  // Requires: SSE
 40429  TEXT ·AmdAxpyPointerLoopXInterleave_V1A12U8(SB), NOSPLIT, $0-48
 40430  	MOVSS alpha+0(FP), X0
 40431  	MOVQ  xs+8(FP), AX
 40432  	MOVQ  incx+16(FP), CX
 40433  	MOVQ  CX, DX
 40434  	SHLQ  $0x05, DX
 40435  	MOVQ  ys+24(FP), DX
 40436  	MOVQ  incy+32(FP), BX
 40437  	MOVQ  BX, SI
 40438  	SHLQ  $0x05, SI
 40439  	MOVQ  n+40(FP), SI
 40440  	JMP   check_limit_unroll
 40441  	PCALIGN $0x08
 40442  	NOP
 40443  	NOP
 40444  	NOP
 40445  	NOP
 40446  
 40447  loop_unroll:
 40448  	MOVSS (AX), X1
 40449  	LEAQ  (AX)(CX*4), AX
 40450  	MOVSS (AX), X2
 40451  	LEAQ  (AX)(CX*4), AX
 40452  	MOVSS (AX), X3
 40453  	LEAQ  (AX)(CX*4), AX
 40454  	MOVSS (AX), X4
 40455  	LEAQ  (AX)(CX*4), AX
 40456  	MOVSS (AX), X5
 40457  	LEAQ  (AX)(CX*4), AX
 40458  	MOVSS (AX), X6
 40459  	LEAQ  (AX)(CX*4), AX
 40460  	MOVSS (AX), X7
 40461  	LEAQ  (AX)(CX*4), AX
 40462  	MOVSS (AX), X8
 40463  	LEAQ  (AX)(CX*4), AX
 40464  	MULSS X0, X1
 40465  	MULSS X0, X2
 40466  	MULSS X0, X3
 40467  	MULSS X0, X4
 40468  	MULSS X0, X5
 40469  	MULSS X0, X6
 40470  	MULSS X0, X7
 40471  	MULSS X0, X8
 40472  	ADDSS (DX), X1
 40473  	MOVSS X1, (DX)
 40474  	LEAQ  (DX)(BX*4), DX
 40475  	ADDSS (DX), X2
 40476  	MOVSS X2, (DX)
 40477  	LEAQ  (DX)(BX*4), DX
 40478  	ADDSS (DX), X3
 40479  	MOVSS X3, (DX)
 40480  	LEAQ  (DX)(BX*4), DX
 40481  	ADDSS (DX), X4
 40482  	MOVSS X4, (DX)
 40483  	LEAQ  (DX)(BX*4), DX
 40484  	ADDSS (DX), X5
 40485  	MOVSS X5, (DX)
 40486  	LEAQ  (DX)(BX*4), DX
 40487  	ADDSS (DX), X6
 40488  	MOVSS X6, (DX)
 40489  	LEAQ  (DX)(BX*4), DX
 40490  	ADDSS (DX), X7
 40491  	MOVSS X7, (DX)
 40492  	LEAQ  (DX)(BX*4), DX
 40493  	ADDSS (DX), X8
 40494  	MOVSS X8, (DX)
 40495  	LEAQ  (DX)(BX*4), DX
 40496  	SUBQ  $0x08, SI
 40497  
 40498  check_limit_unroll:
 40499  	CMPQ SI, $0x08
 40500  	JHS  loop_unroll
 40501  	JMP  check_limit
 40502  
 40503  loop:
 40504  	MOVSS (AX), X1
 40505  	MULSS X0, X1
 40506  	ADDSS (DX), X1
 40507  	MOVSS X1, (DX)
 40508  	DECQ  SI
 40509  	LEAQ  (AX)(CX*4), AX
 40510  	LEAQ  (DX)(BX*4), DX
 40511  
 40512  check_limit:
 40513  	CMPQ SI, $0x00
 40514  	JHI  loop
 40515  	RET
 40516  
 40517  // func AmdAxpyPointerLoopXInterleave_V2A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40518  // Requires: SSE
 40519  TEXT ·AmdAxpyPointerLoopXInterleave_V2A12U8(SB), NOSPLIT, $0-48
 40520  	MOVSS alpha+0(FP), X0
 40521  	MOVQ  xs+8(FP), AX
 40522  	MOVQ  incx+16(FP), CX
 40523  	MOVQ  CX, DX
 40524  	SHLQ  $0x05, DX
 40525  	MOVQ  ys+24(FP), DX
 40526  	MOVQ  incy+32(FP), BX
 40527  	MOVQ  BX, SI
 40528  	SHLQ  $0x05, SI
 40529  	MOVQ  n+40(FP), SI
 40530  	JMP   check_limit_unroll
 40531  	PCALIGN $0x08
 40532  	NOP
 40533  	NOP
 40534  	NOP
 40535  	NOP
 40536  
 40537  loop_unroll:
 40538  	MOVSS (AX), X1
 40539  	LEAQ  (AX)(CX*4), AX
 40540  	MOVSS (AX), X2
 40541  	LEAQ  (AX)(CX*4), AX
 40542  	MOVSS (AX), X3
 40543  	LEAQ  (AX)(CX*4), AX
 40544  	MOVSS (AX), X4
 40545  	LEAQ  (AX)(CX*4), AX
 40546  	MOVSS (AX), X5
 40547  	LEAQ  (AX)(CX*4), AX
 40548  	MOVSS (AX), X6
 40549  	LEAQ  (AX)(CX*4), AX
 40550  	MOVSS (AX), X7
 40551  	LEAQ  (AX)(CX*4), AX
 40552  	MOVSS (AX), X8
 40553  	LEAQ  (AX)(CX*4), AX
 40554  	MULSS X0, X1
 40555  	MULSS X0, X2
 40556  	MULSS X0, X3
 40557  	MULSS X0, X4
 40558  	MULSS X0, X5
 40559  	MULSS X0, X6
 40560  	MULSS X0, X7
 40561  	MULSS X0, X8
 40562  	ADDSS (DX), X1
 40563  	MOVSS X1, (DX)
 40564  	LEAQ  (DX)(BX*4), DX
 40565  	ADDSS (DX), X2
 40566  	MOVSS X2, (DX)
 40567  	LEAQ  (DX)(BX*4), DX
 40568  	ADDSS (DX), X3
 40569  	MOVSS X3, (DX)
 40570  	LEAQ  (DX)(BX*4), DX
 40571  	ADDSS (DX), X4
 40572  	MOVSS X4, (DX)
 40573  	LEAQ  (DX)(BX*4), DX
 40574  	ADDSS (DX), X5
 40575  	MOVSS X5, (DX)
 40576  	LEAQ  (DX)(BX*4), DX
 40577  	ADDSS (DX), X6
 40578  	MOVSS X6, (DX)
 40579  	LEAQ  (DX)(BX*4), DX
 40580  	ADDSS (DX), X7
 40581  	MOVSS X7, (DX)
 40582  	LEAQ  (DX)(BX*4), DX
 40583  	ADDSS (DX), X8
 40584  	MOVSS X8, (DX)
 40585  	LEAQ  (DX)(BX*4), DX
 40586  	SUBQ  $0x08, SI
 40587  
 40588  check_limit_unroll:
 40589  	CMPQ SI, $0x08
 40590  	JHS  loop_unroll
 40591  	JMP  check_limit
 40592  
 40593  loop:
 40594  	MOVSS (AX), X1
 40595  	MULSS X0, X1
 40596  	ADDSS (DX), X1
 40597  	MOVSS X1, (DX)
 40598  	DECQ  SI
 40599  	LEAQ  (AX)(CX*4), AX
 40600  	LEAQ  (DX)(BX*4), DX
 40601  
 40602  check_limit:
 40603  	CMPQ SI, $0x00
 40604  	JHI  loop
 40605  	RET
 40606  
 40607  // func AmdAxpyPointerLoopXInterleave_V3A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40608  // Requires: SSE
 40609  TEXT ·AmdAxpyPointerLoopXInterleave_V3A12U8(SB), NOSPLIT, $0-48
 40610  	MOVSS alpha+0(FP), X0
 40611  	MOVQ  xs+8(FP), AX
 40612  	MOVQ  incx+16(FP), CX
 40613  	MOVQ  CX, DX
 40614  	SHLQ  $0x05, DX
 40615  	MOVQ  ys+24(FP), DX
 40616  	MOVQ  incy+32(FP), BX
 40617  	MOVQ  BX, SI
 40618  	SHLQ  $0x05, SI
 40619  	MOVQ  n+40(FP), SI
 40620  	JMP   check_limit_unroll
 40621  	PCALIGN $0x08
 40622  	NOP
 40623  	NOP
 40624  	NOP
 40625  	NOP
 40626  
 40627  loop_unroll:
 40628  	MOVSS (AX), X1
 40629  	LEAQ  (AX)(CX*4), AX
 40630  	MOVSS (AX), X2
 40631  	LEAQ  (AX)(CX*4), AX
 40632  	MOVSS (AX), X3
 40633  	LEAQ  (AX)(CX*4), AX
 40634  	MOVSS (AX), X4
 40635  	LEAQ  (AX)(CX*4), AX
 40636  	MOVSS (AX), X5
 40637  	LEAQ  (AX)(CX*4), AX
 40638  	MOVSS (AX), X6
 40639  	LEAQ  (AX)(CX*4), AX
 40640  	MOVSS (AX), X7
 40641  	LEAQ  (AX)(CX*4), AX
 40642  	MOVSS (AX), X8
 40643  	LEAQ  (AX)(CX*4), AX
 40644  	MULSS X0, X1
 40645  	MULSS X0, X2
 40646  	MULSS X0, X3
 40647  	MULSS X0, X4
 40648  	MULSS X0, X5
 40649  	MULSS X0, X6
 40650  	MULSS X0, X7
 40651  	MULSS X0, X8
 40652  	ADDSS (DX), X1
 40653  	MOVSS X1, (DX)
 40654  	LEAQ  (DX)(BX*4), DX
 40655  	ADDSS (DX), X2
 40656  	MOVSS X2, (DX)
 40657  	LEAQ  (DX)(BX*4), DX
 40658  	ADDSS (DX), X3
 40659  	MOVSS X3, (DX)
 40660  	LEAQ  (DX)(BX*4), DX
 40661  	ADDSS (DX), X4
 40662  	MOVSS X4, (DX)
 40663  	LEAQ  (DX)(BX*4), DX
 40664  	ADDSS (DX), X5
 40665  	MOVSS X5, (DX)
 40666  	LEAQ  (DX)(BX*4), DX
 40667  	ADDSS (DX), X6
 40668  	MOVSS X6, (DX)
 40669  	LEAQ  (DX)(BX*4), DX
 40670  	ADDSS (DX), X7
 40671  	MOVSS X7, (DX)
 40672  	LEAQ  (DX)(BX*4), DX
 40673  	ADDSS (DX), X8
 40674  	MOVSS X8, (DX)
 40675  	LEAQ  (DX)(BX*4), DX
 40676  	SUBQ  $0x08, SI
 40677  
 40678  check_limit_unroll:
 40679  	CMPQ SI, $0x08
 40680  	JHS  loop_unroll
 40681  	JMP  check_limit
 40682  
 40683  loop:
 40684  	MOVSS (AX), X1
 40685  	MULSS X0, X1
 40686  	ADDSS (DX), X1
 40687  	MOVSS X1, (DX)
 40688  	DECQ  SI
 40689  	LEAQ  (AX)(CX*4), AX
 40690  	LEAQ  (DX)(BX*4), DX
 40691  
 40692  check_limit:
 40693  	CMPQ SI, $0x00
 40694  	JHI  loop
 40695  	RET
 40696  
 40697  // func AmdAxpyPointerLoopXInterleave_V4A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40698  // Requires: SSE
 40699  TEXT ·AmdAxpyPointerLoopXInterleave_V4A12U8(SB), NOSPLIT, $0-48
 40700  	MOVSS alpha+0(FP), X0
 40701  	MOVQ  xs+8(FP), AX
 40702  	MOVQ  incx+16(FP), CX
 40703  	MOVQ  CX, DX
 40704  	SHLQ  $0x05, DX
 40705  	MOVQ  ys+24(FP), DX
 40706  	MOVQ  incy+32(FP), BX
 40707  	MOVQ  BX, SI
 40708  	SHLQ  $0x05, SI
 40709  	MOVQ  n+40(FP), SI
 40710  	JMP   check_limit_unroll
 40711  	PCALIGN $0x08
 40712  	NOP
 40713  	NOP
 40714  	NOP
 40715  	NOP
 40716  
 40717  loop_unroll:
 40718  	MOVSS (AX), X1
 40719  	LEAQ  (AX)(CX*4), AX
 40720  	MOVSS (AX), X2
 40721  	LEAQ  (AX)(CX*4), AX
 40722  	MOVSS (AX), X3
 40723  	LEAQ  (AX)(CX*4), AX
 40724  	MOVSS (AX), X4
 40725  	LEAQ  (AX)(CX*4), AX
 40726  	MOVSS (AX), X5
 40727  	LEAQ  (AX)(CX*4), AX
 40728  	MOVSS (AX), X6
 40729  	LEAQ  (AX)(CX*4), AX
 40730  	MOVSS (AX), X7
 40731  	LEAQ  (AX)(CX*4), AX
 40732  	MOVSS (AX), X8
 40733  	LEAQ  (AX)(CX*4), AX
 40734  	MULSS X0, X1
 40735  	MULSS X0, X2
 40736  	MULSS X0, X3
 40737  	MULSS X0, X4
 40738  	MULSS X0, X5
 40739  	MULSS X0, X6
 40740  	MULSS X0, X7
 40741  	MULSS X0, X8
 40742  	ADDSS (DX), X1
 40743  	MOVSS X1, (DX)
 40744  	LEAQ  (DX)(BX*4), DX
 40745  	ADDSS (DX), X2
 40746  	MOVSS X2, (DX)
 40747  	LEAQ  (DX)(BX*4), DX
 40748  	ADDSS (DX), X3
 40749  	MOVSS X3, (DX)
 40750  	LEAQ  (DX)(BX*4), DX
 40751  	ADDSS (DX), X4
 40752  	MOVSS X4, (DX)
 40753  	LEAQ  (DX)(BX*4), DX
 40754  	ADDSS (DX), X5
 40755  	MOVSS X5, (DX)
 40756  	LEAQ  (DX)(BX*4), DX
 40757  	ADDSS (DX), X6
 40758  	MOVSS X6, (DX)
 40759  	LEAQ  (DX)(BX*4), DX
 40760  	ADDSS (DX), X7
 40761  	MOVSS X7, (DX)
 40762  	LEAQ  (DX)(BX*4), DX
 40763  	ADDSS (DX), X8
 40764  	MOVSS X8, (DX)
 40765  	LEAQ  (DX)(BX*4), DX
 40766  	SUBQ  $0x08, SI
 40767  
 40768  check_limit_unroll:
 40769  	CMPQ SI, $0x08
 40770  	JHS  loop_unroll
 40771  	JMP  check_limit
 40772  
 40773  loop:
 40774  	MOVSS (AX), X1
 40775  	MULSS X0, X1
 40776  	ADDSS (DX), X1
 40777  	MOVSS X1, (DX)
 40778  	DECQ  SI
 40779  	LEAQ  (AX)(CX*4), AX
 40780  	LEAQ  (DX)(BX*4), DX
 40781  
 40782  check_limit:
 40783  	CMPQ SI, $0x00
 40784  	JHI  loop
 40785  	RET
 40786  
 40787  // func AmdAxpyPointerLoopXInterleave_V5A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40788  // Requires: SSE
 40789  TEXT ·AmdAxpyPointerLoopXInterleave_V5A12U8(SB), NOSPLIT, $0-48
 40790  	MOVSS alpha+0(FP), X0
 40791  	MOVQ  xs+8(FP), AX
 40792  	MOVQ  incx+16(FP), CX
 40793  	MOVQ  CX, DX
 40794  	SHLQ  $0x05, DX
 40795  	MOVQ  ys+24(FP), DX
 40796  	MOVQ  incy+32(FP), BX
 40797  	MOVQ  BX, SI
 40798  	SHLQ  $0x05, SI
 40799  	MOVQ  n+40(FP), SI
 40800  	JMP   check_limit_unroll
 40801  	PCALIGN $0x08
 40802  	NOP
 40803  	NOP
 40804  	NOP
 40805  	NOP
 40806  
 40807  loop_unroll:
 40808  	MOVSS (AX), X1
 40809  	LEAQ  (AX)(CX*4), AX
 40810  	MOVSS (AX), X2
 40811  	LEAQ  (AX)(CX*4), AX
 40812  	MOVSS (AX), X3
 40813  	LEAQ  (AX)(CX*4), AX
 40814  	MOVSS (AX), X4
 40815  	LEAQ  (AX)(CX*4), AX
 40816  	MOVSS (AX), X5
 40817  	LEAQ  (AX)(CX*4), AX
 40818  	MOVSS (AX), X6
 40819  	LEAQ  (AX)(CX*4), AX
 40820  	MOVSS (AX), X7
 40821  	LEAQ  (AX)(CX*4), AX
 40822  	MOVSS (AX), X8
 40823  	LEAQ  (AX)(CX*4), AX
 40824  	MULSS X0, X1
 40825  	MULSS X0, X2
 40826  	MULSS X0, X3
 40827  	MULSS X0, X4
 40828  	MULSS X0, X5
 40829  	MULSS X0, X6
 40830  	MULSS X0, X7
 40831  	MULSS X0, X8
 40832  	ADDSS (DX), X1
 40833  	MOVSS X1, (DX)
 40834  	LEAQ  (DX)(BX*4), DX
 40835  	ADDSS (DX), X2
 40836  	MOVSS X2, (DX)
 40837  	LEAQ  (DX)(BX*4), DX
 40838  	ADDSS (DX), X3
 40839  	MOVSS X3, (DX)
 40840  	LEAQ  (DX)(BX*4), DX
 40841  	ADDSS (DX), X4
 40842  	MOVSS X4, (DX)
 40843  	LEAQ  (DX)(BX*4), DX
 40844  	ADDSS (DX), X5
 40845  	MOVSS X5, (DX)
 40846  	LEAQ  (DX)(BX*4), DX
 40847  	ADDSS (DX), X6
 40848  	MOVSS X6, (DX)
 40849  	LEAQ  (DX)(BX*4), DX
 40850  	ADDSS (DX), X7
 40851  	MOVSS X7, (DX)
 40852  	LEAQ  (DX)(BX*4), DX
 40853  	ADDSS (DX), X8
 40854  	MOVSS X8, (DX)
 40855  	LEAQ  (DX)(BX*4), DX
 40856  	SUBQ  $0x08, SI
 40857  
 40858  check_limit_unroll:
 40859  	CMPQ SI, $0x08
 40860  	JHS  loop_unroll
 40861  	JMP  check_limit
 40862  
 40863  loop:
 40864  	MOVSS (AX), X1
 40865  	MULSS X0, X1
 40866  	ADDSS (DX), X1
 40867  	MOVSS X1, (DX)
 40868  	DECQ  SI
 40869  	LEAQ  (AX)(CX*4), AX
 40870  	LEAQ  (DX)(BX*4), DX
 40871  
 40872  check_limit:
 40873  	CMPQ SI, $0x00
 40874  	JHI  loop
 40875  	RET
 40876  
 40877  // func AmdAxpyPointerLoopXInterleave_V0A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40878  // Requires: SSE
 40879  TEXT ·AmdAxpyPointerLoopXInterleave_V0A13U8(SB), NOSPLIT, $0-48
 40880  	MOVSS alpha+0(FP), X0
 40881  	MOVQ  xs+8(FP), AX
 40882  	MOVQ  incx+16(FP), CX
 40883  	MOVQ  CX, DX
 40884  	SHLQ  $0x05, DX
 40885  	MOVQ  ys+24(FP), DX
 40886  	MOVQ  incy+32(FP), BX
 40887  	MOVQ  BX, SI
 40888  	SHLQ  $0x05, SI
 40889  	MOVQ  n+40(FP), SI
 40890  	JMP   check_limit_unroll
 40891  	PCALIGN $0x08
 40892  	NOP
 40893  	NOP
 40894  	NOP
 40895  	NOP
 40896  	NOP
 40897  
 40898  loop_unroll:
 40899  	MOVSS (AX), X1
 40900  	LEAQ  (AX)(CX*4), AX
 40901  	MOVSS (AX), X2
 40902  	LEAQ  (AX)(CX*4), AX
 40903  	MOVSS (AX), X3
 40904  	LEAQ  (AX)(CX*4), AX
 40905  	MOVSS (AX), X4
 40906  	LEAQ  (AX)(CX*4), AX
 40907  	MOVSS (AX), X5
 40908  	LEAQ  (AX)(CX*4), AX
 40909  	MOVSS (AX), X6
 40910  	LEAQ  (AX)(CX*4), AX
 40911  	MOVSS (AX), X7
 40912  	LEAQ  (AX)(CX*4), AX
 40913  	MOVSS (AX), X8
 40914  	LEAQ  (AX)(CX*4), AX
 40915  	MULSS X0, X1
 40916  	MULSS X0, X2
 40917  	MULSS X0, X3
 40918  	MULSS X0, X4
 40919  	MULSS X0, X5
 40920  	MULSS X0, X6
 40921  	MULSS X0, X7
 40922  	MULSS X0, X8
 40923  	ADDSS (DX), X1
 40924  	MOVSS X1, (DX)
 40925  	LEAQ  (DX)(BX*4), DX
 40926  	ADDSS (DX), X2
 40927  	MOVSS X2, (DX)
 40928  	LEAQ  (DX)(BX*4), DX
 40929  	ADDSS (DX), X3
 40930  	MOVSS X3, (DX)
 40931  	LEAQ  (DX)(BX*4), DX
 40932  	ADDSS (DX), X4
 40933  	MOVSS X4, (DX)
 40934  	LEAQ  (DX)(BX*4), DX
 40935  	ADDSS (DX), X5
 40936  	MOVSS X5, (DX)
 40937  	LEAQ  (DX)(BX*4), DX
 40938  	ADDSS (DX), X6
 40939  	MOVSS X6, (DX)
 40940  	LEAQ  (DX)(BX*4), DX
 40941  	ADDSS (DX), X7
 40942  	MOVSS X7, (DX)
 40943  	LEAQ  (DX)(BX*4), DX
 40944  	ADDSS (DX), X8
 40945  	MOVSS X8, (DX)
 40946  	LEAQ  (DX)(BX*4), DX
 40947  	SUBQ  $0x08, SI
 40948  
 40949  check_limit_unroll:
 40950  	CMPQ SI, $0x08
 40951  	JHS  loop_unroll
 40952  	JMP  check_limit
 40953  
 40954  loop:
 40955  	MOVSS (AX), X1
 40956  	MULSS X0, X1
 40957  	ADDSS (DX), X1
 40958  	MOVSS X1, (DX)
 40959  	DECQ  SI
 40960  	LEAQ  (AX)(CX*4), AX
 40961  	LEAQ  (DX)(BX*4), DX
 40962  
 40963  check_limit:
 40964  	CMPQ SI, $0x00
 40965  	JHI  loop
 40966  	RET
 40967  
 40968  // func AmdAxpyPointerLoopXInterleave_V1A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 40969  // Requires: SSE
 40970  TEXT ·AmdAxpyPointerLoopXInterleave_V1A13U8(SB), NOSPLIT, $0-48
 40971  	MOVSS alpha+0(FP), X0
 40972  	MOVQ  xs+8(FP), AX
 40973  	MOVQ  incx+16(FP), CX
 40974  	MOVQ  CX, DX
 40975  	SHLQ  $0x05, DX
 40976  	MOVQ  ys+24(FP), DX
 40977  	MOVQ  incy+32(FP), BX
 40978  	MOVQ  BX, SI
 40979  	SHLQ  $0x05, SI
 40980  	MOVQ  n+40(FP), SI
 40981  	JMP   check_limit_unroll
 40982  	PCALIGN $0x08
 40983  	NOP
 40984  	NOP
 40985  	NOP
 40986  	NOP
 40987  	NOP
 40988  
 40989  loop_unroll:
 40990  	MOVSS (AX), X1
 40991  	LEAQ  (AX)(CX*4), AX
 40992  	MOVSS (AX), X2
 40993  	LEAQ  (AX)(CX*4), AX
 40994  	MOVSS (AX), X3
 40995  	LEAQ  (AX)(CX*4), AX
 40996  	MOVSS (AX), X4
 40997  	LEAQ  (AX)(CX*4), AX
 40998  	MOVSS (AX), X5
 40999  	LEAQ  (AX)(CX*4), AX
 41000  	MOVSS (AX), X6
 41001  	LEAQ  (AX)(CX*4), AX
 41002  	MOVSS (AX), X7
 41003  	LEAQ  (AX)(CX*4), AX
 41004  	MOVSS (AX), X8
 41005  	LEAQ  (AX)(CX*4), AX
 41006  	MULSS X0, X1
 41007  	MULSS X0, X2
 41008  	MULSS X0, X3
 41009  	MULSS X0, X4
 41010  	MULSS X0, X5
 41011  	MULSS X0, X6
 41012  	MULSS X0, X7
 41013  	MULSS X0, X8
 41014  	ADDSS (DX), X1
 41015  	MOVSS X1, (DX)
 41016  	LEAQ  (DX)(BX*4), DX
 41017  	ADDSS (DX), X2
 41018  	MOVSS X2, (DX)
 41019  	LEAQ  (DX)(BX*4), DX
 41020  	ADDSS (DX), X3
 41021  	MOVSS X3, (DX)
 41022  	LEAQ  (DX)(BX*4), DX
 41023  	ADDSS (DX), X4
 41024  	MOVSS X4, (DX)
 41025  	LEAQ  (DX)(BX*4), DX
 41026  	ADDSS (DX), X5
 41027  	MOVSS X5, (DX)
 41028  	LEAQ  (DX)(BX*4), DX
 41029  	ADDSS (DX), X6
 41030  	MOVSS X6, (DX)
 41031  	LEAQ  (DX)(BX*4), DX
 41032  	ADDSS (DX), X7
 41033  	MOVSS X7, (DX)
 41034  	LEAQ  (DX)(BX*4), DX
 41035  	ADDSS (DX), X8
 41036  	MOVSS X8, (DX)
 41037  	LEAQ  (DX)(BX*4), DX
 41038  	SUBQ  $0x08, SI
 41039  
 41040  check_limit_unroll:
 41041  	CMPQ SI, $0x08
 41042  	JHS  loop_unroll
 41043  	JMP  check_limit
 41044  
 41045  loop:
 41046  	MOVSS (AX), X1
 41047  	MULSS X0, X1
 41048  	ADDSS (DX), X1
 41049  	MOVSS X1, (DX)
 41050  	DECQ  SI
 41051  	LEAQ  (AX)(CX*4), AX
 41052  	LEAQ  (DX)(BX*4), DX
 41053  
 41054  check_limit:
 41055  	CMPQ SI, $0x00
 41056  	JHI  loop
 41057  	RET
 41058  
 41059  // func AmdAxpyPointerLoopXInterleave_V2A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41060  // Requires: SSE
 41061  TEXT ·AmdAxpyPointerLoopXInterleave_V2A13U8(SB), NOSPLIT, $0-48
 41062  	MOVSS alpha+0(FP), X0
 41063  	MOVQ  xs+8(FP), AX
 41064  	MOVQ  incx+16(FP), CX
 41065  	MOVQ  CX, DX
 41066  	SHLQ  $0x05, DX
 41067  	MOVQ  ys+24(FP), DX
 41068  	MOVQ  incy+32(FP), BX
 41069  	MOVQ  BX, SI
 41070  	SHLQ  $0x05, SI
 41071  	MOVQ  n+40(FP), SI
 41072  	JMP   check_limit_unroll
 41073  	PCALIGN $0x08
 41074  	NOP
 41075  	NOP
 41076  	NOP
 41077  	NOP
 41078  	NOP
 41079  
 41080  loop_unroll:
 41081  	MOVSS (AX), X1
 41082  	LEAQ  (AX)(CX*4), AX
 41083  	MOVSS (AX), X2
 41084  	LEAQ  (AX)(CX*4), AX
 41085  	MOVSS (AX), X3
 41086  	LEAQ  (AX)(CX*4), AX
 41087  	MOVSS (AX), X4
 41088  	LEAQ  (AX)(CX*4), AX
 41089  	MOVSS (AX), X5
 41090  	LEAQ  (AX)(CX*4), AX
 41091  	MOVSS (AX), X6
 41092  	LEAQ  (AX)(CX*4), AX
 41093  	MOVSS (AX), X7
 41094  	LEAQ  (AX)(CX*4), AX
 41095  	MOVSS (AX), X8
 41096  	LEAQ  (AX)(CX*4), AX
 41097  	MULSS X0, X1
 41098  	MULSS X0, X2
 41099  	MULSS X0, X3
 41100  	MULSS X0, X4
 41101  	MULSS X0, X5
 41102  	MULSS X0, X6
 41103  	MULSS X0, X7
 41104  	MULSS X0, X8
 41105  	ADDSS (DX), X1
 41106  	MOVSS X1, (DX)
 41107  	LEAQ  (DX)(BX*4), DX
 41108  	ADDSS (DX), X2
 41109  	MOVSS X2, (DX)
 41110  	LEAQ  (DX)(BX*4), DX
 41111  	ADDSS (DX), X3
 41112  	MOVSS X3, (DX)
 41113  	LEAQ  (DX)(BX*4), DX
 41114  	ADDSS (DX), X4
 41115  	MOVSS X4, (DX)
 41116  	LEAQ  (DX)(BX*4), DX
 41117  	ADDSS (DX), X5
 41118  	MOVSS X5, (DX)
 41119  	LEAQ  (DX)(BX*4), DX
 41120  	ADDSS (DX), X6
 41121  	MOVSS X6, (DX)
 41122  	LEAQ  (DX)(BX*4), DX
 41123  	ADDSS (DX), X7
 41124  	MOVSS X7, (DX)
 41125  	LEAQ  (DX)(BX*4), DX
 41126  	ADDSS (DX), X8
 41127  	MOVSS X8, (DX)
 41128  	LEAQ  (DX)(BX*4), DX
 41129  	SUBQ  $0x08, SI
 41130  
 41131  check_limit_unroll:
 41132  	CMPQ SI, $0x08
 41133  	JHS  loop_unroll
 41134  	JMP  check_limit
 41135  
 41136  loop:
 41137  	MOVSS (AX), X1
 41138  	MULSS X0, X1
 41139  	ADDSS (DX), X1
 41140  	MOVSS X1, (DX)
 41141  	DECQ  SI
 41142  	LEAQ  (AX)(CX*4), AX
 41143  	LEAQ  (DX)(BX*4), DX
 41144  
 41145  check_limit:
 41146  	CMPQ SI, $0x00
 41147  	JHI  loop
 41148  	RET
 41149  
 41150  // func AmdAxpyPointerLoopXInterleave_V3A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41151  // Requires: SSE
 41152  TEXT ·AmdAxpyPointerLoopXInterleave_V3A13U8(SB), NOSPLIT, $0-48
 41153  	MOVSS alpha+0(FP), X0
 41154  	MOVQ  xs+8(FP), AX
 41155  	MOVQ  incx+16(FP), CX
 41156  	MOVQ  CX, DX
 41157  	SHLQ  $0x05, DX
 41158  	MOVQ  ys+24(FP), DX
 41159  	MOVQ  incy+32(FP), BX
 41160  	MOVQ  BX, SI
 41161  	SHLQ  $0x05, SI
 41162  	MOVQ  n+40(FP), SI
 41163  	JMP   check_limit_unroll
 41164  	PCALIGN $0x08
 41165  	NOP
 41166  	NOP
 41167  	NOP
 41168  	NOP
 41169  	NOP
 41170  
 41171  loop_unroll:
 41172  	MOVSS (AX), X1
 41173  	LEAQ  (AX)(CX*4), AX
 41174  	MOVSS (AX), X2
 41175  	LEAQ  (AX)(CX*4), AX
 41176  	MOVSS (AX), X3
 41177  	LEAQ  (AX)(CX*4), AX
 41178  	MOVSS (AX), X4
 41179  	LEAQ  (AX)(CX*4), AX
 41180  	MOVSS (AX), X5
 41181  	LEAQ  (AX)(CX*4), AX
 41182  	MOVSS (AX), X6
 41183  	LEAQ  (AX)(CX*4), AX
 41184  	MOVSS (AX), X7
 41185  	LEAQ  (AX)(CX*4), AX
 41186  	MOVSS (AX), X8
 41187  	LEAQ  (AX)(CX*4), AX
 41188  	MULSS X0, X1
 41189  	MULSS X0, X2
 41190  	MULSS X0, X3
 41191  	MULSS X0, X4
 41192  	MULSS X0, X5
 41193  	MULSS X0, X6
 41194  	MULSS X0, X7
 41195  	MULSS X0, X8
 41196  	ADDSS (DX), X1
 41197  	MOVSS X1, (DX)
 41198  	LEAQ  (DX)(BX*4), DX
 41199  	ADDSS (DX), X2
 41200  	MOVSS X2, (DX)
 41201  	LEAQ  (DX)(BX*4), DX
 41202  	ADDSS (DX), X3
 41203  	MOVSS X3, (DX)
 41204  	LEAQ  (DX)(BX*4), DX
 41205  	ADDSS (DX), X4
 41206  	MOVSS X4, (DX)
 41207  	LEAQ  (DX)(BX*4), DX
 41208  	ADDSS (DX), X5
 41209  	MOVSS X5, (DX)
 41210  	LEAQ  (DX)(BX*4), DX
 41211  	ADDSS (DX), X6
 41212  	MOVSS X6, (DX)
 41213  	LEAQ  (DX)(BX*4), DX
 41214  	ADDSS (DX), X7
 41215  	MOVSS X7, (DX)
 41216  	LEAQ  (DX)(BX*4), DX
 41217  	ADDSS (DX), X8
 41218  	MOVSS X8, (DX)
 41219  	LEAQ  (DX)(BX*4), DX
 41220  	SUBQ  $0x08, SI
 41221  
 41222  check_limit_unroll:
 41223  	CMPQ SI, $0x08
 41224  	JHS  loop_unroll
 41225  	JMP  check_limit
 41226  
 41227  loop:
 41228  	MOVSS (AX), X1
 41229  	MULSS X0, X1
 41230  	ADDSS (DX), X1
 41231  	MOVSS X1, (DX)
 41232  	DECQ  SI
 41233  	LEAQ  (AX)(CX*4), AX
 41234  	LEAQ  (DX)(BX*4), DX
 41235  
 41236  check_limit:
 41237  	CMPQ SI, $0x00
 41238  	JHI  loop
 41239  	RET
 41240  
 41241  // func AmdAxpyPointerLoopXInterleave_V4A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41242  // Requires: SSE
 41243  TEXT ·AmdAxpyPointerLoopXInterleave_V4A13U8(SB), NOSPLIT, $0-48
 41244  	MOVSS alpha+0(FP), X0
 41245  	MOVQ  xs+8(FP), AX
 41246  	MOVQ  incx+16(FP), CX
 41247  	MOVQ  CX, DX
 41248  	SHLQ  $0x05, DX
 41249  	MOVQ  ys+24(FP), DX
 41250  	MOVQ  incy+32(FP), BX
 41251  	MOVQ  BX, SI
 41252  	SHLQ  $0x05, SI
 41253  	MOVQ  n+40(FP), SI
 41254  	JMP   check_limit_unroll
 41255  	PCALIGN $0x08
 41256  	NOP
 41257  	NOP
 41258  	NOP
 41259  	NOP
 41260  	NOP
 41261  
 41262  loop_unroll:
 41263  	MOVSS (AX), X1
 41264  	LEAQ  (AX)(CX*4), AX
 41265  	MOVSS (AX), X2
 41266  	LEAQ  (AX)(CX*4), AX
 41267  	MOVSS (AX), X3
 41268  	LEAQ  (AX)(CX*4), AX
 41269  	MOVSS (AX), X4
 41270  	LEAQ  (AX)(CX*4), AX
 41271  	MOVSS (AX), X5
 41272  	LEAQ  (AX)(CX*4), AX
 41273  	MOVSS (AX), X6
 41274  	LEAQ  (AX)(CX*4), AX
 41275  	MOVSS (AX), X7
 41276  	LEAQ  (AX)(CX*4), AX
 41277  	MOVSS (AX), X8
 41278  	LEAQ  (AX)(CX*4), AX
 41279  	MULSS X0, X1
 41280  	MULSS X0, X2
 41281  	MULSS X0, X3
 41282  	MULSS X0, X4
 41283  	MULSS X0, X5
 41284  	MULSS X0, X6
 41285  	MULSS X0, X7
 41286  	MULSS X0, X8
 41287  	ADDSS (DX), X1
 41288  	MOVSS X1, (DX)
 41289  	LEAQ  (DX)(BX*4), DX
 41290  	ADDSS (DX), X2
 41291  	MOVSS X2, (DX)
 41292  	LEAQ  (DX)(BX*4), DX
 41293  	ADDSS (DX), X3
 41294  	MOVSS X3, (DX)
 41295  	LEAQ  (DX)(BX*4), DX
 41296  	ADDSS (DX), X4
 41297  	MOVSS X4, (DX)
 41298  	LEAQ  (DX)(BX*4), DX
 41299  	ADDSS (DX), X5
 41300  	MOVSS X5, (DX)
 41301  	LEAQ  (DX)(BX*4), DX
 41302  	ADDSS (DX), X6
 41303  	MOVSS X6, (DX)
 41304  	LEAQ  (DX)(BX*4), DX
 41305  	ADDSS (DX), X7
 41306  	MOVSS X7, (DX)
 41307  	LEAQ  (DX)(BX*4), DX
 41308  	ADDSS (DX), X8
 41309  	MOVSS X8, (DX)
 41310  	LEAQ  (DX)(BX*4), DX
 41311  	SUBQ  $0x08, SI
 41312  
 41313  check_limit_unroll:
 41314  	CMPQ SI, $0x08
 41315  	JHS  loop_unroll
 41316  	JMP  check_limit
 41317  
 41318  loop:
 41319  	MOVSS (AX), X1
 41320  	MULSS X0, X1
 41321  	ADDSS (DX), X1
 41322  	MOVSS X1, (DX)
 41323  	DECQ  SI
 41324  	LEAQ  (AX)(CX*4), AX
 41325  	LEAQ  (DX)(BX*4), DX
 41326  
 41327  check_limit:
 41328  	CMPQ SI, $0x00
 41329  	JHI  loop
 41330  	RET
 41331  
 41332  // func AmdAxpyPointerLoopXInterleave_V5A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41333  // Requires: SSE
 41334  TEXT ·AmdAxpyPointerLoopXInterleave_V5A13U8(SB), NOSPLIT, $0-48
 41335  	MOVSS alpha+0(FP), X0
 41336  	MOVQ  xs+8(FP), AX
 41337  	MOVQ  incx+16(FP), CX
 41338  	MOVQ  CX, DX
 41339  	SHLQ  $0x05, DX
 41340  	MOVQ  ys+24(FP), DX
 41341  	MOVQ  incy+32(FP), BX
 41342  	MOVQ  BX, SI
 41343  	SHLQ  $0x05, SI
 41344  	MOVQ  n+40(FP), SI
 41345  	JMP   check_limit_unroll
 41346  	PCALIGN $0x08
 41347  	NOP
 41348  	NOP
 41349  	NOP
 41350  	NOP
 41351  	NOP
 41352  
 41353  loop_unroll:
 41354  	MOVSS (AX), X1
 41355  	LEAQ  (AX)(CX*4), AX
 41356  	MOVSS (AX), X2
 41357  	LEAQ  (AX)(CX*4), AX
 41358  	MOVSS (AX), X3
 41359  	LEAQ  (AX)(CX*4), AX
 41360  	MOVSS (AX), X4
 41361  	LEAQ  (AX)(CX*4), AX
 41362  	MOVSS (AX), X5
 41363  	LEAQ  (AX)(CX*4), AX
 41364  	MOVSS (AX), X6
 41365  	LEAQ  (AX)(CX*4), AX
 41366  	MOVSS (AX), X7
 41367  	LEAQ  (AX)(CX*4), AX
 41368  	MOVSS (AX), X8
 41369  	LEAQ  (AX)(CX*4), AX
 41370  	MULSS X0, X1
 41371  	MULSS X0, X2
 41372  	MULSS X0, X3
 41373  	MULSS X0, X4
 41374  	MULSS X0, X5
 41375  	MULSS X0, X6
 41376  	MULSS X0, X7
 41377  	MULSS X0, X8
 41378  	ADDSS (DX), X1
 41379  	MOVSS X1, (DX)
 41380  	LEAQ  (DX)(BX*4), DX
 41381  	ADDSS (DX), X2
 41382  	MOVSS X2, (DX)
 41383  	LEAQ  (DX)(BX*4), DX
 41384  	ADDSS (DX), X3
 41385  	MOVSS X3, (DX)
 41386  	LEAQ  (DX)(BX*4), DX
 41387  	ADDSS (DX), X4
 41388  	MOVSS X4, (DX)
 41389  	LEAQ  (DX)(BX*4), DX
 41390  	ADDSS (DX), X5
 41391  	MOVSS X5, (DX)
 41392  	LEAQ  (DX)(BX*4), DX
 41393  	ADDSS (DX), X6
 41394  	MOVSS X6, (DX)
 41395  	LEAQ  (DX)(BX*4), DX
 41396  	ADDSS (DX), X7
 41397  	MOVSS X7, (DX)
 41398  	LEAQ  (DX)(BX*4), DX
 41399  	ADDSS (DX), X8
 41400  	MOVSS X8, (DX)
 41401  	LEAQ  (DX)(BX*4), DX
 41402  	SUBQ  $0x08, SI
 41403  
 41404  check_limit_unroll:
 41405  	CMPQ SI, $0x08
 41406  	JHS  loop_unroll
 41407  	JMP  check_limit
 41408  
 41409  loop:
 41410  	MOVSS (AX), X1
 41411  	MULSS X0, X1
 41412  	ADDSS (DX), X1
 41413  	MOVSS X1, (DX)
 41414  	DECQ  SI
 41415  	LEAQ  (AX)(CX*4), AX
 41416  	LEAQ  (DX)(BX*4), DX
 41417  
 41418  check_limit:
 41419  	CMPQ SI, $0x00
 41420  	JHI  loop
 41421  	RET
 41422  
 41423  // func AmdAxpyPointerLoopXInterleave_V0A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41424  // Requires: SSE
 41425  TEXT ·AmdAxpyPointerLoopXInterleave_V0A14U8(SB), NOSPLIT, $0-48
 41426  	MOVSS alpha+0(FP), X0
 41427  	MOVQ  xs+8(FP), AX
 41428  	MOVQ  incx+16(FP), CX
 41429  	MOVQ  CX, DX
 41430  	SHLQ  $0x05, DX
 41431  	MOVQ  ys+24(FP), DX
 41432  	MOVQ  incy+32(FP), BX
 41433  	MOVQ  BX, SI
 41434  	SHLQ  $0x05, SI
 41435  	MOVQ  n+40(FP), SI
 41436  	JMP   check_limit_unroll
 41437  	PCALIGN $0x08
 41438  	NOP
 41439  	NOP
 41440  	NOP
 41441  	NOP
 41442  	NOP
 41443  	NOP
 41444  
 41445  loop_unroll:
 41446  	MOVSS (AX), X1
 41447  	LEAQ  (AX)(CX*4), AX
 41448  	MOVSS (AX), X2
 41449  	LEAQ  (AX)(CX*4), AX
 41450  	MOVSS (AX), X3
 41451  	LEAQ  (AX)(CX*4), AX
 41452  	MOVSS (AX), X4
 41453  	LEAQ  (AX)(CX*4), AX
 41454  	MOVSS (AX), X5
 41455  	LEAQ  (AX)(CX*4), AX
 41456  	MOVSS (AX), X6
 41457  	LEAQ  (AX)(CX*4), AX
 41458  	MOVSS (AX), X7
 41459  	LEAQ  (AX)(CX*4), AX
 41460  	MOVSS (AX), X8
 41461  	LEAQ  (AX)(CX*4), AX
 41462  	MULSS X0, X1
 41463  	MULSS X0, X2
 41464  	MULSS X0, X3
 41465  	MULSS X0, X4
 41466  	MULSS X0, X5
 41467  	MULSS X0, X6
 41468  	MULSS X0, X7
 41469  	MULSS X0, X8
 41470  	ADDSS (DX), X1
 41471  	MOVSS X1, (DX)
 41472  	LEAQ  (DX)(BX*4), DX
 41473  	ADDSS (DX), X2
 41474  	MOVSS X2, (DX)
 41475  	LEAQ  (DX)(BX*4), DX
 41476  	ADDSS (DX), X3
 41477  	MOVSS X3, (DX)
 41478  	LEAQ  (DX)(BX*4), DX
 41479  	ADDSS (DX), X4
 41480  	MOVSS X4, (DX)
 41481  	LEAQ  (DX)(BX*4), DX
 41482  	ADDSS (DX), X5
 41483  	MOVSS X5, (DX)
 41484  	LEAQ  (DX)(BX*4), DX
 41485  	ADDSS (DX), X6
 41486  	MOVSS X6, (DX)
 41487  	LEAQ  (DX)(BX*4), DX
 41488  	ADDSS (DX), X7
 41489  	MOVSS X7, (DX)
 41490  	LEAQ  (DX)(BX*4), DX
 41491  	ADDSS (DX), X8
 41492  	MOVSS X8, (DX)
 41493  	LEAQ  (DX)(BX*4), DX
 41494  	SUBQ  $0x08, SI
 41495  
 41496  check_limit_unroll:
 41497  	CMPQ SI, $0x08
 41498  	JHS  loop_unroll
 41499  	JMP  check_limit
 41500  
 41501  loop:
 41502  	MOVSS (AX), X1
 41503  	MULSS X0, X1
 41504  	ADDSS (DX), X1
 41505  	MOVSS X1, (DX)
 41506  	DECQ  SI
 41507  	LEAQ  (AX)(CX*4), AX
 41508  	LEAQ  (DX)(BX*4), DX
 41509  
 41510  check_limit:
 41511  	CMPQ SI, $0x00
 41512  	JHI  loop
 41513  	RET
 41514  
 41515  // func AmdAxpyPointerLoopXInterleave_V1A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41516  // Requires: SSE
 41517  TEXT ·AmdAxpyPointerLoopXInterleave_V1A14U8(SB), NOSPLIT, $0-48
 41518  	MOVSS alpha+0(FP), X0
 41519  	MOVQ  xs+8(FP), AX
 41520  	MOVQ  incx+16(FP), CX
 41521  	MOVQ  CX, DX
 41522  	SHLQ  $0x05, DX
 41523  	MOVQ  ys+24(FP), DX
 41524  	MOVQ  incy+32(FP), BX
 41525  	MOVQ  BX, SI
 41526  	SHLQ  $0x05, SI
 41527  	MOVQ  n+40(FP), SI
 41528  	JMP   check_limit_unroll
 41529  	PCALIGN $0x08
 41530  	NOP
 41531  	NOP
 41532  	NOP
 41533  	NOP
 41534  	NOP
 41535  	NOP
 41536  
 41537  loop_unroll:
 41538  	MOVSS (AX), X1
 41539  	LEAQ  (AX)(CX*4), AX
 41540  	MOVSS (AX), X2
 41541  	LEAQ  (AX)(CX*4), AX
 41542  	MOVSS (AX), X3
 41543  	LEAQ  (AX)(CX*4), AX
 41544  	MOVSS (AX), X4
 41545  	LEAQ  (AX)(CX*4), AX
 41546  	MOVSS (AX), X5
 41547  	LEAQ  (AX)(CX*4), AX
 41548  	MOVSS (AX), X6
 41549  	LEAQ  (AX)(CX*4), AX
 41550  	MOVSS (AX), X7
 41551  	LEAQ  (AX)(CX*4), AX
 41552  	MOVSS (AX), X8
 41553  	LEAQ  (AX)(CX*4), AX
 41554  	MULSS X0, X1
 41555  	MULSS X0, X2
 41556  	MULSS X0, X3
 41557  	MULSS X0, X4
 41558  	MULSS X0, X5
 41559  	MULSS X0, X6
 41560  	MULSS X0, X7
 41561  	MULSS X0, X8
 41562  	ADDSS (DX), X1
 41563  	MOVSS X1, (DX)
 41564  	LEAQ  (DX)(BX*4), DX
 41565  	ADDSS (DX), X2
 41566  	MOVSS X2, (DX)
 41567  	LEAQ  (DX)(BX*4), DX
 41568  	ADDSS (DX), X3
 41569  	MOVSS X3, (DX)
 41570  	LEAQ  (DX)(BX*4), DX
 41571  	ADDSS (DX), X4
 41572  	MOVSS X4, (DX)
 41573  	LEAQ  (DX)(BX*4), DX
 41574  	ADDSS (DX), X5
 41575  	MOVSS X5, (DX)
 41576  	LEAQ  (DX)(BX*4), DX
 41577  	ADDSS (DX), X6
 41578  	MOVSS X6, (DX)
 41579  	LEAQ  (DX)(BX*4), DX
 41580  	ADDSS (DX), X7
 41581  	MOVSS X7, (DX)
 41582  	LEAQ  (DX)(BX*4), DX
 41583  	ADDSS (DX), X8
 41584  	MOVSS X8, (DX)
 41585  	LEAQ  (DX)(BX*4), DX
 41586  	SUBQ  $0x08, SI
 41587  
 41588  check_limit_unroll:
 41589  	CMPQ SI, $0x08
 41590  	JHS  loop_unroll
 41591  	JMP  check_limit
 41592  
 41593  loop:
 41594  	MOVSS (AX), X1
 41595  	MULSS X0, X1
 41596  	ADDSS (DX), X1
 41597  	MOVSS X1, (DX)
 41598  	DECQ  SI
 41599  	LEAQ  (AX)(CX*4), AX
 41600  	LEAQ  (DX)(BX*4), DX
 41601  
 41602  check_limit:
 41603  	CMPQ SI, $0x00
 41604  	JHI  loop
 41605  	RET
 41606  
 41607  // func AmdAxpyPointerLoopXInterleave_V2A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41608  // Requires: SSE
 41609  TEXT ·AmdAxpyPointerLoopXInterleave_V2A14U8(SB), NOSPLIT, $0-48
 41610  	MOVSS alpha+0(FP), X0
 41611  	MOVQ  xs+8(FP), AX
 41612  	MOVQ  incx+16(FP), CX
 41613  	MOVQ  CX, DX
 41614  	SHLQ  $0x05, DX
 41615  	MOVQ  ys+24(FP), DX
 41616  	MOVQ  incy+32(FP), BX
 41617  	MOVQ  BX, SI
 41618  	SHLQ  $0x05, SI
 41619  	MOVQ  n+40(FP), SI
 41620  	JMP   check_limit_unroll
 41621  	PCALIGN $0x08
 41622  	NOP
 41623  	NOP
 41624  	NOP
 41625  	NOP
 41626  	NOP
 41627  	NOP
 41628  
 41629  loop_unroll:
 41630  	MOVSS (AX), X1
 41631  	LEAQ  (AX)(CX*4), AX
 41632  	MOVSS (AX), X2
 41633  	LEAQ  (AX)(CX*4), AX
 41634  	MOVSS (AX), X3
 41635  	LEAQ  (AX)(CX*4), AX
 41636  	MOVSS (AX), X4
 41637  	LEAQ  (AX)(CX*4), AX
 41638  	MOVSS (AX), X5
 41639  	LEAQ  (AX)(CX*4), AX
 41640  	MOVSS (AX), X6
 41641  	LEAQ  (AX)(CX*4), AX
 41642  	MOVSS (AX), X7
 41643  	LEAQ  (AX)(CX*4), AX
 41644  	MOVSS (AX), X8
 41645  	LEAQ  (AX)(CX*4), AX
 41646  	MULSS X0, X1
 41647  	MULSS X0, X2
 41648  	MULSS X0, X3
 41649  	MULSS X0, X4
 41650  	MULSS X0, X5
 41651  	MULSS X0, X6
 41652  	MULSS X0, X7
 41653  	MULSS X0, X8
 41654  	ADDSS (DX), X1
 41655  	MOVSS X1, (DX)
 41656  	LEAQ  (DX)(BX*4), DX
 41657  	ADDSS (DX), X2
 41658  	MOVSS X2, (DX)
 41659  	LEAQ  (DX)(BX*4), DX
 41660  	ADDSS (DX), X3
 41661  	MOVSS X3, (DX)
 41662  	LEAQ  (DX)(BX*4), DX
 41663  	ADDSS (DX), X4
 41664  	MOVSS X4, (DX)
 41665  	LEAQ  (DX)(BX*4), DX
 41666  	ADDSS (DX), X5
 41667  	MOVSS X5, (DX)
 41668  	LEAQ  (DX)(BX*4), DX
 41669  	ADDSS (DX), X6
 41670  	MOVSS X6, (DX)
 41671  	LEAQ  (DX)(BX*4), DX
 41672  	ADDSS (DX), X7
 41673  	MOVSS X7, (DX)
 41674  	LEAQ  (DX)(BX*4), DX
 41675  	ADDSS (DX), X8
 41676  	MOVSS X8, (DX)
 41677  	LEAQ  (DX)(BX*4), DX
 41678  	SUBQ  $0x08, SI
 41679  
 41680  check_limit_unroll:
 41681  	CMPQ SI, $0x08
 41682  	JHS  loop_unroll
 41683  	JMP  check_limit
 41684  
 41685  loop:
 41686  	MOVSS (AX), X1
 41687  	MULSS X0, X1
 41688  	ADDSS (DX), X1
 41689  	MOVSS X1, (DX)
 41690  	DECQ  SI
 41691  	LEAQ  (AX)(CX*4), AX
 41692  	LEAQ  (DX)(BX*4), DX
 41693  
 41694  check_limit:
 41695  	CMPQ SI, $0x00
 41696  	JHI  loop
 41697  	RET
 41698  
 41699  // func AmdAxpyPointerLoopXInterleave_V3A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41700  // Requires: SSE
 41701  TEXT ·AmdAxpyPointerLoopXInterleave_V3A14U8(SB), NOSPLIT, $0-48
 41702  	MOVSS alpha+0(FP), X0
 41703  	MOVQ  xs+8(FP), AX
 41704  	MOVQ  incx+16(FP), CX
 41705  	MOVQ  CX, DX
 41706  	SHLQ  $0x05, DX
 41707  	MOVQ  ys+24(FP), DX
 41708  	MOVQ  incy+32(FP), BX
 41709  	MOVQ  BX, SI
 41710  	SHLQ  $0x05, SI
 41711  	MOVQ  n+40(FP), SI
 41712  	JMP   check_limit_unroll
 41713  	PCALIGN $0x08
 41714  	NOP
 41715  	NOP
 41716  	NOP
 41717  	NOP
 41718  	NOP
 41719  	NOP
 41720  
 41721  loop_unroll:
 41722  	MOVSS (AX), X1
 41723  	LEAQ  (AX)(CX*4), AX
 41724  	MOVSS (AX), X2
 41725  	LEAQ  (AX)(CX*4), AX
 41726  	MOVSS (AX), X3
 41727  	LEAQ  (AX)(CX*4), AX
 41728  	MOVSS (AX), X4
 41729  	LEAQ  (AX)(CX*4), AX
 41730  	MOVSS (AX), X5
 41731  	LEAQ  (AX)(CX*4), AX
 41732  	MOVSS (AX), X6
 41733  	LEAQ  (AX)(CX*4), AX
 41734  	MOVSS (AX), X7
 41735  	LEAQ  (AX)(CX*4), AX
 41736  	MOVSS (AX), X8
 41737  	LEAQ  (AX)(CX*4), AX
 41738  	MULSS X0, X1
 41739  	MULSS X0, X2
 41740  	MULSS X0, X3
 41741  	MULSS X0, X4
 41742  	MULSS X0, X5
 41743  	MULSS X0, X6
 41744  	MULSS X0, X7
 41745  	MULSS X0, X8
 41746  	ADDSS (DX), X1
 41747  	MOVSS X1, (DX)
 41748  	LEAQ  (DX)(BX*4), DX
 41749  	ADDSS (DX), X2
 41750  	MOVSS X2, (DX)
 41751  	LEAQ  (DX)(BX*4), DX
 41752  	ADDSS (DX), X3
 41753  	MOVSS X3, (DX)
 41754  	LEAQ  (DX)(BX*4), DX
 41755  	ADDSS (DX), X4
 41756  	MOVSS X4, (DX)
 41757  	LEAQ  (DX)(BX*4), DX
 41758  	ADDSS (DX), X5
 41759  	MOVSS X5, (DX)
 41760  	LEAQ  (DX)(BX*4), DX
 41761  	ADDSS (DX), X6
 41762  	MOVSS X6, (DX)
 41763  	LEAQ  (DX)(BX*4), DX
 41764  	ADDSS (DX), X7
 41765  	MOVSS X7, (DX)
 41766  	LEAQ  (DX)(BX*4), DX
 41767  	ADDSS (DX), X8
 41768  	MOVSS X8, (DX)
 41769  	LEAQ  (DX)(BX*4), DX
 41770  	SUBQ  $0x08, SI
 41771  
 41772  check_limit_unroll:
 41773  	CMPQ SI, $0x08
 41774  	JHS  loop_unroll
 41775  	JMP  check_limit
 41776  
 41777  loop:
 41778  	MOVSS (AX), X1
 41779  	MULSS X0, X1
 41780  	ADDSS (DX), X1
 41781  	MOVSS X1, (DX)
 41782  	DECQ  SI
 41783  	LEAQ  (AX)(CX*4), AX
 41784  	LEAQ  (DX)(BX*4), DX
 41785  
 41786  check_limit:
 41787  	CMPQ SI, $0x00
 41788  	JHI  loop
 41789  	RET
 41790  
 41791  // func AmdAxpyPointerLoopXInterleave_V4A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41792  // Requires: SSE
 41793  TEXT ·AmdAxpyPointerLoopXInterleave_V4A14U8(SB), NOSPLIT, $0-48
 41794  	MOVSS alpha+0(FP), X0
 41795  	MOVQ  xs+8(FP), AX
 41796  	MOVQ  incx+16(FP), CX
 41797  	MOVQ  CX, DX
 41798  	SHLQ  $0x05, DX
 41799  	MOVQ  ys+24(FP), DX
 41800  	MOVQ  incy+32(FP), BX
 41801  	MOVQ  BX, SI
 41802  	SHLQ  $0x05, SI
 41803  	MOVQ  n+40(FP), SI
 41804  	JMP   check_limit_unroll
 41805  	PCALIGN $0x08
 41806  	NOP
 41807  	NOP
 41808  	NOP
 41809  	NOP
 41810  	NOP
 41811  	NOP
 41812  
 41813  loop_unroll:
 41814  	MOVSS (AX), X1
 41815  	LEAQ  (AX)(CX*4), AX
 41816  	MOVSS (AX), X2
 41817  	LEAQ  (AX)(CX*4), AX
 41818  	MOVSS (AX), X3
 41819  	LEAQ  (AX)(CX*4), AX
 41820  	MOVSS (AX), X4
 41821  	LEAQ  (AX)(CX*4), AX
 41822  	MOVSS (AX), X5
 41823  	LEAQ  (AX)(CX*4), AX
 41824  	MOVSS (AX), X6
 41825  	LEAQ  (AX)(CX*4), AX
 41826  	MOVSS (AX), X7
 41827  	LEAQ  (AX)(CX*4), AX
 41828  	MOVSS (AX), X8
 41829  	LEAQ  (AX)(CX*4), AX
 41830  	MULSS X0, X1
 41831  	MULSS X0, X2
 41832  	MULSS X0, X3
 41833  	MULSS X0, X4
 41834  	MULSS X0, X5
 41835  	MULSS X0, X6
 41836  	MULSS X0, X7
 41837  	MULSS X0, X8
 41838  	ADDSS (DX), X1
 41839  	MOVSS X1, (DX)
 41840  	LEAQ  (DX)(BX*4), DX
 41841  	ADDSS (DX), X2
 41842  	MOVSS X2, (DX)
 41843  	LEAQ  (DX)(BX*4), DX
 41844  	ADDSS (DX), X3
 41845  	MOVSS X3, (DX)
 41846  	LEAQ  (DX)(BX*4), DX
 41847  	ADDSS (DX), X4
 41848  	MOVSS X4, (DX)
 41849  	LEAQ  (DX)(BX*4), DX
 41850  	ADDSS (DX), X5
 41851  	MOVSS X5, (DX)
 41852  	LEAQ  (DX)(BX*4), DX
 41853  	ADDSS (DX), X6
 41854  	MOVSS X6, (DX)
 41855  	LEAQ  (DX)(BX*4), DX
 41856  	ADDSS (DX), X7
 41857  	MOVSS X7, (DX)
 41858  	LEAQ  (DX)(BX*4), DX
 41859  	ADDSS (DX), X8
 41860  	MOVSS X8, (DX)
 41861  	LEAQ  (DX)(BX*4), DX
 41862  	SUBQ  $0x08, SI
 41863  
 41864  check_limit_unroll:
 41865  	CMPQ SI, $0x08
 41866  	JHS  loop_unroll
 41867  	JMP  check_limit
 41868  
 41869  loop:
 41870  	MOVSS (AX), X1
 41871  	MULSS X0, X1
 41872  	ADDSS (DX), X1
 41873  	MOVSS X1, (DX)
 41874  	DECQ  SI
 41875  	LEAQ  (AX)(CX*4), AX
 41876  	LEAQ  (DX)(BX*4), DX
 41877  
 41878  check_limit:
 41879  	CMPQ SI, $0x00
 41880  	JHI  loop
 41881  	RET
 41882  
 41883  // func AmdAxpyPointerLoopXInterleave_V5A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41884  // Requires: SSE
 41885  TEXT ·AmdAxpyPointerLoopXInterleave_V5A14U8(SB), NOSPLIT, $0-48
 41886  	MOVSS alpha+0(FP), X0
 41887  	MOVQ  xs+8(FP), AX
 41888  	MOVQ  incx+16(FP), CX
 41889  	MOVQ  CX, DX
 41890  	SHLQ  $0x05, DX
 41891  	MOVQ  ys+24(FP), DX
 41892  	MOVQ  incy+32(FP), BX
 41893  	MOVQ  BX, SI
 41894  	SHLQ  $0x05, SI
 41895  	MOVQ  n+40(FP), SI
 41896  	JMP   check_limit_unroll
 41897  	PCALIGN $0x08
 41898  	NOP
 41899  	NOP
 41900  	NOP
 41901  	NOP
 41902  	NOP
 41903  	NOP
 41904  
 41905  loop_unroll:
 41906  	MOVSS (AX), X1
 41907  	LEAQ  (AX)(CX*4), AX
 41908  	MOVSS (AX), X2
 41909  	LEAQ  (AX)(CX*4), AX
 41910  	MOVSS (AX), X3
 41911  	LEAQ  (AX)(CX*4), AX
 41912  	MOVSS (AX), X4
 41913  	LEAQ  (AX)(CX*4), AX
 41914  	MOVSS (AX), X5
 41915  	LEAQ  (AX)(CX*4), AX
 41916  	MOVSS (AX), X6
 41917  	LEAQ  (AX)(CX*4), AX
 41918  	MOVSS (AX), X7
 41919  	LEAQ  (AX)(CX*4), AX
 41920  	MOVSS (AX), X8
 41921  	LEAQ  (AX)(CX*4), AX
 41922  	MULSS X0, X1
 41923  	MULSS X0, X2
 41924  	MULSS X0, X3
 41925  	MULSS X0, X4
 41926  	MULSS X0, X5
 41927  	MULSS X0, X6
 41928  	MULSS X0, X7
 41929  	MULSS X0, X8
 41930  	ADDSS (DX), X1
 41931  	MOVSS X1, (DX)
 41932  	LEAQ  (DX)(BX*4), DX
 41933  	ADDSS (DX), X2
 41934  	MOVSS X2, (DX)
 41935  	LEAQ  (DX)(BX*4), DX
 41936  	ADDSS (DX), X3
 41937  	MOVSS X3, (DX)
 41938  	LEAQ  (DX)(BX*4), DX
 41939  	ADDSS (DX), X4
 41940  	MOVSS X4, (DX)
 41941  	LEAQ  (DX)(BX*4), DX
 41942  	ADDSS (DX), X5
 41943  	MOVSS X5, (DX)
 41944  	LEAQ  (DX)(BX*4), DX
 41945  	ADDSS (DX), X6
 41946  	MOVSS X6, (DX)
 41947  	LEAQ  (DX)(BX*4), DX
 41948  	ADDSS (DX), X7
 41949  	MOVSS X7, (DX)
 41950  	LEAQ  (DX)(BX*4), DX
 41951  	ADDSS (DX), X8
 41952  	MOVSS X8, (DX)
 41953  	LEAQ  (DX)(BX*4), DX
 41954  	SUBQ  $0x08, SI
 41955  
 41956  check_limit_unroll:
 41957  	CMPQ SI, $0x08
 41958  	JHS  loop_unroll
 41959  	JMP  check_limit
 41960  
 41961  loop:
 41962  	MOVSS (AX), X1
 41963  	MULSS X0, X1
 41964  	ADDSS (DX), X1
 41965  	MOVSS X1, (DX)
 41966  	DECQ  SI
 41967  	LEAQ  (AX)(CX*4), AX
 41968  	LEAQ  (DX)(BX*4), DX
 41969  
 41970  check_limit:
 41971  	CMPQ SI, $0x00
 41972  	JHI  loop
 41973  	RET
 41974  
 41975  // func AmdAxpyPointerLoopXInterleave_V0A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 41976  // Requires: SSE
 41977  TEXT ·AmdAxpyPointerLoopXInterleave_V0A15U8(SB), NOSPLIT, $0-48
 41978  	MOVSS alpha+0(FP), X0
 41979  	MOVQ  xs+8(FP), AX
 41980  	MOVQ  incx+16(FP), CX
 41981  	MOVQ  CX, DX
 41982  	SHLQ  $0x05, DX
 41983  	MOVQ  ys+24(FP), DX
 41984  	MOVQ  incy+32(FP), BX
 41985  	MOVQ  BX, SI
 41986  	SHLQ  $0x05, SI
 41987  	MOVQ  n+40(FP), SI
 41988  	JMP   check_limit_unroll
 41989  	PCALIGN $0x08
 41990  	NOP
 41991  	NOP
 41992  	NOP
 41993  	NOP
 41994  	NOP
 41995  	NOP
 41996  	NOP
 41997  
 41998  loop_unroll:
 41999  	MOVSS (AX), X1
 42000  	LEAQ  (AX)(CX*4), AX
 42001  	MOVSS (AX), X2
 42002  	LEAQ  (AX)(CX*4), AX
 42003  	MOVSS (AX), X3
 42004  	LEAQ  (AX)(CX*4), AX
 42005  	MOVSS (AX), X4
 42006  	LEAQ  (AX)(CX*4), AX
 42007  	MOVSS (AX), X5
 42008  	LEAQ  (AX)(CX*4), AX
 42009  	MOVSS (AX), X6
 42010  	LEAQ  (AX)(CX*4), AX
 42011  	MOVSS (AX), X7
 42012  	LEAQ  (AX)(CX*4), AX
 42013  	MOVSS (AX), X8
 42014  	LEAQ  (AX)(CX*4), AX
 42015  	MULSS X0, X1
 42016  	MULSS X0, X2
 42017  	MULSS X0, X3
 42018  	MULSS X0, X4
 42019  	MULSS X0, X5
 42020  	MULSS X0, X6
 42021  	MULSS X0, X7
 42022  	MULSS X0, X8
 42023  	ADDSS (DX), X1
 42024  	MOVSS X1, (DX)
 42025  	LEAQ  (DX)(BX*4), DX
 42026  	ADDSS (DX), X2
 42027  	MOVSS X2, (DX)
 42028  	LEAQ  (DX)(BX*4), DX
 42029  	ADDSS (DX), X3
 42030  	MOVSS X3, (DX)
 42031  	LEAQ  (DX)(BX*4), DX
 42032  	ADDSS (DX), X4
 42033  	MOVSS X4, (DX)
 42034  	LEAQ  (DX)(BX*4), DX
 42035  	ADDSS (DX), X5
 42036  	MOVSS X5, (DX)
 42037  	LEAQ  (DX)(BX*4), DX
 42038  	ADDSS (DX), X6
 42039  	MOVSS X6, (DX)
 42040  	LEAQ  (DX)(BX*4), DX
 42041  	ADDSS (DX), X7
 42042  	MOVSS X7, (DX)
 42043  	LEAQ  (DX)(BX*4), DX
 42044  	ADDSS (DX), X8
 42045  	MOVSS X8, (DX)
 42046  	LEAQ  (DX)(BX*4), DX
 42047  	SUBQ  $0x08, SI
 42048  
 42049  check_limit_unroll:
 42050  	CMPQ SI, $0x08
 42051  	JHS  loop_unroll
 42052  	JMP  check_limit
 42053  
 42054  loop:
 42055  	MOVSS (AX), X1
 42056  	MULSS X0, X1
 42057  	ADDSS (DX), X1
 42058  	MOVSS X1, (DX)
 42059  	DECQ  SI
 42060  	LEAQ  (AX)(CX*4), AX
 42061  	LEAQ  (DX)(BX*4), DX
 42062  
 42063  check_limit:
 42064  	CMPQ SI, $0x00
 42065  	JHI  loop
 42066  	RET
 42067  
 42068  // func AmdAxpyPointerLoopXInterleave_V1A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42069  // Requires: SSE
 42070  TEXT ·AmdAxpyPointerLoopXInterleave_V1A15U8(SB), NOSPLIT, $0-48
 42071  	MOVSS alpha+0(FP), X0
 42072  	MOVQ  xs+8(FP), AX
 42073  	MOVQ  incx+16(FP), CX
 42074  	MOVQ  CX, DX
 42075  	SHLQ  $0x05, DX
 42076  	MOVQ  ys+24(FP), DX
 42077  	MOVQ  incy+32(FP), BX
 42078  	MOVQ  BX, SI
 42079  	SHLQ  $0x05, SI
 42080  	MOVQ  n+40(FP), SI
 42081  	JMP   check_limit_unroll
 42082  	PCALIGN $0x08
 42083  	NOP
 42084  	NOP
 42085  	NOP
 42086  	NOP
 42087  	NOP
 42088  	NOP
 42089  	NOP
 42090  
 42091  loop_unroll:
 42092  	MOVSS (AX), X1
 42093  	LEAQ  (AX)(CX*4), AX
 42094  	MOVSS (AX), X2
 42095  	LEAQ  (AX)(CX*4), AX
 42096  	MOVSS (AX), X3
 42097  	LEAQ  (AX)(CX*4), AX
 42098  	MOVSS (AX), X4
 42099  	LEAQ  (AX)(CX*4), AX
 42100  	MOVSS (AX), X5
 42101  	LEAQ  (AX)(CX*4), AX
 42102  	MOVSS (AX), X6
 42103  	LEAQ  (AX)(CX*4), AX
 42104  	MOVSS (AX), X7
 42105  	LEAQ  (AX)(CX*4), AX
 42106  	MOVSS (AX), X8
 42107  	LEAQ  (AX)(CX*4), AX
 42108  	MULSS X0, X1
 42109  	MULSS X0, X2
 42110  	MULSS X0, X3
 42111  	MULSS X0, X4
 42112  	MULSS X0, X5
 42113  	MULSS X0, X6
 42114  	MULSS X0, X7
 42115  	MULSS X0, X8
 42116  	ADDSS (DX), X1
 42117  	MOVSS X1, (DX)
 42118  	LEAQ  (DX)(BX*4), DX
 42119  	ADDSS (DX), X2
 42120  	MOVSS X2, (DX)
 42121  	LEAQ  (DX)(BX*4), DX
 42122  	ADDSS (DX), X3
 42123  	MOVSS X3, (DX)
 42124  	LEAQ  (DX)(BX*4), DX
 42125  	ADDSS (DX), X4
 42126  	MOVSS X4, (DX)
 42127  	LEAQ  (DX)(BX*4), DX
 42128  	ADDSS (DX), X5
 42129  	MOVSS X5, (DX)
 42130  	LEAQ  (DX)(BX*4), DX
 42131  	ADDSS (DX), X6
 42132  	MOVSS X6, (DX)
 42133  	LEAQ  (DX)(BX*4), DX
 42134  	ADDSS (DX), X7
 42135  	MOVSS X7, (DX)
 42136  	LEAQ  (DX)(BX*4), DX
 42137  	ADDSS (DX), X8
 42138  	MOVSS X8, (DX)
 42139  	LEAQ  (DX)(BX*4), DX
 42140  	SUBQ  $0x08, SI
 42141  
 42142  check_limit_unroll:
 42143  	CMPQ SI, $0x08
 42144  	JHS  loop_unroll
 42145  	JMP  check_limit
 42146  
 42147  loop:
 42148  	MOVSS (AX), X1
 42149  	MULSS X0, X1
 42150  	ADDSS (DX), X1
 42151  	MOVSS X1, (DX)
 42152  	DECQ  SI
 42153  	LEAQ  (AX)(CX*4), AX
 42154  	LEAQ  (DX)(BX*4), DX
 42155  
 42156  check_limit:
 42157  	CMPQ SI, $0x00
 42158  	JHI  loop
 42159  	RET
 42160  
 42161  // func AmdAxpyPointerLoopXInterleave_V2A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42162  // Requires: SSE
 42163  TEXT ·AmdAxpyPointerLoopXInterleave_V2A15U8(SB), NOSPLIT, $0-48
 42164  	MOVSS alpha+0(FP), X0
 42165  	MOVQ  xs+8(FP), AX
 42166  	MOVQ  incx+16(FP), CX
 42167  	MOVQ  CX, DX
 42168  	SHLQ  $0x05, DX
 42169  	MOVQ  ys+24(FP), DX
 42170  	MOVQ  incy+32(FP), BX
 42171  	MOVQ  BX, SI
 42172  	SHLQ  $0x05, SI
 42173  	MOVQ  n+40(FP), SI
 42174  	JMP   check_limit_unroll
 42175  	PCALIGN $0x08
 42176  	NOP
 42177  	NOP
 42178  	NOP
 42179  	NOP
 42180  	NOP
 42181  	NOP
 42182  	NOP
 42183  
 42184  loop_unroll:
 42185  	MOVSS (AX), X1
 42186  	LEAQ  (AX)(CX*4), AX
 42187  	MOVSS (AX), X2
 42188  	LEAQ  (AX)(CX*4), AX
 42189  	MOVSS (AX), X3
 42190  	LEAQ  (AX)(CX*4), AX
 42191  	MOVSS (AX), X4
 42192  	LEAQ  (AX)(CX*4), AX
 42193  	MOVSS (AX), X5
 42194  	LEAQ  (AX)(CX*4), AX
 42195  	MOVSS (AX), X6
 42196  	LEAQ  (AX)(CX*4), AX
 42197  	MOVSS (AX), X7
 42198  	LEAQ  (AX)(CX*4), AX
 42199  	MOVSS (AX), X8
 42200  	LEAQ  (AX)(CX*4), AX
 42201  	MULSS X0, X1
 42202  	MULSS X0, X2
 42203  	MULSS X0, X3
 42204  	MULSS X0, X4
 42205  	MULSS X0, X5
 42206  	MULSS X0, X6
 42207  	MULSS X0, X7
 42208  	MULSS X0, X8
 42209  	ADDSS (DX), X1
 42210  	MOVSS X1, (DX)
 42211  	LEAQ  (DX)(BX*4), DX
 42212  	ADDSS (DX), X2
 42213  	MOVSS X2, (DX)
 42214  	LEAQ  (DX)(BX*4), DX
 42215  	ADDSS (DX), X3
 42216  	MOVSS X3, (DX)
 42217  	LEAQ  (DX)(BX*4), DX
 42218  	ADDSS (DX), X4
 42219  	MOVSS X4, (DX)
 42220  	LEAQ  (DX)(BX*4), DX
 42221  	ADDSS (DX), X5
 42222  	MOVSS X5, (DX)
 42223  	LEAQ  (DX)(BX*4), DX
 42224  	ADDSS (DX), X6
 42225  	MOVSS X6, (DX)
 42226  	LEAQ  (DX)(BX*4), DX
 42227  	ADDSS (DX), X7
 42228  	MOVSS X7, (DX)
 42229  	LEAQ  (DX)(BX*4), DX
 42230  	ADDSS (DX), X8
 42231  	MOVSS X8, (DX)
 42232  	LEAQ  (DX)(BX*4), DX
 42233  	SUBQ  $0x08, SI
 42234  
 42235  check_limit_unroll:
 42236  	CMPQ SI, $0x08
 42237  	JHS  loop_unroll
 42238  	JMP  check_limit
 42239  
 42240  loop:
 42241  	MOVSS (AX), X1
 42242  	MULSS X0, X1
 42243  	ADDSS (DX), X1
 42244  	MOVSS X1, (DX)
 42245  	DECQ  SI
 42246  	LEAQ  (AX)(CX*4), AX
 42247  	LEAQ  (DX)(BX*4), DX
 42248  
 42249  check_limit:
 42250  	CMPQ SI, $0x00
 42251  	JHI  loop
 42252  	RET
 42253  
 42254  // func AmdAxpyPointerLoopXInterleave_V3A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42255  // Requires: SSE
 42256  TEXT ·AmdAxpyPointerLoopXInterleave_V3A15U8(SB), NOSPLIT, $0-48
 42257  	MOVSS alpha+0(FP), X0
 42258  	MOVQ  xs+8(FP), AX
 42259  	MOVQ  incx+16(FP), CX
 42260  	MOVQ  CX, DX
 42261  	SHLQ  $0x05, DX
 42262  	MOVQ  ys+24(FP), DX
 42263  	MOVQ  incy+32(FP), BX
 42264  	MOVQ  BX, SI
 42265  	SHLQ  $0x05, SI
 42266  	MOVQ  n+40(FP), SI
 42267  	JMP   check_limit_unroll
 42268  	PCALIGN $0x08
 42269  	NOP
 42270  	NOP
 42271  	NOP
 42272  	NOP
 42273  	NOP
 42274  	NOP
 42275  	NOP
 42276  
 42277  loop_unroll:
 42278  	MOVSS (AX), X1
 42279  	LEAQ  (AX)(CX*4), AX
 42280  	MOVSS (AX), X2
 42281  	LEAQ  (AX)(CX*4), AX
 42282  	MOVSS (AX), X3
 42283  	LEAQ  (AX)(CX*4), AX
 42284  	MOVSS (AX), X4
 42285  	LEAQ  (AX)(CX*4), AX
 42286  	MOVSS (AX), X5
 42287  	LEAQ  (AX)(CX*4), AX
 42288  	MOVSS (AX), X6
 42289  	LEAQ  (AX)(CX*4), AX
 42290  	MOVSS (AX), X7
 42291  	LEAQ  (AX)(CX*4), AX
 42292  	MOVSS (AX), X8
 42293  	LEAQ  (AX)(CX*4), AX
 42294  	MULSS X0, X1
 42295  	MULSS X0, X2
 42296  	MULSS X0, X3
 42297  	MULSS X0, X4
 42298  	MULSS X0, X5
 42299  	MULSS X0, X6
 42300  	MULSS X0, X7
 42301  	MULSS X0, X8
 42302  	ADDSS (DX), X1
 42303  	MOVSS X1, (DX)
 42304  	LEAQ  (DX)(BX*4), DX
 42305  	ADDSS (DX), X2
 42306  	MOVSS X2, (DX)
 42307  	LEAQ  (DX)(BX*4), DX
 42308  	ADDSS (DX), X3
 42309  	MOVSS X3, (DX)
 42310  	LEAQ  (DX)(BX*4), DX
 42311  	ADDSS (DX), X4
 42312  	MOVSS X4, (DX)
 42313  	LEAQ  (DX)(BX*4), DX
 42314  	ADDSS (DX), X5
 42315  	MOVSS X5, (DX)
 42316  	LEAQ  (DX)(BX*4), DX
 42317  	ADDSS (DX), X6
 42318  	MOVSS X6, (DX)
 42319  	LEAQ  (DX)(BX*4), DX
 42320  	ADDSS (DX), X7
 42321  	MOVSS X7, (DX)
 42322  	LEAQ  (DX)(BX*4), DX
 42323  	ADDSS (DX), X8
 42324  	MOVSS X8, (DX)
 42325  	LEAQ  (DX)(BX*4), DX
 42326  	SUBQ  $0x08, SI
 42327  
 42328  check_limit_unroll:
 42329  	CMPQ SI, $0x08
 42330  	JHS  loop_unroll
 42331  	JMP  check_limit
 42332  
 42333  loop:
 42334  	MOVSS (AX), X1
 42335  	MULSS X0, X1
 42336  	ADDSS (DX), X1
 42337  	MOVSS X1, (DX)
 42338  	DECQ  SI
 42339  	LEAQ  (AX)(CX*4), AX
 42340  	LEAQ  (DX)(BX*4), DX
 42341  
 42342  check_limit:
 42343  	CMPQ SI, $0x00
 42344  	JHI  loop
 42345  	RET
 42346  
 42347  // func AmdAxpyPointerLoopXInterleave_V4A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42348  // Requires: SSE
 42349  TEXT ·AmdAxpyPointerLoopXInterleave_V4A15U8(SB), NOSPLIT, $0-48
 42350  	MOVSS alpha+0(FP), X0
 42351  	MOVQ  xs+8(FP), AX
 42352  	MOVQ  incx+16(FP), CX
 42353  	MOVQ  CX, DX
 42354  	SHLQ  $0x05, DX
 42355  	MOVQ  ys+24(FP), DX
 42356  	MOVQ  incy+32(FP), BX
 42357  	MOVQ  BX, SI
 42358  	SHLQ  $0x05, SI
 42359  	MOVQ  n+40(FP), SI
 42360  	JMP   check_limit_unroll
 42361  	PCALIGN $0x08
 42362  	NOP
 42363  	NOP
 42364  	NOP
 42365  	NOP
 42366  	NOP
 42367  	NOP
 42368  	NOP
 42369  
 42370  loop_unroll:
 42371  	MOVSS (AX), X1
 42372  	LEAQ  (AX)(CX*4), AX
 42373  	MOVSS (AX), X2
 42374  	LEAQ  (AX)(CX*4), AX
 42375  	MOVSS (AX), X3
 42376  	LEAQ  (AX)(CX*4), AX
 42377  	MOVSS (AX), X4
 42378  	LEAQ  (AX)(CX*4), AX
 42379  	MOVSS (AX), X5
 42380  	LEAQ  (AX)(CX*4), AX
 42381  	MOVSS (AX), X6
 42382  	LEAQ  (AX)(CX*4), AX
 42383  	MOVSS (AX), X7
 42384  	LEAQ  (AX)(CX*4), AX
 42385  	MOVSS (AX), X8
 42386  	LEAQ  (AX)(CX*4), AX
 42387  	MULSS X0, X1
 42388  	MULSS X0, X2
 42389  	MULSS X0, X3
 42390  	MULSS X0, X4
 42391  	MULSS X0, X5
 42392  	MULSS X0, X6
 42393  	MULSS X0, X7
 42394  	MULSS X0, X8
 42395  	ADDSS (DX), X1
 42396  	MOVSS X1, (DX)
 42397  	LEAQ  (DX)(BX*4), DX
 42398  	ADDSS (DX), X2
 42399  	MOVSS X2, (DX)
 42400  	LEAQ  (DX)(BX*4), DX
 42401  	ADDSS (DX), X3
 42402  	MOVSS X3, (DX)
 42403  	LEAQ  (DX)(BX*4), DX
 42404  	ADDSS (DX), X4
 42405  	MOVSS X4, (DX)
 42406  	LEAQ  (DX)(BX*4), DX
 42407  	ADDSS (DX), X5
 42408  	MOVSS X5, (DX)
 42409  	LEAQ  (DX)(BX*4), DX
 42410  	ADDSS (DX), X6
 42411  	MOVSS X6, (DX)
 42412  	LEAQ  (DX)(BX*4), DX
 42413  	ADDSS (DX), X7
 42414  	MOVSS X7, (DX)
 42415  	LEAQ  (DX)(BX*4), DX
 42416  	ADDSS (DX), X8
 42417  	MOVSS X8, (DX)
 42418  	LEAQ  (DX)(BX*4), DX
 42419  	SUBQ  $0x08, SI
 42420  
 42421  check_limit_unroll:
 42422  	CMPQ SI, $0x08
 42423  	JHS  loop_unroll
 42424  	JMP  check_limit
 42425  
 42426  loop:
 42427  	MOVSS (AX), X1
 42428  	MULSS X0, X1
 42429  	ADDSS (DX), X1
 42430  	MOVSS X1, (DX)
 42431  	DECQ  SI
 42432  	LEAQ  (AX)(CX*4), AX
 42433  	LEAQ  (DX)(BX*4), DX
 42434  
 42435  check_limit:
 42436  	CMPQ SI, $0x00
 42437  	JHI  loop
 42438  	RET
 42439  
 42440  // func AmdAxpyPointerLoopXInterleave_V5A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42441  // Requires: SSE
 42442  TEXT ·AmdAxpyPointerLoopXInterleave_V5A15U8(SB), NOSPLIT, $0-48
 42443  	MOVSS alpha+0(FP), X0
 42444  	MOVQ  xs+8(FP), AX
 42445  	MOVQ  incx+16(FP), CX
 42446  	MOVQ  CX, DX
 42447  	SHLQ  $0x05, DX
 42448  	MOVQ  ys+24(FP), DX
 42449  	MOVQ  incy+32(FP), BX
 42450  	MOVQ  BX, SI
 42451  	SHLQ  $0x05, SI
 42452  	MOVQ  n+40(FP), SI
 42453  	JMP   check_limit_unroll
 42454  	PCALIGN $0x08
 42455  	NOP
 42456  	NOP
 42457  	NOP
 42458  	NOP
 42459  	NOP
 42460  	NOP
 42461  	NOP
 42462  
 42463  loop_unroll:
 42464  	MOVSS (AX), X1
 42465  	LEAQ  (AX)(CX*4), AX
 42466  	MOVSS (AX), X2
 42467  	LEAQ  (AX)(CX*4), AX
 42468  	MOVSS (AX), X3
 42469  	LEAQ  (AX)(CX*4), AX
 42470  	MOVSS (AX), X4
 42471  	LEAQ  (AX)(CX*4), AX
 42472  	MOVSS (AX), X5
 42473  	LEAQ  (AX)(CX*4), AX
 42474  	MOVSS (AX), X6
 42475  	LEAQ  (AX)(CX*4), AX
 42476  	MOVSS (AX), X7
 42477  	LEAQ  (AX)(CX*4), AX
 42478  	MOVSS (AX), X8
 42479  	LEAQ  (AX)(CX*4), AX
 42480  	MULSS X0, X1
 42481  	MULSS X0, X2
 42482  	MULSS X0, X3
 42483  	MULSS X0, X4
 42484  	MULSS X0, X5
 42485  	MULSS X0, X6
 42486  	MULSS X0, X7
 42487  	MULSS X0, X8
 42488  	ADDSS (DX), X1
 42489  	MOVSS X1, (DX)
 42490  	LEAQ  (DX)(BX*4), DX
 42491  	ADDSS (DX), X2
 42492  	MOVSS X2, (DX)
 42493  	LEAQ  (DX)(BX*4), DX
 42494  	ADDSS (DX), X3
 42495  	MOVSS X3, (DX)
 42496  	LEAQ  (DX)(BX*4), DX
 42497  	ADDSS (DX), X4
 42498  	MOVSS X4, (DX)
 42499  	LEAQ  (DX)(BX*4), DX
 42500  	ADDSS (DX), X5
 42501  	MOVSS X5, (DX)
 42502  	LEAQ  (DX)(BX*4), DX
 42503  	ADDSS (DX), X6
 42504  	MOVSS X6, (DX)
 42505  	LEAQ  (DX)(BX*4), DX
 42506  	ADDSS (DX), X7
 42507  	MOVSS X7, (DX)
 42508  	LEAQ  (DX)(BX*4), DX
 42509  	ADDSS (DX), X8
 42510  	MOVSS X8, (DX)
 42511  	LEAQ  (DX)(BX*4), DX
 42512  	SUBQ  $0x08, SI
 42513  
 42514  check_limit_unroll:
 42515  	CMPQ SI, $0x08
 42516  	JHS  loop_unroll
 42517  	JMP  check_limit
 42518  
 42519  loop:
 42520  	MOVSS (AX), X1
 42521  	MULSS X0, X1
 42522  	ADDSS (DX), X1
 42523  	MOVSS X1, (DX)
 42524  	DECQ  SI
 42525  	LEAQ  (AX)(CX*4), AX
 42526  	LEAQ  (DX)(BX*4), DX
 42527  
 42528  check_limit:
 42529  	CMPQ SI, $0x00
 42530  	JHI  loop
 42531  	RET
 42532  
 42533  // func AmdAxpyPointerLoopXInterleave_V0A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42534  // Requires: SSE
 42535  TEXT ·AmdAxpyPointerLoopXInterleave_V0A16U8(SB), NOSPLIT, $0-48
 42536  	MOVSS alpha+0(FP), X0
 42537  	MOVQ  xs+8(FP), AX
 42538  	MOVQ  incx+16(FP), CX
 42539  	MOVQ  CX, DX
 42540  	SHLQ  $0x05, DX
 42541  	MOVQ  ys+24(FP), DX
 42542  	MOVQ  incy+32(FP), BX
 42543  	MOVQ  BX, SI
 42544  	SHLQ  $0x05, SI
 42545  	MOVQ  n+40(FP), SI
 42546  	JMP   check_limit_unroll
 42547  	PCALIGN $0x10
 42548  
 42549  loop_unroll:
 42550  	MOVSS (AX), X1
 42551  	LEAQ  (AX)(CX*4), AX
 42552  	MOVSS (AX), X2
 42553  	LEAQ  (AX)(CX*4), AX
 42554  	MOVSS (AX), X3
 42555  	LEAQ  (AX)(CX*4), AX
 42556  	MOVSS (AX), X4
 42557  	LEAQ  (AX)(CX*4), AX
 42558  	MOVSS (AX), X5
 42559  	LEAQ  (AX)(CX*4), AX
 42560  	MOVSS (AX), X6
 42561  	LEAQ  (AX)(CX*4), AX
 42562  	MOVSS (AX), X7
 42563  	LEAQ  (AX)(CX*4), AX
 42564  	MOVSS (AX), X8
 42565  	LEAQ  (AX)(CX*4), AX
 42566  	MULSS X0, X1
 42567  	MULSS X0, X2
 42568  	MULSS X0, X3
 42569  	MULSS X0, X4
 42570  	MULSS X0, X5
 42571  	MULSS X0, X6
 42572  	MULSS X0, X7
 42573  	MULSS X0, X8
 42574  	ADDSS (DX), X1
 42575  	MOVSS X1, (DX)
 42576  	LEAQ  (DX)(BX*4), DX
 42577  	ADDSS (DX), X2
 42578  	MOVSS X2, (DX)
 42579  	LEAQ  (DX)(BX*4), DX
 42580  	ADDSS (DX), X3
 42581  	MOVSS X3, (DX)
 42582  	LEAQ  (DX)(BX*4), DX
 42583  	ADDSS (DX), X4
 42584  	MOVSS X4, (DX)
 42585  	LEAQ  (DX)(BX*4), DX
 42586  	ADDSS (DX), X5
 42587  	MOVSS X5, (DX)
 42588  	LEAQ  (DX)(BX*4), DX
 42589  	ADDSS (DX), X6
 42590  	MOVSS X6, (DX)
 42591  	LEAQ  (DX)(BX*4), DX
 42592  	ADDSS (DX), X7
 42593  	MOVSS X7, (DX)
 42594  	LEAQ  (DX)(BX*4), DX
 42595  	ADDSS (DX), X8
 42596  	MOVSS X8, (DX)
 42597  	LEAQ  (DX)(BX*4), DX
 42598  	SUBQ  $0x08, SI
 42599  
 42600  check_limit_unroll:
 42601  	CMPQ SI, $0x08
 42602  	JHS  loop_unroll
 42603  	JMP  check_limit
 42604  
 42605  loop:
 42606  	MOVSS (AX), X1
 42607  	MULSS X0, X1
 42608  	ADDSS (DX), X1
 42609  	MOVSS X1, (DX)
 42610  	DECQ  SI
 42611  	LEAQ  (AX)(CX*4), AX
 42612  	LEAQ  (DX)(BX*4), DX
 42613  
 42614  check_limit:
 42615  	CMPQ SI, $0x00
 42616  	JHI  loop
 42617  	RET
 42618  
 42619  // func AmdAxpyPointerLoopXInterleave_V1A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42620  // Requires: SSE
 42621  TEXT ·AmdAxpyPointerLoopXInterleave_V1A16U8(SB), NOSPLIT, $0-48
 42622  	MOVSS alpha+0(FP), X0
 42623  	MOVQ  xs+8(FP), AX
 42624  	MOVQ  incx+16(FP), CX
 42625  	MOVQ  CX, DX
 42626  	SHLQ  $0x05, DX
 42627  	MOVQ  ys+24(FP), DX
 42628  	MOVQ  incy+32(FP), BX
 42629  	MOVQ  BX, SI
 42630  	SHLQ  $0x05, SI
 42631  	MOVQ  n+40(FP), SI
 42632  	JMP   check_limit_unroll
 42633  	PCALIGN $0x10
 42634  
 42635  loop_unroll:
 42636  	MOVSS (AX), X1
 42637  	LEAQ  (AX)(CX*4), AX
 42638  	MOVSS (AX), X2
 42639  	LEAQ  (AX)(CX*4), AX
 42640  	MOVSS (AX), X3
 42641  	LEAQ  (AX)(CX*4), AX
 42642  	MOVSS (AX), X4
 42643  	LEAQ  (AX)(CX*4), AX
 42644  	MOVSS (AX), X5
 42645  	LEAQ  (AX)(CX*4), AX
 42646  	MOVSS (AX), X6
 42647  	LEAQ  (AX)(CX*4), AX
 42648  	MOVSS (AX), X7
 42649  	LEAQ  (AX)(CX*4), AX
 42650  	MOVSS (AX), X8
 42651  	LEAQ  (AX)(CX*4), AX
 42652  	MULSS X0, X1
 42653  	MULSS X0, X2
 42654  	MULSS X0, X3
 42655  	MULSS X0, X4
 42656  	MULSS X0, X5
 42657  	MULSS X0, X6
 42658  	MULSS X0, X7
 42659  	MULSS X0, X8
 42660  	ADDSS (DX), X1
 42661  	MOVSS X1, (DX)
 42662  	LEAQ  (DX)(BX*4), DX
 42663  	ADDSS (DX), X2
 42664  	MOVSS X2, (DX)
 42665  	LEAQ  (DX)(BX*4), DX
 42666  	ADDSS (DX), X3
 42667  	MOVSS X3, (DX)
 42668  	LEAQ  (DX)(BX*4), DX
 42669  	ADDSS (DX), X4
 42670  	MOVSS X4, (DX)
 42671  	LEAQ  (DX)(BX*4), DX
 42672  	ADDSS (DX), X5
 42673  	MOVSS X5, (DX)
 42674  	LEAQ  (DX)(BX*4), DX
 42675  	ADDSS (DX), X6
 42676  	MOVSS X6, (DX)
 42677  	LEAQ  (DX)(BX*4), DX
 42678  	ADDSS (DX), X7
 42679  	MOVSS X7, (DX)
 42680  	LEAQ  (DX)(BX*4), DX
 42681  	ADDSS (DX), X8
 42682  	MOVSS X8, (DX)
 42683  	LEAQ  (DX)(BX*4), DX
 42684  	SUBQ  $0x08, SI
 42685  
 42686  check_limit_unroll:
 42687  	CMPQ SI, $0x08
 42688  	JHS  loop_unroll
 42689  	JMP  check_limit
 42690  
 42691  loop:
 42692  	MOVSS (AX), X1
 42693  	MULSS X0, X1
 42694  	ADDSS (DX), X1
 42695  	MOVSS X1, (DX)
 42696  	DECQ  SI
 42697  	LEAQ  (AX)(CX*4), AX
 42698  	LEAQ  (DX)(BX*4), DX
 42699  
 42700  check_limit:
 42701  	CMPQ SI, $0x00
 42702  	JHI  loop
 42703  	RET
 42704  
 42705  // func AmdAxpyPointerLoopXInterleave_V2A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42706  // Requires: SSE
 42707  TEXT ·AmdAxpyPointerLoopXInterleave_V2A16U8(SB), NOSPLIT, $0-48
 42708  	MOVSS alpha+0(FP), X0
 42709  	MOVQ  xs+8(FP), AX
 42710  	MOVQ  incx+16(FP), CX
 42711  	MOVQ  CX, DX
 42712  	SHLQ  $0x05, DX
 42713  	MOVQ  ys+24(FP), DX
 42714  	MOVQ  incy+32(FP), BX
 42715  	MOVQ  BX, SI
 42716  	SHLQ  $0x05, SI
 42717  	MOVQ  n+40(FP), SI
 42718  	JMP   check_limit_unroll
 42719  	PCALIGN $0x10
 42720  
 42721  loop_unroll:
 42722  	MOVSS (AX), X1
 42723  	LEAQ  (AX)(CX*4), AX
 42724  	MOVSS (AX), X2
 42725  	LEAQ  (AX)(CX*4), AX
 42726  	MOVSS (AX), X3
 42727  	LEAQ  (AX)(CX*4), AX
 42728  	MOVSS (AX), X4
 42729  	LEAQ  (AX)(CX*4), AX
 42730  	MOVSS (AX), X5
 42731  	LEAQ  (AX)(CX*4), AX
 42732  	MOVSS (AX), X6
 42733  	LEAQ  (AX)(CX*4), AX
 42734  	MOVSS (AX), X7
 42735  	LEAQ  (AX)(CX*4), AX
 42736  	MOVSS (AX), X8
 42737  	LEAQ  (AX)(CX*4), AX
 42738  	MULSS X0, X1
 42739  	MULSS X0, X2
 42740  	MULSS X0, X3
 42741  	MULSS X0, X4
 42742  	MULSS X0, X5
 42743  	MULSS X0, X6
 42744  	MULSS X0, X7
 42745  	MULSS X0, X8
 42746  	ADDSS (DX), X1
 42747  	MOVSS X1, (DX)
 42748  	LEAQ  (DX)(BX*4), DX
 42749  	ADDSS (DX), X2
 42750  	MOVSS X2, (DX)
 42751  	LEAQ  (DX)(BX*4), DX
 42752  	ADDSS (DX), X3
 42753  	MOVSS X3, (DX)
 42754  	LEAQ  (DX)(BX*4), DX
 42755  	ADDSS (DX), X4
 42756  	MOVSS X4, (DX)
 42757  	LEAQ  (DX)(BX*4), DX
 42758  	ADDSS (DX), X5
 42759  	MOVSS X5, (DX)
 42760  	LEAQ  (DX)(BX*4), DX
 42761  	ADDSS (DX), X6
 42762  	MOVSS X6, (DX)
 42763  	LEAQ  (DX)(BX*4), DX
 42764  	ADDSS (DX), X7
 42765  	MOVSS X7, (DX)
 42766  	LEAQ  (DX)(BX*4), DX
 42767  	ADDSS (DX), X8
 42768  	MOVSS X8, (DX)
 42769  	LEAQ  (DX)(BX*4), DX
 42770  	SUBQ  $0x08, SI
 42771  
 42772  check_limit_unroll:
 42773  	CMPQ SI, $0x08
 42774  	JHS  loop_unroll
 42775  	JMP  check_limit
 42776  
 42777  loop:
 42778  	MOVSS (AX), X1
 42779  	MULSS X0, X1
 42780  	ADDSS (DX), X1
 42781  	MOVSS X1, (DX)
 42782  	DECQ  SI
 42783  	LEAQ  (AX)(CX*4), AX
 42784  	LEAQ  (DX)(BX*4), DX
 42785  
 42786  check_limit:
 42787  	CMPQ SI, $0x00
 42788  	JHI  loop
 42789  	RET
 42790  
 42791  // func AmdAxpyPointerLoopXInterleave_V3A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42792  // Requires: SSE
 42793  TEXT ·AmdAxpyPointerLoopXInterleave_V3A16U8(SB), NOSPLIT, $0-48
 42794  	MOVSS alpha+0(FP), X0
 42795  	MOVQ  xs+8(FP), AX
 42796  	MOVQ  incx+16(FP), CX
 42797  	MOVQ  CX, DX
 42798  	SHLQ  $0x05, DX
 42799  	MOVQ  ys+24(FP), DX
 42800  	MOVQ  incy+32(FP), BX
 42801  	MOVQ  BX, SI
 42802  	SHLQ  $0x05, SI
 42803  	MOVQ  n+40(FP), SI
 42804  	JMP   check_limit_unroll
 42805  	PCALIGN $0x10
 42806  
 42807  loop_unroll:
 42808  	MOVSS (AX), X1
 42809  	LEAQ  (AX)(CX*4), AX
 42810  	MOVSS (AX), X2
 42811  	LEAQ  (AX)(CX*4), AX
 42812  	MOVSS (AX), X3
 42813  	LEAQ  (AX)(CX*4), AX
 42814  	MOVSS (AX), X4
 42815  	LEAQ  (AX)(CX*4), AX
 42816  	MOVSS (AX), X5
 42817  	LEAQ  (AX)(CX*4), AX
 42818  	MOVSS (AX), X6
 42819  	LEAQ  (AX)(CX*4), AX
 42820  	MOVSS (AX), X7
 42821  	LEAQ  (AX)(CX*4), AX
 42822  	MOVSS (AX), X8
 42823  	LEAQ  (AX)(CX*4), AX
 42824  	MULSS X0, X1
 42825  	MULSS X0, X2
 42826  	MULSS X0, X3
 42827  	MULSS X0, X4
 42828  	MULSS X0, X5
 42829  	MULSS X0, X6
 42830  	MULSS X0, X7
 42831  	MULSS X0, X8
 42832  	ADDSS (DX), X1
 42833  	MOVSS X1, (DX)
 42834  	LEAQ  (DX)(BX*4), DX
 42835  	ADDSS (DX), X2
 42836  	MOVSS X2, (DX)
 42837  	LEAQ  (DX)(BX*4), DX
 42838  	ADDSS (DX), X3
 42839  	MOVSS X3, (DX)
 42840  	LEAQ  (DX)(BX*4), DX
 42841  	ADDSS (DX), X4
 42842  	MOVSS X4, (DX)
 42843  	LEAQ  (DX)(BX*4), DX
 42844  	ADDSS (DX), X5
 42845  	MOVSS X5, (DX)
 42846  	LEAQ  (DX)(BX*4), DX
 42847  	ADDSS (DX), X6
 42848  	MOVSS X6, (DX)
 42849  	LEAQ  (DX)(BX*4), DX
 42850  	ADDSS (DX), X7
 42851  	MOVSS X7, (DX)
 42852  	LEAQ  (DX)(BX*4), DX
 42853  	ADDSS (DX), X8
 42854  	MOVSS X8, (DX)
 42855  	LEAQ  (DX)(BX*4), DX
 42856  	SUBQ  $0x08, SI
 42857  
 42858  check_limit_unroll:
 42859  	CMPQ SI, $0x08
 42860  	JHS  loop_unroll
 42861  	JMP  check_limit
 42862  
 42863  loop:
 42864  	MOVSS (AX), X1
 42865  	MULSS X0, X1
 42866  	ADDSS (DX), X1
 42867  	MOVSS X1, (DX)
 42868  	DECQ  SI
 42869  	LEAQ  (AX)(CX*4), AX
 42870  	LEAQ  (DX)(BX*4), DX
 42871  
 42872  check_limit:
 42873  	CMPQ SI, $0x00
 42874  	JHI  loop
 42875  	RET
 42876  
 42877  // func AmdAxpyPointerLoopXInterleave_V4A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42878  // Requires: SSE
 42879  TEXT ·AmdAxpyPointerLoopXInterleave_V4A16U8(SB), NOSPLIT, $0-48
 42880  	MOVSS alpha+0(FP), X0
 42881  	MOVQ  xs+8(FP), AX
 42882  	MOVQ  incx+16(FP), CX
 42883  	MOVQ  CX, DX
 42884  	SHLQ  $0x05, DX
 42885  	MOVQ  ys+24(FP), DX
 42886  	MOVQ  incy+32(FP), BX
 42887  	MOVQ  BX, SI
 42888  	SHLQ  $0x05, SI
 42889  	MOVQ  n+40(FP), SI
 42890  	JMP   check_limit_unroll
 42891  	PCALIGN $0x10
 42892  
 42893  loop_unroll:
 42894  	MOVSS (AX), X1
 42895  	LEAQ  (AX)(CX*4), AX
 42896  	MOVSS (AX), X2
 42897  	LEAQ  (AX)(CX*4), AX
 42898  	MOVSS (AX), X3
 42899  	LEAQ  (AX)(CX*4), AX
 42900  	MOVSS (AX), X4
 42901  	LEAQ  (AX)(CX*4), AX
 42902  	MOVSS (AX), X5
 42903  	LEAQ  (AX)(CX*4), AX
 42904  	MOVSS (AX), X6
 42905  	LEAQ  (AX)(CX*4), AX
 42906  	MOVSS (AX), X7
 42907  	LEAQ  (AX)(CX*4), AX
 42908  	MOVSS (AX), X8
 42909  	LEAQ  (AX)(CX*4), AX
 42910  	MULSS X0, X1
 42911  	MULSS X0, X2
 42912  	MULSS X0, X3
 42913  	MULSS X0, X4
 42914  	MULSS X0, X5
 42915  	MULSS X0, X6
 42916  	MULSS X0, X7
 42917  	MULSS X0, X8
 42918  	ADDSS (DX), X1
 42919  	MOVSS X1, (DX)
 42920  	LEAQ  (DX)(BX*4), DX
 42921  	ADDSS (DX), X2
 42922  	MOVSS X2, (DX)
 42923  	LEAQ  (DX)(BX*4), DX
 42924  	ADDSS (DX), X3
 42925  	MOVSS X3, (DX)
 42926  	LEAQ  (DX)(BX*4), DX
 42927  	ADDSS (DX), X4
 42928  	MOVSS X4, (DX)
 42929  	LEAQ  (DX)(BX*4), DX
 42930  	ADDSS (DX), X5
 42931  	MOVSS X5, (DX)
 42932  	LEAQ  (DX)(BX*4), DX
 42933  	ADDSS (DX), X6
 42934  	MOVSS X6, (DX)
 42935  	LEAQ  (DX)(BX*4), DX
 42936  	ADDSS (DX), X7
 42937  	MOVSS X7, (DX)
 42938  	LEAQ  (DX)(BX*4), DX
 42939  	ADDSS (DX), X8
 42940  	MOVSS X8, (DX)
 42941  	LEAQ  (DX)(BX*4), DX
 42942  	SUBQ  $0x08, SI
 42943  
 42944  check_limit_unroll:
 42945  	CMPQ SI, $0x08
 42946  	JHS  loop_unroll
 42947  	JMP  check_limit
 42948  
 42949  loop:
 42950  	MOVSS (AX), X1
 42951  	MULSS X0, X1
 42952  	ADDSS (DX), X1
 42953  	MOVSS X1, (DX)
 42954  	DECQ  SI
 42955  	LEAQ  (AX)(CX*4), AX
 42956  	LEAQ  (DX)(BX*4), DX
 42957  
 42958  check_limit:
 42959  	CMPQ SI, $0x00
 42960  	JHI  loop
 42961  	RET
 42962  
 42963  // func AmdAxpyPointerLoopXInterleave_V5A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr)
 42964  // Requires: SSE
 42965  TEXT ·AmdAxpyPointerLoopXInterleave_V5A16U8(SB), NOSPLIT, $0-48
 42966  	MOVSS alpha+0(FP), X0
 42967  	MOVQ  xs+8(FP), AX
 42968  	MOVQ  incx+16(FP), CX
 42969  	MOVQ  CX, DX
 42970  	SHLQ  $0x05, DX
 42971  	MOVQ  ys+24(FP), DX
 42972  	MOVQ  incy+32(FP), BX
 42973  	MOVQ  BX, SI
 42974  	SHLQ  $0x05, SI
 42975  	MOVQ  n+40(FP), SI
 42976  	JMP   check_limit_unroll
 42977  	PCALIGN $0x10
 42978  
 42979  loop_unroll:
 42980  	MOVSS (AX), X1
 42981  	LEAQ  (AX)(CX*4), AX
 42982  	MOVSS (AX), X2
 42983  	LEAQ  (AX)(CX*4), AX
 42984  	MOVSS (AX), X3
 42985  	LEAQ  (AX)(CX*4), AX
 42986  	MOVSS (AX), X4
 42987  	LEAQ  (AX)(CX*4), AX
 42988  	MOVSS (AX), X5
 42989  	LEAQ  (AX)(CX*4), AX
 42990  	MOVSS (AX), X6
 42991  	LEAQ  (AX)(CX*4), AX
 42992  	MOVSS (AX), X7
 42993  	LEAQ  (AX)(CX*4), AX
 42994  	MOVSS (AX), X8
 42995  	LEAQ  (AX)(CX*4), AX
 42996  	MULSS X0, X1
 42997  	MULSS X0, X2
 42998  	MULSS X0, X3
 42999  	MULSS X0, X4
 43000  	MULSS X0, X5
 43001  	MULSS X0, X6
 43002  	MULSS X0, X7
 43003  	MULSS X0, X8
 43004  	ADDSS (DX), X1
 43005  	MOVSS X1, (DX)
 43006  	LEAQ  (DX)(BX*4), DX
 43007  	ADDSS (DX), X2
 43008  	MOVSS X2, (DX)
 43009  	LEAQ  (DX)(BX*4), DX
 43010  	ADDSS (DX), X3
 43011  	MOVSS X3, (DX)
 43012  	LEAQ  (DX)(BX*4), DX
 43013  	ADDSS (DX), X4
 43014  	MOVSS X4, (DX)
 43015  	LEAQ  (DX)(BX*4), DX
 43016  	ADDSS (DX), X5
 43017  	MOVSS X5, (DX)
 43018  	LEAQ  (DX)(BX*4), DX
 43019  	ADDSS (DX), X6
 43020  	MOVSS X6, (DX)
 43021  	LEAQ  (DX)(BX*4), DX
 43022  	ADDSS (DX), X7
 43023  	MOVSS X7, (DX)
 43024  	LEAQ  (DX)(BX*4), DX
 43025  	ADDSS (DX), X8
 43026  	MOVSS X8, (DX)
 43027  	LEAQ  (DX)(BX*4), DX
 43028  	SUBQ  $0x08, SI
 43029  
 43030  check_limit_unroll:
 43031  	CMPQ SI, $0x08
 43032  	JHS  loop_unroll
 43033  	JMP  check_limit
 43034  
 43035  loop:
 43036  	MOVSS (AX), X1
 43037  	MULSS X0, X1
 43038  	ADDSS (DX), X1
 43039  	MOVSS X1, (DX)
 43040  	DECQ  SI
 43041  	LEAQ  (AX)(CX*4), AX
 43042  	LEAQ  (DX)(BX*4), DX
 43043  
 43044  check_limit:
 43045  	CMPQ SI, $0x00
 43046  	JHI  loop
 43047  	RET