github.com/qiaogw/arrgo@v0.0.8/internal/arithmetic_amd64.s (about)

     1  // +build !noasm !appengine
     2  
     3  #define NOSPLIT 7
     4  
     5  // func initasm()(a,a2 bool)
     6  // pulled from runtime/asm_amd64.s
     7  TEXT ·initasm(SB), NOSPLIT, $0
     8  	MOVQ $1, R15
     9  
    10  	MOVQ $1, AX
    11  	CPUID
    12  
    13  	ANDL $0x1, CX
    14  	CMPL CX, $0x1
    15  	CMOVQEQ R15, R9
    16  	MOVB R9, ·Sse3Supt(SB)
    17  	XORQ R9, R9
    18  
    19  	MOVQ $1, AX
    20  	CPUID
    21  	ANDL $0x18001000, CX
    22  	CMPL CX, $0x18001000
    23  	CMOVQEQ R15, R9
    24  	MOVB R9, ·FmaSupt(SB) // set numgo·fmaSupt
    25  	XORQ R9, R9
    26  
    27  	ANDL $0x18000000, CX 
    28  	CMPL CX, $0x18000000
    29  	JNE  noavx
    30  
    31  	// For XGETBV, OSXSAVE bit is required and sufficient
    32  	MOVQ $0, CX
    33  
    34  	// Check for FMA capability
    35  	// XGETBV
    36  	BYTE $0x0F; BYTE $0x01; BYTE $0xD0
    37  	
    38  	ANDL $6, AX
    39  	CMPL AX, $6                        // Check for OS support of YMM registers
    40  	JNE  noavx
    41  	MOVB $1, ·AvxSupt(SB)              // set numgo·avxSupt
    42  
    43  	// Check for AVX2 capability
    44  	MOVL $7, AX
    45  	XORQ CX, CX
    46  	CPUID
    47  	ANDL $0x20, BX         // check for AVX2 bit
    48  	CMPL BX, $0x20
    49  	CMOVQEQ R15, R9
    50  	MOVB R9, ·Avx2Supt(SB) // set numgo·avx2Supt
    51  	XORQ R9, R9
    52  	RET
    53  
    54  noavx:
    55  	MOVB $0, ·FmaSupt(SB) // set numgo·fmaSupt
    56  	MOVB $0, ·AvxSupt(SB) // set numgo·avxSupt
    57  	MOVB $0, ·Avx2Supt(SB) // set numgo·avx2Supt
    58  	RET
    59  
    60  // func AddC(c float64, d []float64)
    61  TEXT ·AddC(SB), NOSPLIT, $0
    62  	// Data ptr
    63  	MOVQ d+8(FP), R10
    64  
    65  	// n = Data len
    66  	MOVQ d_len+16(FP), SI
    67  
    68  	// zero len return
    69  	CMPQ SI, $0
    70  	JE   ACEND
    71  
    72  	// check tail
    73  	SUBQ $4, SI
    74  	JL   ACTAIL
    75  
    76  	// avx support test
    77  	LEAQ c+0(FP), R9
    78  	CMPB ·AvxSupt(SB), $1
    79  	JE   AVX_AC
    80  	CMPB ·Avx2Supt(SB), $1
    81  	JE  AVX2_AC
    82  
    83  	// load multiplier
    84  	MOVSD  (R9), X0
    85  	SHUFPD $0, X0, X0
    86  
    87  ACLOOP:  // Unrolled x2 d[i]|d[i+1] += c
    88  	MOVUPD 0(R10), X1
    89  	MOVUPD 16(R10), X2
    90  	ADDPD  X0, X1
    91  	ADDPD  X0, X2
    92  	MOVUPD X1, 0(R10)
    93  	MOVUPD X2, 16(R10)
    94  	ADDQ   $32, R10
    95  	SUBQ   $4, SI
    96  	JGE    ACLOOP
    97  	JMP    ACTAIL
    98  
    99  // NEED AVX INSTRUCTION CODING FOR THIS TO WORK
   100  AVX2_AC: // Until AVX2 is known
   101  AVX_AC:
   102  	// VBROADCASTD (R9), Y0
   103  	BYTE $0xC4; BYTE $0xC2; BYTE $0x7D; BYTE $0x19; BYTE $0x01
   104  
   105  AVX_ACLOOP:
   106  	// VADDPD (R10),Y0,Y1
   107  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x58; BYTE $0x0A
   108  
   109  	// VMOVDQA Y1, (R10)
   110  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x7F; BYTE $0x0A
   111  	
   112  	ADDQ $32, R10
   113  	SUBQ $4, SI
   114  	JGE  AVX_ACLOOP
   115  	//VZEROUPPER
   116  	BYTE $0xC5; BYTE $0xF8; BYTE $0x77
   117  
   118  ACTAIL:  // Catch len % 4 == 0
   119  	ADDQ $4, SI
   120  	JE   ACEND
   121  
   122  ACTL:  // Calc the last values individually d[i] += c
   123  	MOVSD 0(R10), X1
   124  	ADDSD X0, X1
   125  	MOVSD X1, 0(R10)
   126  	ADDQ  $8, R10
   127  	SUBQ  $1, SI
   128  	JG    ACTL
   129  
   130  ACEND:
   131  	RET
   132  
   133  // func subtrC(c float64, d []float64)
   134  TEXT ·SubtrC(SB), NOSPLIT, $0
   135  	// Data ptr
   136  	MOVQ d+8(FP), R10
   137  
   138  	// n = Data len
   139  	MOVQ d_len+16(FP), SI
   140  
   141  	// zero len return
   142  	CMPQ SI, $0
   143  	JE   SCEND
   144  
   145  	// check tail
   146  	SUBQ $4, SI
   147  	JL   SCTAIL
   148  
   149  	// load multiplier
   150  	MOVSD  c+0(FP), X0
   151  	SHUFPD $0, X0, X0
   152  
   153  SCLOOP:  // load d[i] | d[i+1]
   154  	MOVUPD 0(R10), X1
   155  	MOVUPD 16(R10), X2
   156  	SUBPD  X0, X1
   157  	SUBPD  X0, X2
   158  	MOVUPD X1, 0(R10)
   159  	MOVUPD X2, 16(R10)
   160  	ADDQ   $32, R10
   161  	SUBQ   $4, SI
   162  	JGE    SCLOOP
   163  
   164  SCTAIL:
   165  	ADDQ $4, SI
   166  	JE   SCEND
   167  
   168  SCTL:
   169  	MOVSD 0(R10), X1
   170  	SUBSD X0, X1
   171  	MOVSD X1, 0(R10)
   172  	ADDQ  $8, R10
   173  	SUBQ  $1, SI
   174  	JG    SCTL
   175  
   176  SCEND:
   177  	RET
   178  
   179  // func multC(c float64, d []float64)
   180  TEXT ·MultC(SB), NOSPLIT, $0
   181  	MOVQ d_base+8(FP), R10
   182  	MOVQ d_len+16(FP), SI
   183  
   184  	// zero len return
   185  	CMPQ SI, $0
   186  	JE   MCEND
   187  	SUBQ $4, SI
   188  	JL   MCTAIL
   189  
   190  	// load multiplier
   191  	MOVSD  c+0(FP), X0
   192  	SHUFPD $0, X0, X0
   193  
   194  MCLOOP:  // load d[i] | d[i+1]
   195  	MOVUPD 0(R10), X1
   196  	MOVUPD 16(R10), X2
   197  	MULPD  X0, X1
   198  	MULPD  X0, X2
   199  	MOVUPD X1, 0(R10)
   200  	MOVUPD X2, 16(R10)
   201  	ADDQ   $32, R10
   202  	SUBQ   $4, SI
   203  	JGE    MCLOOP
   204  
   205  MCTAIL:
   206  	ADDQ $4, SI
   207  	JE   MCEND
   208  
   209  MCTL:
   210  	MOVSD 0(R10), X1
   211  	MULSD X0, X1
   212  	MOVSD X1, 0(R10)
   213  	ADDQ  $8, R10
   214  	SUBQ  $1, SI
   215  	JG    MCTL
   216  
   217  MCEND:
   218  	RET
   219  
   220  // func divC(c float64, d []float64)
   221  TEXT ·DivC(SB), NOSPLIT, $0
   222  	// Data ptr
   223  	MOVQ d+8(FP), R10
   224  
   225  	// n = Data len
   226  	MOVQ d_len+16(FP), SI
   227  
   228  	// zero len return
   229  	CMPQ SI, $0
   230  	JE   DCEND
   231  
   232  	// check tail
   233  	SUBQ $4, SI
   234  	JL   DCTAIL
   235  
   236  	// load multiplier
   237  	MOVSD  c+0(FP), X0
   238  	SHUFPD $0, X0, X0
   239  
   240  DCLOOP:  // load d[i] | d[i+1]
   241  	MOVUPD 0(R10), X1
   242  	MOVUPD 16(R10), X2
   243  	DIVPD  X0, X1
   244  	DIVPD  X0, X2
   245  	MOVUPD X1, 0(R10)
   246  	MOVUPD X2, 16(R10)
   247  	ADDQ   $32, R10
   248  	SUBQ   $4, SI
   249  	JGE    DCLOOP 
   250  
   251  DCTAIL:
   252  	ADDQ $4, SI
   253  	JE   DCEND
   254  
   255  DCTL:
   256  	MOVSD 0(R10), X1
   257  	DIVSD X0, X1
   258  	MOVSD X1, 0(R10)
   259  	ADDQ  $8, R10
   260  	SUBQ  $1, SI
   261  	JG    DCTL
   262  
   263  DCEND:
   264  	RET
   265  
   266  // func add(a,b []float64)
   267  TEXT ·Add(SB), NOSPLIT, $0
   268  	// a Data ptr
   269  	MOVQ a_base+0(FP), R8
   270  
   271  	// a len
   272  	MOVQ a_len+8(FP), SI
   273  
   274  	// b Data ptr
   275  	MOVQ b_base+24(FP), R9
   276  	MOVQ R9, R10
   277  
   278  	// b len
   279  	MOVQ b_len+32(FP), DI
   280  	MOVQ DI, R11
   281  
   282  	// zero len return
   283  	CMPQ SI, $0
   284  	JE   AEND
   285  
   286  	// check tail
   287  	SUBQ $2, SI
   288  	JL   ATAIL
   289  
   290  ALD:
   291  	CMPQ DI, $1
   292  	JE   ALT
   293  	SUBQ $2, DI
   294  	JGE  ALO
   295  	MOVQ R10, R9
   296  	MOVQ R11, DI
   297  	SUBQ $2, DI
   298  
   299  ALO:
   300  	MOVUPD (R9), X1
   301  	ADDQ   $16, R9
   302  	JMP    ALOOP
   303  
   304  ALT:
   305  	MOVLPD (R9), X1
   306  	MOVQ   R10, R9
   307  	MOVQ   R11, DI
   308  	MOVHPD (R9), X1
   309  	SUBQ   $1, DI
   310  	ADDQ   $8, R9
   311  
   312  ALOOP:
   313  	MOVUPD (R8), X0
   314  	ADDPD  X1, X0
   315  	MOVUPD X0, (R8)
   316  	ADDQ   $16, R8
   317  	SUBQ   $2, SI
   318  	JGE    ALD
   319  
   320  ATAIL:
   321  	ADDQ $2, SI
   322  	JE   AEND
   323  
   324  ATL:
   325  	MOVSD (R8), X0
   326  	MOVSD (R9), X1
   327  	ADDSD X1, X0
   328  	MOVSD X0, (R8)
   329  	ADDQ  $8, R8
   330  	ADDQ  $8, R9
   331  	SUBQ  $1, SI
   332  	JG    ATL
   333  
   334  AEND:
   335  	RET
   336  
   337  // func vadd(a,b[]float64)
   338  // req:  len(a) == len(b)
   339  TEXT ·Vadd(SB), NOSPLIT, $0
   340  	// a Data ptr
   341  	MOVQ a_base+0(FP), R8
   342  
   343  	// a len
   344  	MOVQ a_len+8(FP), SI
   345  
   346  	// b Data ptr
   347  	MOVQ b_base+24(FP), R9
   348  
   349  	// zero len return
   350  	CMPQ SI, $0
   351  	JE   vadd_exit
   352  
   353  	// check tail
   354  	SUBQ $8, SI
   355  	JL   vadd_tail
   356  
   357  	// AVX vs SSE
   358  	CMPB ·AvxSupt(SB), $1
   359  	JE   vadd_avx_loop
   360  
   361  vadd_loop:
   362  	MOVUPD (R9), X1
   363  	MOVUPD 16(R9), X3
   364  	MOVUPD 32(R9), X5
   365  	MOVUPD 48(R9), X7
   366  	
   367  	MOVUPD (R8), X0
   368  	ADDPD  X1, X0
   369  	MOVUPD 16(R8), X2
   370  	ADDPD  X3, X2
   371  	MOVUPD 32(R8), X4
   372  	ADDPD  X5, X4
   373  	MOVUPD 48(R8), X6
   374  	ADDPD  X7, X6
   375  	
   376  	MOVUPD X0, (R8)
   377  	MOVUPD X2, 16(R8)
   378  	MOVUPD X4, 32(R8)
   379  	MOVUPD X6, 48(R8)
   380  	ADDQ   $64, R8
   381  	ADDQ   $64, R9
   382  	SUBQ   $8, SI
   383  	JGE    vadd_loop
   384  
   385  vadd_tail:
   386  	ADDQ $8, SI
   387  	JE   vadd_exit
   388  
   389  vadd_tail_loop:
   390  	MOVSD (R8), X15
   391  	MOVSD (R9), X14
   392  	ADDSD X14, X15
   393  	MOVSD X15, (R8)
   394  	ADDQ  $8, R8
   395  	ADDQ  $8, R9
   396  	SUBQ  $1, SI
   397  	JGE   vadd_tail_loop
   398  	JMP   vadd_exit
   399  	
   400  vadd_avx_loop:
   401  	//VMOVDQU (R9), Y0
   402  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x6F; BYTE $0x01
   403  	//VMOVDQU 32(R9), Y1
   404  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x6F; BYTE $0x49; BYTE $0x20
   405  
   406  	// VADDPD (R8),Y0,Y0
   407  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x58; BYTE $0x00
   408  	// VADDPD 32(R10),Y1,Y1
   409  	BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0x58; BYTE $0x48; BYTE $0x20
   410  
   411  	//VMOVDQA Y0, (R8)
   412  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x7F; BYTE $0x00
   413  	//VMOVDQA Y1, 32(R8)
   414  	BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x7F; BYTE $0x48; BYTE $0x20
   415  
   416  	
   417  	ADDQ $64, R8
   418  	ADDQ $64, R9
   419  	SUBQ $8, SI
   420  	JGE  vadd_avx_loop
   421  	//VZEROUPPER
   422  	BYTE $0xC5; BYTE $0xF8; BYTE $0x77
   423  	ADDQ $8, SI
   424  	JE   vadd_exit
   425  	JMP  vadd_tail_loop
   426  
   427  vadd_exit:
   428  	RET
   429  
   430  // func hadd(st uint64, a []float64)
   431  // req:  len(a) == len(b)
   432  TEXT ·Hadd(SB), NOSPLIT, $0
   433  	// a Data ptr
   434  	MOVQ a_base+8(FP), R8
   435  	MOVQ R8, R9
   436  
   437  	// a len
   438  	MOVQ a_len+16(FP), SI
   439  	MOVQ st+0(FP), CX
   440  	MOVQ CX,  DI
   441  	ANDQ $1, DI
   442  	
   443  
   444  	CMPQ CX, $1
   445  	JE hadd_exit
   446  	CMPQ SI, $0
   447  	JE   hadd_exit
   448  	CMPQ CX, $8
   449  	JG hadd_big_stride
   450  	CMPB ·Sse3Supt(SB), $1
   451  	JE hadd_sse3_head
   452  
   453  hadd_big_stride:
   454  	// AVX vs SSE
   455  	CMPB ·AvxSupt(SB), $1
   456  	//JE   hadd_avx_head
   457  	CMPB ·Sse3Supt(SB), $1
   458  	JE hadd_sse3_head
   459  hadd_head:
   460  	PXOR X0, X0
   461  	MOVQ CX, DI
   462  	SUBQ $1, DI
   463  hadd_loop:
   464  	ADDPD (R8), X0
   465  	ADDQ $16, R8
   466  	SUBQ $2, DI
   467  	JG hadd_loop
   468  	JZ hadd_tail
   469  	MOVAPD X0, X1
   470  	UNPCKHPD X1, X0
   471  	ADDPD X1,X0
   472  	MOVQ X0, (R9)
   473  	ADDQ $8, R9
   474  	SUBQ CX, SI
   475  	JG hadd_head
   476  	JMP hadd_exit
   477  hadd_tail:
   478  	ADDSD (R8), X0
   479  	MOVAPD X0, X1
   480  	UNPCKHPD X1, X0
   481  	ADDPD X1,X0
   482  	MOVQ X0, (R9)
   483  	ADDQ $8, R9
   484  	SUBQ CX, SI
   485  	JZ hadd_exit
   486  	MOVQ 8(R8), X0
   487  	MOVQ CX, DI
   488  	SUBQ $2, DI
   489  	ADDQ $16, R8
   490  	JMP hadd_loop
   491  hadd_sse3_head:
   492  	PXOR X0, X0
   493  	MOVQ CX, DI
   494  	SUBQ $1, DI
   495  hadd_sse3_loop:
   496  	ADDPD (R8), X0
   497  	ADDQ $16, R8
   498  	SUBQ $2, DI
   499  	JG hadd_sse3_loop
   500  	JZ hadd_sse3_tail
   501  	BYTE $0x66; BYTE $0x0F; BYTE $0x7C; BYTE $0xC0
   502  	// HADDPD X0, X0  //Added in 1.6
   503  	MOVQ X0, (R9)
   504  	ADDQ $8, R9
   505  	SUBQ CX, SI
   506  	JG hadd_sse3_head
   507  	JMP hadd_exit
   508  hadd_sse3_tail:
   509  	ADDSD (R8), X0
   510  	BYTE $0x66; BYTE $0x0F; BYTE $0x7C; BYTE $0xC0
   511  	// HADDPD X0, X0  //Added in 1.6
   512  	MOVQ X0, (R9)
   513  	ADDQ $8, R9
   514  	SUBQ CX, SI
   515  	JZ hadd_exit
   516  	MOVQ 8(R8), X0
   517  	MOVQ CX, DI
   518  	SUBQ $2, DI
   519  	ADDQ $16, R8
   520  	JMP hadd_sse3_loop
   521  hadd_exit:	
   522  	RET
   523  
   524  	
   525  // func subtr(a,b []float64)
   526  TEXT ·Subtr(SB), NOSPLIT, $0
   527  	// a Data ptr
   528  	MOVQ a_base+0(FP), R8
   529  
   530  	// a len
   531  	MOVQ a_len+8(FP), SI
   532  
   533  	// b Data ptr
   534  	MOVQ b_base+24(FP), R9
   535  	MOVQ R9, R10
   536  
   537  	// b len
   538  	MOVQ b_len+32(FP), DI
   539  	MOVQ DI, R11
   540  
   541  	// zero len return
   542  	MOVQ $0, AX
   543  	CMPQ AX, SI
   544  	JE   SEND
   545  
   546  	// check tail
   547  	SUBQ $2, SI
   548  	JL   STAIL
   549  
   550  SLD:
   551  	SUBQ $1, DI
   552  	JE   SLT
   553  	SUBQ $1, DI
   554  	JGE  SLO
   555  	MOVQ R10, R9
   556  	MOVQ R11, DI
   557  	SUBQ $2, DI
   558  
   559  SLO:
   560  	MOVUPD 0(R9), X1
   561  	ADDQ   $16, R9
   562  	JMP    SLOOP
   563  
   564  SLT:
   565  	MOVLPD 0(R9), X1
   566  	MOVQ   R10, R9
   567  	MOVQ   R11, DI
   568  	MOVHPD 0(R9), X1
   569  	SUBQ   $1, DI
   570  	ADDQ   $8, R9
   571  
   572  SLOOP:
   573  	MOVUPD 0(R8), X0
   574  	SUBPD  X1, X0
   575  	MOVUPD X0, 0(R8)
   576  	ADDQ   $16, R8
   577  	SUBQ   $2, SI
   578  	JGE    SLD
   579  
   580  STAIL:
   581  	ADDQ $2, SI
   582  	JE   SEND
   583  
   584  STL:
   585  	MOVSD 0(R8), X0
   586  	MOVSD 0(R9), X1
   587  	SUBSD X1, X0
   588  	MOVSD X0, 0(R8)
   589  	ADDQ  $8, R8
   590  	ADDQ  $8, R9
   591  	SUBQ  $1, SI
   592  	JG    STL
   593  
   594  SEND:
   595  	RET
   596  
   597  // func mult(a,b []float64)
   598  TEXT ·Mult(SB), NOSPLIT, $0
   599  	// a Data ptr
   600  	MOVQ a_base+0(FP), R8
   601  
   602  	// a len
   603  	MOVQ a_len+8(FP), SI
   604  
   605  	// b Data ptr
   606  	MOVQ b_base+24(FP), R9
   607  	MOVQ R9, R10
   608  
   609  	// b len
   610  	MOVQ b_len+32(FP), DI
   611  	MOVQ DI, R11
   612  
   613  	// zero len return
   614  	MOVQ $0, AX
   615  	CMPQ AX, SI
   616  	JE   MEND
   617  
   618  	// check tail
   619  	SUBQ $2, SI
   620  	JL   MTAIL
   621  
   622  MLD:
   623  	SUBQ $1, DI
   624  	JE   MLT
   625  	SUBQ $1, DI
   626  	JGE  MLO
   627  	MOVQ R10, R9
   628  	MOVQ R11, DI
   629  	SUBQ $2, DI
   630  
   631  MLO:
   632  	MOVUPD 0(R9), X1
   633  	ADDQ   $16, R9
   634  	JMP    MLOOP
   635  
   636  MLT:
   637  	MOVLPD 0(R9), X1
   638  	MOVQ   R10, R9
   639  	MOVQ   R11, DI
   640  	MOVHPD 0(R9), X1
   641  	SUBQ   $1, DI
   642  	ADDQ   $8, R9
   643  
   644  MLOOP:
   645  	MOVUPD 0(R8), X0
   646  	MULPD  X1, X0
   647  	MOVUPD X0, 0(R8)
   648  	ADDQ   $16, R8
   649  	SUBQ   $2, SI
   650  	JGE    MLD
   651  
   652  MTAIL:
   653  	ADDQ $2, SI
   654  	JE   MEND
   655  
   656  MTL:
   657  	MOVSD 0(R8), X0
   658  	MOVSD 0(R9), X1
   659  	MULSD X1, X0
   660  	MOVSD X0, 0(R8)
   661  	ADDQ  $8, R8
   662  	ADDQ  $8, R9
   663  	SUBQ  $1, SI
   664  	JG    MTL
   665  
   666  MEND:
   667  	RET
   668  
   669  // func div(a,b []float64)
   670  TEXT ·Div(SB), NOSPLIT, $0
   671  	// a Data ptr
   672  	MOVQ a_base+0(FP), R8
   673  
   674  	// a len
   675  	MOVQ a_len+8(FP), SI
   676  
   677  	// b Data ptr
   678  	MOVQ b_base+24(FP), R9
   679  	MOVQ R9, R10
   680  
   681  	// b len
   682  	MOVQ b_len+32(FP), DI
   683  	MOVQ DI, R11
   684  
   685  	// zero len return
   686  	MOVQ $0, AX
   687  	CMPQ AX, SI
   688  	JE   DEND
   689  
   690  	// check tail
   691  	SUBQ $2, SI
   692  	JL   DTAIL
   693  
   694  DLD:
   695  	SUBQ $1, DI
   696  	JE   DLT
   697  	SUBQ $1, DI
   698  	JGE  DLO
   699  	MOVQ R10, R9
   700  	MOVQ R11, DI
   701  	SUBQ $2, DI
   702  
   703  DLO:
   704  	MOVUPD 0(R9), X1
   705  	ADDQ   $16, R9
   706  	JMP    DLOOP
   707  DLT:
   708  	MOVLPD 0(R9), X1
   709  	MOVQ   R10, R9
   710  	MOVQ   R11, DI
   711  	MOVHPD 0(R9), X1
   712  	SUBQ   $1, DI
   713  	ADDQ   $8, R9
   714  
   715  DLOOP:
   716  	MOVUPD 0(R8), X0
   717  	DIVPD  X1, X0
   718  	MOVUPD X0, 0(R8)
   719  	ADDQ   $16, R8
   720  	SUBQ   $2, SI
   721  	JGE    DLD
   722  
   723  DTAIL:
   724  	ADDQ $2, SI
   725  	JE   DEND
   726  DTL:
   727  	MOVSD 0(R8), X0
   728  	MOVSD 0(R9), X1
   729  	DIVSD X1, X0
   730  	MOVSD X0, 0(R8)
   731  	ADDQ  $8, R8
   732  	ADDQ  $8, R9
   733  	SUBQ  $1, SI
   734  	JG    DTL
   735  
   736  DEND:
   737  	RET
   738  
   739  // func fma12(a float64, x,b []float64)
   740  // x[i] = a*x[i]+b[i]
   741  TEXT ·Fma12(SB), NOSPLIT, $0
   742  	// a ptr
   743  	MOVSD  a+0(FP), X2
   744  	SHUFPD $0, X2, X2
   745  
   746  	// x Data ptr
   747  	MOVQ x_base+8(FP), R8
   748  
   749  	// x len
   750  	MOVQ x_len+16(FP), SI
   751  
   752  	// b Data ptr
   753  	MOVQ b_base+32(FP), R9
   754  	MOVQ R9, R10
   755  
   756  	// b len
   757  	MOVQ b_len+40(FP), DI
   758  	MOVQ DI, R11
   759  
   760  	// zero len return
   761  	CMPQ SI, $0
   762  	JE   F12END
   763  
   764  	// check tail
   765  	SUBQ $2, SI
   766  	JL   F12TAIL
   767  
   768  F12LD:
   769  	CMPQ DI, $1
   770  	JE   F12LT
   771  	SUBQ $2, DI
   772  	JGE  F12LO
   773  	MOVQ R10, R9
   774  	MOVQ R11, DI
   775  	SUBQ $2, DI
   776  
   777  F12LO:
   778  	MOVUPD (R9), X1
   779  	ADDQ   $16, R9
   780  	JMP    F12LOOP
   781  
   782  F12LT:
   783  	MOVLPD (R9), X1
   784  	MOVQ   R10, R9
   785  	MOVQ   R11, DI
   786  	MOVHPD (R9), X1
   787  	SUBQ   $1, DI
   788  	ADDQ   $8, R9
   789  
   790  F12LOOP:
   791  	MOVUPD (R8), X0
   792  	MULPD  X2, X0
   793  	ADDPD  X1, X0
   794  	MOVUPD X0, (R8)
   795  	ADDQ   $16, R8
   796  	SUBQ   $2, SI
   797  	JGE    F12LD
   798  	JMP    F12TAIL
   799  
   800  F12LDF:
   801  	CMPQ DI, $1
   802  	JE   F12LTF
   803  	SUBQ $2, DI
   804  	JGE  F12LOF
   805  	MOVQ R10, R9
   806  	MOVQ R11, DI
   807  	SUBQ $2, DI
   808  
   809  F12LOF:
   810  	MOVUPD (R9), X1
   811  	ADDQ   $16, R9
   812  	JMP    F12LOOPF
   813  
   814  F12LTF:
   815  	MOVLPD (R9), X1
   816  	MOVQ   R10, R9
   817  	MOVQ   R11, DI
   818  	MOVHPD (R9), X1
   819  	SUBQ   $1, DI
   820  	ADDQ   $8, R9
   821  
   822  F12LOOPF:
   823  	MOVUPD (R8), X0
   824  
   825  	// VMFADD213PD X0, X1, X2
   826  	BYTE   $0xC4; BYTE $0xE2; BYTE $0xF1; BYTE $0x98; BYTE $0xC2
   827  	MOVUPD X0, (R8)
   828  	ADDQ   $16, R8
   829  	SUBQ   $2, SI
   830  	JGE    F12LDF
   831  
   832  F12TAIL:
   833  	ADDQ $2, SI
   834  	JE   F12END
   835  
   836  F12TL:
   837  	MOVSD (R8), X0
   838  	MOVSD (R9), X1
   839  	MULPD X2, X0
   840  	ADDPD X1, X0
   841  	MOVSD X0, (R8)
   842  	ADDQ  $8, R8
   843  	ADDQ  $8, R9
   844  	SUBQ  $1, SI
   845  	JG    F12TL
   846  
   847  F12END:
   848  	RET
   849  
   850  // func fma21(a float64, x,b []float64)
   851  // x[i] = x[i]*b[i]+a
   852  TEXT ·Fma21(SB), NOSPLIT, $0
   853  	// a ptr
   854  	MOVSD  a+0(FP), X2
   855  	SHUFPD $0, X2, X2
   856  
   857  	// x Data ptr
   858  	MOVQ x_base+8(FP), R8
   859  
   860  	// x len
   861  	MOVQ x_len+16(FP), SI
   862  
   863  	// b Data ptr
   864  	MOVQ b_base+32(FP), R9
   865  	MOVQ R9, R10
   866  
   867  	// b len
   868  	MOVQ b_len+40(FP), DI
   869  	MOVQ DI, R11
   870  
   871  	// zero len return
   872  	CMPQ SI, $0
   873  	JE   F21END
   874  
   875  	// check tail
   876  	SUBQ $2, SI
   877  	JL   F21TAIL
   878  
   879  F21LD:
   880  	CMPQ DI, $1
   881  	JE   F21LT
   882  	SUBQ $2, DI
   883  	JGE  F21LO
   884  	MOVQ R10, R9
   885  	MOVQ R11, DI
   886  	SUBQ $2, DI
   887  
   888  F21LO:
   889  	MOVUPD (R9), X1
   890  	ADDQ   $16, R9
   891  	JMP    F21LOOP
   892  
   893  F21LT:
   894  	MOVLPD (R9), X1
   895  	MOVQ   R10, R9
   896  	MOVQ   R11, DI
   897  	MOVHPD (R9), X1
   898  	SUBQ   $1, DI
   899  	ADDQ   $8, R9
   900  
   901  F21LOOP:
   902  	MOVUPD (R8), X0
   903  	MULPD  X1, X0
   904  	ADDPD  X2, X0
   905  	MOVUPD X0, (R8)
   906  	ADDQ   $16, R8
   907  	SUBQ   $2, SI
   908  	JGE    F21LD
   909  	JMP    F21TAIL
   910  
   911  F21LDF:
   912  	CMPQ DI, $1
   913  	JE   F21LTF
   914  	SUBQ $2, DI
   915  	JGE  F21LOF
   916  	MOVQ R10, R9
   917  	MOVQ R11, DI
   918  	SUBQ $2, DI
   919  
   920  F21LOF:
   921  	MOVUPD (R9), X1
   922  	ADDQ   $16, R9
   923  	JMP    F21LOOPF
   924  
   925  F21LTF:
   926  	MOVLPD (R9), X1
   927  	MOVQ   R10, R9
   928  	MOVQ   R11, DI
   929  	MOVHPD (R9), X1
   930  	SUBQ   $1, DI
   931  	ADDQ   $8, R9
   932  
   933  F21LOOPF:
   934  	MOVUPD (R8), X0
   935  
   936  	// VMFADD213PD X0, X1, X2
   937  	BYTE   $0xC4; BYTE $0xE2; BYTE $0xF1; BYTE $0xA8; BYTE $0xC2
   938  	MOVUPD X0, (R8)
   939  	ADDQ   $16, R8
   940  	SUBQ   $2, SI
   941  	JGE    F21LDF
   942  
   943  F21TAIL:
   944  	ADDQ $2, SI
   945  	JE   F21END
   946  
   947  F21TL:
   948  	MOVSD (R8), X0
   949  	MOVSD (R9), X1
   950  	MULPD X1, X0
   951  	ADDPD X2, X0
   952  	MOVSD X0, (R8)
   953  	ADDQ  $8, R8
   954  	ADDQ  $8, R9
   955  	SUBQ  $1, SI
   956  	JG    F21TL
   957  
   958  F21END:
   959  	RET