gonum.org/v1/gonum@v0.14.0/internal/asm/f64/ger_amd64.s (about)

     1  // Copyright ©2017 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  #define SIZE 8
    10  
    11  #define M_DIM m+0(FP)
    12  #define M CX
    13  #define N_DIM n+8(FP)
    14  #define N BX
    15  
    16  #define TMP1 R14
    17  #define TMP2 R15
    18  
    19  #define X_PTR SI
    20  #define Y y_base+56(FP)
    21  #define Y_PTR DX
    22  #define A_ROW AX
    23  #define A_PTR DI
    24  
    25  #define INC_X R8
    26  #define INC3_X R9
    27  
    28  #define INC_Y R10
    29  #define INC3_Y R11
    30  
    31  #define LDA R12
    32  #define LDA3 R13
    33  
    34  #define ALPHA X0
    35  
    36  #define LOAD4 \
    37  	PREFETCHNTA (X_PTR )(INC_X*8)     \
    38  	MOVDDUP     (X_PTR), X1           \
    39  	MOVDDUP     (X_PTR)(INC_X*1), X2  \
    40  	MOVDDUP     (X_PTR)(INC_X*2), X3  \
    41  	MOVDDUP     (X_PTR)(INC3_X*1), X4 \
    42  	MULPD       ALPHA, X1             \
    43  	MULPD       ALPHA, X2             \
    44  	MULPD       ALPHA, X3             \
    45  	MULPD       ALPHA, X4
    46  
    47  #define LOAD2 \
    48  	MOVDDUP (X_PTR), X1          \
    49  	MOVDDUP (X_PTR)(INC_X*1), X2 \
    50  	MULPD   ALPHA, X1            \
    51  	MULPD   ALPHA, X2
    52  
    53  #define LOAD1 \
    54  	MOVDDUP (X_PTR), X1 \
    55  	MULPD   ALPHA, X1
    56  
    57  #define KERNEL_LOAD4 \
    58  	MOVUPS (Y_PTR), X5       \
    59  	MOVUPS 2*SIZE(Y_PTR), X6
    60  
    61  #define KERNEL_LOAD4_INC \
    62  	MOVLPD (Y_PTR), X5           \
    63  	MOVHPD (Y_PTR)(INC_Y*1), X5  \
    64  	MOVLPD (Y_PTR)(INC_Y*2), X6  \
    65  	MOVHPD (Y_PTR)(INC3_Y*1), X6
    66  
    67  #define KERNEL_LOAD2 \
    68  	MOVUPS (Y_PTR), X5
    69  
    70  #define KERNEL_LOAD2_INC \
    71  	MOVLPD (Y_PTR), X5          \
    72  	MOVHPD (Y_PTR)(INC_Y*1), X5
    73  
    74  #define KERNEL_4x4 \
    75  	MOVUPS X5, X7  \
    76  	MOVUPS X6, X8  \
    77  	MOVUPS X5, X9  \
    78  	MOVUPS X6, X10 \
    79  	MOVUPS X5, X11 \
    80  	MOVUPS X6, X12 \
    81  	MULPD  X1, X5  \
    82  	MULPD  X1, X6  \
    83  	MULPD  X2, X7  \
    84  	MULPD  X2, X8  \
    85  	MULPD  X3, X9  \
    86  	MULPD  X3, X10 \
    87  	MULPD  X4, X11 \
    88  	MULPD  X4, X12
    89  
    90  #define STORE_4x4 \
    91  	MOVUPS (A_PTR), X13               \
    92  	ADDPD  X13, X5                    \
    93  	MOVUPS 2*SIZE(A_PTR), X14         \
    94  	ADDPD  X14, X6                    \
    95  	MOVUPS (A_PTR)(LDA*1), X15        \
    96  	ADDPD  X15, X7                    \
    97  	MOVUPS 2*SIZE(A_PTR)(LDA*1), X0   \
    98  	ADDPD  X0, X8                     \
    99  	MOVUPS (A_PTR)(LDA*2), X13        \
   100  	ADDPD  X13, X9                    \
   101  	MOVUPS 2*SIZE(A_PTR)(LDA*2), X14  \
   102  	ADDPD  X14, X10                   \
   103  	MOVUPS (A_PTR)(LDA3*1), X15       \
   104  	ADDPD  X15, X11                   \
   105  	MOVUPS 2*SIZE(A_PTR)(LDA3*1), X0  \
   106  	ADDPD  X0, X12                    \
   107  	MOVUPS X5, (A_PTR)                \
   108  	MOVUPS X6, 2*SIZE(A_PTR)          \
   109  	MOVUPS X7, (A_PTR)(LDA*1)         \
   110  	MOVUPS X8, 2*SIZE(A_PTR)(LDA*1)   \
   111  	MOVUPS X9, (A_PTR)(LDA*2)         \
   112  	MOVUPS X10, 2*SIZE(A_PTR)(LDA*2)  \
   113  	MOVUPS X11, (A_PTR)(LDA3*1)       \
   114  	MOVUPS X12, 2*SIZE(A_PTR)(LDA3*1) \
   115  	ADDQ   $4*SIZE, A_PTR
   116  
   117  #define KERNEL_4x2 \
   118  	MOVUPS X5, X6 \
   119  	MOVUPS X5, X7 \
   120  	MOVUPS X5, X8 \
   121  	MULPD  X1, X5 \
   122  	MULPD  X2, X6 \
   123  	MULPD  X3, X7 \
   124  	MULPD  X4, X8
   125  
   126  #define STORE_4x2 \
   127  	MOVUPS (A_PTR), X9          \
   128  	ADDPD  X9, X5               \
   129  	MOVUPS (A_PTR)(LDA*1), X10  \
   130  	ADDPD  X10, X6              \
   131  	MOVUPS (A_PTR)(LDA*2), X11  \
   132  	ADDPD  X11, X7              \
   133  	MOVUPS (A_PTR)(LDA3*1), X12 \
   134  	ADDPD  X12, X8              \
   135  	MOVUPS X5, (A_PTR)          \
   136  	MOVUPS X6, (A_PTR)(LDA*1)   \
   137  	MOVUPS X7, (A_PTR)(LDA*2)   \
   138  	MOVUPS X8, (A_PTR)(LDA3*1)  \
   139  	ADDQ   $2*SIZE, A_PTR
   140  
   141  #define KERNEL_4x1 \
   142  	MOVSD (Y_PTR), X5 \
   143  	MOVSD X5, X6      \
   144  	MOVSD X5, X7      \
   145  	MOVSD X5, X8      \
   146  	MULSD X1, X5      \
   147  	MULSD X2, X6      \
   148  	MULSD X3, X7      \
   149  	MULSD X4, X8
   150  
   151  #define STORE_4x1 \
   152  	ADDSD (A_PTR), X5         \
   153  	ADDSD (A_PTR)(LDA*1), X6  \
   154  	ADDSD (A_PTR)(LDA*2), X7  \
   155  	ADDSD (A_PTR)(LDA3*1), X8 \
   156  	MOVSD X5, (A_PTR)         \
   157  	MOVSD X6, (A_PTR)(LDA*1)  \
   158  	MOVSD X7, (A_PTR)(LDA*2)  \
   159  	MOVSD X8, (A_PTR)(LDA3*1) \
   160  	ADDQ  $SIZE, A_PTR
   161  
   162  #define KERNEL_2x4 \
   163  	MOVUPS X5, X7 \
   164  	MOVUPS X6, X8 \
   165  	MULPD  X1, X5 \
   166  	MULPD  X1, X6 \
   167  	MULPD  X2, X7 \
   168  	MULPD  X2, X8
   169  
   170  #define STORE_2x4 \
   171  	MOVUPS (A_PTR), X9               \
   172  	ADDPD  X9, X5                    \
   173  	MOVUPS 2*SIZE(A_PTR), X10        \
   174  	ADDPD  X10, X6                   \
   175  	MOVUPS (A_PTR)(LDA*1), X11       \
   176  	ADDPD  X11, X7                   \
   177  	MOVUPS 2*SIZE(A_PTR)(LDA*1), X12 \
   178  	ADDPD  X12, X8                   \
   179  	MOVUPS X5, (A_PTR)               \
   180  	MOVUPS X6, 2*SIZE(A_PTR)         \
   181  	MOVUPS X7, (A_PTR)(LDA*1)        \
   182  	MOVUPS X8, 2*SIZE(A_PTR)(LDA*1)  \
   183  	ADDQ   $4*SIZE, A_PTR
   184  
   185  #define KERNEL_2x2 \
   186  	MOVUPS X5, X6 \
   187  	MULPD  X1, X5 \
   188  	MULPD  X2, X6
   189  
   190  #define STORE_2x2 \
   191  	MOVUPS (A_PTR), X7        \
   192  	ADDPD  X7, X5             \
   193  	MOVUPS (A_PTR)(LDA*1), X8 \
   194  	ADDPD  X8, X6             \
   195  	MOVUPS X5, (A_PTR)        \
   196  	MOVUPS X6, (A_PTR)(LDA*1) \
   197  	ADDQ   $2*SIZE, A_PTR
   198  
   199  #define KERNEL_2x1 \
   200  	MOVSD (Y_PTR), X5 \
   201  	MOVSD X5, X6      \
   202  	MULSD X1, X5      \
   203  	MULSD X2, X6
   204  
   205  #define STORE_2x1 \
   206  	ADDSD (A_PTR), X5        \
   207  	ADDSD (A_PTR)(LDA*1), X6 \
   208  	MOVSD X5, (A_PTR)        \
   209  	MOVSD X6, (A_PTR)(LDA*1) \
   210  	ADDQ  $SIZE, A_PTR
   211  
   212  #define KERNEL_1x4 \
   213  	MULPD X1, X5 \
   214  	MULPD X1, X6
   215  
   216  #define STORE_1x4 \
   217  	MOVUPS (A_PTR), X7       \
   218  	ADDPD  X7, X5            \
   219  	MOVUPS 2*SIZE(A_PTR), X8 \
   220  	ADDPD  X8, X6            \
   221  	MOVUPS X5, (A_PTR)       \
   222  	MOVUPS X6, 2*SIZE(A_PTR) \
   223  	ADDQ   $4*SIZE, A_PTR
   224  
   225  #define KERNEL_1x2 \
   226  	MULPD X1, X5
   227  
   228  #define STORE_1x2 \
   229  	MOVUPS (A_PTR), X6    \
   230  	ADDPD  X6, X5         \
   231  	MOVUPS X5, (A_PTR)    \
   232  	ADDQ   $2*SIZE, A_PTR
   233  
   234  #define KERNEL_1x1 \
   235  	MOVSD (Y_PTR), X5 \
   236  	MULSD X1, X5
   237  
   238  #define STORE_1x1 \
   239  	ADDSD (A_PTR), X5  \
   240  	MOVSD X5, (A_PTR)  \
   241  	ADDQ  $SIZE, A_PTR
   242  
   243  // func Ger(m, n uintptr, alpha float64,
   244  //	x []float64, incX uintptr,
   245  //	y []float64, incY uintptr,
   246  //	a []float64, lda uintptr)
   247  TEXT ·Ger(SB), NOSPLIT, $0
   248  	MOVQ M_DIM, M
   249  	MOVQ N_DIM, N
   250  	CMPQ M, $0
   251  	JE   end
   252  	CMPQ N, $0
   253  	JE   end
   254  
   255  	MOVDDUP alpha+16(FP), ALPHA
   256  
   257  	MOVQ x_base+24(FP), X_PTR
   258  	MOVQ y_base+56(FP), Y_PTR
   259  	MOVQ a_base+88(FP), A_ROW
   260  	MOVQ incX+48(FP), INC_X       // INC_X = incX * sizeof(float64)
   261  	SHLQ $3, INC_X
   262  	MOVQ lda+112(FP), LDA         // LDA = LDA * sizeof(float64)
   263  	SHLQ $3, LDA
   264  	LEAQ (LDA)(LDA*2), LDA3       // LDA3 = LDA * 3
   265  	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
   266  	MOVQ A_ROW, A_PTR
   267  
   268  	XORQ    TMP2, TMP2
   269  	MOVQ    M, TMP1
   270  	SUBQ    $1, TMP1
   271  	IMULQ   INC_X, TMP1
   272  	NEGQ    TMP1
   273  	CMPQ    INC_X, $0
   274  	CMOVQLT TMP1, TMP2
   275  	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR
   276  
   277  	CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
   278  	JG   inc
   279  	JL   end
   280  
   281  	SHRQ $2, M
   282  	JZ   r2
   283  
   284  r4:
   285  	// LOAD 4
   286  	LOAD4
   287  
   288  	MOVQ N_DIM, N
   289  	SHRQ $2, N
   290  	JZ   r4c2
   291  
   292  r4c4:
   293  	// 4x4 KERNEL
   294  	KERNEL_LOAD4
   295  	KERNEL_4x4
   296  	STORE_4x4
   297  
   298  	ADDQ $4*SIZE, Y_PTR
   299  
   300  	DECQ N
   301  	JNZ  r4c4
   302  
   303  	// Reload ALPHA after it's clobbered by STORE_4x4
   304  	MOVDDUP alpha+16(FP), ALPHA
   305  
   306  r4c2:
   307  	TESTQ $2, N_DIM
   308  	JZ    r4c1
   309  
   310  	// 4x2 KERNEL
   311  	KERNEL_LOAD2
   312  	KERNEL_4x2
   313  	STORE_4x2
   314  
   315  	ADDQ $2*SIZE, Y_PTR
   316  
   317  r4c1:
   318  	TESTQ $1, N_DIM
   319  	JZ    r4end
   320  
   321  	// 4x1 KERNEL
   322  	KERNEL_4x1
   323  	STORE_4x1
   324  
   325  	ADDQ $SIZE, Y_PTR
   326  
   327  r4end:
   328  	LEAQ (X_PTR)(INC_X*4), X_PTR
   329  	MOVQ Y, Y_PTR
   330  	LEAQ (A_ROW)(LDA*4), A_ROW
   331  	MOVQ A_ROW, A_PTR
   332  
   333  	DECQ M
   334  	JNZ  r4
   335  
   336  r2:
   337  	TESTQ $2, M_DIM
   338  	JZ    r1
   339  
   340  	// LOAD 2
   341  	LOAD2
   342  
   343  	MOVQ N_DIM, N
   344  	SHRQ $2, N
   345  	JZ   r2c2
   346  
   347  r2c4:
   348  	// 2x4 KERNEL
   349  	KERNEL_LOAD4
   350  	KERNEL_2x4
   351  	STORE_2x4
   352  
   353  	ADDQ $4*SIZE, Y_PTR
   354  
   355  	DECQ N
   356  	JNZ  r2c4
   357  
   358  r2c2:
   359  	TESTQ $2, N_DIM
   360  	JZ    r2c1
   361  
   362  	// 2x2 KERNEL
   363  	KERNEL_LOAD2
   364  	KERNEL_2x2
   365  	STORE_2x2
   366  
   367  	ADDQ $2*SIZE, Y_PTR
   368  
   369  r2c1:
   370  	TESTQ $1, N_DIM
   371  	JZ    r2end
   372  
   373  	// 2x1 KERNEL
   374  	KERNEL_2x1
   375  	STORE_2x1
   376  
   377  	ADDQ $SIZE, Y_PTR
   378  
   379  r2end:
   380  	LEAQ (X_PTR)(INC_X*2), X_PTR
   381  	MOVQ Y, Y_PTR
   382  	LEAQ (A_ROW)(LDA*2), A_ROW
   383  	MOVQ A_ROW, A_PTR
   384  
   385  r1:
   386  	TESTQ $1, M_DIM
   387  	JZ    end
   388  
   389  	// LOAD 1
   390  	LOAD1
   391  
   392  	MOVQ N_DIM, N
   393  	SHRQ $2, N
   394  	JZ   r1c2
   395  
   396  r1c4:
   397  	// 1x4 KERNEL
   398  	KERNEL_LOAD4
   399  	KERNEL_1x4
   400  	STORE_1x4
   401  
   402  	ADDQ $4*SIZE, Y_PTR
   403  
   404  	DECQ N
   405  	JNZ  r1c4
   406  
   407  r1c2:
   408  	TESTQ $2, N_DIM
   409  	JZ    r1c1
   410  
   411  	// 1x2 KERNEL
   412  	KERNEL_LOAD2
   413  	KERNEL_1x2
   414  	STORE_1x2
   415  
   416  	ADDQ $2*SIZE, Y_PTR
   417  
   418  r1c1:
   419  	TESTQ $1, N_DIM
   420  	JZ    end
   421  
   422  	// 1x1 KERNEL
   423  	KERNEL_1x1
   424  	STORE_1x1
   425  
   426  	ADDQ $SIZE, Y_PTR
   427  
   428  end:
   429  	RET
   430  
   431  inc:  // Algorithm for incY != 1 ( split loads in kernel )
   432  
   433  	MOVQ incY+80(FP), INC_Y       // INC_Y = incY * sizeof(float64)
   434  	SHLQ $3, INC_Y
   435  	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
   436  
   437  	XORQ    TMP2, TMP2
   438  	MOVQ    N, TMP1
   439  	SUBQ    $1, TMP1
   440  	IMULQ   INC_Y, TMP1
   441  	NEGQ    TMP1
   442  	CMPQ    INC_Y, $0
   443  	CMOVQLT TMP1, TMP2
   444  	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
   445  
   446  	SHRQ $2, M
   447  	JZ   inc_r2
   448  
   449  inc_r4:
   450  	// LOAD 4
   451  	LOAD4
   452  
   453  	MOVQ N_DIM, N
   454  	SHRQ $2, N
   455  	JZ   inc_r4c2
   456  
   457  inc_r4c4:
   458  	// 4x4 KERNEL
   459  	KERNEL_LOAD4_INC
   460  	KERNEL_4x4
   461  	STORE_4x4
   462  
   463  	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
   464  	DECQ N
   465  	JNZ  inc_r4c4
   466  
   467  	// Reload ALPHA after it's clobbered by STORE_4x4
   468  	MOVDDUP alpha+16(FP), ALPHA
   469  
   470  inc_r4c2:
   471  	TESTQ $2, N_DIM
   472  	JZ    inc_r4c1
   473  
   474  	// 4x2 KERNEL
   475  	KERNEL_LOAD2_INC
   476  	KERNEL_4x2
   477  	STORE_4x2
   478  
   479  	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
   480  
   481  inc_r4c1:
   482  	TESTQ $1, N_DIM
   483  	JZ    inc_r4end
   484  
   485  	// 4x1 KERNEL
   486  	KERNEL_4x1
   487  	STORE_4x1
   488  
   489  	ADDQ INC_Y, Y_PTR
   490  
   491  inc_r4end:
   492  	LEAQ (X_PTR)(INC_X*4), X_PTR
   493  	MOVQ Y, Y_PTR
   494  	LEAQ (A_ROW)(LDA*4), A_ROW
   495  	MOVQ A_ROW, A_PTR
   496  
   497  	DECQ M
   498  	JNZ  inc_r4
   499  
   500  inc_r2:
   501  	TESTQ $2, M_DIM
   502  	JZ    inc_r1
   503  
   504  	// LOAD 2
   505  	LOAD2
   506  
   507  	MOVQ N_DIM, N
   508  	SHRQ $2, N
   509  	JZ   inc_r2c2
   510  
   511  inc_r2c4:
   512  	// 2x4 KERNEL
   513  	KERNEL_LOAD4_INC
   514  	KERNEL_2x4
   515  	STORE_2x4
   516  
   517  	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
   518  	DECQ N
   519  	JNZ  inc_r2c4
   520  
   521  inc_r2c2:
   522  	TESTQ $2, N_DIM
   523  	JZ    inc_r2c1
   524  
   525  	// 2x2 KERNEL
   526  	KERNEL_LOAD2_INC
   527  	KERNEL_2x2
   528  	STORE_2x2
   529  
   530  	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
   531  
   532  inc_r2c1:
   533  	TESTQ $1, N_DIM
   534  	JZ    inc_r2end
   535  
   536  	// 2x1 KERNEL
   537  	KERNEL_2x1
   538  	STORE_2x1
   539  
   540  	ADDQ INC_Y, Y_PTR
   541  
   542  inc_r2end:
   543  	LEAQ (X_PTR)(INC_X*2), X_PTR
   544  	MOVQ Y, Y_PTR
   545  	LEAQ (A_ROW)(LDA*2), A_ROW
   546  	MOVQ A_ROW, A_PTR
   547  
   548  inc_r1:
   549  	TESTQ $1, M_DIM
   550  	JZ    end
   551  
   552  	// LOAD 1
   553  	LOAD1
   554  
   555  	MOVQ N_DIM, N
   556  	SHRQ $2, N
   557  	JZ   inc_r1c2
   558  
   559  inc_r1c4:
   560  	// 1x4 KERNEL
   561  	KERNEL_LOAD4_INC
   562  	KERNEL_1x4
   563  	STORE_1x4
   564  
   565  	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
   566  	DECQ N
   567  	JNZ  inc_r1c4
   568  
   569  inc_r1c2:
   570  	TESTQ $2, N_DIM
   571  	JZ    inc_r1c1
   572  
   573  	// 1x2 KERNEL
   574  	KERNEL_LOAD2_INC
   575  	KERNEL_1x2
   576  	STORE_1x2
   577  
   578  	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
   579  
   580  inc_r1c1:
   581  	TESTQ $1, N_DIM
   582  	JZ    end
   583  
   584  	// 1x1 KERNEL
   585  	KERNEL_1x1
   586  	STORE_1x1
   587  
   588  	ADDQ INC_Y, Y_PTR
   589  
   590  inc_end:
   591  	RET