gonum.org/v1/gonum@v0.14.0/internal/asm/f32/ddotunitary_amd64.s (about)

     1  // Copyright ©2017 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  #define HADDPD_SUM_SUM    LONG $0xC07C0F66 // @ HADDPD X0, X0
    10  
    11  #define X_PTR SI
    12  #define Y_PTR DI
    13  #define LEN CX
    14  #define TAIL BX
    15  #define IDX AX
    16  #define SUM X0
    17  #define P_SUM X1
    18  
    19  // func DdotUnitary(x, y []float32) (sum float32)
    20  TEXT ·DdotUnitary(SB), NOSPLIT, $0
    21  	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
    22  	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
    23  	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
    24  	CMPQ    y_len+32(FP), LEN
    25  	CMOVQLE y_len+32(FP), LEN
    26  	PXOR    SUM, SUM             // psum = 0
    27  	CMPQ    LEN, $0
    28  	JE      dot_end
    29  
    30  	XORQ IDX, IDX
    31  	MOVQ Y_PTR, DX
    32  	ANDQ $0xF, DX    // Align on 16-byte boundary for ADDPS
    33  	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
    34  
    35  	SUBQ $16, DX
    36  
    37  dot_align: // Trim first value(s) in unaligned buffer  do {
    38  	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
    39  	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
    40  	MULSD    X3, X2
    41  	ADDSD    X2, SUM            // SUM += X2
    42  	INCQ     IDX                // IDX++
    43  	DECQ     LEN
    44  	JZ       dot_end            // if --TAIL == 0 { return }
    45  	ADDQ     $4, DX
    46  	JNZ      dot_align          // } while --LEN > 0
    47  
    48  dot_no_trim:
    49  	PXOR P_SUM, P_SUM   // P_SUM = 0  for pipelining
    50  	MOVQ LEN, TAIL
    51  	ANDQ $0x7, TAIL     // TAIL = LEN % 8
    52  	SHRQ $3, LEN        // LEN = floor( LEN / 8 )
    53  	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
    54  
    55  dot_loop: // Loop unrolled 8x  do {
    56  	CVTPS2PD (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
    57  	CVTPS2PD 8(X_PTR)(IDX*4), X3
    58  	CVTPS2PD 16(X_PTR)(IDX*4), X4
    59  	CVTPS2PD 24(X_PTR)(IDX*4), X5
    60  
    61  	CVTPS2PD (Y_PTR)(IDX*4), X6   // X_j = y[i:i+1]
    62  	CVTPS2PD 8(Y_PTR)(IDX*4), X7
    63  	CVTPS2PD 16(Y_PTR)(IDX*4), X8
    64  	CVTPS2PD 24(Y_PTR)(IDX*4), X9
    65  
    66  	MULPD X6, X2 // X_i *= X_j
    67  	MULPD X7, X3
    68  	MULPD X8, X4
    69  	MULPD X9, X5
    70  
    71  	ADDPD X2, SUM   // SUM += X_i
    72  	ADDPD X3, P_SUM
    73  	ADDPD X4, SUM
    74  	ADDPD X5, P_SUM
    75  
    76  	ADDQ $8, IDX  // IDX += 8
    77  	DECQ LEN
    78  	JNZ  dot_loop // } while --LEN > 0
    79  
    80  	ADDPD P_SUM, SUM // SUM += P_SUM
    81  	CMPQ  TAIL, $0   // if TAIL == 0 { return }
    82  	JE    dot_end
    83  
    84  dot_tail_start:
    85  	MOVQ TAIL, LEN
    86  	SHRQ $1, LEN
    87  	JZ   dot_tail_one
    88  
    89  dot_tail_two:
    90  	CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
    91  	CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
    92  	MULPD    X6, X2             // X_i *= X_j
    93  	ADDPD    X2, SUM            // SUM += X_i
    94  	ADDQ     $2, IDX            // IDX += 2
    95  	DECQ     LEN
    96  	JNZ      dot_tail_two       // } while --LEN > 0
    97  
    98  	ANDQ $1, TAIL
    99  	JZ   dot_end
   100  
   101  dot_tail_one:
   102  	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
   103  	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
   104  	MULSD    X3, X2             // X2 *= X3
   105  	ADDSD    X2, SUM            // SUM += X2
   106  
   107  dot_end:
   108  	HADDPD_SUM_SUM        // SUM = \sum{ SUM[i] }
   109  	MOVSD SUM, sum+48(FP) // return SUM
   110  	RET