github.com/gopherd/gonum@v0.0.4/internal/asm/f64/sum_amd64.s (about)

     1  // Copyright ©2018 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  #define X_PTR SI
    10  #define IDX AX
    11  #define LEN CX
    12  #define TAIL BX
    13  #define SUM X0
    14  #define SUM_1 X1
    15  #define SUM_2 X2
    16  #define SUM_3 X3
    17  
    18  // func Sum(x []float64) float64
    19  TEXT ·Sum(SB), NOSPLIT, $0
    20  	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
    21  	MOVQ x_len+8(FP), LEN    // LEN = len(x)
    22  	XORQ IDX, IDX            // i = 0
    23  	PXOR SUM, SUM            // p_sum_i = 0
    24  	CMPQ LEN, $0             // if LEN == 0 { return 0 }
    25  	JE   sum_end
    26  
    27  	PXOR SUM_1, SUM_1
    28  	PXOR SUM_2, SUM_2
    29  	PXOR SUM_3, SUM_3
    30  
    31  	MOVQ X_PTR, TAIL // Check memory alignment
    32  	ANDQ $15, TAIL   // TAIL = &y % 16
    33  	JZ   no_trim     // if TAIL == 0 { goto no_trim }
    34  
    35  	// Align on 16-byte boundary
    36  	ADDSD (X_PTR), X0 // X0 += x[0]
    37  	INCQ  IDX         // i++
    38  	DECQ  LEN         // LEN--
    39  	JZ    sum_end     // if LEN == 0 { return }
    40  
    41  no_trim:
    42  	MOVQ LEN, TAIL
    43  	SHRQ $4, LEN   // LEN = floor( n / 16 )
    44  	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }
    45  
    46  sum_loop: // sum 16x wide do {
    47  	ADDPD (X_PTR)(IDX*8), SUM      // sum_i += x[i:i+2]
    48  	ADDPD 16(X_PTR)(IDX*8), SUM_1
    49  	ADDPD 32(X_PTR)(IDX*8), SUM_2
    50  	ADDPD 48(X_PTR)(IDX*8), SUM_3
    51  	ADDPD 64(X_PTR)(IDX*8), SUM
    52  	ADDPD 80(X_PTR)(IDX*8), SUM_1
    53  	ADDPD 96(X_PTR)(IDX*8), SUM_2
    54  	ADDPD 112(X_PTR)(IDX*8), SUM_3
    55  	ADDQ  $16, IDX                 // i += 16
    56  	DECQ  LEN
    57  	JNZ   sum_loop                 // } while --LEN > 0
    58  
    59  sum_tail8:
    60  	TESTQ $8, TAIL
    61  	JZ    sum_tail4
    62  
    63  	ADDPD (X_PTR)(IDX*8), SUM     // sum_i += x[i:i+2]
    64  	ADDPD 16(X_PTR)(IDX*8), SUM_1
    65  	ADDPD 32(X_PTR)(IDX*8), SUM_2
    66  	ADDPD 48(X_PTR)(IDX*8), SUM_3
    67  	ADDQ  $8, IDX
    68  
    69  sum_tail4:
    70  	ADDPD SUM_3, SUM
    71  	ADDPD SUM_2, SUM_1
    72  
    73  	TESTQ $4, TAIL
    74  	JZ    sum_tail2
    75  
    76  	ADDPD (X_PTR)(IDX*8), SUM     // sum_i += x[i:i+2]
    77  	ADDPD 16(X_PTR)(IDX*8), SUM_1
    78  	ADDQ  $4, IDX
    79  
    80  sum_tail2:
    81  	ADDPD SUM_1, SUM
    82  
    83  	TESTQ $2, TAIL
    84  	JZ    sum_tail1
    85  
    86  	ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2]
    87  	ADDQ  $2, IDX
    88  
    89  sum_tail1:
    90  	HADDPD SUM, SUM // sum_i[0] += sum_i[1]
    91  
    92  	TESTQ $1, TAIL
    93  	JZ    sum_end
    94  
    95  	ADDSD (X_PTR)(IDX*8), SUM
    96  
    97  sum_end: // return sum
    98  	MOVSD SUM, ret+24(FP)
    99  	RET