github.com/gopherd/gonum@v0.0.4/internal/asm/f32/sum_amd64.s (about)

     1  // Copyright ©2021 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  #define X_PTR SI
    10  #define IDX AX
    11  #define LEN CX
    12  #define TAIL BX
    13  #define SUM X0
    14  #define SUM_1 X1
    15  #define SUM_2 X2
    16  #define SUM_3 X3
    17  
    18  // func Sum(x []float32) float32
    19  TEXT ·Sum(SB), NOSPLIT, $0
    20  	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
    21  	MOVQ x_len+8(FP), LEN    // LEN = len(x)
    22  	XORQ IDX, IDX            // i = 0
    23  	PXOR SUM, SUM            // p_sum_i = 0
    24  	CMPQ LEN, $0             // if LEN == 0 { return 0 }
    25  	JE   sum_end
    26  
    27  	PXOR SUM_1, SUM_1
    28  	PXOR SUM_2, SUM_2
    29  	PXOR SUM_3, SUM_3
    30  
    31  	MOVQ X_PTR, TAIL // Check memory alignment
    32  	ANDQ $15, TAIL   // TAIL = &x % 16
    33  	JZ   no_trim     // if TAIL == 0 { goto no_trim }
    34  	SUBQ $16, TAIL   // TAIL -= 16
    35  
    36  sum_align: // Align on 16-byte boundary do {
    37  	ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
    38  	INCQ  IDX                 // i++
    39  	DECQ  LEN                 // LEN--
    40  	JZ    sum_end             // if LEN == 0 { return }
    41  	ADDQ  $4, TAIL            // TAIL += 4
    42  	JNZ   sum_align           // } while TAIL < 0
    43  
    44  no_trim:
    45  	MOVQ LEN, TAIL
    46  	SHRQ $4, LEN   // LEN = floor( n / 16 )
    47  	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }
    48  
    49  
    50  sum_loop: // sum 16x wide do {
    51  	ADDPS (X_PTR)(IDX*4), SUM     // sum_i += x[i:i+4]
    52  	ADDPS 16(X_PTR)(IDX*4), SUM_1
    53  	ADDPS 32(X_PTR)(IDX*4), SUM_2
    54  	ADDPS 48(X_PTR)(IDX*4), SUM_3
    55  
    56  	ADDQ  $16, IDX                // i += 16
    57  	DECQ  LEN
    58  	JNZ   sum_loop                // } while --LEN > 0
    59  
    60  sum_tail8:
    61  	ADDPS SUM_3, SUM
    62  	ADDPS SUM_2, SUM_1
    63  
    64  	TESTQ $8, TAIL
    65  	JZ    sum_tail4
    66  
    67  	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
    68  	ADDPS 16(X_PTR)(IDX*4), SUM_1
    69  	ADDQ  $8, IDX
    70  
    71  sum_tail4:
    72  	ADDPS SUM_1, SUM
    73  
    74  	TESTQ $4, TAIL
    75  	JZ    sum_tail2
    76  
    77  	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
    78  	ADDQ  $4, IDX
    79  
    80  sum_tail2:
    81  	HADDPS SUM, SUM            // sum_i[:2] += sum_i[2:4]
    82  
    83  	TESTQ $2, TAIL
    84  	JZ    sum_tail1
    85  
    86  	MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
    87  	ADDPS SUM_1, SUM            // sum_i += x[i:i+2]
    88  	ADDQ  $2, IDX
    89  
    90  sum_tail1:
    91  	HADDPS SUM, SUM // sum_i[0] += sum_i[1]
    92  
    93  	TESTQ $1, TAIL
    94  	JZ    sum_end
    95  
    96  	ADDSS (X_PTR)(IDX*4), SUM
    97  
    98  sum_end: // return sum
    99  	MOVSS SUM, ret+24(FP)
   100  	RET