github.com/gopherd/gonum@v0.0.4/internal/asm/f64/sum_amd64.s (about) 1 // Copyright ©2018 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define X_PTR SI 10 #define IDX AX 11 #define LEN CX 12 #define TAIL BX 13 #define SUM X0 14 #define SUM_1 X1 15 #define SUM_2 X2 16 #define SUM_3 X3 17 18 // func Sum(x []float64) float64 19 TEXT ·Sum(SB), NOSPLIT, $0 20 MOVQ x_base+0(FP), X_PTR // X_PTR = &x 21 MOVQ x_len+8(FP), LEN // LEN = len(x) 22 XORQ IDX, IDX // i = 0 23 PXOR SUM, SUM // p_sum_i = 0 24 CMPQ LEN, $0 // if LEN == 0 { return 0 } 25 JE sum_end 26 27 PXOR SUM_1, SUM_1 28 PXOR SUM_2, SUM_2 29 PXOR SUM_3, SUM_3 30 31 MOVQ X_PTR, TAIL // Check memory alignment 32 ANDQ $15, TAIL // TAIL = &y % 16 33 JZ no_trim // if TAIL == 0 { goto no_trim } 34 35 // Align on 16-byte boundary 36 ADDSD (X_PTR), X0 // X0 += x[0] 37 INCQ IDX // i++ 38 DECQ LEN // LEN-- 39 JZ sum_end // if LEN == 0 { return } 40 41 no_trim: 42 MOVQ LEN, TAIL 43 SHRQ $4, LEN // LEN = floor( n / 16 ) 44 JZ sum_tail8 // if LEN == 0 { goto sum_tail8 } 45 46 sum_loop: // sum 16x wide do { 47 ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] 48 ADDPD 16(X_PTR)(IDX*8), SUM_1 49 ADDPD 32(X_PTR)(IDX*8), SUM_2 50 ADDPD 48(X_PTR)(IDX*8), SUM_3 51 ADDPD 64(X_PTR)(IDX*8), SUM 52 ADDPD 80(X_PTR)(IDX*8), SUM_1 53 ADDPD 96(X_PTR)(IDX*8), SUM_2 54 ADDPD 112(X_PTR)(IDX*8), SUM_3 55 ADDQ $16, IDX // i += 16 56 DECQ LEN 57 JNZ sum_loop // } while --LEN > 0 58 59 sum_tail8: 60 TESTQ $8, TAIL 61 JZ sum_tail4 62 63 ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] 64 ADDPD 16(X_PTR)(IDX*8), SUM_1 65 ADDPD 32(X_PTR)(IDX*8), SUM_2 66 ADDPD 48(X_PTR)(IDX*8), SUM_3 67 ADDQ $8, IDX 68 69 sum_tail4: 70 ADDPD SUM_3, SUM 71 ADDPD SUM_2, SUM_1 72 73 TESTQ $4, TAIL 74 JZ sum_tail2 75 76 ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] 77 ADDPD 16(X_PTR)(IDX*8), SUM_1 78 ADDQ $4, IDX 79 80 sum_tail2: 81 ADDPD SUM_1, SUM 82 83 TESTQ $2, TAIL 84 JZ sum_tail1 85 86 ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] 87 ADDQ $2, IDX 88 89 sum_tail1: 90 HADDPD SUM, SUM // sum_i[0] += sum_i[1] 91 92 TESTQ $1, TAIL 93 JZ sum_end 94 95 ADDSD (X_PTR)(IDX*8), SUM 96 97 sum_end: // return sum 98 MOVSD SUM, ret+24(FP) 99 RET