github.com/gopherd/gonum@v0.0.4/internal/asm/f32/sum_amd64.s (about) 1 // Copyright ©2021 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define X_PTR SI 10 #define IDX AX 11 #define LEN CX 12 #define TAIL BX 13 #define SUM X0 14 #define SUM_1 X1 15 #define SUM_2 X2 16 #define SUM_3 X3 17 18 // func Sum(x []float32) float32 19 TEXT ·Sum(SB), NOSPLIT, $0 20 MOVQ x_base+0(FP), X_PTR // X_PTR = &x 21 MOVQ x_len+8(FP), LEN // LEN = len(x) 22 XORQ IDX, IDX // i = 0 23 PXOR SUM, SUM // p_sum_i = 0 24 CMPQ LEN, $0 // if LEN == 0 { return 0 } 25 JE sum_end 26 27 PXOR SUM_1, SUM_1 28 PXOR SUM_2, SUM_2 29 PXOR SUM_3, SUM_3 30 31 MOVQ X_PTR, TAIL // Check memory alignment 32 ANDQ $15, TAIL // TAIL = &x % 16 33 JZ no_trim // if TAIL == 0 { goto no_trim } 34 SUBQ $16, TAIL // TAIL -= 16 35 36 sum_align: // Align on 16-byte boundary do { 37 ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0] 38 INCQ IDX // i++ 39 DECQ LEN // LEN-- 40 JZ sum_end // if LEN == 0 { return } 41 ADDQ $4, TAIL // TAIL += 4 42 JNZ sum_align // } while TAIL < 0 43 44 no_trim: 45 MOVQ LEN, TAIL 46 SHRQ $4, LEN // LEN = floor( n / 16 ) 47 JZ sum_tail8 // if LEN == 0 { goto sum_tail8 } 48 49 50 sum_loop: // sum 16x wide do { 51 ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4] 52 ADDPS 16(X_PTR)(IDX*4), SUM_1 53 ADDPS 32(X_PTR)(IDX*4), SUM_2 54 ADDPS 48(X_PTR)(IDX*4), SUM_3 55 56 ADDQ $16, IDX // i += 16 57 DECQ LEN 58 JNZ sum_loop // } while --LEN > 0 59 60 sum_tail8: 61 ADDPS SUM_3, SUM 62 ADDPS SUM_2, SUM_1 63 64 TESTQ $8, TAIL 65 JZ sum_tail4 66 67 ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4] 68 ADDPS 16(X_PTR)(IDX*4), SUM_1 69 ADDQ $8, IDX 70 71 sum_tail4: 72 ADDPS SUM_1, SUM 73 74 TESTQ $4, TAIL 75 JZ sum_tail2 76 77 ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4] 78 ADDQ $4, IDX 79 80 sum_tail2: 81 HADDPS SUM, SUM // sum_i[:2] += sum_i[2:4] 82 83 TESTQ $2, TAIL 84 JZ sum_tail1 85 86 MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1 87 ADDPS SUM_1, SUM // sum_i += x[i:i+2] 88 ADDQ $2, IDX 89 90 sum_tail1: 91 HADDPS SUM, SUM // sum_i[0] += sum_i[1] 92 93 TESTQ $1, TAIL 94 JZ sum_end 95 96 ADDSS (X_PTR)(IDX*4), SUM 97 98 sum_end: // return sum 99 MOVSS SUM, ret+24(FP) 100 RET