gonum.org/v1/gonum@v0.14.0/internal/asm/f64/abssuminc_amd64.s (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  // func L1NormInc(x []float64, n, incX int) (sum float64)
    10  TEXT ·L1NormInc(SB), NOSPLIT, $0
    11  	MOVQ  x_base+0(FP), SI // SI = &x
    12  	MOVQ  n+24(FP), CX     // CX = n
    13  	MOVQ  incX+32(FP), AX  // AX =  increment * sizeof( float64 )
    14  	SHLQ  $3, AX
    15  	MOVQ  AX, DX           // DX = AX * 3
    16  	IMULQ $3, DX
    17  	PXOR  X0, X0           // p_sum_i = 0
    18  	PXOR  X1, X1
    19  	PXOR  X2, X2
    20  	PXOR  X3, X3
    21  	PXOR  X4, X4
    22  	PXOR  X5, X5
    23  	PXOR  X6, X6
    24  	PXOR  X7, X7
    25  	CMPQ  CX, $0           // if CX == 0 { return 0 }
    26  	JE    absum_end
    27  	MOVQ  CX, BX
    28  	ANDQ  $7, BX           // BX = n % 8
    29  	SHRQ  $3, CX           // CX = floor( n / 8 )
    30  	JZ    absum_tail_start // if CX == 0 { goto absum_tail_start }
    31  
    32  absum_loop: // do {
    33  	// p_sum = max( p_sum + x[i], p_sum - x[i] )
    34  	MOVSD  (SI), X8        // X_i[0] = x[i]
    35  	MOVSD  (SI)(AX*1), X9
    36  	MOVSD  (SI)(AX*2), X10
    37  	MOVSD  (SI)(DX*1), X11
    38  	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
    39  	MOVHPD (SI), X8        // X_i[1] = x[i+4]
    40  	MOVHPD (SI)(AX*1), X9
    41  	MOVHPD (SI)(AX*2), X10
    42  	MOVHPD (SI)(DX*1), X11
    43  	ADDPD  X8, X0          // p_sum_i += X_i  ( positive values )
    44  	ADDPD  X9, X2
    45  	ADDPD  X10, X4
    46  	ADDPD  X11, X6
    47  	SUBPD  X8, X1          // p_sum_(i+1) -= X_i  ( negative values )
    48  	SUBPD  X9, X3
    49  	SUBPD  X10, X5
    50  	SUBPD  X11, X7
    51  	MAXPD  X1, X0          // p_sum_i = max( p_sum_i, p_sum_(i+1) )
    52  	MAXPD  X3, X2
    53  	MAXPD  X5, X4
    54  	MAXPD  X7, X6
    55  	MOVAPS X0, X1          // p_sum_(i+1) = p_sum_i
    56  	MOVAPS X2, X3
    57  	MOVAPS X4, X5
    58  	MOVAPS X6, X7
    59  	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
    60  	LOOP   absum_loop      // } while --CX > 0
    61  
    62  	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
    63  	ADDPD X3, X0
    64  	ADDPD X5, X7
    65  	ADDPD X7, X0
    66  
    67  	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
    68  	MOVAPS X0, X1
    69  	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
    70  	ADDSD  X1, X0
    71  	CMPQ   BX, $0
    72  	JE     absum_end    // if BX == 0 { goto absum_end }
    73  
    74  absum_tail_start: // Reset loop registers
    75  	MOVQ  BX, CX // Loop counter:  CX = BX
    76  	XORPS X8, X8 // X_8 = 0
    77  
    78  absum_tail: // do {
    79  	// p_sum += max( p_sum + x[i], p_sum - x[i] )
    80  	MOVSD (SI), X8   // X_8 = x[i]
    81  	MOVSD X0, X1     // p_sum_1 = p_sum_0
    82  	ADDSD X8, X0     // p_sum_0 += X_8
    83  	SUBSD X8, X1     // p_sum_1 -= X_8
    84  	MAXSD X1, X0     // p_sum_0 = max( p_sum_0, p_sum_1 )
    85  	ADDQ  AX, SI     // i++
    86  	LOOP  absum_tail // } while --CX > 0
    87  
    88  absum_end: // return p_sum_0
    89  	MOVSD X0, sum+40(FP)
    90  	RET