gonum.org/v1/gonum@v0.14.0/internal/asm/f64/l1norm_amd64.s (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  // func L1Dist(s, t []float64) float64
    10  TEXT ·L1Dist(SB), NOSPLIT, $0
    11  	MOVQ    s_base+0(FP), DI  // DI = &s
    12  	MOVQ    t_base+24(FP), SI // SI = &t
    13  	MOVQ    s_len+8(FP), CX   // CX = len(s)
    14  	CMPQ    t_len+32(FP), CX  // CX = max( CX, len(t) )
    15  	CMOVQLE t_len+32(FP), CX
    16  	PXOR    X3, X3            // norm = 0
    17  	CMPQ    CX, $0            // if CX == 0 { return 0 }
    18  	JE      l1_end
    19  	XORQ    AX, AX            // i = 0
    20  	MOVQ    CX, BX
    21  	ANDQ    $1, BX            // BX = CX % 2
    22  	SHRQ    $1, CX            // CX = floor( CX / 2 )
    23  	JZ      l1_tail_start     // if CX == 0 { return 0 }
    24  
    25  l1_loop: // Loop unrolled 2x  do {
    26  	MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
    27  	MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
    28  	MOVAPS X0, X2
    29  	SUBPD  X1, X0
    30  	SUBPD  X2, X1
    31  	MAXPD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
    32  	ADDPD  X0, X3         // norm += X0
    33  	ADDQ   $2, AX         // i += 2
    34  	LOOP   l1_loop        // } while --CX > 0
    35  	CMPQ   BX, $0         // if BX == 0 { return }
    36  	JE     l1_end
    37  
    38  l1_tail_start: // Reset loop registers
    39  	MOVQ BX, CX // Loop counter: CX = BX
    40  	PXOR X0, X0 // reset X0, X1 to break dependencies
    41  	PXOR X1, X1
    42  
    43  l1_tail:
    44  	MOVSD  (SI)(AX*8), X0 // X0 = t[i]
    45  	MOVSD  (DI)(AX*8), X1 // x1 = s[i]
    46  	MOVAPD X0, X2
    47  	SUBSD  X1, X0
    48  	SUBSD  X2, X1
    49  	MAXSD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
    50  	ADDSD  X0, X3         // norm += X0
    51  
    52  l1_end:
    53  	MOVAPS X3, X2
    54  	SHUFPD $1, X2, X2
    55  	ADDSD  X3, X2         // X2 = X3[1] + X3[0]
    56  	MOVSD  X2, ret+48(FP) // return X2
    57  	RET
    58