github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/l2_avx256_amd64.s (about)

     1  //go:build !noasm && amd64
     2  // AUTO-GENERATED BY GOAT -- DO NOT EDIT
     3  
     4  TEXT ·l2_256(SB), $0-32
     5  	MOVQ a+0(FP), DI
     6  	MOVQ b+8(FP), SI
     7  	MOVQ res+16(FP), DX
     8  	MOVQ len+24(FP), CX
     9  	BYTE $0x55               // pushq	%rbp
    10  	WORD $0x8948; BYTE $0xe5 // movq	%rsp, %rbp
    11  	LONG $0xf8e48348         // andq	$-8, %rsp
    12  	WORD $0x8b48; BYTE $0x01 // movq	(%rcx), %rax
    13  	WORD $0xf883; BYTE $0x07 // cmpl	$7, %eax
    14  	JG   LBB0_9
    15  	LONG $0xff408d44         // leal	-1(%rax), %r8d
    16  	WORD $0x03a8             // testb	$3, %al
    17  	JE   LBB0_2
    18  	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
    19  	LONG $0x03e18341         // andl	$3, %r9d
    20  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    21  	WORD $0xc931             // xorl	%ecx, %ecx
    22  
    23  LBB0_4:
    24  	LONG $0x0f10fac5         // vmovss	(%rdi), %xmm1
    25  	LONG $0x0e5cf2c5         // vsubss	(%rsi), %xmm1, %xmm1
    26  	LONG $0xc959f2c5         // vmulss	%xmm1, %xmm1, %xmm1
    27  	LONG $0xc158fac5         // vaddss	%xmm1, %xmm0, %xmm0
    28  	LONG $0x04c78348         // addq	$4, %rdi
    29  	LONG $0x04c68348         // addq	$4, %rsi
    30  	LONG $0x01c18348         // addq	$1, %rcx
    31  	WORD $0x3941; BYTE $0xc9 // cmpl	%ecx, %r9d
    32  	JNE  LBB0_4
    33  	WORD $0xc829             // subl	%ecx, %eax
    34  	LONG $0x03f88341         // cmpl	$3, %r8d
    35  	JAE  LBB0_7
    36  
    37  LBB0_26:
    38  	LONG $0x0211fac5         // vmovss	%xmm0, (%rdx)
    39  	WORD $0x8948; BYTE $0xec // movq	%rbp, %rsp
    40  	BYTE $0x5d               // popq	%rbp
    41  	BYTE $0xc3               // retq
    42  
    43  LBB0_9:
    44  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    45  	WORD $0xf883; BYTE $0x20 // cmpl	$32, %eax
    46  	JB   LBB0_10
    47  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    48  	LONG $0xc957f0c5         // vxorps	%xmm1, %xmm1, %xmm1
    49  	LONG $0xd257e8c5         // vxorps	%xmm2, %xmm2, %xmm2
    50  	LONG $0xdb57e0c5         // vxorps	%xmm3, %xmm3, %xmm3
    51  
    52  LBB0_16:
    53  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
    54  	LONG $0x6f10fcc5; BYTE $0x20 // vmovups	32(%rdi), %ymm5
    55  	LONG $0x7710fcc5; BYTE $0x40 // vmovups	64(%rdi), %ymm6
    56  	LONG $0x7f10fcc5; BYTE $0x60 // vmovups	96(%rdi), %ymm7
    57  	LONG $0x265cdcc5             // vsubps	(%rsi), %ymm4, %ymm4
    58  	LONG $0x6e5cd4c5; BYTE $0x20 // vsubps	32(%rsi), %ymm5, %ymm5
    59  	LONG $0x765cccc5; BYTE $0x40 // vsubps	64(%rsi), %ymm6, %ymm6
    60  	LONG $0x7e5cc4c5; BYTE $0x60 // vsubps	96(%rsi), %ymm7, %ymm7
    61  	LONG $0xb85de2c4; BYTE $0xdc // vfmadd231ps	%ymm4, %ymm4, %ymm3
    62  	LONG $0xb855e2c4; BYTE $0xd5 // vfmadd231ps	%ymm5, %ymm5, %ymm2
    63  	LONG $0xb84de2c4; BYTE $0xce // vfmadd231ps	%ymm6, %ymm6, %ymm1
    64  	LONG $0xb845e2c4; BYTE $0xc7 // vfmadd231ps	%ymm7, %ymm7, %ymm0
    65  	WORD $0xc083; BYTE $0xe0     // addl	$-32, %eax
    66  	LONG $0x80ef8348             // subq	$-128, %rdi
    67  	LONG $0x80ee8348             // subq	$-128, %rsi
    68  	WORD $0xf883; BYTE $0x1f     // cmpl	$31, %eax
    69  	JA   LBB0_16
    70  	WORD $0xf883; BYTE $0x08     // cmpl	$8, %eax
    71  	JAE  LBB0_11
    72  	JMP  LBB0_13
    73  
    74  LBB0_10:
    75  	LONG $0xc957f0c5 // vxorps	%xmm1, %xmm1, %xmm1
    76  	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
    77  	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
    78  
    79  LBB0_11:
    80  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
    81  	LONG $0x265cdcc5             // vsubps	(%rsi), %ymm4, %ymm4
    82  	LONG $0xb85de2c4; BYTE $0xdc // vfmadd231ps	%ymm4, %ymm4, %ymm3
    83  	WORD $0xc083; BYTE $0xf8     // addl	$-8, %eax
    84  	LONG $0x20c78348             // addq	$32, %rdi
    85  	LONG $0x20c68348             // addq	$32, %rsi
    86  	WORD $0xf883; BYTE $0x07     // cmpl	$7, %eax
    87  	JA   LBB0_11
    88  
    89  LBB0_13:
    90  	WORD $0xc085             // testl	%eax, %eax
    91  	JE   LBB0_14
    92  	LONG $0xff408d44         // leal	-1(%rax), %r8d
    93  	WORD $0x03a8             // testb	$3, %al
    94  	JE   LBB0_18
    95  	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
    96  	LONG $0x03e18341         // andl	$3, %r9d
    97  	LONG $0xe457d8c5         // vxorps	%xmm4, %xmm4, %xmm4
    98  	WORD $0xc931             // xorl	%ecx, %ecx
    99  
   100  LBB0_20:
   101  	LONG $0x2f10fac5         // vmovss	(%rdi), %xmm5
   102  	LONG $0x2e5cd2c5         // vsubss	(%rsi), %xmm5, %xmm5
   103  	LONG $0xed59d2c5         // vmulss	%xmm5, %xmm5, %xmm5
   104  	LONG $0xe558dac5         // vaddss	%xmm5, %xmm4, %xmm4
   105  	LONG $0x04c78348         // addq	$4, %rdi
   106  	LONG $0x04c68348         // addq	$4, %rsi
   107  	LONG $0x01c18348         // addq	$1, %rcx
   108  	WORD $0x3941; BYTE $0xc9 // cmpl	%ecx, %r9d
   109  	JNE  LBB0_20
   110  	WORD $0xc829             // subl	%ecx, %eax
   111  	LONG $0x03f88341         // cmpl	$3, %r8d
   112  	JAE  LBB0_23
   113  	JMP  LBB0_25
   114  
   115  LBB0_2:
   116  	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
   117  	LONG $0x03f88341 // cmpl	$3, %r8d
   118  	JB   LBB0_26
   119  
   120  LBB0_7:
   121  	WORD $0xc089 // movl	%eax, %eax
   122  	WORD $0xc931 // xorl	%ecx, %ecx
   123  
   124  LBB0_8:
   125  	LONG $0x0c10fac5; BYTE $0x8f   // vmovss	(%rdi,%rcx,4), %xmm1
   126  	LONG $0x5410fac5; WORD $0x048f // vmovss	4(%rdi,%rcx,4), %xmm2
   127  	LONG $0x0c5cf2c5; BYTE $0x8e   // vsubss	(%rsi,%rcx,4), %xmm1, %xmm1
   128  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   129  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   130  	LONG $0x4c5ceac5; WORD $0x048e // vsubss	4(%rsi,%rcx,4), %xmm2, %xmm1
   131  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   132  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   133  	LONG $0x4c10fac5; WORD $0x088f // vmovss	8(%rdi,%rcx,4), %xmm1
   134  	LONG $0x4c5cf2c5; WORD $0x088e // vsubss	8(%rsi,%rcx,4), %xmm1, %xmm1
   135  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   136  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   137  	LONG $0x4c10fac5; WORD $0x0c8f // vmovss	12(%rdi,%rcx,4), %xmm1
   138  	LONG $0x4c5cf2c5; WORD $0x0c8e // vsubss	12(%rsi,%rcx,4), %xmm1, %xmm1
   139  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   140  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   141  	LONG $0x04c18348               // addq	$4, %rcx
   142  	WORD $0xc839                   // cmpl	%ecx, %eax
   143  	JNE  LBB0_8
   144  	JMP  LBB0_26
   145  
   146  LBB0_14:
   147  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   148  	JMP  LBB0_25
   149  
   150  LBB0_18:
   151  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   152  	LONG $0x03f88341 // cmpl	$3, %r8d
   153  	JB   LBB0_25
   154  
   155  LBB0_23:
   156  	WORD $0xc089 // movl	%eax, %eax
   157  	WORD $0xc931 // xorl	%ecx, %ecx
   158  
   159  LBB0_24:
   160  	LONG $0x2c10fac5; BYTE $0x8f   // vmovss	(%rdi,%rcx,4), %xmm5
   161  	LONG $0x7410fac5; WORD $0x048f // vmovss	4(%rdi,%rcx,4), %xmm6
   162  	LONG $0x2c5cd2c5; BYTE $0x8e   // vsubss	(%rsi,%rcx,4), %xmm5, %xmm5
   163  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   164  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   165  	LONG $0x6c5ccac5; WORD $0x048e // vsubss	4(%rsi,%rcx,4), %xmm6, %xmm5
   166  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   167  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   168  	LONG $0x6c10fac5; WORD $0x088f // vmovss	8(%rdi,%rcx,4), %xmm5
   169  	LONG $0x6c5cd2c5; WORD $0x088e // vsubss	8(%rsi,%rcx,4), %xmm5, %xmm5
   170  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   171  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   172  	LONG $0x6c10fac5; WORD $0x0c8f // vmovss	12(%rdi,%rcx,4), %xmm5
   173  	LONG $0x6c5cd2c5; WORD $0x0c8e // vsubss	12(%rsi,%rcx,4), %xmm5, %xmm5
   174  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   175  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   176  	LONG $0x04c18348               // addq	$4, %rcx
   177  	WORD $0xc839                   // cmpl	%ecx, %eax
   178  	JNE  LBB0_24
   179  
   180  LBB0_25:
   181  	LONG $0xd358ecc5               // vaddps	%ymm3, %ymm2, %ymm2
   182  	LONG $0xc058f4c5               // vaddps	%ymm0, %ymm1, %ymm0
   183  	LONG $0xc258fcc5               // vaddps	%ymm2, %ymm0, %ymm0
   184  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   185  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   186  	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
   187  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   188  	LONG $0xc058dac5               // vaddss	%xmm0, %xmm4, %xmm0
   189  	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
   190  	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
   191  	BYTE $0x5d                     // popq	%rbp
   192  	WORD $0xf8c5; BYTE $0x77       // vzeroupper
   193  	BYTE $0xc3                     // retq