github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/l2_avx512_amd64.s (about)

     1  //go:build !noasm && amd64
     2  // AUTO-GENERATED BY GOAT -- DO NOT EDIT
     3  
     4  TEXT ·l2_512(SB), $0-32
     5  	MOVQ a+0(FP), DI
     6  	MOVQ b+8(FP), SI
     7  	MOVQ res+16(FP), DX
     8  	MOVQ len+24(FP), CX
     9  	BYTE $0x55               // pushq	%rbp
    10  	WORD $0x8948; BYTE $0xe5 // movq	%rsp, %rbp
    11  	LONG $0xf8e48348         // andq	$-8, %rsp
    12  	WORD $0x8b48; BYTE $0x01 // movq	(%rcx), %rax
    13  	WORD $0xf883; BYTE $0x07 // cmpl	$7, %eax
    14  	JG   LBB0_9
    15  	LONG $0xff408d44         // leal	-1(%rax), %r8d
    16  	WORD $0x03a8             // testb	$3, %al
    17  	JE   LBB0_2
    18  	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
    19  	LONG $0x03e18341         // andl	$3, %r9d
    20  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    21  	WORD $0xc931             // xorl	%ecx, %ecx
    22  
    23  LBB0_4:
    24  	LONG $0x0f10fac5         // vmovss	(%rdi), %xmm1
    25  	LONG $0x0e5cf2c5         // vsubss	(%rsi), %xmm1, %xmm1
    26  	LONG $0xc959f2c5         // vmulss	%xmm1, %xmm1, %xmm1
    27  	LONG $0xc158fac5         // vaddss	%xmm1, %xmm0, %xmm0
    28  	LONG $0x04c78348         // addq	$4, %rdi
    29  	LONG $0x04c68348         // addq	$4, %rsi
    30  	LONG $0x01c18348         // addq	$1, %rcx
    31  	WORD $0x3941; BYTE $0xc9 // cmpl	%ecx, %r9d
    32  	JNE  LBB0_4
    33  	WORD $0xc829             // subl	%ecx, %eax
    34  	LONG $0x03f88341         // cmpl	$3, %r8d
    35  	JAE  LBB0_7
    36  
    37  LBB0_36:
    38  	LONG $0x0211fac5         // vmovss	%xmm0, (%rdx)
    39  	WORD $0x8948; BYTE $0xec // movq	%rbp, %rsp
    40  	BYTE $0x5d               // popq	%rbp
    41  	BYTE $0xc3               // retq
    42  
    43  LBB0_9:
    44  	LONG $0xc057f8c5             // vxorps	%xmm0, %xmm0, %xmm0
    45  	LONG $0x0000803d; BYTE $0x00 // cmpl	$128, %eax
    46  	JB   LBB0_10
    47  	LONG $0xc957f0c5             // vxorps	%xmm1, %xmm1, %xmm1
    48  	LONG $0xd257e8c5             // vxorps	%xmm2, %xmm2, %xmm2
    49  	LONG $0xdb57e0c5             // vxorps	%xmm3, %xmm3, %xmm3
    50  	LONG $0xed57d0c5             // vxorps	%xmm5, %xmm5, %xmm5
    51  	LONG $0xe457d8c5             // vxorps	%xmm4, %xmm4, %xmm4
    52  	LONG $0xf657c8c5             // vxorps	%xmm6, %xmm6, %xmm6
    53  	LONG $0xff57c0c5             // vxorps	%xmm7, %xmm7, %xmm7
    54  	LONG $0x573841c4; BYTE $0xc0 // vxorps	%xmm8, %xmm8, %xmm8
    55  
    56  LBB0_22:
    57  	LONG $0x487c7162; WORD $0x0f10             // vmovups	(%rdi), %zmm9
    58  	LONG $0x487c7162; WORD $0x5710; BYTE $0x01 // vmovups	64(%rdi), %zmm10
    59  	LONG $0x487c7162; WORD $0x5f10; BYTE $0x02 // vmovups	128(%rdi), %zmm11
    60  	LONG $0x487c7162; WORD $0x6710; BYTE $0x03 // vmovups	192(%rdi), %zmm12
    61  	LONG $0x487c7162; WORD $0x6f10; BYTE $0x04 // vmovups	256(%rdi), %zmm13
    62  	LONG $0x487c7162; WORD $0x7710; BYTE $0x05 // vmovups	320(%rdi), %zmm14
    63  	LONG $0x487c7162; WORD $0x7f10; BYTE $0x06 // vmovups	384(%rdi), %zmm15
    64  	LONG $0x48347162; WORD $0x0e5c             // vsubps	(%rsi), %zmm9, %zmm9
    65  	LONG $0x482c7162; WORD $0x565c; BYTE $0x01 // vsubps	64(%rsi), %zmm10, %zmm10
    66  	LONG $0x48247162; WORD $0x5e5c; BYTE $0x02 // vsubps	128(%rsi), %zmm11, %zmm11
    67  	LONG $0x481c7162; WORD $0x665c; BYTE $0x03 // vsubps	192(%rsi), %zmm12, %zmm12
    68  	LONG $0x48147162; WORD $0x6e5c; BYTE $0x04 // vsubps	256(%rsi), %zmm13, %zmm13
    69  	LONG $0x480c7162; WORD $0x765c; BYTE $0x05 // vsubps	320(%rsi), %zmm14, %zmm14
    70  	LONG $0x48047162; WORD $0x7e5c; BYTE $0x06 // vsubps	384(%rsi), %zmm15, %zmm15
    71  	LONG $0x487ce162; WORD $0x4710; BYTE $0x07 // vmovups	448(%rdi), %zmm16
    72  	LONG $0x407ce162; WORD $0x465c; BYTE $0x07 // vsubps	448(%rsi), %zmm16, %zmm16
    73  	LONG $0x4835d262; WORD $0xc9b8             // vfmadd231ps	%zmm9, %zmm9, %zmm1
    74  	LONG $0x482dd262; WORD $0xd2b8             // vfmadd231ps	%zmm10, %zmm10, %zmm2
    75  	LONG $0x4825d262; WORD $0xdbb8             // vfmadd231ps	%zmm11, %zmm11, %zmm3
    76  	LONG $0x481dd262; WORD $0xecb8             // vfmadd231ps	%zmm12, %zmm12, %zmm5
    77  	LONG $0x4815d262; WORD $0xe5b8             // vfmadd231ps	%zmm13, %zmm13, %zmm4
    78  	LONG $0x480dd262; WORD $0xf6b8             // vfmadd231ps	%zmm14, %zmm14, %zmm6
    79  	LONG $0x4805d262; WORD $0xffb8             // vfmadd231ps	%zmm15, %zmm15, %zmm7
    80  	LONG $0x407d3262; WORD $0xc0b8             // vfmadd231ps	%zmm16, %zmm16, %zmm8
    81  	WORD $0xc083; BYTE $0x80                   // addl	$-128, %eax
    82  	LONG $0x00c78148; WORD $0x0002; BYTE $0x00 // addq	$512, %rdi
    83  	LONG $0x00c68148; WORD $0x0002; BYTE $0x00 // addq	$512, %rsi
    84  	WORD $0xbe0f; BYTE $0xc8                   // movsbl	%al, %ecx
    85  	WORD $0xc139                               // cmpl	%eax, %ecx
    86  	JNE  LBB0_22
    87  	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
    88  	LONG $0x4864f162; WORD $0xd558             // vaddps	%zmm5, %zmm3, %zmm2
    89  	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
    90  	LONG $0x485cf162; WORD $0xd658             // vaddps	%zmm6, %zmm4, %zmm2
    91  	LONG $0x4844d162; WORD $0xd858             // vaddps	%zmm8, %zmm7, %zmm3
    92  	LONG $0x486cf162; WORD $0xd358             // vaddps	%zmm3, %zmm2, %zmm2
    93  	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
    94  	LONG $0x48fdf362; WORD $0xcb1b; BYTE $0x01 // vextractf64x4	$1, %zmm1, %ymm3
    95  	LONG $0xd257e8c5                           // vxorps	%xmm2, %xmm2, %xmm2
    96  	LONG $0xca58f4c5                           // vaddps	%ymm2, %ymm1, %ymm1
    97  	LONG $0xcb58f4c5                           // vaddps	%ymm3, %ymm1, %ymm1
    98  	WORD $0xc085                               // testl	%eax, %eax
    99  	JE   LBB0_24
   100  	WORD $0xf883; BYTE $0x20                   // cmpl	$32, %eax
   101  	JB   LBB0_12
   102  
   103  LBB0_25:
   104  	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
   105  	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
   106  	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
   107  
   108  LBB0_26:
   109  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
   110  	LONG $0x6f10fcc5; BYTE $0x20 // vmovups	32(%rdi), %ymm5
   111  	LONG $0x7710fcc5; BYTE $0x40 // vmovups	64(%rdi), %ymm6
   112  	LONG $0x7f10fcc5; BYTE $0x60 // vmovups	96(%rdi), %ymm7
   113  	LONG $0x265cdcc5             // vsubps	(%rsi), %ymm4, %ymm4
   114  	LONG $0x6e5cd4c5; BYTE $0x20 // vsubps	32(%rsi), %ymm5, %ymm5
   115  	LONG $0x765cccc5; BYTE $0x40 // vsubps	64(%rsi), %ymm6, %ymm6
   116  	LONG $0x7e5cc4c5; BYTE $0x60 // vsubps	96(%rsi), %ymm7, %ymm7
   117  	LONG $0xb85de2c4; BYTE $0xcc // vfmadd231ps	%ymm4, %ymm4, %ymm1
   118  	LONG $0xb855e2c4; BYTE $0xc5 // vfmadd231ps	%ymm5, %ymm5, %ymm0
   119  	LONG $0xb84de2c4; BYTE $0xde // vfmadd231ps	%ymm6, %ymm6, %ymm3
   120  	LONG $0xb845e2c4; BYTE $0xd7 // vfmadd231ps	%ymm7, %ymm7, %ymm2
   121  	WORD $0xc083; BYTE $0xe0     // addl	$-32, %eax
   122  	LONG $0x80ef8348             // subq	$-128, %rdi
   123  	LONG $0x80ee8348             // subq	$-128, %rsi
   124  	WORD $0xf883; BYTE $0x1f     // cmpl	$31, %eax
   125  	JA   LBB0_26
   126  	WORD $0xf883; BYTE $0x08     // cmpl	$8, %eax
   127  	JAE  LBB0_14
   128  	JMP  LBB0_19
   129  
   130  LBB0_10:
   131  	LONG $0xc957f0c5         // vxorps	%xmm1, %xmm1, %xmm1
   132  	WORD $0xf883; BYTE $0x20 // cmpl	$32, %eax
   133  	JAE  LBB0_25
   134  
   135  LBB0_12:
   136  	LONG $0xdb57e0c5         // vxorps	%xmm3, %xmm3, %xmm3
   137  	LONG $0xd257e8c5         // vxorps	%xmm2, %xmm2, %xmm2
   138  	WORD $0xf883; BYTE $0x08 // cmpl	$8, %eax
   139  	JB   LBB0_19
   140  
   141  LBB0_14:
   142  	LONG $0xf8408d44         // leal	-8(%rax), %r8d
   143  	WORD $0x8945; BYTE $0xc1 // movl	%r8d, %r9d
   144  	LONG $0x03e9c141         // shrl	$3, %r9d
   145  	LONG $0x01498d41         // leal	1(%r9), %ecx
   146  	WORD $0xc1f6; BYTE $0x03 // testb	$3, %cl
   147  	JE   LBB0_18
   148  	LONG $0x01c18041         // addb	$1, %r9b
   149  	LONG $0xc9b60f45         // movzbl	%r9b, %r9d
   150  	LONG $0x03e18341         // andl	$3, %r9d
   151  	LONG $0x03e1c149         // shlq	$3, %r9
   152  	WORD $0xc931             // xorl	%ecx, %ecx
   153  
   154  LBB0_16:
   155  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
   156  	LONG $0x265cdcc5             // vsubps	(%rsi), %ymm4, %ymm4
   157  	LONG $0xb85de2c4; BYTE $0xcc // vfmadd231ps	%ymm4, %ymm4, %ymm1
   158  	LONG $0x20c78348             // addq	$32, %rdi
   159  	LONG $0x20c68348             // addq	$32, %rsi
   160  	LONG $0x08c18348             // addq	$8, %rcx
   161  	WORD $0x3941; BYTE $0xc9     // cmpl	%ecx, %r9d
   162  	JNE  LBB0_16
   163  	WORD $0xc829                 // subl	%ecx, %eax
   164  
   165  LBB0_18:
   166  	LONG $0x18f88341 // cmpl	$24, %r8d
   167  	JB   LBB0_19
   168  
   169  LBB0_37:
   170  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
   171  	LONG $0x6f10fcc5; BYTE $0x20 // vmovups	32(%rdi), %ymm5
   172  	LONG $0x7710fcc5; BYTE $0x40 // vmovups	64(%rdi), %ymm6
   173  	LONG $0x7f10fcc5; BYTE $0x60 // vmovups	96(%rdi), %ymm7
   174  	LONG $0x265cdcc5             // vsubps	(%rsi), %ymm4, %ymm4
   175  	LONG $0x6e5cd4c5; BYTE $0x20 // vsubps	32(%rsi), %ymm5, %ymm5
   176  	LONG $0xa85de2c4; BYTE $0xe1 // vfmadd213ps	%ymm1, %ymm4, %ymm4
   177  	LONG $0xa855e2c4; BYTE $0xec // vfmadd213ps	%ymm4, %ymm5, %ymm5
   178  	LONG $0x665cccc5; BYTE $0x40 // vsubps	64(%rsi), %ymm6, %ymm4
   179  	LONG $0xa85de2c4; BYTE $0xe5 // vfmadd213ps	%ymm5, %ymm4, %ymm4
   180  	LONG $0x4e5cc4c5; BYTE $0x60 // vsubps	96(%rsi), %ymm7, %ymm1
   181  	LONG $0xa875e2c4; BYTE $0xcc // vfmadd213ps	%ymm4, %ymm1, %ymm1
   182  	WORD $0xc083; BYTE $0xe0     // addl	$-32, %eax
   183  	LONG $0x80ef8348             // subq	$-128, %rdi
   184  	LONG $0x80ee8348             // subq	$-128, %rsi
   185  	WORD $0xf883; BYTE $0x07     // cmpl	$7, %eax
   186  	JA   LBB0_37
   187  
   188  LBB0_19:
   189  	WORD $0xc085             // testl	%eax, %eax
   190  	JE   LBB0_20
   191  	LONG $0xff408d44         // leal	-1(%rax), %r8d
   192  	WORD $0x03a8             // testb	$3, %al
   193  	JE   LBB0_28
   194  	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
   195  	LONG $0x03e18341         // andl	$3, %r9d
   196  	LONG $0xe457d8c5         // vxorps	%xmm4, %xmm4, %xmm4
   197  	WORD $0xc931             // xorl	%ecx, %ecx
   198  
   199  LBB0_30:
   200  	LONG $0x2f10fac5         // vmovss	(%rdi), %xmm5
   201  	LONG $0x2e5cd2c5         // vsubss	(%rsi), %xmm5, %xmm5
   202  	LONG $0xed59d2c5         // vmulss	%xmm5, %xmm5, %xmm5
   203  	LONG $0xe558dac5         // vaddss	%xmm5, %xmm4, %xmm4
   204  	LONG $0x04c78348         // addq	$4, %rdi
   205  	LONG $0x04c68348         // addq	$4, %rsi
   206  	LONG $0x01c18348         // addq	$1, %rcx
   207  	WORD $0x3941; BYTE $0xc9 // cmpl	%ecx, %r9d
   208  	JNE  LBB0_30
   209  	WORD $0xc829             // subl	%ecx, %eax
   210  	LONG $0x03f88341         // cmpl	$3, %r8d
   211  	JAE  LBB0_33
   212  	JMP  LBB0_35
   213  
   214  LBB0_2:
   215  	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
   216  	LONG $0x03f88341 // cmpl	$3, %r8d
   217  	JB   LBB0_36
   218  
   219  LBB0_7:
   220  	WORD $0xc089 // movl	%eax, %eax
   221  	WORD $0xc931 // xorl	%ecx, %ecx
   222  
   223  LBB0_8:
   224  	LONG $0x0c10fac5; BYTE $0x8f   // vmovss	(%rdi,%rcx,4), %xmm1
   225  	LONG $0x5410fac5; WORD $0x048f // vmovss	4(%rdi,%rcx,4), %xmm2
   226  	LONG $0x0c5cf2c5; BYTE $0x8e   // vsubss	(%rsi,%rcx,4), %xmm1, %xmm1
   227  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   228  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   229  	LONG $0x4c5ceac5; WORD $0x048e // vsubss	4(%rsi,%rcx,4), %xmm2, %xmm1
   230  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   231  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   232  	LONG $0x4c10fac5; WORD $0x088f // vmovss	8(%rdi,%rcx,4), %xmm1
   233  	LONG $0x4c5cf2c5; WORD $0x088e // vsubss	8(%rsi,%rcx,4), %xmm1, %xmm1
   234  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   235  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   236  	LONG $0x4c10fac5; WORD $0x0c8f // vmovss	12(%rdi,%rcx,4), %xmm1
   237  	LONG $0x4c5cf2c5; WORD $0x0c8e // vsubss	12(%rsi,%rcx,4), %xmm1, %xmm1
   238  	LONG $0xc959f2c5               // vmulss	%xmm1, %xmm1, %xmm1
   239  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   240  	LONG $0x04c18348               // addq	$4, %rcx
   241  	WORD $0xc839                   // cmpl	%ecx, %eax
   242  	JNE  LBB0_8
   243  	JMP  LBB0_36
   244  
   245  LBB0_20:
   246  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   247  	JMP  LBB0_35
   248  
   249  LBB0_24:
   250  	LONG $0xc258f4c5               // vaddps	%ymm2, %ymm1, %ymm0
   251  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   252  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   253  	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
   254  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   255  	LONG $0xc957f0c5               // vxorps	%xmm1, %xmm1, %xmm1
   256  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   257  	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
   258  	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
   259  	BYTE $0x5d                     // popq	%rbp
   260  	WORD $0xf8c5; BYTE $0x77       // vzeroupper
   261  	BYTE $0xc3                     // retq
   262  
   263  LBB0_28:
   264  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   265  	LONG $0x03f88341 // cmpl	$3, %r8d
   266  	JB   LBB0_35
   267  
   268  LBB0_33:
   269  	WORD $0xc089 // movl	%eax, %eax
   270  	WORD $0xc931 // xorl	%ecx, %ecx
   271  
   272  LBB0_34:
   273  	LONG $0x2c10fac5; BYTE $0x8f   // vmovss	(%rdi,%rcx,4), %xmm5
   274  	LONG $0x7410fac5; WORD $0x048f // vmovss	4(%rdi,%rcx,4), %xmm6
   275  	LONG $0x2c5cd2c5; BYTE $0x8e   // vsubss	(%rsi,%rcx,4), %xmm5, %xmm5
   276  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   277  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   278  	LONG $0x6c5ccac5; WORD $0x048e // vsubss	4(%rsi,%rcx,4), %xmm6, %xmm5
   279  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   280  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   281  	LONG $0x6c10fac5; WORD $0x088f // vmovss	8(%rdi,%rcx,4), %xmm5
   282  	LONG $0x6c5cd2c5; WORD $0x088e // vsubss	8(%rsi,%rcx,4), %xmm5, %xmm5
   283  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   284  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   285  	LONG $0x6c10fac5; WORD $0x0c8f // vmovss	12(%rdi,%rcx,4), %xmm5
   286  	LONG $0x6c5cd2c5; WORD $0x0c8e // vsubss	12(%rsi,%rcx,4), %xmm5, %xmm5
   287  	LONG $0xed59d2c5               // vmulss	%xmm5, %xmm5, %xmm5
   288  	LONG $0xe558dac5               // vaddss	%xmm5, %xmm4, %xmm4
   289  	LONG $0x04c18348               // addq	$4, %rcx
   290  	WORD $0xc839                   // cmpl	%ecx, %eax
   291  	JNE  LBB0_34
   292  
   293  LBB0_35:
   294  	LONG $0xc158fcc5               // vaddps	%ymm1, %ymm0, %ymm0
   295  	LONG $0xca58e4c5               // vaddps	%ymm2, %ymm3, %ymm1
   296  	LONG $0xc058f4c5               // vaddps	%ymm0, %ymm1, %ymm0
   297  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   298  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   299  	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
   300  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   301  	LONG $0xc058dac5               // vaddss	%xmm0, %xmm4, %xmm0
   302  	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
   303  	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
   304  	BYTE $0x5d                     // popq	%rbp
   305  	WORD $0xf8c5; BYTE $0x77       // vzeroupper
   306  	BYTE $0xc3                     // retq