github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/dot_avx512_amd64.s (about)

     1  //go:build !noasm && amd64
     2  // AUTO-GENERATED BY GOAT -- DO NOT EDIT
     3  
     4  TEXT ·dot_512(SB), $0-32
     5  	MOVQ a+0(FP), DI
     6  	MOVQ b+8(FP), SI
     7  	MOVQ res+16(FP), DX
     8  	MOVQ len+24(FP), CX
     9  	BYTE $0x55               // pushq	%rbp
    10  	WORD $0x8948; BYTE $0xe5 // movq	%rsp, %rbp
    11  	LONG $0xf8e48348         // andq	$-8, %rsp
    12  	WORD $0x8b48; BYTE $0x01 // movq	(%rcx), %rax
    13  	WORD $0xf883; BYTE $0x07 // cmpl	$7, %eax
    14  	JG   LBB0_6
    15  	LONG $0xff408d44         // leal	-1(%rax), %r8d
    16  	WORD $0x03a8             // testb	$3, %al
    17  	JE   LBB0_15
    18  	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
    19  	LONG $0x03e18341         // andl	$3, %r9d
    20  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    21  	WORD $0xc931             // xorl	%ecx, %ecx
    22  
    23  LBB0_3:
    24  	LONG $0x0f10fac5             // vmovss	(%rdi), %xmm1
    25  	LONG $0xb971e2c4; BYTE $0x06 // vfmadd231ss	(%rsi), %xmm1, %xmm0
    26  	LONG $0x04c78348             // addq	$4, %rdi
    27  	LONG $0x04c68348             // addq	$4, %rsi
    28  	LONG $0x01c18348             // addq	$1, %rcx
    29  	WORD $0x3941; BYTE $0xc9     // cmpl	%ecx, %r9d
    30  	JNE  LBB0_3
    31  	WORD $0xc829                 // subl	%ecx, %eax
    32  	LONG $0x03f88341             // cmpl	$3, %r8d
    33  	JAE  LBB0_16
    34  	JMP  LBB0_5
    35  
    36  LBB0_6:
    37  	LONG $0xc057f8c5             // vxorps	%xmm0, %xmm0, %xmm0
    38  	LONG $0x0000803d; BYTE $0x00 // cmpl	$128, %eax
    39  	JB   LBB0_13
    40  	LONG $0xc957f0c5             // vxorps	%xmm1, %xmm1, %xmm1
    41  	LONG $0xd257e8c5             // vxorps	%xmm2, %xmm2, %xmm2
    42  	LONG $0xdb57e0c5             // vxorps	%xmm3, %xmm3, %xmm3
    43  	LONG $0xed57d0c5             // vxorps	%xmm5, %xmm5, %xmm5
    44  	LONG $0xe457d8c5             // vxorps	%xmm4, %xmm4, %xmm4
    45  	LONG $0xf657c8c5             // vxorps	%xmm6, %xmm6, %xmm6
    46  	LONG $0xff57c0c5             // vxorps	%xmm7, %xmm7, %xmm7
    47  	LONG $0x573841c4; BYTE $0xc0 // vxorps	%xmm8, %xmm8, %xmm8
    48  
    49  LBB0_8:
    50  	LONG $0x487c7162; WORD $0x0f10             // vmovups	(%rdi), %zmm9
    51  	LONG $0x487c7162; WORD $0x5710; BYTE $0x01 // vmovups	64(%rdi), %zmm10
    52  	LONG $0x487c7162; WORD $0x5f10; BYTE $0x02 // vmovups	128(%rdi), %zmm11
    53  	LONG $0x487c7162; WORD $0x6710; BYTE $0x03 // vmovups	192(%rdi), %zmm12
    54  	LONG $0x487c7162; WORD $0x6f10; BYTE $0x04 // vmovups	256(%rdi), %zmm13
    55  	LONG $0x487c7162; WORD $0x7710; BYTE $0x05 // vmovups	320(%rdi), %zmm14
    56  	LONG $0x487c7162; WORD $0x7f10; BYTE $0x06 // vmovups	384(%rdi), %zmm15
    57  	LONG $0x4835f262; WORD $0x0eb8             // vfmadd231ps	(%rsi), %zmm9, %zmm1
    58  	LONG $0x482df262; WORD $0x56b8; BYTE $0x01 // vfmadd231ps	64(%rsi), %zmm10, %zmm2
    59  	LONG $0x4825f262; WORD $0x5eb8; BYTE $0x02 // vfmadd231ps	128(%rsi), %zmm11, %zmm3
    60  	LONG $0x481df262; WORD $0x6eb8; BYTE $0x03 // vfmadd231ps	192(%rsi), %zmm12, %zmm5
    61  	LONG $0x4815f262; WORD $0x66b8; BYTE $0x04 // vfmadd231ps	256(%rsi), %zmm13, %zmm4
    62  	LONG $0x480df262; WORD $0x76b8; BYTE $0x05 // vfmadd231ps	320(%rsi), %zmm14, %zmm6
    63  	LONG $0x4805f262; WORD $0x7eb8; BYTE $0x06 // vfmadd231ps	384(%rsi), %zmm15, %zmm7
    64  	LONG $0x487c7162; WORD $0x4f10; BYTE $0x07 // vmovups	448(%rdi), %zmm9
    65  	LONG $0x48357262; WORD $0x46b8; BYTE $0x07 // vfmadd231ps	448(%rsi), %zmm9, %zmm8
    66  	WORD $0xc083; BYTE $0x80                   // addl	$-128, %eax
    67  	LONG $0x00c78148; WORD $0x0002; BYTE $0x00 // addq	$512, %rdi
    68  	LONG $0x00c68148; WORD $0x0002; BYTE $0x00 // addq	$512, %rsi
    69  	WORD $0xbe0f; BYTE $0xc8                   // movsbl	%al, %ecx
    70  	WORD $0xc139                               // cmpl	%eax, %ecx
    71  	JNE  LBB0_8
    72  	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
    73  	LONG $0x4864f162; WORD $0xd558             // vaddps	%zmm5, %zmm3, %zmm2
    74  	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
    75  	LONG $0x485cf162; WORD $0xd658             // vaddps	%zmm6, %zmm4, %zmm2
    76  	LONG $0x4844d162; WORD $0xd858             // vaddps	%zmm8, %zmm7, %zmm3
    77  	LONG $0x486cf162; WORD $0xd358             // vaddps	%zmm3, %zmm2, %zmm2
    78  	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
    79  	LONG $0x48fdf362; WORD $0xcb1b; BYTE $0x01 // vextractf64x4	$1, %zmm1, %ymm3
    80  	LONG $0xd257e8c5                           // vxorps	%xmm2, %xmm2, %xmm2
    81  	LONG $0xca58f4c5                           // vaddps	%ymm2, %ymm1, %ymm1
    82  	LONG $0xcb58f4c5                           // vaddps	%ymm3, %ymm1, %ymm1
    83  	WORD $0xc085                               // testl	%eax, %eax
    84  	JE   LBB0_18
    85  	WORD $0xf883; BYTE $0x20                   // cmpl	$32, %eax
    86  	JB   LBB0_14
    87  
    88  LBB0_11:
    89  	LONG $0xe0488d44               // leal	-32(%rax), %r9d
    90  	LONG $0x20c1f641               // testb	$32, %r9b
    91  	JNE  LBB0_19
    92  	LONG $0x2710fcc5               // vmovups	(%rdi), %ymm4
    93  	LONG $0x4710fcc5; BYTE $0x20   // vmovups	32(%rdi), %ymm0
    94  	LONG $0x5f10fcc5; BYTE $0x40   // vmovups	64(%rdi), %ymm3
    95  	LONG $0x5710fcc5; BYTE $0x60   // vmovups	96(%rdi), %ymm2
    96  	LONG $0xb85de2c4; BYTE $0x0e   // vfmadd231ps	(%rsi), %ymm4, %ymm1
    97  	LONG $0xe457d8c5               // vxorps	%xmm4, %xmm4, %xmm4
    98  	LONG $0x985de2c4; WORD $0x2046 // vfmadd132ps	32(%rsi), %ymm4, %ymm0
    99  	LONG $0x985de2c4; WORD $0x405e // vfmadd132ps	64(%rsi), %ymm4, %ymm3
   100  	LONG $0x985de2c4; WORD $0x6056 // vfmadd132ps	96(%rsi), %ymm4, %ymm2
   101  	LONG $0x80ef8348               // subq	$-128, %rdi
   102  	LONG $0x80ee8348               // subq	$-128, %rsi
   103  	WORD $0x8944; BYTE $0xc8       // movl	%r9d, %eax
   104  	LONG $0x20f98341               // cmpl	$32, %r9d
   105  	JAE  LBB0_20
   106  	JMP  LBB0_22
   107  
   108  LBB0_13:
   109  	LONG $0xc957f0c5         // vxorps	%xmm1, %xmm1, %xmm1
   110  	WORD $0xf883; BYTE $0x20 // cmpl	$32, %eax
   111  	JAE  LBB0_11
   112  
   113  LBB0_14:
   114  	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
   115  	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
   116  	JMP  LBB0_21
   117  
   118  LBB0_15:
   119  	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
   120  	LONG $0x03f88341 // cmpl	$3, %r8d
   121  	JB   LBB0_5
   122  
   123  LBB0_16:
   124  	WORD $0xc089 // movl	%eax, %eax
   125  	WORD $0xc931 // xorl	%ecx, %ecx
   126  
   127  LBB0_17:
   128  	LONG $0x0c10fac5; BYTE $0x8f               // vmovss	(%rdi,%rcx,4), %xmm1
   129  	LONG $0x5410fac5; WORD $0x048f             // vmovss	4(%rdi,%rcx,4), %xmm2
   130  	LONG $0x9979e2c4; WORD $0x8e0c             // vfmadd132ss	(%rsi,%rcx,4), %xmm0, %xmm1
   131  	LONG $0xb969e2c4; WORD $0x8e4c; BYTE $0x04 // vfmadd231ss	4(%rsi,%rcx,4), %xmm2, %xmm1
   132  	LONG $0x5410fac5; WORD $0x088f             // vmovss	8(%rdi,%rcx,4), %xmm2
   133  	LONG $0x9971e2c4; WORD $0x8e54; BYTE $0x08 // vfmadd132ss	8(%rsi,%rcx,4), %xmm1, %xmm2
   134  	LONG $0x4410fac5; WORD $0x0c8f             // vmovss	12(%rdi,%rcx,4), %xmm0
   135  	LONG $0x9969e2c4; WORD $0x8e44; BYTE $0x0c // vfmadd132ss	12(%rsi,%rcx,4), %xmm2, %xmm0
   136  	LONG $0x04c18348                           // addq	$4, %rcx
   137  	WORD $0xc839                               // cmpl	%ecx, %eax
   138  	JNE  LBB0_17
   139  
   140  LBB0_5:
   141  	LONG $0x0211fac5         // vmovss	%xmm0, (%rdx)
   142  	WORD $0x8948; BYTE $0xec // movq	%rbp, %rsp
   143  	BYTE $0x5d               // popq	%rbp
   144  	BYTE $0xc3               // retq
   145  
   146  LBB0_18:
   147  	LONG $0xc258f4c5               // vaddps	%ymm2, %ymm1, %ymm0
   148  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   149  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   150  	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
   151  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   152  	LONG $0xc957f0c5               // vxorps	%xmm1, %xmm1, %xmm1
   153  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   154  	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
   155  	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
   156  	BYTE $0x5d                     // popq	%rbp
   157  	WORD $0xf8c5; BYTE $0x77       // vzeroupper
   158  	BYTE $0xc3                     // retq
   159  
   160  LBB0_19:
   161  	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
   162  	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
   163  	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
   164  	LONG $0x20f98341 // cmpl	$32, %r9d
   165  	JB   LBB0_22
   166  
   167  LBB0_20:
   168  	LONG $0x2710fcc5                           // vmovups	(%rdi), %ymm4
   169  	LONG $0x6f10fcc5; BYTE $0x20               // vmovups	32(%rdi), %ymm5
   170  	LONG $0x7710fcc5; BYTE $0x40               // vmovups	64(%rdi), %ymm6
   171  	LONG $0x7f10fcc5; BYTE $0x60               // vmovups	96(%rdi), %ymm7
   172  	LONG $0x9875e2c4; BYTE $0x26               // vfmadd132ps	(%rsi), %ymm1, %ymm4
   173  	LONG $0x987de2c4; WORD $0x206e             // vfmadd132ps	32(%rsi), %ymm0, %ymm5
   174  	LONG $0x9865e2c4; WORD $0x4076             // vfmadd132ps	64(%rsi), %ymm3, %ymm6
   175  	LONG $0x986de2c4; WORD $0x607e             // vfmadd132ps	96(%rsi), %ymm2, %ymm7
   176  	QUAD $0x000000808f10fcc5                   // vmovups	128(%rdi), %ymm1
   177  	QUAD $0x000000a08710fcc5                   // vmovups	160(%rdi), %ymm0
   178  	QUAD $0x000000c09f10fcc5                   // vmovups	192(%rdi), %ymm3
   179  	QUAD $0x000000e09710fcc5                   // vmovups	224(%rdi), %ymm2
   180  	QUAD $0x0000808e985de2c4; BYTE $0x00       // vfmadd132ps	128(%rsi), %ymm4, %ymm1
   181  	QUAD $0x0000a0869855e2c4; BYTE $0x00       // vfmadd132ps	160(%rsi), %ymm5, %ymm0
   182  	QUAD $0x0000c09e984de2c4; BYTE $0x00       // vfmadd132ps	192(%rsi), %ymm6, %ymm3
   183  	QUAD $0x0000e0969845e2c4; BYTE $0x00       // vfmadd132ps	224(%rsi), %ymm7, %ymm2
   184  	WORD $0xc083; BYTE $0xc0                   // addl	$-64, %eax
   185  	LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq	$256, %rdi
   186  	LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq	$256, %rsi
   187  	WORD $0xf883; BYTE $0x1f                   // cmpl	$31, %eax
   188  	JA   LBB0_20
   189  
   190  LBB0_21:
   191  	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
   192  
   193  LBB0_22:
   194  	LONG $0x08f98341         // cmpl	$8, %r9d
   195  	JB   LBB0_29
   196  	LONG $0xf8418d45         // leal	-8(%r9), %r8d
   197  	WORD $0x8944; BYTE $0xc0 // movl	%r8d, %eax
   198  	WORD $0xe8c1; BYTE $0x03 // shrl	$3, %eax
   199  	WORD $0x488d; BYTE $0x01 // leal	1(%rax), %ecx
   200  	WORD $0xc1f6; BYTE $0x03 // testb	$3, %cl
   201  	JE   LBB0_27
   202  	WORD $0x0104             // addb	$1, %al
   203  	WORD $0xb60f; BYTE $0xc0 // movzbl	%al, %eax
   204  	WORD $0xe083; BYTE $0x03 // andl	$3, %eax
   205  	LONG $0x03e0c148         // shlq	$3, %rax
   206  	WORD $0xc931             // xorl	%ecx, %ecx
   207  
   208  LBB0_25:
   209  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
   210  	LONG $0xb85de2c4; BYTE $0x0e // vfmadd231ps	(%rsi), %ymm4, %ymm1
   211  	LONG $0x20c78348             // addq	$32, %rdi
   212  	LONG $0x20c68348             // addq	$32, %rsi
   213  	LONG $0x08c18348             // addq	$8, %rcx
   214  	WORD $0xc839                 // cmpl	%ecx, %eax
   215  	JNE  LBB0_25
   216  	WORD $0x2941; BYTE $0xc9     // subl	%ecx, %r9d
   217  
   218  LBB0_27:
   219  	LONG $0x18f88341 // cmpl	$24, %r8d
   220  	JB   LBB0_29
   221  
   222  LBB0_28:
   223  	LONG $0x2710fcc5               // vmovups	(%rdi), %ymm4
   224  	LONG $0x6f10fcc5; BYTE $0x20   // vmovups	32(%rdi), %ymm5
   225  	LONG $0x7710fcc5; BYTE $0x40   // vmovups	64(%rdi), %ymm6
   226  	LONG $0x7f10fcc5; BYTE $0x60   // vmovups	96(%rdi), %ymm7
   227  	LONG $0x9875e2c4; BYTE $0x26   // vfmadd132ps	(%rsi), %ymm1, %ymm4
   228  	LONG $0xb855e2c4; WORD $0x2066 // vfmadd231ps	32(%rsi), %ymm5, %ymm4
   229  	LONG $0xb84de2c4; WORD $0x4066 // vfmadd231ps	64(%rsi), %ymm6, %ymm4
   230  	LONG $0xcc28fcc5               // vmovaps	%ymm4, %ymm1
   231  	LONG $0xb845e2c4; WORD $0x604e // vfmadd231ps	96(%rsi), %ymm7, %ymm1
   232  	LONG $0xe0c18341               // addl	$-32, %r9d
   233  	LONG $0x80ef8348               // subq	$-128, %rdi
   234  	LONG $0x80ee8348               // subq	$-128, %rsi
   235  	LONG $0x07f98341               // cmpl	$7, %r9d
   236  	JA   LBB0_28
   237  
   238  LBB0_29:
   239  	WORD $0x8545; BYTE $0xc9 // testl	%r9d, %r9d
   240  	JE   LBB0_34
   241  	LONG $0xff418d45         // leal	-1(%r9), %r8d
   242  	LONG $0x03c1f641         // testb	$3, %r9b
   243  	JE   LBB0_35
   244  	WORD $0x8944; BYTE $0xc9 // movl	%r9d, %ecx
   245  	WORD $0xe183; BYTE $0x03 // andl	$3, %ecx
   246  	LONG $0xe457d8c5         // vxorps	%xmm4, %xmm4, %xmm4
   247  	WORD $0xc031             // xorl	%eax, %eax
   248  
   249  LBB0_32:
   250  	LONG $0x2f10fac5             // vmovss	(%rdi), %xmm5
   251  	LONG $0xb951e2c4; BYTE $0x26 // vfmadd231ss	(%rsi), %xmm5, %xmm4
   252  	LONG $0x04c78348             // addq	$4, %rdi
   253  	LONG $0x04c68348             // addq	$4, %rsi
   254  	LONG $0x01c08348             // addq	$1, %rax
   255  	WORD $0xc139                 // cmpl	%eax, %ecx
   256  	JNE  LBB0_32
   257  	WORD $0x2941; BYTE $0xc1     // subl	%eax, %r9d
   258  	LONG $0x03f88341             // cmpl	$3, %r8d
   259  	JAE  LBB0_36
   260  	JMP  LBB0_38
   261  
   262  LBB0_34:
   263  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   264  	JMP  LBB0_38
   265  
   266  LBB0_35:
   267  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   268  	LONG $0x03f88341 // cmpl	$3, %r8d
   269  	JB   LBB0_38
   270  
   271  LBB0_36:
   272  	WORD $0x8944; BYTE $0xc8 // movl	%r9d, %eax
   273  	WORD $0xc931             // xorl	%ecx, %ecx
   274  
   275  LBB0_37:
   276  	LONG $0x2c10fac5; BYTE $0x8f               // vmovss	(%rdi,%rcx,4), %xmm5
   277  	LONG $0x7410fac5; WORD $0x048f             // vmovss	4(%rdi,%rcx,4), %xmm6
   278  	LONG $0x9959e2c4; WORD $0x8e2c             // vfmadd132ss	(%rsi,%rcx,4), %xmm4, %xmm5
   279  	LONG $0xb949e2c4; WORD $0x8e6c; BYTE $0x04 // vfmadd231ss	4(%rsi,%rcx,4), %xmm6, %xmm5
   280  	LONG $0x7410fac5; WORD $0x088f             // vmovss	8(%rdi,%rcx,4), %xmm6
   281  	LONG $0x9951e2c4; WORD $0x8e74; BYTE $0x08 // vfmadd132ss	8(%rsi,%rcx,4), %xmm5, %xmm6
   282  	LONG $0x6410fac5; WORD $0x0c8f             // vmovss	12(%rdi,%rcx,4), %xmm4
   283  	LONG $0x9949e2c4; WORD $0x8e64; BYTE $0x0c // vfmadd132ss	12(%rsi,%rcx,4), %xmm6, %xmm4
   284  	LONG $0x04c18348                           // addq	$4, %rcx
   285  	WORD $0xc839                               // cmpl	%ecx, %eax
   286  	JNE  LBB0_37
   287  
   288  LBB0_38:
   289  	LONG $0xc158fcc5               // vaddps	%ymm1, %ymm0, %ymm0
   290  	LONG $0xca58e4c5               // vaddps	%ymm2, %ymm3, %ymm1
   291  	LONG $0xc058f4c5               // vaddps	%ymm0, %ymm1, %ymm0
   292  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   293  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   294  	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
   295  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   296  	LONG $0xc058dac5               // vaddss	%xmm0, %xmm4, %xmm0
   297  	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
   298  	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
   299  	BYTE $0x5d                     // popq	%rbp
   300  	WORD $0xf8c5; BYTE $0x77       // vzeroupper
   301  	BYTE $0xc3                     // retq