github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/dot_avx256_amd64.s (about)

     1  //go:build !noasm && amd64
     2  // AUTO-GENERATED BY GOAT -- DO NOT EDIT
     3  
     4  TEXT ·dot_256(SB), $0-32
     5  	MOVQ a+0(FP), DI
     6  	MOVQ b+8(FP), SI
     7  	MOVQ res+16(FP), DX
     8  	MOVQ len+24(FP), CX
     9  	BYTE $0x55               // pushq	%rbp
    10  	WORD $0x8948; BYTE $0xe5 // movq	%rsp, %rbp
    11  	LONG $0xf8e48348         // andq	$-8, %rsp
    12  	WORD $0x8b4c; BYTE $0x09 // movq	(%rcx), %r9
    13  	LONG $0x07f98341         // cmpl	$7, %r9d
    14  	JG   LBB0_9
    15  	LONG $0xff418d45         // leal	-1(%r9), %r8d
    16  	LONG $0x03c1f641         // testb	$3, %r9b
    17  	JE   LBB0_2
    18  	WORD $0x8944; BYTE $0xc8 // movl	%r9d, %eax
    19  	WORD $0xe083; BYTE $0x03 // andl	$3, %eax
    20  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    21  	WORD $0xc931             // xorl	%ecx, %ecx
    22  
    23  LBB0_4:
    24  	LONG $0x0f10fac5             // vmovss	(%rdi), %xmm1
    25  	LONG $0xb971e2c4; BYTE $0x06 // vfmadd231ss	(%rsi), %xmm1, %xmm0
    26  	LONG $0x04c78348             // addq	$4, %rdi
    27  	LONG $0x04c68348             // addq	$4, %rsi
    28  	LONG $0x01c18348             // addq	$1, %rcx
    29  	WORD $0xc839                 // cmpl	%ecx, %eax
    30  	JNE  LBB0_4
    31  	WORD $0x2941; BYTE $0xc9     // subl	%ecx, %r9d
    32  	LONG $0x03f88341             // cmpl	$3, %r8d
    33  	JAE  LBB0_7
    34  
    35  LBB0_31:
    36  	LONG $0x0211fac5         // vmovss	%xmm0, (%rdx)
    37  	WORD $0x8948; BYTE $0xec // movq	%rbp, %rsp
    38  	BYTE $0x5d               // popq	%rbp
    39  	BYTE $0xc3               // retq
    40  
    41  LBB0_9:
    42  	LONG $0xc057f8c5               // vxorps	%xmm0, %xmm0, %xmm0
    43  	LONG $0x20f98341               // cmpl	$32, %r9d
    44  	JB   LBB0_10
    45  	LONG $0xe0498d41               // leal	-32(%r9), %ecx
    46  	WORD $0xc1f6; BYTE $0x20       // testb	$32, %cl
    47  	JNE  LBB0_12
    48  	LONG $0x1f10fcc5               // vmovups	(%rdi), %ymm3
    49  	LONG $0x5710fcc5; BYTE $0x20   // vmovups	32(%rdi), %ymm2
    50  	LONG $0x4f10fcc5; BYTE $0x40   // vmovups	64(%rdi), %ymm1
    51  	LONG $0x4710fcc5; BYTE $0x60   // vmovups	96(%rdi), %ymm0
    52  	LONG $0xe457d8c5               // vxorps	%xmm4, %xmm4, %xmm4
    53  	LONG $0x985de2c4; BYTE $0x1e   // vfmadd132ps	(%rsi), %ymm4, %ymm3
    54  	LONG $0x985de2c4; WORD $0x2056 // vfmadd132ps	32(%rsi), %ymm4, %ymm2
    55  	LONG $0x985de2c4; WORD $0x404e // vfmadd132ps	64(%rsi), %ymm4, %ymm1
    56  	LONG $0x985de2c4; WORD $0x6046 // vfmadd132ps	96(%rsi), %ymm4, %ymm0
    57  	LONG $0x80ef8348               // subq	$-128, %rdi
    58  	LONG $0x80ee8348               // subq	$-128, %rsi
    59  	WORD $0x8941; BYTE $0xc9       // movl	%ecx, %r9d
    60  	WORD $0xf983; BYTE $0x20       // cmpl	$32, %ecx
    61  	JAE  LBB0_20
    62  	JMP  LBB0_15
    63  
    64  LBB0_10:
    65  	LONG $0xc957f0c5 // vxorps	%xmm1, %xmm1, %xmm1
    66  	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
    67  	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
    68  	JMP  LBB0_16
    69  
    70  LBB0_2:
    71  	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
    72  	LONG $0x03f88341 // cmpl	$3, %r8d
    73  	JB   LBB0_31
    74  
    75  LBB0_7:
    76  	WORD $0x8944; BYTE $0xc8 // movl	%r9d, %eax
    77  	WORD $0xc931             // xorl	%ecx, %ecx
    78  
    79  LBB0_8:
    80  	LONG $0x0c10fac5; BYTE $0x8f               // vmovss	(%rdi,%rcx,4), %xmm1
    81  	LONG $0x5410fac5; WORD $0x048f             // vmovss	4(%rdi,%rcx,4), %xmm2
    82  	LONG $0x9979e2c4; WORD $0x8e0c             // vfmadd132ss	(%rsi,%rcx,4), %xmm0, %xmm1
    83  	LONG $0xb969e2c4; WORD $0x8e4c; BYTE $0x04 // vfmadd231ss	4(%rsi,%rcx,4), %xmm2, %xmm1
    84  	LONG $0x5410fac5; WORD $0x088f             // vmovss	8(%rdi,%rcx,4), %xmm2
    85  	LONG $0x9971e2c4; WORD $0x8e54; BYTE $0x08 // vfmadd132ss	8(%rsi,%rcx,4), %xmm1, %xmm2
    86  	LONG $0x4410fac5; WORD $0x0c8f             // vmovss	12(%rdi,%rcx,4), %xmm0
    87  	LONG $0x9969e2c4; WORD $0x8e44; BYTE $0x0c // vfmadd132ss	12(%rsi,%rcx,4), %xmm2, %xmm0
    88  	LONG $0x04c18348                           // addq	$4, %rcx
    89  	WORD $0xc839                               // cmpl	%ecx, %eax
    90  	JNE  LBB0_8
    91  	JMP  LBB0_31
    92  
    93  LBB0_12:
    94  	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
    95  	LONG $0xc957f0c5         // vxorps	%xmm1, %xmm1, %xmm1
    96  	LONG $0xd257e8c5         // vxorps	%xmm2, %xmm2, %xmm2
    97  	LONG $0xdb57e0c5         // vxorps	%xmm3, %xmm3, %xmm3
    98  	WORD $0xf983; BYTE $0x20 // cmpl	$32, %ecx
    99  	JB   LBB0_15
   100  
   101  LBB0_20:
   102  	LONG $0x2710fcc5                           // vmovups	(%rdi), %ymm4
   103  	LONG $0x6f10fcc5; BYTE $0x20               // vmovups	32(%rdi), %ymm5
   104  	LONG $0x7710fcc5; BYTE $0x40               // vmovups	64(%rdi), %ymm6
   105  	LONG $0x7f10fcc5; BYTE $0x60               // vmovups	96(%rdi), %ymm7
   106  	LONG $0x9865e2c4; BYTE $0x26               // vfmadd132ps	(%rsi), %ymm3, %ymm4
   107  	LONG $0x986de2c4; WORD $0x206e             // vfmadd132ps	32(%rsi), %ymm2, %ymm5
   108  	LONG $0x9875e2c4; WORD $0x4076             // vfmadd132ps	64(%rsi), %ymm1, %ymm6
   109  	LONG $0x987de2c4; WORD $0x607e             // vfmadd132ps	96(%rsi), %ymm0, %ymm7
   110  	QUAD $0x000000809f10fcc5                   // vmovups	128(%rdi), %ymm3
   111  	QUAD $0x000000a09710fcc5                   // vmovups	160(%rdi), %ymm2
   112  	QUAD $0x000000c08f10fcc5                   // vmovups	192(%rdi), %ymm1
   113  	QUAD $0x000000e08710fcc5                   // vmovups	224(%rdi), %ymm0
   114  	QUAD $0x0000809e985de2c4; BYTE $0x00       // vfmadd132ps	128(%rsi), %ymm4, %ymm3
   115  	QUAD $0x0000a0969855e2c4; BYTE $0x00       // vfmadd132ps	160(%rsi), %ymm5, %ymm2
   116  	QUAD $0x0000c08e984de2c4; BYTE $0x00       // vfmadd132ps	192(%rsi), %ymm6, %ymm1
   117  	QUAD $0x0000e0869845e2c4; BYTE $0x00       // vfmadd132ps	224(%rsi), %ymm7, %ymm0
   118  	LONG $0xc0c18341                           // addl	$-64, %r9d
   119  	LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq	$256, %rdi
   120  	LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq	$256, %rsi
   121  	LONG $0x1ff98341                           // cmpl	$31, %r9d
   122  	JA   LBB0_20
   123  	WORD $0x8944; BYTE $0xc9                   // movl	%r9d, %ecx
   124  
   125  LBB0_15:
   126  	WORD $0x8941; BYTE $0xc9 // movl	%ecx, %r9d
   127  	WORD $0xf983; BYTE $0x08 // cmpl	$8, %ecx
   128  	JB   LBB0_18
   129  
   130  LBB0_16:
   131  	WORD $0x8944; BYTE $0xc9 // movl	%r9d, %ecx
   132  
   133  LBB0_17:
   134  	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
   135  	LONG $0xb85de2c4; BYTE $0x1e // vfmadd231ps	(%rsi), %ymm4, %ymm3
   136  	WORD $0xc183; BYTE $0xf8     // addl	$-8, %ecx
   137  	LONG $0x20c78348             // addq	$32, %rdi
   138  	LONG $0x20c68348             // addq	$32, %rsi
   139  	WORD $0xf983; BYTE $0x07     // cmpl	$7, %ecx
   140  	JA   LBB0_17
   141  
   142  LBB0_18:
   143  	WORD $0xc985             // testl	%ecx, %ecx
   144  	JE   LBB0_19
   145  	LONG $0xff418d44         // leal	-1(%rcx), %r8d
   146  	WORD $0xc1f6; BYTE $0x03 // testb	$3, %cl
   147  	JE   LBB0_23
   148  	WORD $0x8941; BYTE $0xc9 // movl	%ecx, %r9d
   149  	LONG $0x03e18341         // andl	$3, %r9d
   150  	LONG $0xe457d8c5         // vxorps	%xmm4, %xmm4, %xmm4
   151  	WORD $0xc031             // xorl	%eax, %eax
   152  
   153  LBB0_25:
   154  	LONG $0x2f10fac5             // vmovss	(%rdi), %xmm5
   155  	LONG $0xb951e2c4; BYTE $0x26 // vfmadd231ss	(%rsi), %xmm5, %xmm4
   156  	LONG $0x04c78348             // addq	$4, %rdi
   157  	LONG $0x04c68348             // addq	$4, %rsi
   158  	LONG $0x01c08348             // addq	$1, %rax
   159  	WORD $0x3941; BYTE $0xc1     // cmpl	%eax, %r9d
   160  	JNE  LBB0_25
   161  	WORD $0xc129                 // subl	%eax, %ecx
   162  	LONG $0x03f88341             // cmpl	$3, %r8d
   163  	JAE  LBB0_28
   164  	JMP  LBB0_30
   165  
   166  LBB0_19:
   167  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   168  	JMP  LBB0_30
   169  
   170  LBB0_23:
   171  	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
   172  	LONG $0x03f88341 // cmpl	$3, %r8d
   173  	JB   LBB0_30
   174  
   175  LBB0_28:
   176  	WORD $0xc889 // movl	%ecx, %eax
   177  	WORD $0xc931 // xorl	%ecx, %ecx
   178  
   179  LBB0_29:
   180  	LONG $0x2c10fac5; BYTE $0x8f               // vmovss	(%rdi,%rcx,4), %xmm5
   181  	LONG $0x7410fac5; WORD $0x048f             // vmovss	4(%rdi,%rcx,4), %xmm6
   182  	LONG $0x9959e2c4; WORD $0x8e2c             // vfmadd132ss	(%rsi,%rcx,4), %xmm4, %xmm5
   183  	LONG $0xb949e2c4; WORD $0x8e6c; BYTE $0x04 // vfmadd231ss	4(%rsi,%rcx,4), %xmm6, %xmm5
   184  	LONG $0x7410fac5; WORD $0x088f             // vmovss	8(%rdi,%rcx,4), %xmm6
   185  	LONG $0x9951e2c4; WORD $0x8e74; BYTE $0x08 // vfmadd132ss	8(%rsi,%rcx,4), %xmm5, %xmm6
   186  	LONG $0x6410fac5; WORD $0x0c8f             // vmovss	12(%rdi,%rcx,4), %xmm4
   187  	LONG $0x9949e2c4; WORD $0x8e64; BYTE $0x0c // vfmadd132ss	12(%rsi,%rcx,4), %xmm6, %xmm4
   188  	LONG $0x04c18348                           // addq	$4, %rcx
   189  	WORD $0xc839                               // cmpl	%ecx, %eax
   190  	JNE  LBB0_29
   191  
   192  LBB0_30:
   193  	LONG $0xd358ecc5               // vaddps	%ymm3, %ymm2, %ymm2
   194  	LONG $0xc058f4c5               // vaddps	%ymm0, %ymm1, %ymm0
   195  	LONG $0xc258fcc5               // vaddps	%ymm2, %ymm0, %ymm0
   196  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   197  	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
   198  	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
   199  	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
   200  	LONG $0xc058dac5               // vaddss	%xmm0, %xmm4, %xmm0
   201  	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
   202  	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
   203  	BYTE $0x5d                     // popq	%rbp
   204  	WORD $0xf8c5; BYTE $0x77       // vzeroupper
   205  	BYTE $0xc3                     // retq