github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/dot_avx256_amd64.s (about) 1 //go:build !noasm && amd64 2 // AUTO-GENERATED BY GOAT -- DO NOT EDIT 3 4 TEXT ·dot_256(SB), $0-32 5 MOVQ a+0(FP), DI 6 MOVQ b+8(FP), SI 7 MOVQ res+16(FP), DX 8 MOVQ len+24(FP), CX 9 BYTE $0x55 // pushq %rbp 10 WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp 11 LONG $0xf8e48348 // andq $-8, %rsp 12 WORD $0x8b4c; BYTE $0x09 // movq (%rcx), %r9 13 LONG $0x07f98341 // cmpl $7, %r9d 14 JG LBB0_9 15 LONG $0xff418d45 // leal -1(%r9), %r8d 16 LONG $0x03c1f641 // testb $3, %r9b 17 JE LBB0_2 18 WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax 19 WORD $0xe083; BYTE $0x03 // andl $3, %eax 20 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 21 WORD $0xc931 // xorl %ecx, %ecx 22 23 LBB0_4: 24 LONG $0x0f10fac5 // vmovss (%rdi), %xmm1 25 LONG $0xb971e2c4; BYTE $0x06 // vfmadd231ss (%rsi), %xmm1, %xmm0 26 LONG $0x04c78348 // addq $4, %rdi 27 LONG $0x04c68348 // addq $4, %rsi 28 LONG $0x01c18348 // addq $1, %rcx 29 WORD $0xc839 // cmpl %ecx, %eax 30 JNE LBB0_4 31 WORD $0x2941; BYTE $0xc9 // subl %ecx, %r9d 32 LONG $0x03f88341 // cmpl $3, %r8d 33 JAE LBB0_7 34 35 LBB0_31: 36 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 37 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 38 BYTE $0x5d // popq %rbp 39 BYTE $0xc3 // retq 40 41 LBB0_9: 42 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 43 LONG $0x20f98341 // cmpl $32, %r9d 44 JB LBB0_10 45 LONG $0xe0498d41 // leal -32(%r9), %ecx 46 WORD $0xc1f6; BYTE $0x20 // testb $32, %cl 47 JNE LBB0_12 48 LONG $0x1f10fcc5 // vmovups (%rdi), %ymm3 49 LONG $0x5710fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm2 50 LONG $0x4f10fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm1 51 LONG $0x4710fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm0 52 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 53 LONG $0x985de2c4; BYTE $0x1e // vfmadd132ps (%rsi), %ymm4, %ymm3 54 LONG $0x985de2c4; WORD $0x2056 // vfmadd132ps 32(%rsi), %ymm4, %ymm2 55 LONG $0x985de2c4; WORD $0x404e // vfmadd132ps 64(%rsi), %ymm4, %ymm1 56 LONG $0x985de2c4; WORD $0x6046 // vfmadd132ps 96(%rsi), %ymm4, %ymm0 57 LONG $0x80ef8348 // subq $-128, %rdi 58 LONG $0x80ee8348 // subq $-128, %rsi 59 WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d 60 WORD $0xf983; BYTE $0x20 // cmpl $32, %ecx 61 JAE LBB0_20 62 JMP LBB0_15 63 64 LBB0_10: 65 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 66 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 67 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 68 JMP LBB0_16 69 70 LBB0_2: 71 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 72 LONG $0x03f88341 // cmpl $3, %r8d 73 JB LBB0_31 74 75 LBB0_7: 76 WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax 77 WORD $0xc931 // xorl %ecx, %ecx 78 79 LBB0_8: 80 LONG $0x0c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm1 81 LONG $0x5410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm2 82 LONG $0x9979e2c4; WORD $0x8e0c // vfmadd132ss (%rsi,%rcx,4), %xmm0, %xmm1 83 LONG $0xb969e2c4; WORD $0x8e4c; BYTE $0x04 // vfmadd231ss 4(%rsi,%rcx,4), %xmm2, %xmm1 84 LONG $0x5410fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm2 85 LONG $0x9971e2c4; WORD $0x8e54; BYTE $0x08 // vfmadd132ss 8(%rsi,%rcx,4), %xmm1, %xmm2 86 LONG $0x4410fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm0 87 LONG $0x9969e2c4; WORD $0x8e44; BYTE $0x0c // vfmadd132ss 12(%rsi,%rcx,4), %xmm2, %xmm0 88 LONG $0x04c18348 // addq $4, %rcx 89 WORD $0xc839 // cmpl %ecx, %eax 90 JNE LBB0_8 91 JMP LBB0_31 92 93 LBB0_12: 94 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 95 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 96 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 97 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 98 WORD $0xf983; BYTE $0x20 // cmpl $32, %ecx 99 JB LBB0_15 100 101 LBB0_20: 102 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 103 LONG $0x6f10fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm5 104 LONG $0x7710fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm6 105 LONG $0x7f10fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm7 106 LONG $0x9865e2c4; BYTE $0x26 // vfmadd132ps (%rsi), %ymm3, %ymm4 107 LONG $0x986de2c4; WORD $0x206e // vfmadd132ps 32(%rsi), %ymm2, %ymm5 108 LONG $0x9875e2c4; WORD $0x4076 // vfmadd132ps 64(%rsi), %ymm1, %ymm6 109 LONG $0x987de2c4; WORD $0x607e // vfmadd132ps 96(%rsi), %ymm0, %ymm7 110 QUAD $0x000000809f10fcc5 // vmovups 128(%rdi), %ymm3 111 QUAD $0x000000a09710fcc5 // vmovups 160(%rdi), %ymm2 112 QUAD $0x000000c08f10fcc5 // vmovups 192(%rdi), %ymm1 113 QUAD $0x000000e08710fcc5 // vmovups 224(%rdi), %ymm0 114 QUAD $0x0000809e985de2c4; BYTE $0x00 // vfmadd132ps 128(%rsi), %ymm4, %ymm3 115 QUAD $0x0000a0969855e2c4; BYTE $0x00 // vfmadd132ps 160(%rsi), %ymm5, %ymm2 116 QUAD $0x0000c08e984de2c4; BYTE $0x00 // vfmadd132ps 192(%rsi), %ymm6, %ymm1 117 QUAD $0x0000e0869845e2c4; BYTE $0x00 // vfmadd132ps 224(%rsi), %ymm7, %ymm0 118 LONG $0xc0c18341 // addl $-64, %r9d 119 LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi 120 LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq $256, %rsi 121 LONG $0x1ff98341 // cmpl $31, %r9d 122 JA LBB0_20 123 WORD $0x8944; BYTE $0xc9 // movl %r9d, %ecx 124 125 LBB0_15: 126 WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d 127 WORD $0xf983; BYTE $0x08 // cmpl $8, %ecx 128 JB LBB0_18 129 130 LBB0_16: 131 WORD $0x8944; BYTE $0xc9 // movl %r9d, %ecx 132 133 LBB0_17: 134 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 135 LONG $0xb85de2c4; BYTE $0x1e // vfmadd231ps (%rsi), %ymm4, %ymm3 136 WORD $0xc183; BYTE $0xf8 // addl $-8, %ecx 137 LONG $0x20c78348 // addq $32, %rdi 138 LONG $0x20c68348 // addq $32, %rsi 139 WORD $0xf983; BYTE $0x07 // cmpl $7, %ecx 140 JA LBB0_17 141 142 LBB0_18: 143 WORD $0xc985 // testl %ecx, %ecx 144 JE LBB0_19 145 LONG $0xff418d44 // leal -1(%rcx), %r8d 146 WORD $0xc1f6; BYTE $0x03 // testb $3, %cl 147 JE LBB0_23 148 WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d 149 LONG $0x03e18341 // andl $3, %r9d 150 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 151 WORD $0xc031 // xorl %eax, %eax 152 153 LBB0_25: 154 LONG $0x2f10fac5 // vmovss (%rdi), %xmm5 155 LONG $0xb951e2c4; BYTE $0x26 // vfmadd231ss (%rsi), %xmm5, %xmm4 156 LONG $0x04c78348 // addq $4, %rdi 157 LONG $0x04c68348 // addq $4, %rsi 158 LONG $0x01c08348 // addq $1, %rax 159 WORD $0x3941; BYTE $0xc1 // cmpl %eax, %r9d 160 JNE LBB0_25 161 WORD $0xc129 // subl %eax, %ecx 162 LONG $0x03f88341 // cmpl $3, %r8d 163 JAE LBB0_28 164 JMP LBB0_30 165 166 LBB0_19: 167 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 168 JMP LBB0_30 169 170 LBB0_23: 171 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 172 LONG $0x03f88341 // cmpl $3, %r8d 173 JB LBB0_30 174 175 LBB0_28: 176 WORD $0xc889 // movl %ecx, %eax 177 WORD $0xc931 // xorl %ecx, %ecx 178 179 LBB0_29: 180 LONG $0x2c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm5 181 LONG $0x7410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm6 182 LONG $0x9959e2c4; WORD $0x8e2c // vfmadd132ss (%rsi,%rcx,4), %xmm4, %xmm5 183 LONG $0xb949e2c4; WORD $0x8e6c; BYTE $0x04 // vfmadd231ss 4(%rsi,%rcx,4), %xmm6, %xmm5 184 LONG $0x7410fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm6 185 LONG $0x9951e2c4; WORD $0x8e74; BYTE $0x08 // vfmadd132ss 8(%rsi,%rcx,4), %xmm5, %xmm6 186 LONG $0x6410fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm4 187 LONG $0x9949e2c4; WORD $0x8e64; BYTE $0x0c // vfmadd132ss 12(%rsi,%rcx,4), %xmm6, %xmm4 188 LONG $0x04c18348 // addq $4, %rcx 189 WORD $0xc839 // cmpl %ecx, %eax 190 JNE LBB0_29 191 192 LBB0_30: 193 LONG $0xd358ecc5 // vaddps %ymm3, %ymm2, %ymm2 194 LONG $0xc058f4c5 // vaddps %ymm0, %ymm1, %ymm0 195 LONG $0xc258fcc5 // vaddps %ymm2, %ymm0, %ymm0 196 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 197 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 198 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 199 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 200 LONG $0xc058dac5 // vaddss %xmm0, %xmm4, %xmm0 201 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 202 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 203 BYTE $0x5d // popq %rbp 204 WORD $0xf8c5; BYTE $0x77 // vzeroupper 205 BYTE $0xc3 // retq