github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/dot_avx512_amd64.s (about) 1 //go:build !noasm && amd64 2 // AUTO-GENERATED BY GOAT -- DO NOT EDIT 3 4 TEXT ·dot_512(SB), $0-32 5 MOVQ a+0(FP), DI 6 MOVQ b+8(FP), SI 7 MOVQ res+16(FP), DX 8 MOVQ len+24(FP), CX 9 BYTE $0x55 // pushq %rbp 10 WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp 11 LONG $0xf8e48348 // andq $-8, %rsp 12 WORD $0x8b48; BYTE $0x01 // movq (%rcx), %rax 13 WORD $0xf883; BYTE $0x07 // cmpl $7, %eax 14 JG LBB0_6 15 LONG $0xff408d44 // leal -1(%rax), %r8d 16 WORD $0x03a8 // testb $3, %al 17 JE LBB0_15 18 WORD $0x8941; BYTE $0xc1 // movl %eax, %r9d 19 LONG $0x03e18341 // andl $3, %r9d 20 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 21 WORD $0xc931 // xorl %ecx, %ecx 22 23 LBB0_3: 24 LONG $0x0f10fac5 // vmovss (%rdi), %xmm1 25 LONG $0xb971e2c4; BYTE $0x06 // vfmadd231ss (%rsi), %xmm1, %xmm0 26 LONG $0x04c78348 // addq $4, %rdi 27 LONG $0x04c68348 // addq $4, %rsi 28 LONG $0x01c18348 // addq $1, %rcx 29 WORD $0x3941; BYTE $0xc9 // cmpl %ecx, %r9d 30 JNE LBB0_3 31 WORD $0xc829 // subl %ecx, %eax 32 LONG $0x03f88341 // cmpl $3, %r8d 33 JAE LBB0_16 34 JMP LBB0_5 35 36 LBB0_6: 37 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 38 LONG $0x0000803d; BYTE $0x00 // cmpl $128, %eax 39 JB LBB0_13 40 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 41 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 42 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 43 LONG $0xed57d0c5 // vxorps %xmm5, %xmm5, %xmm5 44 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 45 LONG $0xf657c8c5 // vxorps %xmm6, %xmm6, %xmm6 46 LONG $0xff57c0c5 // vxorps %xmm7, %xmm7, %xmm7 47 LONG $0x573841c4; BYTE $0xc0 // vxorps %xmm8, %xmm8, %xmm8 48 49 LBB0_8: 50 LONG $0x487c7162; WORD $0x0f10 // vmovups (%rdi), %zmm9 51 LONG $0x487c7162; WORD $0x5710; BYTE $0x01 // vmovups 64(%rdi), %zmm10 52 LONG $0x487c7162; WORD $0x5f10; BYTE $0x02 // vmovups 128(%rdi), %zmm11 53 LONG $0x487c7162; WORD $0x6710; BYTE $0x03 // vmovups 192(%rdi), %zmm12 54 LONG $0x487c7162; WORD $0x6f10; BYTE $0x04 // vmovups 256(%rdi), %zmm13 55 LONG $0x487c7162; WORD $0x7710; BYTE $0x05 // vmovups 320(%rdi), %zmm14 56 LONG $0x487c7162; WORD $0x7f10; BYTE $0x06 // vmovups 384(%rdi), %zmm15 57 LONG $0x4835f262; WORD $0x0eb8 // vfmadd231ps (%rsi), %zmm9, %zmm1 58 LONG $0x482df262; WORD $0x56b8; BYTE $0x01 // vfmadd231ps 64(%rsi), %zmm10, %zmm2 59 LONG $0x4825f262; WORD $0x5eb8; BYTE $0x02 // vfmadd231ps 128(%rsi), %zmm11, %zmm3 60 LONG $0x481df262; WORD $0x6eb8; BYTE $0x03 // vfmadd231ps 192(%rsi), %zmm12, %zmm5 61 LONG $0x4815f262; WORD $0x66b8; BYTE $0x04 // vfmadd231ps 256(%rsi), %zmm13, %zmm4 62 LONG $0x480df262; WORD $0x76b8; BYTE $0x05 // vfmadd231ps 320(%rsi), %zmm14, %zmm6 63 LONG $0x4805f262; WORD $0x7eb8; BYTE $0x06 // vfmadd231ps 384(%rsi), %zmm15, %zmm7 64 LONG $0x487c7162; WORD $0x4f10; BYTE $0x07 // vmovups 448(%rdi), %zmm9 65 LONG $0x48357262; WORD $0x46b8; BYTE $0x07 // vfmadd231ps 448(%rsi), %zmm9, %zmm8 66 WORD $0xc083; BYTE $0x80 // addl $-128, %eax 67 LONG $0x00c78148; WORD $0x0002; BYTE $0x00 // addq $512, %rdi 68 LONG $0x00c68148; WORD $0x0002; BYTE $0x00 // addq $512, %rsi 69 WORD $0xbe0f; BYTE $0xc8 // movsbl %al, %ecx 70 WORD $0xc139 // cmpl %eax, %ecx 71 JNE LBB0_8 72 LONG $0x4874f162; WORD $0xca58 // vaddps %zmm2, %zmm1, %zmm1 73 LONG $0x4864f162; WORD $0xd558 // vaddps %zmm5, %zmm3, %zmm2 74 LONG $0x4874f162; WORD $0xca58 // vaddps %zmm2, %zmm1, %zmm1 75 LONG $0x485cf162; WORD $0xd658 // vaddps %zmm6, %zmm4, %zmm2 76 LONG $0x4844d162; WORD $0xd858 // vaddps %zmm8, %zmm7, %zmm3 77 LONG $0x486cf162; WORD $0xd358 // vaddps %zmm3, %zmm2, %zmm2 78 LONG $0x4874f162; WORD $0xca58 // vaddps %zmm2, %zmm1, %zmm1 79 LONG $0x48fdf362; WORD $0xcb1b; BYTE $0x01 // vextractf64x4 $1, %zmm1, %ymm3 80 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 81 LONG $0xca58f4c5 // vaddps %ymm2, %ymm1, %ymm1 82 LONG $0xcb58f4c5 // vaddps %ymm3, %ymm1, %ymm1 83 WORD $0xc085 // testl %eax, %eax 84 JE LBB0_18 85 WORD $0xf883; BYTE $0x20 // cmpl $32, %eax 86 JB LBB0_14 87 88 LBB0_11: 89 LONG $0xe0488d44 // leal -32(%rax), %r9d 90 LONG $0x20c1f641 // testb $32, %r9b 91 JNE LBB0_19 92 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 93 LONG $0x4710fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm0 94 LONG $0x5f10fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm3 95 LONG $0x5710fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm2 96 LONG $0xb85de2c4; BYTE $0x0e // vfmadd231ps (%rsi), %ymm4, %ymm1 97 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 98 LONG $0x985de2c4; WORD $0x2046 // vfmadd132ps 32(%rsi), %ymm4, %ymm0 99 LONG $0x985de2c4; WORD $0x405e // vfmadd132ps 64(%rsi), %ymm4, %ymm3 100 LONG $0x985de2c4; WORD $0x6056 // vfmadd132ps 96(%rsi), %ymm4, %ymm2 101 LONG $0x80ef8348 // subq $-128, %rdi 102 LONG $0x80ee8348 // subq $-128, %rsi 103 WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax 104 LONG $0x20f98341 // cmpl $32, %r9d 105 JAE LBB0_20 106 JMP LBB0_22 107 108 LBB0_13: 109 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 110 WORD $0xf883; BYTE $0x20 // cmpl $32, %eax 111 JAE LBB0_11 112 113 LBB0_14: 114 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 115 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 116 JMP LBB0_21 117 118 LBB0_15: 119 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 120 LONG $0x03f88341 // cmpl $3, %r8d 121 JB LBB0_5 122 123 LBB0_16: 124 WORD $0xc089 // movl %eax, %eax 125 WORD $0xc931 // xorl %ecx, %ecx 126 127 LBB0_17: 128 LONG $0x0c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm1 129 LONG $0x5410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm2 130 LONG $0x9979e2c4; WORD $0x8e0c // vfmadd132ss (%rsi,%rcx,4), %xmm0, %xmm1 131 LONG $0xb969e2c4; WORD $0x8e4c; BYTE $0x04 // vfmadd231ss 4(%rsi,%rcx,4), %xmm2, %xmm1 132 LONG $0x5410fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm2 133 LONG $0x9971e2c4; WORD $0x8e54; BYTE $0x08 // vfmadd132ss 8(%rsi,%rcx,4), %xmm1, %xmm2 134 LONG $0x4410fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm0 135 LONG $0x9969e2c4; WORD $0x8e44; BYTE $0x0c // vfmadd132ss 12(%rsi,%rcx,4), %xmm2, %xmm0 136 LONG $0x04c18348 // addq $4, %rcx 137 WORD $0xc839 // cmpl %ecx, %eax 138 JNE LBB0_17 139 140 LBB0_5: 141 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 142 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 143 BYTE $0x5d // popq %rbp 144 BYTE $0xc3 // retq 145 146 LBB0_18: 147 LONG $0xc258f4c5 // vaddps %ymm2, %ymm1, %ymm0 148 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 149 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 150 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 151 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 152 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 153 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 154 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 155 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 156 BYTE $0x5d // popq %rbp 157 WORD $0xf8c5; BYTE $0x77 // vzeroupper 158 BYTE $0xc3 // retq 159 160 LBB0_19: 161 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 162 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 163 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 164 LONG $0x20f98341 // cmpl $32, %r9d 165 JB LBB0_22 166 167 LBB0_20: 168 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 169 LONG $0x6f10fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm5 170 LONG $0x7710fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm6 171 LONG $0x7f10fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm7 172 LONG $0x9875e2c4; BYTE $0x26 // vfmadd132ps (%rsi), %ymm1, %ymm4 173 LONG $0x987de2c4; WORD $0x206e // vfmadd132ps 32(%rsi), %ymm0, %ymm5 174 LONG $0x9865e2c4; WORD $0x4076 // vfmadd132ps 64(%rsi), %ymm3, %ymm6 175 LONG $0x986de2c4; WORD $0x607e // vfmadd132ps 96(%rsi), %ymm2, %ymm7 176 QUAD $0x000000808f10fcc5 // vmovups 128(%rdi), %ymm1 177 QUAD $0x000000a08710fcc5 // vmovups 160(%rdi), %ymm0 178 QUAD $0x000000c09f10fcc5 // vmovups 192(%rdi), %ymm3 179 QUAD $0x000000e09710fcc5 // vmovups 224(%rdi), %ymm2 180 QUAD $0x0000808e985de2c4; BYTE $0x00 // vfmadd132ps 128(%rsi), %ymm4, %ymm1 181 QUAD $0x0000a0869855e2c4; BYTE $0x00 // vfmadd132ps 160(%rsi), %ymm5, %ymm0 182 QUAD $0x0000c09e984de2c4; BYTE $0x00 // vfmadd132ps 192(%rsi), %ymm6, %ymm3 183 QUAD $0x0000e0969845e2c4; BYTE $0x00 // vfmadd132ps 224(%rsi), %ymm7, %ymm2 184 WORD $0xc083; BYTE $0xc0 // addl $-64, %eax 185 LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi 186 LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq $256, %rsi 187 WORD $0xf883; BYTE $0x1f // cmpl $31, %eax 188 JA LBB0_20 189 190 LBB0_21: 191 WORD $0x8941; BYTE $0xc1 // movl %eax, %r9d 192 193 LBB0_22: 194 LONG $0x08f98341 // cmpl $8, %r9d 195 JB LBB0_29 196 LONG $0xf8418d45 // leal -8(%r9), %r8d 197 WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax 198 WORD $0xe8c1; BYTE $0x03 // shrl $3, %eax 199 WORD $0x488d; BYTE $0x01 // leal 1(%rax), %ecx 200 WORD $0xc1f6; BYTE $0x03 // testb $3, %cl 201 JE LBB0_27 202 WORD $0x0104 // addb $1, %al 203 WORD $0xb60f; BYTE $0xc0 // movzbl %al, %eax 204 WORD $0xe083; BYTE $0x03 // andl $3, %eax 205 LONG $0x03e0c148 // shlq $3, %rax 206 WORD $0xc931 // xorl %ecx, %ecx 207 208 LBB0_25: 209 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 210 LONG $0xb85de2c4; BYTE $0x0e // vfmadd231ps (%rsi), %ymm4, %ymm1 211 LONG $0x20c78348 // addq $32, %rdi 212 LONG $0x20c68348 // addq $32, %rsi 213 LONG $0x08c18348 // addq $8, %rcx 214 WORD $0xc839 // cmpl %ecx, %eax 215 JNE LBB0_25 216 WORD $0x2941; BYTE $0xc9 // subl %ecx, %r9d 217 218 LBB0_27: 219 LONG $0x18f88341 // cmpl $24, %r8d 220 JB LBB0_29 221 222 LBB0_28: 223 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 224 LONG $0x6f10fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm5 225 LONG $0x7710fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm6 226 LONG $0x7f10fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm7 227 LONG $0x9875e2c4; BYTE $0x26 // vfmadd132ps (%rsi), %ymm1, %ymm4 228 LONG $0xb855e2c4; WORD $0x2066 // vfmadd231ps 32(%rsi), %ymm5, %ymm4 229 LONG $0xb84de2c4; WORD $0x4066 // vfmadd231ps 64(%rsi), %ymm6, %ymm4 230 LONG $0xcc28fcc5 // vmovaps %ymm4, %ymm1 231 LONG $0xb845e2c4; WORD $0x604e // vfmadd231ps 96(%rsi), %ymm7, %ymm1 232 LONG $0xe0c18341 // addl $-32, %r9d 233 LONG $0x80ef8348 // subq $-128, %rdi 234 LONG $0x80ee8348 // subq $-128, %rsi 235 LONG $0x07f98341 // cmpl $7, %r9d 236 JA LBB0_28 237 238 LBB0_29: 239 WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d 240 JE LBB0_34 241 LONG $0xff418d45 // leal -1(%r9), %r8d 242 LONG $0x03c1f641 // testb $3, %r9b 243 JE LBB0_35 244 WORD $0x8944; BYTE $0xc9 // movl %r9d, %ecx 245 WORD $0xe183; BYTE $0x03 // andl $3, %ecx 246 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 247 WORD $0xc031 // xorl %eax, %eax 248 249 LBB0_32: 250 LONG $0x2f10fac5 // vmovss (%rdi), %xmm5 251 LONG $0xb951e2c4; BYTE $0x26 // vfmadd231ss (%rsi), %xmm5, %xmm4 252 LONG $0x04c78348 // addq $4, %rdi 253 LONG $0x04c68348 // addq $4, %rsi 254 LONG $0x01c08348 // addq $1, %rax 255 WORD $0xc139 // cmpl %eax, %ecx 256 JNE LBB0_32 257 WORD $0x2941; BYTE $0xc1 // subl %eax, %r9d 258 LONG $0x03f88341 // cmpl $3, %r8d 259 JAE LBB0_36 260 JMP LBB0_38 261 262 LBB0_34: 263 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 264 JMP LBB0_38 265 266 LBB0_35: 267 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 268 LONG $0x03f88341 // cmpl $3, %r8d 269 JB LBB0_38 270 271 LBB0_36: 272 WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax 273 WORD $0xc931 // xorl %ecx, %ecx 274 275 LBB0_37: 276 LONG $0x2c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm5 277 LONG $0x7410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm6 278 LONG $0x9959e2c4; WORD $0x8e2c // vfmadd132ss (%rsi,%rcx,4), %xmm4, %xmm5 279 LONG $0xb949e2c4; WORD $0x8e6c; BYTE $0x04 // vfmadd231ss 4(%rsi,%rcx,4), %xmm6, %xmm5 280 LONG $0x7410fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm6 281 LONG $0x9951e2c4; WORD $0x8e74; BYTE $0x08 // vfmadd132ss 8(%rsi,%rcx,4), %xmm5, %xmm6 282 LONG $0x6410fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm4 283 LONG $0x9949e2c4; WORD $0x8e64; BYTE $0x0c // vfmadd132ss 12(%rsi,%rcx,4), %xmm6, %xmm4 284 LONG $0x04c18348 // addq $4, %rcx 285 WORD $0xc839 // cmpl %ecx, %eax 286 JNE LBB0_37 287 288 LBB0_38: 289 LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 290 LONG $0xca58e4c5 // vaddps %ymm2, %ymm3, %ymm1 291 LONG $0xc058f4c5 // vaddps %ymm0, %ymm1, %ymm0 292 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 293 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 294 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 295 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 296 LONG $0xc058dac5 // vaddss %xmm0, %xmm4, %xmm0 297 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 298 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 299 BYTE $0x5d // popq %rbp 300 WORD $0xf8c5; BYTE $0x77 // vzeroupper 301 BYTE $0xc3 // retq