github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/l2_avx512_amd64.s (about) 1 //go:build !noasm && amd64 2 // AUTO-GENERATED BY GOAT -- DO NOT EDIT 3 4 TEXT ·l2_512(SB), $0-32 5 MOVQ a+0(FP), DI 6 MOVQ b+8(FP), SI 7 MOVQ res+16(FP), DX 8 MOVQ len+24(FP), CX 9 BYTE $0x55 // pushq %rbp 10 WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp 11 LONG $0xf8e48348 // andq $-8, %rsp 12 WORD $0x8b48; BYTE $0x01 // movq (%rcx), %rax 13 WORD $0xf883; BYTE $0x07 // cmpl $7, %eax 14 JG LBB0_9 15 LONG $0xff408d44 // leal -1(%rax), %r8d 16 WORD $0x03a8 // testb $3, %al 17 JE LBB0_2 18 WORD $0x8941; BYTE $0xc1 // movl %eax, %r9d 19 LONG $0x03e18341 // andl $3, %r9d 20 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 21 WORD $0xc931 // xorl %ecx, %ecx 22 23 LBB0_4: 24 LONG $0x0f10fac5 // vmovss (%rdi), %xmm1 25 LONG $0x0e5cf2c5 // vsubss (%rsi), %xmm1, %xmm1 26 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 27 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 28 LONG $0x04c78348 // addq $4, %rdi 29 LONG $0x04c68348 // addq $4, %rsi 30 LONG $0x01c18348 // addq $1, %rcx 31 WORD $0x3941; BYTE $0xc9 // cmpl %ecx, %r9d 32 JNE LBB0_4 33 WORD $0xc829 // subl %ecx, %eax 34 LONG $0x03f88341 // cmpl $3, %r8d 35 JAE LBB0_7 36 37 LBB0_36: 38 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 39 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 40 BYTE $0x5d // popq %rbp 41 BYTE $0xc3 // retq 42 43 LBB0_9: 44 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 45 LONG $0x0000803d; BYTE $0x00 // cmpl $128, %eax 46 JB LBB0_10 47 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 48 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 49 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 50 LONG $0xed57d0c5 // vxorps %xmm5, %xmm5, %xmm5 51 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 52 LONG $0xf657c8c5 // vxorps %xmm6, %xmm6, %xmm6 53 LONG $0xff57c0c5 // vxorps %xmm7, %xmm7, %xmm7 54 LONG $0x573841c4; BYTE $0xc0 // vxorps %xmm8, %xmm8, %xmm8 55 56 LBB0_22: 57 LONG $0x487c7162; WORD $0x0f10 // vmovups (%rdi), %zmm9 58 LONG $0x487c7162; WORD $0x5710; BYTE $0x01 // vmovups 64(%rdi), %zmm10 59 LONG $0x487c7162; WORD $0x5f10; BYTE $0x02 // vmovups 128(%rdi), %zmm11 60 LONG $0x487c7162; WORD $0x6710; BYTE $0x03 // vmovups 192(%rdi), %zmm12 61 LONG $0x487c7162; WORD $0x6f10; BYTE $0x04 // vmovups 256(%rdi), %zmm13 62 LONG $0x487c7162; WORD $0x7710; BYTE $0x05 // vmovups 320(%rdi), %zmm14 63 LONG $0x487c7162; WORD $0x7f10; BYTE $0x06 // vmovups 384(%rdi), %zmm15 64 LONG $0x48347162; WORD $0x0e5c // vsubps (%rsi), %zmm9, %zmm9 65 LONG $0x482c7162; WORD $0x565c; BYTE $0x01 // vsubps 64(%rsi), %zmm10, %zmm10 66 LONG $0x48247162; WORD $0x5e5c; BYTE $0x02 // vsubps 128(%rsi), %zmm11, %zmm11 67 LONG $0x481c7162; WORD $0x665c; BYTE $0x03 // vsubps 192(%rsi), %zmm12, %zmm12 68 LONG $0x48147162; WORD $0x6e5c; BYTE $0x04 // vsubps 256(%rsi), %zmm13, %zmm13 69 LONG $0x480c7162; WORD $0x765c; BYTE $0x05 // vsubps 320(%rsi), %zmm14, %zmm14 70 LONG $0x48047162; WORD $0x7e5c; BYTE $0x06 // vsubps 384(%rsi), %zmm15, %zmm15 71 LONG $0x487ce162; WORD $0x4710; BYTE $0x07 // vmovups 448(%rdi), %zmm16 72 LONG $0x407ce162; WORD $0x465c; BYTE $0x07 // vsubps 448(%rsi), %zmm16, %zmm16 73 LONG $0x4835d262; WORD $0xc9b8 // vfmadd231ps %zmm9, %zmm9, %zmm1 74 LONG $0x482dd262; WORD $0xd2b8 // vfmadd231ps %zmm10, %zmm10, %zmm2 75 LONG $0x4825d262; WORD $0xdbb8 // vfmadd231ps %zmm11, %zmm11, %zmm3 76 LONG $0x481dd262; WORD $0xecb8 // vfmadd231ps %zmm12, %zmm12, %zmm5 77 LONG $0x4815d262; WORD $0xe5b8 // vfmadd231ps %zmm13, %zmm13, %zmm4 78 LONG $0x480dd262; WORD $0xf6b8 // vfmadd231ps %zmm14, %zmm14, %zmm6 79 LONG $0x4805d262; WORD $0xffb8 // vfmadd231ps %zmm15, %zmm15, %zmm7 80 LONG $0x407d3262; WORD $0xc0b8 // vfmadd231ps %zmm16, %zmm16, %zmm8 81 WORD $0xc083; BYTE $0x80 // addl $-128, %eax 82 LONG $0x00c78148; WORD $0x0002; BYTE $0x00 // addq $512, %rdi 83 LONG $0x00c68148; WORD $0x0002; BYTE $0x00 // addq $512, %rsi 84 WORD $0xbe0f; BYTE $0xc8 // movsbl %al, %ecx 85 WORD $0xc139 // cmpl %eax, %ecx 86 JNE LBB0_22 87 LONG $0x4874f162; WORD $0xca58 // vaddps %zmm2, %zmm1, %zmm1 88 LONG $0x4864f162; WORD $0xd558 // vaddps %zmm5, %zmm3, %zmm2 89 LONG $0x4874f162; WORD $0xca58 // vaddps %zmm2, %zmm1, %zmm1 90 LONG $0x485cf162; WORD $0xd658 // vaddps %zmm6, %zmm4, %zmm2 91 LONG $0x4844d162; WORD $0xd858 // vaddps %zmm8, %zmm7, %zmm3 92 LONG $0x486cf162; WORD $0xd358 // vaddps %zmm3, %zmm2, %zmm2 93 LONG $0x4874f162; WORD $0xca58 // vaddps %zmm2, %zmm1, %zmm1 94 LONG $0x48fdf362; WORD $0xcb1b; BYTE $0x01 // vextractf64x4 $1, %zmm1, %ymm3 95 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 96 LONG $0xca58f4c5 // vaddps %ymm2, %ymm1, %ymm1 97 LONG $0xcb58f4c5 // vaddps %ymm3, %ymm1, %ymm1 98 WORD $0xc085 // testl %eax, %eax 99 JE LBB0_24 100 WORD $0xf883; BYTE $0x20 // cmpl $32, %eax 101 JB LBB0_12 102 103 LBB0_25: 104 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 105 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 106 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 107 108 LBB0_26: 109 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 110 LONG $0x6f10fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm5 111 LONG $0x7710fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm6 112 LONG $0x7f10fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm7 113 LONG $0x265cdcc5 // vsubps (%rsi), %ymm4, %ymm4 114 LONG $0x6e5cd4c5; BYTE $0x20 // vsubps 32(%rsi), %ymm5, %ymm5 115 LONG $0x765cccc5; BYTE $0x40 // vsubps 64(%rsi), %ymm6, %ymm6 116 LONG $0x7e5cc4c5; BYTE $0x60 // vsubps 96(%rsi), %ymm7, %ymm7 117 LONG $0xb85de2c4; BYTE $0xcc // vfmadd231ps %ymm4, %ymm4, %ymm1 118 LONG $0xb855e2c4; BYTE $0xc5 // vfmadd231ps %ymm5, %ymm5, %ymm0 119 LONG $0xb84de2c4; BYTE $0xde // vfmadd231ps %ymm6, %ymm6, %ymm3 120 LONG $0xb845e2c4; BYTE $0xd7 // vfmadd231ps %ymm7, %ymm7, %ymm2 121 WORD $0xc083; BYTE $0xe0 // addl $-32, %eax 122 LONG $0x80ef8348 // subq $-128, %rdi 123 LONG $0x80ee8348 // subq $-128, %rsi 124 WORD $0xf883; BYTE $0x1f // cmpl $31, %eax 125 JA LBB0_26 126 WORD $0xf883; BYTE $0x08 // cmpl $8, %eax 127 JAE LBB0_14 128 JMP LBB0_19 129 130 LBB0_10: 131 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 132 WORD $0xf883; BYTE $0x20 // cmpl $32, %eax 133 JAE LBB0_25 134 135 LBB0_12: 136 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 137 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 138 WORD $0xf883; BYTE $0x08 // cmpl $8, %eax 139 JB LBB0_19 140 141 LBB0_14: 142 LONG $0xf8408d44 // leal -8(%rax), %r8d 143 WORD $0x8945; BYTE $0xc1 // movl %r8d, %r9d 144 LONG $0x03e9c141 // shrl $3, %r9d 145 LONG $0x01498d41 // leal 1(%r9), %ecx 146 WORD $0xc1f6; BYTE $0x03 // testb $3, %cl 147 JE LBB0_18 148 LONG $0x01c18041 // addb $1, %r9b 149 LONG $0xc9b60f45 // movzbl %r9b, %r9d 150 LONG $0x03e18341 // andl $3, %r9d 151 LONG $0x03e1c149 // shlq $3, %r9 152 WORD $0xc931 // xorl %ecx, %ecx 153 154 LBB0_16: 155 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 156 LONG $0x265cdcc5 // vsubps (%rsi), %ymm4, %ymm4 157 LONG $0xb85de2c4; BYTE $0xcc // vfmadd231ps %ymm4, %ymm4, %ymm1 158 LONG $0x20c78348 // addq $32, %rdi 159 LONG $0x20c68348 // addq $32, %rsi 160 LONG $0x08c18348 // addq $8, %rcx 161 WORD $0x3941; BYTE $0xc9 // cmpl %ecx, %r9d 162 JNE LBB0_16 163 WORD $0xc829 // subl %ecx, %eax 164 165 LBB0_18: 166 LONG $0x18f88341 // cmpl $24, %r8d 167 JB LBB0_19 168 169 LBB0_37: 170 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 171 LONG $0x6f10fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm5 172 LONG $0x7710fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm6 173 LONG $0x7f10fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm7 174 LONG $0x265cdcc5 // vsubps (%rsi), %ymm4, %ymm4 175 LONG $0x6e5cd4c5; BYTE $0x20 // vsubps 32(%rsi), %ymm5, %ymm5 176 LONG $0xa85de2c4; BYTE $0xe1 // vfmadd213ps %ymm1, %ymm4, %ymm4 177 LONG $0xa855e2c4; BYTE $0xec // vfmadd213ps %ymm4, %ymm5, %ymm5 178 LONG $0x665cccc5; BYTE $0x40 // vsubps 64(%rsi), %ymm6, %ymm4 179 LONG $0xa85de2c4; BYTE $0xe5 // vfmadd213ps %ymm5, %ymm4, %ymm4 180 LONG $0x4e5cc4c5; BYTE $0x60 // vsubps 96(%rsi), %ymm7, %ymm1 181 LONG $0xa875e2c4; BYTE $0xcc // vfmadd213ps %ymm4, %ymm1, %ymm1 182 WORD $0xc083; BYTE $0xe0 // addl $-32, %eax 183 LONG $0x80ef8348 // subq $-128, %rdi 184 LONG $0x80ee8348 // subq $-128, %rsi 185 WORD $0xf883; BYTE $0x07 // cmpl $7, %eax 186 JA LBB0_37 187 188 LBB0_19: 189 WORD $0xc085 // testl %eax, %eax 190 JE LBB0_20 191 LONG $0xff408d44 // leal -1(%rax), %r8d 192 WORD $0x03a8 // testb $3, %al 193 JE LBB0_28 194 WORD $0x8941; BYTE $0xc1 // movl %eax, %r9d 195 LONG $0x03e18341 // andl $3, %r9d 196 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 197 WORD $0xc931 // xorl %ecx, %ecx 198 199 LBB0_30: 200 LONG $0x2f10fac5 // vmovss (%rdi), %xmm5 201 LONG $0x2e5cd2c5 // vsubss (%rsi), %xmm5, %xmm5 202 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 203 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 204 LONG $0x04c78348 // addq $4, %rdi 205 LONG $0x04c68348 // addq $4, %rsi 206 LONG $0x01c18348 // addq $1, %rcx 207 WORD $0x3941; BYTE $0xc9 // cmpl %ecx, %r9d 208 JNE LBB0_30 209 WORD $0xc829 // subl %ecx, %eax 210 LONG $0x03f88341 // cmpl $3, %r8d 211 JAE LBB0_33 212 JMP LBB0_35 213 214 LBB0_2: 215 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 216 LONG $0x03f88341 // cmpl $3, %r8d 217 JB LBB0_36 218 219 LBB0_7: 220 WORD $0xc089 // movl %eax, %eax 221 WORD $0xc931 // xorl %ecx, %ecx 222 223 LBB0_8: 224 LONG $0x0c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm1 225 LONG $0x5410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm2 226 LONG $0x0c5cf2c5; BYTE $0x8e // vsubss (%rsi,%rcx,4), %xmm1, %xmm1 227 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 228 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 229 LONG $0x4c5ceac5; WORD $0x048e // vsubss 4(%rsi,%rcx,4), %xmm2, %xmm1 230 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 231 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 232 LONG $0x4c10fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm1 233 LONG $0x4c5cf2c5; WORD $0x088e // vsubss 8(%rsi,%rcx,4), %xmm1, %xmm1 234 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 235 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 236 LONG $0x4c10fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm1 237 LONG $0x4c5cf2c5; WORD $0x0c8e // vsubss 12(%rsi,%rcx,4), %xmm1, %xmm1 238 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 239 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 240 LONG $0x04c18348 // addq $4, %rcx 241 WORD $0xc839 // cmpl %ecx, %eax 242 JNE LBB0_8 243 JMP LBB0_36 244 245 LBB0_20: 246 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 247 JMP LBB0_35 248 249 LBB0_24: 250 LONG $0xc258f4c5 // vaddps %ymm2, %ymm1, %ymm0 251 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 252 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 253 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 254 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 255 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 256 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 257 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 258 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 259 BYTE $0x5d // popq %rbp 260 WORD $0xf8c5; BYTE $0x77 // vzeroupper 261 BYTE $0xc3 // retq 262 263 LBB0_28: 264 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 265 LONG $0x03f88341 // cmpl $3, %r8d 266 JB LBB0_35 267 268 LBB0_33: 269 WORD $0xc089 // movl %eax, %eax 270 WORD $0xc931 // xorl %ecx, %ecx 271 272 LBB0_34: 273 LONG $0x2c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm5 274 LONG $0x7410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm6 275 LONG $0x2c5cd2c5; BYTE $0x8e // vsubss (%rsi,%rcx,4), %xmm5, %xmm5 276 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 277 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 278 LONG $0x6c5ccac5; WORD $0x048e // vsubss 4(%rsi,%rcx,4), %xmm6, %xmm5 279 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 280 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 281 LONG $0x6c10fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm5 282 LONG $0x6c5cd2c5; WORD $0x088e // vsubss 8(%rsi,%rcx,4), %xmm5, %xmm5 283 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 284 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 285 LONG $0x6c10fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm5 286 LONG $0x6c5cd2c5; WORD $0x0c8e // vsubss 12(%rsi,%rcx,4), %xmm5, %xmm5 287 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 288 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 289 LONG $0x04c18348 // addq $4, %rcx 290 WORD $0xc839 // cmpl %ecx, %eax 291 JNE LBB0_34 292 293 LBB0_35: 294 LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 295 LONG $0xca58e4c5 // vaddps %ymm2, %ymm3, %ymm1 296 LONG $0xc058f4c5 // vaddps %ymm0, %ymm1, %ymm0 297 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 298 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 299 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 300 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 301 LONG $0xc058dac5 // vaddss %xmm0, %xmm4, %xmm0 302 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 303 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 304 BYTE $0x5d // popq %rbp 305 WORD $0xf8c5; BYTE $0x77 // vzeroupper 306 BYTE $0xc3 // retq