github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/asm/l2_avx256_amd64.s (about) 1 //go:build !noasm && amd64 2 // AUTO-GENERATED BY GOAT -- DO NOT EDIT 3 4 TEXT ·l2_256(SB), $0-32 5 MOVQ a+0(FP), DI 6 MOVQ b+8(FP), SI 7 MOVQ res+16(FP), DX 8 MOVQ len+24(FP), CX 9 BYTE $0x55 // pushq %rbp 10 WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp 11 LONG $0xf8e48348 // andq $-8, %rsp 12 WORD $0x8b48; BYTE $0x01 // movq (%rcx), %rax 13 WORD $0xf883; BYTE $0x07 // cmpl $7, %eax 14 JG LBB0_9 15 LONG $0xff408d44 // leal -1(%rax), %r8d 16 WORD $0x03a8 // testb $3, %al 17 JE LBB0_2 18 WORD $0x8941; BYTE $0xc1 // movl %eax, %r9d 19 LONG $0x03e18341 // andl $3, %r9d 20 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 21 WORD $0xc931 // xorl %ecx, %ecx 22 23 LBB0_4: 24 LONG $0x0f10fac5 // vmovss (%rdi), %xmm1 25 LONG $0x0e5cf2c5 // vsubss (%rsi), %xmm1, %xmm1 26 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 27 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 28 LONG $0x04c78348 // addq $4, %rdi 29 LONG $0x04c68348 // addq $4, %rsi 30 LONG $0x01c18348 // addq $1, %rcx 31 WORD $0x3941; BYTE $0xc9 // cmpl %ecx, %r9d 32 JNE LBB0_4 33 WORD $0xc829 // subl %ecx, %eax 34 LONG $0x03f88341 // cmpl $3, %r8d 35 JAE LBB0_7 36 37 LBB0_26: 38 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 39 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 40 BYTE $0x5d // popq %rbp 41 BYTE $0xc3 // retq 42 43 LBB0_9: 44 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 45 WORD $0xf883; BYTE $0x20 // cmpl $32, %eax 46 JB LBB0_10 47 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 48 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 49 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 50 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 51 52 LBB0_16: 53 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 54 LONG $0x6f10fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm5 55 LONG $0x7710fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm6 56 LONG $0x7f10fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm7 57 LONG $0x265cdcc5 // vsubps (%rsi), %ymm4, %ymm4 58 LONG $0x6e5cd4c5; BYTE $0x20 // vsubps 32(%rsi), %ymm5, %ymm5 59 LONG $0x765cccc5; BYTE $0x40 // vsubps 64(%rsi), %ymm6, %ymm6 60 LONG $0x7e5cc4c5; BYTE $0x60 // vsubps 96(%rsi), %ymm7, %ymm7 61 LONG $0xb85de2c4; BYTE $0xdc // vfmadd231ps %ymm4, %ymm4, %ymm3 62 LONG $0xb855e2c4; BYTE $0xd5 // vfmadd231ps %ymm5, %ymm5, %ymm2 63 LONG $0xb84de2c4; BYTE $0xce // vfmadd231ps %ymm6, %ymm6, %ymm1 64 LONG $0xb845e2c4; BYTE $0xc7 // vfmadd231ps %ymm7, %ymm7, %ymm0 65 WORD $0xc083; BYTE $0xe0 // addl $-32, %eax 66 LONG $0x80ef8348 // subq $-128, %rdi 67 LONG $0x80ee8348 // subq $-128, %rsi 68 WORD $0xf883; BYTE $0x1f // cmpl $31, %eax 69 JA LBB0_16 70 WORD $0xf883; BYTE $0x08 // cmpl $8, %eax 71 JAE LBB0_11 72 JMP LBB0_13 73 74 LBB0_10: 75 LONG $0xc957f0c5 // vxorps %xmm1, %xmm1, %xmm1 76 LONG $0xd257e8c5 // vxorps %xmm2, %xmm2, %xmm2 77 LONG $0xdb57e0c5 // vxorps %xmm3, %xmm3, %xmm3 78 79 LBB0_11: 80 LONG $0x2710fcc5 // vmovups (%rdi), %ymm4 81 LONG $0x265cdcc5 // vsubps (%rsi), %ymm4, %ymm4 82 LONG $0xb85de2c4; BYTE $0xdc // vfmadd231ps %ymm4, %ymm4, %ymm3 83 WORD $0xc083; BYTE $0xf8 // addl $-8, %eax 84 LONG $0x20c78348 // addq $32, %rdi 85 LONG $0x20c68348 // addq $32, %rsi 86 WORD $0xf883; BYTE $0x07 // cmpl $7, %eax 87 JA LBB0_11 88 89 LBB0_13: 90 WORD $0xc085 // testl %eax, %eax 91 JE LBB0_14 92 LONG $0xff408d44 // leal -1(%rax), %r8d 93 WORD $0x03a8 // testb $3, %al 94 JE LBB0_18 95 WORD $0x8941; BYTE $0xc1 // movl %eax, %r9d 96 LONG $0x03e18341 // andl $3, %r9d 97 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 98 WORD $0xc931 // xorl %ecx, %ecx 99 100 LBB0_20: 101 LONG $0x2f10fac5 // vmovss (%rdi), %xmm5 102 LONG $0x2e5cd2c5 // vsubss (%rsi), %xmm5, %xmm5 103 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 104 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 105 LONG $0x04c78348 // addq $4, %rdi 106 LONG $0x04c68348 // addq $4, %rsi 107 LONG $0x01c18348 // addq $1, %rcx 108 WORD $0x3941; BYTE $0xc9 // cmpl %ecx, %r9d 109 JNE LBB0_20 110 WORD $0xc829 // subl %ecx, %eax 111 LONG $0x03f88341 // cmpl $3, %r8d 112 JAE LBB0_23 113 JMP LBB0_25 114 115 LBB0_2: 116 LONG $0xc057f8c5 // vxorps %xmm0, %xmm0, %xmm0 117 LONG $0x03f88341 // cmpl $3, %r8d 118 JB LBB0_26 119 120 LBB0_7: 121 WORD $0xc089 // movl %eax, %eax 122 WORD $0xc931 // xorl %ecx, %ecx 123 124 LBB0_8: 125 LONG $0x0c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm1 126 LONG $0x5410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm2 127 LONG $0x0c5cf2c5; BYTE $0x8e // vsubss (%rsi,%rcx,4), %xmm1, %xmm1 128 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 129 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 130 LONG $0x4c5ceac5; WORD $0x048e // vsubss 4(%rsi,%rcx,4), %xmm2, %xmm1 131 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 132 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 133 LONG $0x4c10fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm1 134 LONG $0x4c5cf2c5; WORD $0x088e // vsubss 8(%rsi,%rcx,4), %xmm1, %xmm1 135 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 136 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 137 LONG $0x4c10fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm1 138 LONG $0x4c5cf2c5; WORD $0x0c8e // vsubss 12(%rsi,%rcx,4), %xmm1, %xmm1 139 LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 140 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 141 LONG $0x04c18348 // addq $4, %rcx 142 WORD $0xc839 // cmpl %ecx, %eax 143 JNE LBB0_8 144 JMP LBB0_26 145 146 LBB0_14: 147 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 148 JMP LBB0_25 149 150 LBB0_18: 151 LONG $0xe457d8c5 // vxorps %xmm4, %xmm4, %xmm4 152 LONG $0x03f88341 // cmpl $3, %r8d 153 JB LBB0_25 154 155 LBB0_23: 156 WORD $0xc089 // movl %eax, %eax 157 WORD $0xc931 // xorl %ecx, %ecx 158 159 LBB0_24: 160 LONG $0x2c10fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm5 161 LONG $0x7410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm6 162 LONG $0x2c5cd2c5; BYTE $0x8e // vsubss (%rsi,%rcx,4), %xmm5, %xmm5 163 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 164 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 165 LONG $0x6c5ccac5; WORD $0x048e // vsubss 4(%rsi,%rcx,4), %xmm6, %xmm5 166 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 167 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 168 LONG $0x6c10fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm5 169 LONG $0x6c5cd2c5; WORD $0x088e // vsubss 8(%rsi,%rcx,4), %xmm5, %xmm5 170 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 171 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 172 LONG $0x6c10fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm5 173 LONG $0x6c5cd2c5; WORD $0x0c8e // vsubss 12(%rsi,%rcx,4), %xmm5, %xmm5 174 LONG $0xed59d2c5 // vmulss %xmm5, %xmm5, %xmm5 175 LONG $0xe558dac5 // vaddss %xmm5, %xmm4, %xmm4 176 LONG $0x04c18348 // addq $4, %rcx 177 WORD $0xc839 // cmpl %ecx, %eax 178 JNE LBB0_24 179 180 LBB0_25: 181 LONG $0xd358ecc5 // vaddps %ymm3, %ymm2, %ymm2 182 LONG $0xc058f4c5 // vaddps %ymm0, %ymm1, %ymm0 183 LONG $0xc258fcc5 // vaddps %ymm2, %ymm0, %ymm0 184 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 185 LONG $0xc07cffc5 // vhaddps %ymm0, %ymm0, %ymm0 186 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 187 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 188 LONG $0xc058dac5 // vaddss %xmm0, %xmm4, %xmm0 189 LONG $0x0211fac5 // vmovss %xmm0, (%rdx) 190 WORD $0x8948; BYTE $0xec // movq %rbp, %rsp 191 BYTE $0x5d // popq %rbp 192 WORD $0xf8c5; BYTE $0x77 // vzeroupper 193 BYTE $0xc3 // retq