gitee.com/quant1x/num@v0.3.2/asm/floats_avx_amd64.s (about) 1 //+build !noasm !appengine 2 // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT 3 4 TEXT ·___mm256_mul_const_add_to(SB), $0-32 5 6 MOVQ a+0(FP), DI 7 MOVQ b+8(FP), SI 8 MOVQ c+16(FP), DX 9 MOVQ n+24(FP), CX 10 11 LONG $0x07418d4c // lea r8, [rcx + 7] 12 WORD $0x8548; BYTE $0xc9 // test rcx, rcx 13 LONG $0xc1490f4c // cmovns r8, rcx 14 WORD $0x894c; BYTE $0xc0 // mov rax, r8 15 LONG $0x03f8c148 // sar rax, 3 16 LONG $0xf8e08349 // and r8, -8 17 WORD $0x294c; BYTE $0xc1 // sub rcx, r8 18 WORD $0xc085 // test eax, eax 19 JLE LBB0_6 20 WORD $0xf883; BYTE $0x01 // cmp eax, 1 21 JE LBB0_4 22 WORD $0x8941; BYTE $0xc0 // mov r8d, eax 23 LONG $0xfee08341 // and r8d, -2 24 LBB0_3: 25 LONG $0x0710fcc5 // vmovups ymm0, yword [rdi] 26 LONG $0x187de2c4; BYTE $0x0e // vbroadcastss ymm1, dword [rsi] 27 LONG $0xa87de2c4; BYTE $0x0a // vfmadd213ps ymm1, ymm0, yword [rdx] 28 LONG $0x0a11fcc5 // vmovups yword [rdx], ymm1 29 LONG $0x4710fcc5; BYTE $0x20 // vmovups ymm0, yword [rdi + 32] 30 LONG $0x187de2c4; BYTE $0x0e // vbroadcastss ymm1, dword [rsi] 31 LONG $0xa87de2c4; WORD $0x204a // vfmadd213ps ymm1, ymm0, yword [rdx + 32] 32 LONG $0x4a11fcc5; BYTE $0x20 // vmovups yword [rdx + 32], ymm1 33 LONG $0x40c78348 // add rdi, 64 34 LONG $0x40c28348 // add rdx, 64 35 LONG $0xfec08341 // add r8d, -2 36 JNE LBB0_3 37 LBB0_4: 38 WORD $0x01a8 // test al, 1 39 JE LBB0_6 40 LONG $0x0710fcc5 // vmovups ymm0, yword [rdi] 41 LONG $0x187de2c4; BYTE $0x0e // vbroadcastss ymm1, dword [rsi] 42 LONG $0xa87de2c4; BYTE $0x0a // vfmadd213ps ymm1, ymm0, yword [rdx] 43 LONG $0x0a11fcc5 // vmovups yword [rdx], ymm1 44 LONG $0x20c78348 // add rdi, 32 45 LONG $0x20c28348 // add rdx, 32 46 LBB0_6: 47 WORD $0xc985 // test ecx, ecx 48 JLE LBB0_18 49 WORD $0xc889 // mov eax, ecx 50 LONG $0x20f88348 // cmp rax, 32 51 JAE LBB0_9 52 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 53 JMP LBB0_14 54 LBB0_9: 55 LONG $0x82048d4c // lea r8, [rdx + 4*rax] 56 LONG $0x870c8d4c // lea r9, [rdi + 4*rax] 57 LONG $0x04568d4c // lea r10, [rsi + 4] 58 WORD $0x394c; BYTE $0xca // cmp rdx, r9 59 LONG $0xd3920f41 // setb r11b 60 WORD $0x394c; BYTE $0xc7 // cmp rdi, r8 61 WORD $0x920f; BYTE $0xd3 // setb bl 62 WORD $0x394c; BYTE $0xd2 // cmp rdx, r10 63 LONG $0xd1920f41 // setb r9b 64 WORD $0x3949; BYTE $0xf0 // cmp r8, rsi 65 LONG $0xd2970f41 // seta r10b 66 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 67 WORD $0x8441; BYTE $0xdb // test r11b, bl 68 JNE LBB0_14 69 WORD $0x2045; BYTE $0xd1 // and r9b, r10b 70 JNE LBB0_14 71 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx 72 LONG $0x1fe18341 // and r9d, 31 73 WORD $0x8949; BYTE $0xc0 // mov r8, rax 74 WORD $0x294d; BYTE $0xc8 // sub r8, r9 75 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 76 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 77 LBB0_12: 78 LONG $0x107ca1c4; WORD $0x970c // vmovups ymm1, yword [rdi + 4*r10] 79 LONG $0x107ca1c4; WORD $0x9754; BYTE $0x20 // vmovups ymm2, yword [rdi + 4*r10 + 32] 80 LONG $0x107ca1c4; WORD $0x975c; BYTE $0x40 // vmovups ymm3, yword [rdi + 4*r10 + 64] 81 LONG $0x107ca1c4; WORD $0x9764; BYTE $0x60 // vmovups ymm4, yword [rdi + 4*r10 + 96] 82 LONG $0xa87da2c4; WORD $0x920c // vfmadd213ps ymm1, ymm0, yword [rdx + 4*r10] 83 LONG $0xa87da2c4; WORD $0x9254; BYTE $0x20 // vfmadd213ps ymm2, ymm0, yword [rdx + 4*r10 + 32] 84 LONG $0xa87da2c4; WORD $0x925c; BYTE $0x40 // vfmadd213ps ymm3, ymm0, yword [rdx + 4*r10 + 64] 85 LONG $0xa87da2c4; WORD $0x9264; BYTE $0x60 // vfmadd213ps ymm4, ymm0, yword [rdx + 4*r10 + 96] 86 LONG $0x117ca1c4; WORD $0x920c // vmovups yword [rdx + 4*r10], ymm1 87 LONG $0x117ca1c4; WORD $0x9254; BYTE $0x20 // vmovups yword [rdx + 4*r10 + 32], ymm2 88 LONG $0x117ca1c4; WORD $0x925c; BYTE $0x40 // vmovups yword [rdx + 4*r10 + 64], ymm3 89 LONG $0x117ca1c4; WORD $0x9264; BYTE $0x60 // vmovups yword [rdx + 4*r10 + 96], ymm4 90 LONG $0x20c28349 // add r10, 32 91 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 92 JNE LBB0_12 93 WORD $0x854d; BYTE $0xc9 // test r9, r9 94 JE LBB0_18 95 LBB0_14: 96 WORD $0x2944; BYTE $0xc1 // sub ecx, r8d 97 LONG $0x01488d4d // lea r9, [r8 + 1] 98 WORD $0xc1f6; BYTE $0x01 // test cl, 1 99 JE LBB0_16 100 LONG $0x107aa1c4; WORD $0x8704 // vmovss xmm0, dword [rdi + 4*r8] 101 LONG $0x0e10fac5 // vmovss xmm1, dword [rsi] 102 LONG $0xa979a2c4; WORD $0x820c // vfmadd213ss xmm1, xmm0, dword [rdx + 4*r8] 103 LONG $0x117aa1c4; WORD $0x820c // vmovss dword [rdx + 4*r8], xmm1 104 WORD $0x894d; BYTE $0xc8 // mov r8, r9 105 LBB0_16: 106 WORD $0x394c; BYTE $0xc8 // cmp rax, r9 107 JE LBB0_18 108 LBB0_17: 109 LONG $0x107aa1c4; WORD $0x8704 // vmovss xmm0, dword [rdi + 4*r8] 110 LONG $0x0e10fac5 // vmovss xmm1, dword [rsi] 111 LONG $0xa979a2c4; WORD $0x820c // vfmadd213ss xmm1, xmm0, dword [rdx + 4*r8] 112 LONG $0x117aa1c4; WORD $0x820c // vmovss dword [rdx + 4*r8], xmm1 113 LONG $0x107aa1c4; WORD $0x8744; BYTE $0x04 // vmovss xmm0, dword [rdi + 4*r8 + 4] 114 LONG $0x0e10fac5 // vmovss xmm1, dword [rsi] 115 LONG $0xa979a2c4; WORD $0x824c; BYTE $0x04 // vfmadd213ss xmm1, xmm0, dword [rdx + 4*r8 + 4] 116 LONG $0x117aa1c4; WORD $0x824c; BYTE $0x04 // vmovss dword [rdx + 4*r8 + 4], xmm1 117 LONG $0x02c08349 // add r8, 2 118 WORD $0x394c; BYTE $0xc0 // cmp rax, r8 119 JNE LBB0_17 120 LBB0_18: 121 VZEROUPPER 122 RET 123 124 125 126 127 TEXT ·___mm256_mul_const_to(SB), $0-32 128 129 MOVQ a+0(FP), DI 130 MOVQ b+8(FP), SI 131 MOVQ c+16(FP), DX 132 MOVQ n+24(FP), CX 133 134 LONG $0x07418d4c // lea r8, [rcx + 7] 135 WORD $0x8548; BYTE $0xc9 // test rcx, rcx 136 LONG $0xc1490f4c // cmovns r8, rcx 137 WORD $0x894c; BYTE $0xc0 // mov rax, r8 138 LONG $0x03f8c148 // sar rax, 3 139 LONG $0xf8e08349 // and r8, -8 140 WORD $0x294c; BYTE $0xc1 // sub rcx, r8 141 WORD $0xc085 // test eax, eax 142 JLE LBB1_6 143 WORD $0xf883; BYTE $0x01 // cmp eax, 1 144 JE LBB1_4 145 WORD $0x8941; BYTE $0xc0 // mov r8d, eax 146 LONG $0xfee08341 // and r8d, -2 147 LBB1_3: 148 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 149 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 150 LONG $0x0211fcc5 // vmovups yword [rdx], ymm0 151 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 152 LONG $0x4759fcc5; BYTE $0x20 // vmulps ymm0, ymm0, yword [rdi + 32] 153 LONG $0x4211fcc5; BYTE $0x20 // vmovups yword [rdx + 32], ymm0 154 LONG $0x40c78348 // add rdi, 64 155 LONG $0x40c28348 // add rdx, 64 156 LONG $0xfec08341 // add r8d, -2 157 JNE LBB1_3 158 LBB1_4: 159 WORD $0x01a8 // test al, 1 160 JE LBB1_6 161 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 162 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 163 LONG $0x0211fcc5 // vmovups yword [rdx], ymm0 164 LONG $0x20c78348 // add rdi, 32 165 LONG $0x20c28348 // add rdx, 32 166 LBB1_6: 167 WORD $0xc985 // test ecx, ecx 168 JLE LBB1_18 169 WORD $0xc889 // mov eax, ecx 170 LONG $0x20f88348 // cmp rax, 32 171 JAE LBB1_9 172 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 173 JMP LBB1_14 174 LBB1_9: 175 LONG $0x82048d4c // lea r8, [rdx + 4*rax] 176 LONG $0x870c8d4c // lea r9, [rdi + 4*rax] 177 LONG $0x04568d4c // lea r10, [rsi + 4] 178 WORD $0x394c; BYTE $0xca // cmp rdx, r9 179 LONG $0xd3920f41 // setb r11b 180 WORD $0x394c; BYTE $0xc7 // cmp rdi, r8 181 WORD $0x920f; BYTE $0xd3 // setb bl 182 WORD $0x394c; BYTE $0xd2 // cmp rdx, r10 183 LONG $0xd1920f41 // setb r9b 184 WORD $0x3949; BYTE $0xf0 // cmp r8, rsi 185 LONG $0xd2970f41 // seta r10b 186 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 187 WORD $0x8441; BYTE $0xdb // test r11b, bl 188 JNE LBB1_14 189 WORD $0x2045; BYTE $0xd1 // and r9b, r10b 190 JNE LBB1_14 191 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx 192 LONG $0x1fe18341 // and r9d, 31 193 WORD $0x8949; BYTE $0xc0 // mov r8, rax 194 WORD $0x294d; BYTE $0xc8 // sub r8, r9 195 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 196 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 197 LBB1_12: 198 LONG $0x597ca1c4; WORD $0x970c // vmulps ymm1, ymm0, yword [rdi + 4*r10] 199 LONG $0x597ca1c4; WORD $0x9754; BYTE $0x20 // vmulps ymm2, ymm0, yword [rdi + 4*r10 + 32] 200 LONG $0x597ca1c4; WORD $0x975c; BYTE $0x40 // vmulps ymm3, ymm0, yword [rdi + 4*r10 + 64] 201 LONG $0x597ca1c4; WORD $0x9764; BYTE $0x60 // vmulps ymm4, ymm0, yword [rdi + 4*r10 + 96] 202 LONG $0x117ca1c4; WORD $0x920c // vmovups yword [rdx + 4*r10], ymm1 203 LONG $0x117ca1c4; WORD $0x9254; BYTE $0x20 // vmovups yword [rdx + 4*r10 + 32], ymm2 204 LONG $0x117ca1c4; WORD $0x925c; BYTE $0x40 // vmovups yword [rdx + 4*r10 + 64], ymm3 205 LONG $0x117ca1c4; WORD $0x9264; BYTE $0x60 // vmovups yword [rdx + 4*r10 + 96], ymm4 206 LONG $0x20c28349 // add r10, 32 207 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 208 JNE LBB1_12 209 WORD $0x854d; BYTE $0xc9 // test r9, r9 210 JE LBB1_18 211 LBB1_14: 212 WORD $0x2944; BYTE $0xc1 // sub ecx, r8d 213 WORD $0x894d; BYTE $0xc1 // mov r9, r8 214 WORD $0xf749; BYTE $0xd1 // not r9 215 WORD $0x0149; BYTE $0xc1 // add r9, rax 216 LONG $0x03e18348 // and rcx, 3 217 JE LBB1_16 218 LBB1_15: 219 LONG $0x0610fac5 // vmovss xmm0, dword [rsi] 220 LONG $0x597aa1c4; WORD $0x8704 // vmulss xmm0, xmm0, dword [rdi + 4*r8] 221 LONG $0x117aa1c4; WORD $0x8204 // vmovss dword [rdx + 4*r8], xmm0 222 WORD $0xff49; BYTE $0xc0 // inc r8 223 WORD $0xff48; BYTE $0xc9 // dec rcx 224 JNE LBB1_15 225 LBB1_16: 226 LONG $0x03f98349 // cmp r9, 3 227 JB LBB1_18 228 LBB1_17: 229 LONG $0x0610fac5 // vmovss xmm0, dword [rsi] 230 LONG $0x597aa1c4; WORD $0x8704 // vmulss xmm0, xmm0, dword [rdi + 4*r8] 231 LONG $0x117aa1c4; WORD $0x8204 // vmovss dword [rdx + 4*r8], xmm0 232 LONG $0x0610fac5 // vmovss xmm0, dword [rsi] 233 LONG $0x597aa1c4; WORD $0x8744; BYTE $0x04 // vmulss xmm0, xmm0, dword [rdi + 4*r8 + 4] 234 LONG $0x117aa1c4; WORD $0x8244; BYTE $0x04 // vmovss dword [rdx + 4*r8 + 4], xmm0 235 LONG $0x0610fac5 // vmovss xmm0, dword [rsi] 236 LONG $0x597aa1c4; WORD $0x8744; BYTE $0x08 // vmulss xmm0, xmm0, dword [rdi + 4*r8 + 8] 237 LONG $0x117aa1c4; WORD $0x8244; BYTE $0x08 // vmovss dword [rdx + 4*r8 + 8], xmm0 238 LONG $0x0610fac5 // vmovss xmm0, dword [rsi] 239 LONG $0x597aa1c4; WORD $0x8744; BYTE $0x0c // vmulss xmm0, xmm0, dword [rdi + 4*r8 + 12] 240 LONG $0x117aa1c4; WORD $0x8244; BYTE $0x0c // vmovss dword [rdx + 4*r8 + 12], xmm0 241 LONG $0x04c08349 // add r8, 4 242 WORD $0x394c; BYTE $0xc0 // cmp rax, r8 243 JNE LBB1_17 244 LBB1_18: 245 VZEROUPPER 246 RET 247 248 249 250 251 TEXT ·___mm256_mul_const(SB), $0-24 252 253 MOVQ a+0(FP), DI 254 MOVQ b+8(FP), SI 255 MOVQ n+16(FP), DX 256 257 LONG $0x074a8d48 // lea rcx, [rdx + 7] 258 WORD $0x8548; BYTE $0xd2 // test rdx, rdx 259 LONG $0xca490f48 // cmovns rcx, rdx 260 WORD $0x8948; BYTE $0xc8 // mov rax, rcx 261 LONG $0x03f8c148 // sar rax, 3 262 LONG $0xf8e18348 // and rcx, -8 263 WORD $0x2948; BYTE $0xca // sub rdx, rcx 264 WORD $0xc085 // test eax, eax 265 JLE LBB2_6 266 WORD $0xf883; BYTE $0x01 // cmp eax, 1 267 JE LBB2_4 268 WORD $0xc189 // mov ecx, eax 269 WORD $0xe183; BYTE $0xfe // and ecx, -2 270 LBB2_3: 271 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 272 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 273 LONG $0x0711fcc5 // vmovups yword [rdi], ymm0 274 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 275 LONG $0x4759fcc5; BYTE $0x20 // vmulps ymm0, ymm0, yword [rdi + 32] 276 LONG $0x4711fcc5; BYTE $0x20 // vmovups yword [rdi + 32], ymm0 277 LONG $0x40c78348 // add rdi, 64 278 WORD $0xc183; BYTE $0xfe // add ecx, -2 279 JNE LBB2_3 280 LBB2_4: 281 WORD $0x01a8 // test al, 1 282 JE LBB2_6 283 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 284 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 285 LONG $0x0711fcc5 // vmovups yword [rdi], ymm0 286 LONG $0x20c78348 // add rdi, 32 287 LBB2_6: 288 WORD $0xd285 // test edx, edx 289 JLE LBB2_19 290 WORD $0xd089 // mov eax, edx 291 LONG $0x20f88348 // cmp rax, 32 292 JB LBB2_8 293 LONG $0x044e8d48 // lea rcx, [rsi + 4] 294 WORD $0x3948; BYTE $0xcf // cmp rdi, rcx 295 JAE LBB2_12 296 LONG $0x870c8d48 // lea rcx, [rdi + 4*rax] 297 WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi 298 JBE LBB2_12 299 LBB2_8: 300 WORD $0xc931 // xor ecx, ecx 301 LBB2_15: 302 WORD $0xca29 // sub edx, ecx 303 WORD $0x8949; BYTE $0xc8 // mov r8, rcx 304 WORD $0xf749; BYTE $0xd0 // not r8 305 WORD $0x0149; BYTE $0xc0 // add r8, rax 306 LONG $0x03e28348 // and rdx, 3 307 JE LBB2_17 308 LBB2_16: 309 LONG $0x0410fac5; BYTE $0x8f // vmovss xmm0, dword [rdi + 4*rcx] 310 LONG $0x0659fac5 // vmulss xmm0, xmm0, dword [rsi] 311 LONG $0x0411fac5; BYTE $0x8f // vmovss dword [rdi + 4*rcx], xmm0 312 WORD $0xff48; BYTE $0xc1 // inc rcx 313 WORD $0xff48; BYTE $0xca // dec rdx 314 JNE LBB2_16 315 LBB2_17: 316 LONG $0x03f88349 // cmp r8, 3 317 JB LBB2_19 318 LBB2_18: 319 LONG $0x0410fac5; BYTE $0x8f // vmovss xmm0, dword [rdi + 4*rcx] 320 LONG $0x0659fac5 // vmulss xmm0, xmm0, dword [rsi] 321 LONG $0x4c10fac5; WORD $0x048f // vmovss xmm1, dword [rdi + 4*rcx + 4] 322 LONG $0x0411fac5; BYTE $0x8f // vmovss dword [rdi + 4*rcx], xmm0 323 LONG $0x0659f2c5 // vmulss xmm0, xmm1, dword [rsi] 324 LONG $0x4411fac5; WORD $0x048f // vmovss dword [rdi + 4*rcx + 4], xmm0 325 LONG $0x4410fac5; WORD $0x088f // vmovss xmm0, dword [rdi + 4*rcx + 8] 326 LONG $0x0659fac5 // vmulss xmm0, xmm0, dword [rsi] 327 LONG $0x4411fac5; WORD $0x088f // vmovss dword [rdi + 4*rcx + 8], xmm0 328 LONG $0x4410fac5; WORD $0x0c8f // vmovss xmm0, dword [rdi + 4*rcx + 12] 329 LONG $0x0659fac5 // vmulss xmm0, xmm0, dword [rsi] 330 LONG $0x4411fac5; WORD $0x0c8f // vmovss dword [rdi + 4*rcx + 12], xmm0 331 LONG $0x04c18348 // add rcx, 4 332 WORD $0x3948; BYTE $0xc8 // cmp rax, rcx 333 JNE LBB2_18 334 JMP LBB2_19 335 LBB2_12: 336 WORD $0x8941; BYTE $0xd0 // mov r8d, edx 337 LONG $0x1fe08341 // and r8d, 31 338 WORD $0x8948; BYTE $0xc1 // mov rcx, rax 339 WORD $0x294c; BYTE $0xc1 // sub rcx, r8 340 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss ymm0, dword [rsi] 341 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d 342 LBB2_13: 343 LONG $0x597ca1c4; WORD $0x8f0c // vmulps ymm1, ymm0, yword [rdi + 4*r9] 344 LONG $0x597ca1c4; WORD $0x8f54; BYTE $0x20 // vmulps ymm2, ymm0, yword [rdi + 4*r9 + 32] 345 LONG $0x597ca1c4; WORD $0x8f5c; BYTE $0x40 // vmulps ymm3, ymm0, yword [rdi + 4*r9 + 64] 346 LONG $0x597ca1c4; WORD $0x8f64; BYTE $0x60 // vmulps ymm4, ymm0, yword [rdi + 4*r9 + 96] 347 LONG $0x117ca1c4; WORD $0x8f0c // vmovups yword [rdi + 4*r9], ymm1 348 LONG $0x117ca1c4; WORD $0x8f54; BYTE $0x20 // vmovups yword [rdi + 4*r9 + 32], ymm2 349 LONG $0x117ca1c4; WORD $0x8f5c; BYTE $0x40 // vmovups yword [rdi + 4*r9 + 64], ymm3 350 LONG $0x117ca1c4; WORD $0x8f64; BYTE $0x60 // vmovups yword [rdi + 4*r9 + 96], ymm4 351 LONG $0x20c18349 // add r9, 32 352 WORD $0x394c; BYTE $0xc9 // cmp rcx, r9 353 JNE LBB2_13 354 WORD $0x854d; BYTE $0xc0 // test r8, r8 355 JNE LBB2_15 356 LBB2_19: 357 VZEROUPPER 358 RET 359 360 361 362 363 TEXT ·___mm256_mul_to(SB), $0-32 364 365 MOVQ a+0(FP), DI 366 MOVQ b+8(FP), SI 367 MOVQ c+16(FP), DX 368 MOVQ n+24(FP), CX 369 370 LONG $0x07418d48 // lea rax, [rcx + 7] 371 WORD $0x8548; BYTE $0xc9 // test rcx, rcx 372 LONG $0xc1490f48 // cmovns rax, rcx 373 WORD $0x8949; BYTE $0xc0 // mov r8, rax 374 LONG $0x03f8c149 // sar r8, 3 375 LONG $0xf8e08348 // and rax, -8 376 WORD $0x2948; BYTE $0xc1 // sub rcx, rax 377 WORD $0x8545; BYTE $0xc0 // test r8d, r8d 378 JLE LBB3_6 379 WORD $0x8944; BYTE $0xc0 // mov eax, r8d 380 WORD $0xe083; BYTE $0x03 // and eax, 3 381 LONG $0x04f88341 // cmp r8d, 4 382 JB LBB3_4 383 LONG $0xfce08341 // and r8d, -4 384 LBB3_3: 385 LONG $0x0610fcc5 // vmovups ymm0, yword [rsi] 386 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 387 LONG $0x0211fcc5 // vmovups yword [rdx], ymm0 388 LONG $0x4610fcc5; BYTE $0x20 // vmovups ymm0, yword [rsi + 32] 389 LONG $0x4759fcc5; BYTE $0x20 // vmulps ymm0, ymm0, yword [rdi + 32] 390 LONG $0x4211fcc5; BYTE $0x20 // vmovups yword [rdx + 32], ymm0 391 LONG $0x4610fcc5; BYTE $0x40 // vmovups ymm0, yword [rsi + 64] 392 LONG $0x4759fcc5; BYTE $0x40 // vmulps ymm0, ymm0, yword [rdi + 64] 393 LONG $0x4211fcc5; BYTE $0x40 // vmovups yword [rdx + 64], ymm0 394 LONG $0x4610fcc5; BYTE $0x60 // vmovups ymm0, yword [rsi + 96] 395 LONG $0x4759fcc5; BYTE $0x60 // vmulps ymm0, ymm0, yword [rdi + 96] 396 LONG $0x4211fcc5; BYTE $0x60 // vmovups yword [rdx + 96], ymm0 397 LONG $0x80ef8348 // sub rdi, -128 398 LONG $0x80ee8348 // sub rsi, -128 399 LONG $0x80ea8348 // sub rdx, -128 400 LONG $0xfcc08341 // add r8d, -4 401 JNE LBB3_3 402 LBB3_4: 403 WORD $0xc085 // test eax, eax 404 JE LBB3_6 405 LBB3_5: 406 LONG $0x0610fcc5 // vmovups ymm0, yword [rsi] 407 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 408 LONG $0x0211fcc5 // vmovups yword [rdx], ymm0 409 LONG $0x20c78348 // add rdi, 32 410 LONG $0x20c68348 // add rsi, 32 411 LONG $0x20c28348 // add rdx, 32 412 WORD $0xc8ff // dec eax 413 JNE LBB3_5 414 LBB3_6: 415 WORD $0xc985 // test ecx, ecx 416 JLE LBB3_18 417 WORD $0xc889 // mov eax, ecx 418 LONG $0x20f88348 // cmp rax, 32 419 JAE LBB3_9 420 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 421 JMP LBB3_14 422 LBB3_9: 423 WORD $0x8949; BYTE $0xd1 // mov r9, rdx 424 WORD $0x2949; BYTE $0xf9 // sub r9, rdi 425 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 426 LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 427 JB LBB3_14 428 WORD $0x8949; BYTE $0xd1 // mov r9, rdx 429 WORD $0x2949; BYTE $0xf1 // sub r9, rsi 430 LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 431 JB LBB3_14 432 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx 433 LONG $0x1fe18341 // and r9d, 31 434 WORD $0x8949; BYTE $0xc0 // mov r8, rax 435 WORD $0x294d; BYTE $0xc8 // sub r8, r9 436 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 437 LBB3_12: 438 LONG $0x107ca1c4; WORD $0x9604 // vmovups ymm0, yword [rsi + 4*r10] 439 LONG $0x107ca1c4; WORD $0x964c; BYTE $0x20 // vmovups ymm1, yword [rsi + 4*r10 + 32] 440 LONG $0x107ca1c4; WORD $0x9654; BYTE $0x40 // vmovups ymm2, yword [rsi + 4*r10 + 64] 441 LONG $0x107ca1c4; WORD $0x965c; BYTE $0x60 // vmovups ymm3, yword [rsi + 4*r10 + 96] 442 LONG $0x597ca1c4; WORD $0x9704 // vmulps ymm0, ymm0, yword [rdi + 4*r10] 443 LONG $0x5974a1c4; WORD $0x974c; BYTE $0x20 // vmulps ymm1, ymm1, yword [rdi + 4*r10 + 32] 444 LONG $0x596ca1c4; WORD $0x9754; BYTE $0x40 // vmulps ymm2, ymm2, yword [rdi + 4*r10 + 64] 445 LONG $0x5964a1c4; WORD $0x975c; BYTE $0x60 // vmulps ymm3, ymm3, yword [rdi + 4*r10 + 96] 446 LONG $0x117ca1c4; WORD $0x9204 // vmovups yword [rdx + 4*r10], ymm0 447 LONG $0x117ca1c4; WORD $0x924c; BYTE $0x20 // vmovups yword [rdx + 4*r10 + 32], ymm1 448 LONG $0x117ca1c4; WORD $0x9254; BYTE $0x40 // vmovups yword [rdx + 4*r10 + 64], ymm2 449 LONG $0x117ca1c4; WORD $0x925c; BYTE $0x60 // vmovups yword [rdx + 4*r10 + 96], ymm3 450 LONG $0x20c28349 // add r10, 32 451 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 452 JNE LBB3_12 453 WORD $0x854d; BYTE $0xc9 // test r9, r9 454 JE LBB3_18 455 LBB3_14: 456 WORD $0x2944; BYTE $0xc1 // sub ecx, r8d 457 WORD $0x894d; BYTE $0xc1 // mov r9, r8 458 WORD $0xf749; BYTE $0xd1 // not r9 459 WORD $0x0149; BYTE $0xc1 // add r9, rax 460 LONG $0x03e18348 // and rcx, 3 461 JE LBB3_16 462 LBB3_15: 463 LONG $0x107aa1c4; WORD $0x8604 // vmovss xmm0, dword [rsi + 4*r8] 464 LONG $0x597aa1c4; WORD $0x8704 // vmulss xmm0, xmm0, dword [rdi + 4*r8] 465 LONG $0x117aa1c4; WORD $0x8204 // vmovss dword [rdx + 4*r8], xmm0 466 WORD $0xff49; BYTE $0xc0 // inc r8 467 WORD $0xff48; BYTE $0xc9 // dec rcx 468 JNE LBB3_15 469 LBB3_16: 470 LONG $0x03f98349 // cmp r9, 3 471 JB LBB3_18 472 LBB3_17: 473 LONG $0x107aa1c4; WORD $0x8604 // vmovss xmm0, dword [rsi + 4*r8] 474 LONG $0x597aa1c4; WORD $0x8704 // vmulss xmm0, xmm0, dword [rdi + 4*r8] 475 LONG $0x117aa1c4; WORD $0x8204 // vmovss dword [rdx + 4*r8], xmm0 476 LONG $0x107aa1c4; WORD $0x8644; BYTE $0x04 // vmovss xmm0, dword [rsi + 4*r8 + 4] 477 LONG $0x597aa1c4; WORD $0x8744; BYTE $0x04 // vmulss xmm0, xmm0, dword [rdi + 4*r8 + 4] 478 LONG $0x117aa1c4; WORD $0x8244; BYTE $0x04 // vmovss dword [rdx + 4*r8 + 4], xmm0 479 LONG $0x107aa1c4; WORD $0x8644; BYTE $0x08 // vmovss xmm0, dword [rsi + 4*r8 + 8] 480 LONG $0x597aa1c4; WORD $0x8744; BYTE $0x08 // vmulss xmm0, xmm0, dword [rdi + 4*r8 + 8] 481 LONG $0x117aa1c4; WORD $0x8244; BYTE $0x08 // vmovss dword [rdx + 4*r8 + 8], xmm0 482 LONG $0x107aa1c4; WORD $0x8644; BYTE $0x0c // vmovss xmm0, dword [rsi + 4*r8 + 12] 483 LONG $0x597aa1c4; WORD $0x8744; BYTE $0x0c // vmulss xmm0, xmm0, dword [rdi + 4*r8 + 12] 484 LONG $0x117aa1c4; WORD $0x8244; BYTE $0x0c // vmovss dword [rdx + 4*r8 + 12], xmm0 485 LONG $0x04c08349 // add r8, 4 486 WORD $0x394c; BYTE $0xc0 // cmp rax, r8 487 JNE LBB3_17 488 LBB3_18: 489 VZEROUPPER 490 RET 491 492 493 494 495 TEXT ·___mm256_dot(SB), $0-32 496 497 MOVQ a+0(FP), DI 498 MOVQ b+8(FP), SI 499 MOVQ n+16(FP), DX 500 MOVQ ret+24(FP), CX 501 502 LONG $0x07428d48 // lea rax, [rdx + 7] 503 WORD $0x8548; BYTE $0xd2 // test rdx, rdx 504 LONG $0xc2490f48 // cmovns rax, rdx 505 WORD $0x8949; BYTE $0xc1 // mov r9, rax 506 LONG $0x03f9c149 // sar r9, 3 507 LONG $0xf8e08348 // and rax, -8 508 WORD $0x2948; BYTE $0xc2 // sub rdx, rax 509 WORD $0x8545; BYTE $0xc9 // test r9d, r9d 510 JLE LBB4_1 511 LONG $0x0610fcc5 // vmovups ymm0, yword [rsi] 512 LONG $0x0759fcc5 // vmulps ymm0, ymm0, yword [rdi] 513 LONG $0x20c78348 // add rdi, 32 514 LONG $0x20c68348 // add rsi, 32 515 LONG $0x01f98341 // cmp r9d, 1 516 JE LBB4_8 517 LONG $0xff418d45 // lea r8d, [r9 - 1] 518 LONG $0xfec18341 // add r9d, -2 519 WORD $0x8944; BYTE $0xc0 // mov eax, r8d 520 WORD $0xe083; BYTE $0x03 // and eax, 3 521 LONG $0x03f98341 // cmp r9d, 3 522 JB LBB4_6 523 LONG $0xfce08341 // and r8d, -4 524 LBB4_5: 525 LONG $0x0e10fcc5 // vmovups ymm1, yword [rsi] 526 LONG $0x5610fcc5; BYTE $0x20 // vmovups ymm2, yword [rsi + 32] 527 LONG $0x5e10fcc5; BYTE $0x40 // vmovups ymm3, yword [rsi + 64] 528 LONG $0x987de2c4; BYTE $0x0f // vfmadd132ps ymm1, ymm0, yword [rdi] 529 LONG $0xb86de2c4; WORD $0x204f // vfmadd231ps ymm1, ymm2, yword [rdi + 32] 530 LONG $0xb865e2c4; WORD $0x404f // vfmadd231ps ymm1, ymm3, yword [rdi + 64] 531 LONG $0x5610fcc5; BYTE $0x60 // vmovups ymm2, yword [rsi + 96] 532 LONG $0xc128fcc5 // vmovaps ymm0, ymm1 533 LONG $0xb86de2c4; WORD $0x6047 // vfmadd231ps ymm0, ymm2, yword [rdi + 96] 534 LONG $0x80ef8348 // sub rdi, -128 535 LONG $0x80ee8348 // sub rsi, -128 536 LONG $0xfcc08341 // add r8d, -4 537 JNE LBB4_5 538 LBB4_6: 539 WORD $0xc085 // test eax, eax 540 JE LBB4_8 541 LBB4_7: 542 LONG $0x0e10fcc5 // vmovups ymm1, yword [rsi] 543 LONG $0xb875e2c4; BYTE $0x07 // vfmadd231ps ymm0, ymm1, yword [rdi] 544 LONG $0x20c78348 // add rdi, 32 545 LONG $0x20c68348 // add rsi, 32 546 WORD $0xc8ff // dec eax 547 JNE LBB4_7 548 JMP LBB4_8 549 LBB4_1: 550 LBB4_8: 551 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 552 LONG $0xc058f0c5 // vaddps xmm0, xmm1, xmm0 553 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 554 LONG $0xc058f0c5 // vaddps xmm0, xmm1, xmm0 555 LONG $0xc816fac5 // vmovshdup xmm1, xmm0 556 LONG $0xc158fac5 // vaddss xmm0, xmm0, xmm1 557 LONG $0x0111fac5 // vmovss dword [rcx], xmm0 558 WORD $0xd285 // test edx, edx 559 JLE LBB4_20 560 WORD $0xd089 // mov eax, edx 561 LONG $0x20f88348 // cmp rax, 32 562 JAE LBB4_11 563 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 564 JMP LBB4_16 565 LBB4_11: 566 LONG $0x04418d4c // lea r8, [rcx + 4] 567 LONG $0x870c8d4c // lea r9, [rdi + 4*rax] 568 LONG $0x86148d4c // lea r10, [rsi + 4*rax] 569 WORD $0x3949; BYTE $0xc9 // cmp r9, rcx 570 LONG $0xd3970f41 // seta r11b 571 WORD $0x394c; BYTE $0xc7 // cmp rdi, r8 572 WORD $0x920f; BYTE $0xd3 // setb bl 573 WORD $0x3949; BYTE $0xca // cmp r10, rcx 574 LONG $0xd1970f41 // seta r9b 575 WORD $0x394c; BYTE $0xc6 // cmp rsi, r8 576 LONG $0xd2920f41 // setb r10b 577 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 578 WORD $0x8441; BYTE $0xdb // test r11b, bl 579 JNE LBB4_16 580 WORD $0x2045; BYTE $0xd1 // and r9b, r10b 581 JNE LBB4_16 582 WORD $0x8941; BYTE $0xd1 // mov r9d, edx 583 LONG $0x1fe18341 // and r9d, 31 584 WORD $0x8949; BYTE $0xc0 // mov r8, rax 585 WORD $0x294d; BYTE $0xc8 // sub r8, r9 586 LONG $0xc957f0c5 // vxorps xmm1, xmm1, xmm1 587 LONG $0x0c71e3c4; WORD $0x01c0 // vblendps xmm0, xmm1, xmm0, 1 588 LONG $0xc957f0c5 // vxorps xmm1, xmm1, xmm1 589 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d 590 LONG $0xd257e8c5 // vxorps xmm2, xmm2, xmm2 591 LONG $0xdb57e0c5 // vxorps xmm3, xmm3, xmm3 592 LBB4_14: 593 LONG $0x107ca1c4; WORD $0x9624 // vmovups ymm4, yword [rsi + 4*r10] 594 LONG $0x107ca1c4; WORD $0x966c; BYTE $0x20 // vmovups ymm5, yword [rsi + 4*r10 + 32] 595 LONG $0x107ca1c4; WORD $0x9674; BYTE $0x40 // vmovups ymm6, yword [rsi + 4*r10 + 64] 596 LONG $0x107ca1c4; WORD $0x967c; BYTE $0x60 // vmovups ymm7, yword [rsi + 4*r10 + 96] 597 LONG $0xb85da2c4; WORD $0x9704 // vfmadd231ps ymm0, ymm4, yword [rdi + 4*r10] 598 LONG $0xb855a2c4; WORD $0x974c; BYTE $0x20 // vfmadd231ps ymm1, ymm5, yword [rdi + 4*r10 + 32] 599 LONG $0xb84da2c4; WORD $0x9754; BYTE $0x40 // vfmadd231ps ymm2, ymm6, yword [rdi + 4*r10 + 64] 600 LONG $0xb845a2c4; WORD $0x975c; BYTE $0x60 // vfmadd231ps ymm3, ymm7, yword [rdi + 4*r10 + 96] 601 LONG $0x20c28349 // add r10, 32 602 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 603 JNE LBB4_14 604 LONG $0xc058f4c5 // vaddps ymm0, ymm1, ymm0 605 LONG $0xc058ecc5 // vaddps ymm0, ymm2, ymm0 606 LONG $0xc058e4c5 // vaddps ymm0, ymm3, ymm0 607 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 608 LONG $0xc158f8c5 // vaddps xmm0, xmm0, xmm1 609 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 610 LONG $0xc158f8c5 // vaddps xmm0, xmm0, xmm1 611 LONG $0xc816fac5 // vmovshdup xmm1, xmm0 612 LONG $0xc158fac5 // vaddss xmm0, xmm0, xmm1 613 LONG $0x0111fac5 // vmovss dword [rcx], xmm0 614 WORD $0x854d; BYTE $0xc9 // test r9, r9 615 JE LBB4_20 616 LBB4_16: 617 WORD $0x2944; BYTE $0xc2 // sub edx, r8d 618 WORD $0x894d; BYTE $0xc1 // mov r9, r8 619 WORD $0xf749; BYTE $0xd1 // not r9 620 WORD $0x0149; BYTE $0xc1 // add r9, rax 621 LONG $0x03e28348 // and rdx, 3 622 JE LBB4_18 623 LBB4_17: 624 LONG $0x107aa1c4; WORD $0x860c // vmovss xmm1, dword [rsi + 4*r8] 625 LONG $0xb971a2c4; WORD $0x8704 // vfmadd231ss xmm0, xmm1, dword [rdi + 4*r8] 626 LONG $0x0111fac5 // vmovss dword [rcx], xmm0 627 WORD $0xff49; BYTE $0xc0 // inc r8 628 WORD $0xff48; BYTE $0xca // dec rdx 629 JNE LBB4_17 630 LBB4_18: 631 LONG $0x03f98349 // cmp r9, 3 632 JB LBB4_20 633 LBB4_19: 634 LONG $0x107aa1c4; WORD $0x860c // vmovss xmm1, dword [rsi + 4*r8] 635 LONG $0x9979a2c4; WORD $0x870c // vfmadd132ss xmm1, xmm0, dword [rdi + 4*r8] 636 LONG $0x0911fac5 // vmovss dword [rcx], xmm1 637 LONG $0x107aa1c4; WORD $0x8644; BYTE $0x04 // vmovss xmm0, dword [rsi + 4*r8 + 4] 638 LONG $0x9971a2c4; WORD $0x8744; BYTE $0x04 // vfmadd132ss xmm0, xmm1, dword [rdi + 4*r8 + 4] 639 LONG $0x0111fac5 // vmovss dword [rcx], xmm0 640 LONG $0x107aa1c4; WORD $0x864c; BYTE $0x08 // vmovss xmm1, dword [rsi + 4*r8 + 8] 641 LONG $0x9979a2c4; WORD $0x874c; BYTE $0x08 // vfmadd132ss xmm1, xmm0, dword [rdi + 4*r8 + 8] 642 LONG $0x0911fac5 // vmovss dword [rcx], xmm1 643 LONG $0x107aa1c4; WORD $0x8644; BYTE $0x0c // vmovss xmm0, dword [rsi + 4*r8 + 12] 644 LONG $0x9971a2c4; WORD $0x8744; BYTE $0x0c // vfmadd132ss xmm0, xmm1, dword [rdi + 4*r8 + 12] 645 LONG $0x0111fac5 // vmovss dword [rcx], xmm0 646 LONG $0x04c08349 // add r8, 4 647 WORD $0x394c; BYTE $0xc0 // cmp rax, r8 648 JNE LBB4_19 649 LBB4_20: 650 VZEROUPPER 651 RET