gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/matrix.s (about) 1 Mat4Mul_F64_V(double*, double*, double*): # @Mat4Mul_F64_V(double*, double*, double*) 2 vbroadcastsd (%rsi), %ymm0 3 vmovupd (%rdx), %ymm1 4 vmovupd 32(%rdx), %ymm2 5 vmovupd 64(%rdx), %ymm3 6 vmovupd 96(%rdx), %ymm4 7 vmulpd %ymm0, %ymm1, %ymm0 8 vbroadcastsd 8(%rsi), %ymm5 9 vfmadd213pd %ymm0, %ymm2, %ymm5 # ymm5 = (ymm2 * ymm5) + ymm0 10 vbroadcastsd 16(%rsi), %ymm0 11 vfmadd213pd %ymm5, %ymm3, %ymm0 # ymm0 = (ymm3 * ymm0) + ymm5 12 vbroadcastsd 24(%rsi), %ymm5 13 vfmadd213pd %ymm0, %ymm4, %ymm5 # ymm5 = (ymm4 * ymm5) + ymm0 14 vmovupd %ymm5, (%rdi) 15 vbroadcastsd 32(%rsi), %ymm0 16 vmulpd %ymm0, %ymm1, %ymm0 17 vbroadcastsd 40(%rsi), %ymm1 18 vfmadd213pd %ymm0, %ymm2, %ymm1 # ymm1 = (ymm2 * ymm1) + ymm0 19 vbroadcastsd 48(%rsi), %ymm0 20 vfmadd213pd %ymm1, %ymm3, %ymm0 # ymm0 = (ymm3 * ymm0) + ymm1 21 vbroadcastsd 56(%rsi), %ymm1 22 vfmadd213pd %ymm0, %ymm4, %ymm1 # ymm1 = (ymm4 * ymm1) + ymm0 23 vmovupd %ymm1, 32(%rdi) 24 vbroadcastsd 64(%rsi), %ymm0 25 vmovupd (%rdx), %ymm1 26 vmovupd 32(%rdx), %ymm2 27 vmovupd 64(%rdx), %ymm3 28 vmovupd 96(%rdx), %ymm4 29 vmulpd %ymm0, %ymm1, %ymm0 30 vbroadcastsd 72(%rsi), %ymm5 31 vfmadd213pd %ymm0, %ymm2, %ymm5 # ymm5 = (ymm2 * ymm5) + ymm0 32 vbroadcastsd 80(%rsi), %ymm0 33 vfmadd213pd %ymm5, %ymm3, %ymm0 # ymm0 = (ymm3 * ymm0) + ymm5 34 vbroadcastsd 88(%rsi), %ymm5 35 vfmadd213pd %ymm0, %ymm4, %ymm5 # ymm5 = (ymm4 * ymm5) + ymm0 36 vmovupd %ymm5, 64(%rdi) 37 vbroadcastsd 96(%rsi), %ymm0 38 vmulpd %ymm0, %ymm1, %ymm0 39 vbroadcastsd 104(%rsi), %ymm1 40 vfmadd213pd %ymm0, %ymm2, %ymm1 # ymm1 = (ymm2 * ymm1) + ymm0 41 vbroadcastsd 112(%rsi), %ymm0 42 vfmadd213pd %ymm1, %ymm3, %ymm0 # ymm0 = (ymm3 * ymm0) + ymm1 43 vbroadcastsd 120(%rsi), %ymm1 44 vfmadd213pd %ymm0, %ymm4, %ymm1 # ymm1 = (ymm4 * ymm1) + ymm0 45 vmovupd %ymm1, 96(%rdi) 46 vzeroupper 47 retq 48 Mat4Mul_F32_V(float*, float*, float*): # @Mat4Mul_F32_V(float*, float*, float*) 49 vbroadcastf128 (%rdx), %ymm0 # ymm0 = mem[0,1,0,1] 50 vbroadcastf128 16(%rdx), %ymm1 # ymm1 = mem[0,1,0,1] 51 vbroadcastf128 32(%rdx), %ymm2 # ymm2 = mem[0,1,0,1] 52 vbroadcastf128 48(%rdx), %ymm3 # ymm3 = mem[0,1,0,1] 53 vmovss 16(%rsi), %xmm4 # xmm4 = mem[0],zero,zero,zero 54 vmovss (%rsi), %xmm5 # xmm5 = mem[0],zero,zero,zero 55 vshufps $0, %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0,0],xmm4[0,0] 56 vmovss 4(%rsi), %xmm5 # xmm5 = mem[0],zero,zero,zero 57 vmovss 8(%rsi), %xmm6 # xmm6 = mem[0],zero,zero,zero 58 vmovss 12(%rsi), %xmm7 # xmm7 = mem[0],zero,zero,zero 59 vpermpd $80, %ymm4, %ymm4 # ymm4 = ymm4[0,0,1,1] 60 vmulps %ymm4, %ymm0, %ymm0 61 vmovss 20(%rsi), %xmm4 # xmm4 = mem[0],zero,zero,zero 62 vshufps $0, %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0,0],xmm4[0,0] 63 vpermpd $80, %ymm4, %ymm4 # ymm4 = ymm4[0,0,1,1] 64 vfmadd213ps %ymm0, %ymm1, %ymm4 # ymm4 = (ymm1 * ymm4) + ymm0 65 vmovss 24(%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero 66 vshufps $0, %xmm0, %xmm6, %xmm0 # xmm0 = xmm6[0,0],xmm0[0,0] 67 vpermpd $80, %ymm0, %ymm0 # ymm0 = ymm0[0,0,1,1] 68 vfmadd213ps %ymm4, %ymm2, %ymm0 # ymm0 = (ymm2 * ymm0) + ymm4 69 vmovss 28(%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero 70 vshufps $0, %xmm1, %xmm7, %xmm1 # xmm1 = xmm7[0,0],xmm1[0,0] 71 vpermpd $80, %ymm1, %ymm1 # ymm1 = ymm1[0,0,1,1] 72 vfmadd213ps %ymm0, %ymm3, %ymm1 # ymm1 = (ymm3 * ymm1) + ymm0 73 vbroadcastf128 (%rdx), %ymm0 # ymm0 = mem[0,1,0,1] 74 vbroadcastf128 16(%rdx), %ymm2 # ymm2 = mem[0,1,0,1] 75 vbroadcastf128 32(%rdx), %ymm3 # ymm3 = mem[0,1,0,1] 76 vmovups %ymm1, (%rdi) 77 vbroadcastf128 48(%rdx), %ymm1 # ymm1 = mem[0,1,0,1] 78 vmovss 48(%rsi), %xmm4 # xmm4 = mem[0],zero,zero,zero 79 vmovss 32(%rsi), %xmm5 # xmm5 = mem[0],zero,zero,zero 80 vshufps $0, %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0,0],xmm4[0,0] 81 vmovss 36(%rsi), %xmm5 # xmm5 = mem[0],zero,zero,zero 82 vmovss 40(%rsi), %xmm6 # xmm6 = mem[0],zero,zero,zero 83 vmovss 44(%rsi), %xmm7 # xmm7 = mem[0],zero,zero,zero 84 vpermpd $80, %ymm4, %ymm4 # ymm4 = ymm4[0,0,1,1] 85 vmulps %ymm4, %ymm0, %ymm0 86 vmovss 52(%rsi), %xmm4 # xmm4 = mem[0],zero,zero,zero 87 vshufps $0, %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0,0],xmm4[0,0] 88 vpermpd $80, %ymm4, %ymm4 # ymm4 = ymm4[0,0,1,1] 89 vfmadd213ps %ymm0, %ymm2, %ymm4 # ymm4 = (ymm2 * ymm4) + ymm0 90 vmovss 56(%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero 91 vshufps $0, %xmm0, %xmm6, %xmm0 # xmm0 = xmm6[0,0],xmm0[0,0] 92 vpermpd $80, %ymm0, %ymm0 # ymm0 = ymm0[0,0,1,1] 93 vfmadd213ps %ymm4, %ymm3, %ymm0 # ymm0 = (ymm3 * ymm0) + ymm4 94 vmovss 60(%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero 95 vshufps $0, %xmm2, %xmm7, %xmm2 # xmm2 = xmm7[0,0],xmm2[0,0] 96 vpermpd $80, %ymm2, %ymm2 # ymm2 = ymm2[0,0,1,1] 97 vfmadd213ps %ymm0, %ymm1, %ymm2 # ymm2 = (ymm1 * ymm2) + ymm0 98 vmovups %ymm2, 32(%rdi) 99 vzeroupper 100 retq 101 MatMul_F64_V(double*, double*, double*, unsigned long, unsigned long, unsigned long): # @MatMul_F64_V(double*, double*, double*, unsigned long, unsigned long, unsigned long) 102 pushq %rbp 103 pushq %r15 104 pushq %r14 105 pushq %r13 106 pushq %r12 107 pushq %rbx 108 movq %rdx, -16(%rsp) # 8-byte Spill 109 movq %rcx, -8(%rsp) # 8-byte Spill 110 testq %rcx, %rcx 111 je .LBB4_13 112 testq %r8, %r8 113 je .LBB4_13 114 testq %r9, %r9 115 je .LBB4_13 116 movq %r9, %r12 117 andq $-16, %r12 118 movq -16(%rsp), %rax # 8-byte Reload 119 leaq 96(%rax), %rcx 120 leaq (%r16,%r9,8), %r11 121 leaq 96(%rdi), %rbx 122 xorl %r14d, %r14d 123 jmp .LBB4_4 124 .LBB4_12: # in Loop: Header=BB4_4 Depth=1 125 addq $1, %r14 126 addq %r11, %rbx 127 addq %r11, %rdi 128 cmpq -8(%rsp), %r14 # 8-byte Folded Reload 129 je .LBB4_13 130 .LBB4_4: # =>This Loop Header: Depth=1 131 movq %r14, %r15 132 imulq %r8, %r15 133 movq -16(%rsp), %r13 # 8-byte Reload 134 movq %rcx, %rax 135 xorl %ebp, %ebp 136 jmp .LBB4_5 137 .LBB4_11: # in Loop: Header=BB4_5 Depth=2 138 addq $1, %rbp 139 addq %r11, %rax 140 addq %r11, %r13 141 cmpq %r8, %rbp 142 je .LBB4_12 143 .LBB4_5: # Parent Loop BB4_4 Depth=1 144 leaq (%r15,%rbp), %rdx 145 vmovsd (%rsi,%rdx,8), %xmm0 # xmm0 = mem[0],zero 146 cmpq $16, %r9 147 jae .LBB4_7 148 xorl %edx, %edx 149 jmp .LBB4_10 150 .LBB4_7: # in Loop: Header=BB4_5 Depth=2 151 vbroadcastsd %xmm0, %ymm1 152 xorl %r10d, %r10d 153 .LBB4_8: # Parent Loop BB4_4 Depth=1 154 vmovupd -96(%rax,%r10,8), %ymm2 155 vmovupd -64(%rax,%r10,8), %ymm3 156 vmovupd -32(%rax,%r10,8), %ymm4 157 vmovupd (%rax,%r10,8), %ymm5 158 vfmadd213pd -96(%rbx,%r10,8), %ymm1, %ymm2 # ymm2 = (ymm1 * ymm2) + mem 159 vfmadd213pd -64(%rbx,%r10,8), %ymm1, %ymm3 # ymm3 = (ymm1 * ymm3) + mem 160 vfmadd213pd -32(%rbx,%r10,8), %ymm1, %ymm4 # ymm4 = (ymm1 * ymm4) + mem 161 vfmadd213pd (%rbx,%r10,8), %ymm1, %ymm5 # ymm5 = (ymm1 * ymm5) + mem 162 vmovupd %ymm2, -96(%rbx,%r10,8) 163 vmovupd %ymm3, -64(%rbx,%r10,8) 164 vmovupd %ymm4, -32(%rbx,%r10,8) 165 vmovupd %ymm5, (%rbx,%r10,8) 166 addq $16, %r10 167 cmpq %r10, %r12 168 jne .LBB4_8 169 movq %r12, %rdx 170 cmpq %r9, %r12 171 je .LBB4_11 172 .LBB4_10: # Parent Loop BB4_4 Depth=1 173 vmovsd (%r13,%rdx,8), %xmm1 # xmm1 = mem[0],zero 174 vfmadd213sd (%rdi,%rdx,8), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem 175 vmovsd %xmm1, (%rdi,%rdx,8) 176 addq $1, %rdx 177 cmpq %rdx, %r9 178 jne .LBB4_10 179 jmp .LBB4_11 180 .LBB4_13: 181 popq %rbx 182 popq %r12 183 popq %r13 184 popq %r14 185 popq %r15 186 popq %rbp 187 vzeroupper 188 retq 189 MatMul_F32_V(float*, float*, float*, unsigned long, unsigned long, unsigned long): # @MatMul_F32_V(float*, float*, float*, unsigned long, unsigned long, unsigned long) 190 pushq %rbp 191 pushq %r15 192 pushq %r14 193 pushq %r13 194 pushq %r12 195 pushq %rbx 196 movq %rdx, -16(%rsp) # 8-byte Spill 197 movq %rcx, -8(%rsp) # 8-byte Spill 198 testq %rcx, %rcx 199 je .LBB5_13 200 testq %r8, %r8 201 je .LBB5_13 202 testq %r9, %r9 203 je .LBB5_13 204 movq %r9, %r12 205 andq $-32, %r12 206 movq -16(%rsp), %rax # 8-byte Reload 207 leaq 96(%rax), %rcx 208 leaq (%r16,%r9,4), %r11 209 leaq 96(%rdi), %rbx 210 xorl %r14d, %r14d 211 jmp .LBB5_4 212 .LBB5_12: # in Loop: Header=BB5_4 Depth=1 213 addq $1, %r14 214 addq %r11, %rbx 215 addq %r11, %rdi 216 cmpq -8(%rsp), %r14 # 8-byte Folded Reload 217 je .LBB5_13 218 .LBB5_4: # =>This Loop Header: Depth=1 219 movq %r14, %r15 220 imulq %r8, %r15 221 movq -16(%rsp), %r13 # 8-byte Reload 222 movq %rcx, %rax 223 xorl %ebp, %ebp 224 jmp .LBB5_5 225 .LBB5_11: # in Loop: Header=BB5_5 Depth=2 226 addq $1, %rbp 227 addq %r11, %rax 228 addq %r11, %r13 229 cmpq %r8, %rbp 230 je .LBB5_12 231 .LBB5_5: # Parent Loop BB5_4 Depth=1 232 leaq (%r15,%rbp), %rdx 233 vmovss (%rsi,%rdx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 234 cmpq $32, %r9 235 jae .LBB5_7 236 xorl %edx, %edx 237 jmp .LBB5_10 238 .LBB5_7: # in Loop: Header=BB5_5 Depth=2 239 vbroadcastss %xmm0, %ymm1 240 xorl %r10d, %r10d 241 .LBB5_8: # Parent Loop BB5_4 Depth=1 242 vmovups -96(%rax,%r10,4), %ymm2 243 vmovups -64(%rax,%r10,4), %ymm3 244 vmovups -32(%rax,%r10,4), %ymm4 245 vmovups (%rax,%r10,4), %ymm5 246 vfmadd213ps -96(%rbx,%r10,4), %ymm1, %ymm2 # ymm2 = (ymm1 * ymm2) + mem 247 vfmadd213ps -64(%rbx,%r10,4), %ymm1, %ymm3 # ymm3 = (ymm1 * ymm3) + mem 248 vfmadd213ps -32(%rbx,%r10,4), %ymm1, %ymm4 # ymm4 = (ymm1 * ymm4) + mem 249 vfmadd213ps (%rbx,%r10,4), %ymm1, %ymm5 # ymm5 = (ymm1 * ymm5) + mem 250 vmovups %ymm2, -96(%rbx,%r10,4) 251 vmovups %ymm3, -64(%rbx,%r10,4) 252 vmovups %ymm4, -32(%rbx,%r10,4) 253 vmovups %ymm5, (%rbx,%r10,4) 254 addq $32, %r10 255 cmpq %r10, %r12 256 jne .LBB5_8 257 movq %r12, %rdx 258 cmpq %r9, %r12 259 je .LBB5_11 260 .LBB5_10: # Parent Loop BB5_4 Depth=1 261 vmovss (%r13,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 262 vfmadd213ss (%rdi,%rdx,4), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem 263 vmovss %xmm1, (%rdi,%rdx,4) 264 addq $1, %rdx 265 cmpq %rdx, %r9 266 jne .LBB5_10 267 jmp .LBB5_11 268 .LBB5_13: 269 popq %rbx 270 popq %r12 271 popq %r13 272 popq %r14 273 popq %r15 274 popq %rbp 275 vzeroupper 276 retq 277 MatMulVec_F64_V(double*, double*, double*, unsigned long, unsigned long): # @MatMulVec_F64_V(double*, double*, double*, unsigned long, unsigned long) 278 pushq %rbx 279 testq %rcx, %rcx 280 je .LBB6_10 281 testq %r8, %r8 282 je .LBB6_10 283 movq %r8, %r9 284 andq $-16, %r9 285 leaq 96(%rsi), %rax 286 leaq (%r16,%r8,8), %r10 287 xorl %r11d, %r11d 288 jmp .LBB6_3 289 .LBB6_9: # in Loop: Header=BB6_3 Depth=1 290 vmovsd %xmm0, (%rdi,%r11,8) 291 addq $1, %r11 292 addq %r10, %rax 293 addq %r10, %rsi 294 cmpq %rcx, %r11 295 je .LBB6_10 296 .LBB6_3: # =>This Loop Header: Depth=1 297 vmovq (%rdi,%r11,8), %xmm0 # xmm0 = mem[0],zero 298 cmpq $16, %r8 299 jae .LBB6_5 300 xorl %ebx, %ebx 301 jmp .LBB6_8 302 .LBB6_5: # in Loop: Header=BB6_3 Depth=1 303 vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero 304 vxorpd %xmm1, %xmm1, %xmm1 305 xorl %ebx, %ebx 306 vxorpd %xmm2, %xmm2, %xmm2 307 vxorpd %xmm3, %xmm3, %xmm3 308 .LBB6_6: # Parent Loop BB6_3 Depth=1 309 vmovupd (%rdx,%rbx,8), %ymm4 310 vmovupd 32(%rdx,%rbx,8), %ymm5 311 vmovupd 64(%rdx,%rbx,8), %ymm6 312 vmovupd 96(%rdx,%rbx,8), %ymm7 313 vfmadd231pd -96(%rax,%rbx,8), %ymm4, %ymm0 # ymm0 = (ymm4 * mem) + ymm0 314 vfmadd231pd -64(%rax,%rbx,8), %ymm5, %ymm1 # ymm1 = (ymm5 * mem) + ymm1 315 vfmadd231pd -32(%rax,%rbx,8), %ymm6, %ymm2 # ymm2 = (ymm6 * mem) + ymm2 316 vfmadd231pd (%rax,%rbx,8), %ymm7, %ymm3 # ymm3 = (ymm7 * mem) + ymm3 317 addq $16, %rbx 318 cmpq %rbx, %r9 319 jne .LBB6_6 320 vaddpd %ymm0, %ymm1, %ymm0 321 vaddpd %ymm0, %ymm2, %ymm0 322 vaddpd %ymm0, %ymm3, %ymm0 323 vextractf128 $1, %ymm0, %xmm1 324 vaddpd %xmm1, %xmm0, %xmm0 325 vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] 326 vaddsd %xmm1, %xmm0, %xmm0 327 movq %r9, %rbx 328 cmpq %r8, %r9 329 je .LBB6_9 330 .LBB6_8: # Parent Loop BB6_3 Depth=1 331 vmovsd (%rdx,%rbx,8), %xmm1 # xmm1 = mem[0],zero 332 vfmadd231sd (%rsi,%rbx,8), %xmm1, %xmm0 # xmm0 = (xmm1 * mem) + xmm0 333 addq $1, %rbx 334 cmpq %rbx, %r8 335 jne .LBB6_8 336 jmp .LBB6_9 337 .LBB6_10: 338 popq %rbx 339 vzeroupper 340 retq 341 MatMulVec_F32_V(float*, float*, float*, unsigned long, unsigned long): # @MatMulVec_F32_V(float*, float*, float*, unsigned long, unsigned long) 342 pushq %rbx 343 testq %rcx, %rcx 344 je .LBB7_10 345 testq %r8, %r8 346 je .LBB7_10 347 movq %r8, %r9 348 andq $-32, %r9 349 leaq 96(%rsi), %rax 350 leaq (%r16,%r8,4), %r10 351 xorl %r11d, %r11d 352 vxorps %xmm0, %xmm0, %xmm0 353 jmp .LBB7_3 354 .LBB7_9: # in Loop: Header=BB7_3 Depth=1 355 vmovss %xmm1, (%rdi,%r11,4) 356 addq $1, %r11 357 addq %r10, %rax 358 addq %r10, %rsi 359 cmpq %rcx, %r11 360 je .LBB7_10 361 .LBB7_3: # =>This Loop Header: Depth=1 362 vmovss (%rdi,%r11,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 363 cmpq $32, %r8 364 jae .LBB7_5 365 xorl %ebx, %ebx 366 jmp .LBB7_8 367 .LBB7_5: # in Loop: Header=BB7_3 Depth=1 368 vblendps $1, %xmm1, %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[1,2,3] 369 vxorps %xmm2, %xmm2, %xmm2 370 xorl %ebx, %ebx 371 vxorps %xmm3, %xmm3, %xmm3 372 vxorps %xmm4, %xmm4, %xmm4 373 .LBB7_6: # Parent Loop BB7_3 Depth=1 374 vmovups (%rdx,%rbx,4), %ymm5 375 vmovups 32(%rdx,%rbx,4), %ymm6 376 vmovups 64(%rdx,%rbx,4), %ymm7 377 vmovups 96(%rdx,%rbx,4), %ymm8 378 vfmadd231ps -96(%rax,%rbx,4), %ymm5, %ymm1 # ymm1 = (ymm5 * mem) + ymm1 379 vfmadd231ps -64(%rax,%rbx,4), %ymm6, %ymm2 # ymm2 = (ymm6 * mem) + ymm2 380 vfmadd231ps -32(%rax,%rbx,4), %ymm7, %ymm3 # ymm3 = (ymm7 * mem) + ymm3 381 vfmadd231ps (%rax,%rbx,4), %ymm8, %ymm4 # ymm4 = (ymm8 * mem) + ymm4 382 addq $32, %rbx 383 cmpq %rbx, %r9 384 jne .LBB7_6 385 vaddps %ymm1, %ymm2, %ymm1 386 vaddps %ymm1, %ymm3, %ymm1 387 vaddps %ymm1, %ymm4, %ymm1 388 vextractf128 $1, %ymm1, %xmm2 389 vaddps %xmm2, %xmm1, %xmm1 390 vpermilpd $1, %xmm1, %xmm2 # xmm2 = xmm1[1,0] 391 vaddps %xmm2, %xmm1, %xmm1 392 vmovshdup %xmm1, %xmm2 # xmm2 = xmm1[1,1,3,3] 393 vaddss %xmm2, %xmm1, %xmm1 394 movq %r9, %rbx 395 cmpq %r8, %r9 396 je .LBB7_9 397 .LBB7_8: # Parent Loop BB7_3 Depth=1 398 vmovss (%rdx,%rbx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 399 vfmadd231ss (%rsi,%rbx,4), %xmm2, %xmm1 # xmm1 = (xmm2 * mem) + xmm1 400 addq $1, %rbx 401 cmpq %rbx, %r8 402 jne .LBB7_8 403 jmp .LBB7_9 404 .LBB7_10: 405 popq %rbx 406 vzeroupper 407 retq 408 MatMulTiled_F64_V(double*, double*, double*, unsigned long, unsigned long, unsigned long): # @MatMulTiled_F64_V(double*, double*, double*, unsigned long, unsigned long, unsigned long) 409 pushq %rbp 410 pushq %r15 411 pushq %r14 412 pushq %r13 413 pushq %r12 414 pushq %rbx 415 subq $72, %rsp 416 movq %r9, -128(%rsp) # 8-byte Spill 417 movq %r8, -104(%rsp) # 8-byte Spill 418 movq %rdx, -88(%rsp) # 8-byte Spill 419 movq %rdi, -112(%rsp) # 8-byte Spill 420 movq %rcx, -64(%rsp) # 8-byte Spill 421 addq $7, %rcx 422 movq %rcx, -72(%rsp) # 8-byte Spill 423 je .LBB8_21 424 movq -104(%rsp), %rax # 8-byte Reload 425 addq $255, %rax 426 movq %rax, 8(%rsp) # 8-byte Spill 427 je .LBB8_21 428 movq -128(%rsp), %rax # 8-byte Reload 429 addq $255, %rax 430 movq %rax, -40(%rsp) # 8-byte Spill 431 je .LBB8_21 432 movq -88(%rsp), %rax # 8-byte Reload 433 addq $96, %rax 434 movq %rax, -48(%rsp) # 8-byte Spill 435 movq -128(%rsp), %rax # 8-byte Reload 436 leaq (%r16,%rax,8), %rbx 437 movq -112(%rsp), %rcx # 8-byte Reload 438 addq $96, %rcx 439 movq %rcx, -96(%rsp) # 8-byte Spill 440 shlq $6, %rax 441 movq %rax, -80(%rsp) # 8-byte Spill 442 xorl %edx, %edx 443 jmp .LBB8_4 444 .LBB8_20: # in Loop: Header=BB8_4 Depth=1 445 movq -80(%rsp), %rax # 8-byte Reload 446 addq %rax, -96(%rsp) # 8-byte Folded Spill 447 addq %rax, -112(%rsp) # 8-byte Folded Spill 448 movq -56(%rsp), %rax # 8-byte Reload 449 movq %rax, %rdx 450 cmpq -72(%rsp), %rax # 8-byte Folded Reload 451 jae .LBB8_21 452 .LBB8_4: # =>This Loop Header: Depth=1 453 leaq 8(%rdx), %rax 454 movq -64(%rsp), %rcx # 8-byte Reload 455 cmpq %rcx, %rax 456 movq %rax, -56(%rsp) # 8-byte Spill 457 cmovaq %rcx, %rax 458 cltq 459 movq %rdx, -16(%rsp) # 8-byte Spill 460 movq %rax, 24(%rsp) # 8-byte Spill 461 cmpq %rax, %rdx 462 jae .LBB8_20 463 xorl %eax, %eax 464 movq %rax, -120(%rsp) # 8-byte Spill 465 movl $256, %edx # imm = 0x100 466 xorl %eax, %eax 467 jmp .LBB8_6 468 .LBB8_19: # in Loop: Header=BB8_6 Depth=2 469 movq -120(%rsp), %rax # 8-byte Reload 470 addl $1, %eax 471 movq %rax, -120(%rsp) # 8-byte Spill 472 movq -24(%rsp), %rdx # 8-byte Reload 473 addq $256, %rdx # imm = 0x100 474 movq -32(%rsp), %rax # 8-byte Reload 475 cmpq -40(%rsp), %rax # 8-byte Folded Reload 476 jae .LBB8_20 477 .LBB8_6: # Parent Loop BB8_4 Depth=1 478 movl %eax, %edi 479 movq -128(%rsp), %rbp # 8-byte Reload 480 cmpq %rdx, %rbp 481 movq %rdx, -24(%rsp) # 8-byte Spill 482 cmovbq %rbp, %rdx 483 addq $256, %rax # imm = 0x100 484 cmpq %rax, %rbp 485 movq %rax, %rcx 486 cmovbq %rbp, %rcx 487 movq %rax, -32(%rsp) # 8-byte Spill 488 cmovbq %rbp, %rax 489 cmpl %eax, %edi 490 jge .LBB8_19 491 movslq %edi, %r14 492 movq -96(%rsp), %rdi # 8-byte Reload 493 leaq (%rdi,%r14,8), %rdi 494 movq %rdi, (%rsp) # 8-byte Spill 495 movslq %edx, %r11 496 subq %r14, %r11 497 andq $-16, %r11 498 movslq %ecx, %r12 499 movq -120(%rsp), %rcx # 8-byte Reload 500 shll $8, %ecx 501 movslq %ecx, %rcx 502 subq %rcx, %r12 503 movslq %eax, %rdx 504 movq %r12, %rcx 505 andq $-16, %rcx 506 movq -48(%rsp), %rax # 8-byte Reload 507 leaq (%rax,%r14,8), %rax 508 movq %rax, -8(%rsp) # 8-byte Spill 509 movq %r14, %r13 510 movq %rcx, 64(%rsp) # 8-byte Spill 511 addq %rcx, %r13 512 xorl %eax, %eax 513 jmp .LBB8_8 514 .LBB8_18: # in Loop: Header=BB8_8 Depth=3 515 movq 16(%rsp), %rax # 8-byte Reload 516 cmpq 8(%rsp), %rax # 8-byte Folded Reload 517 jae .LBB8_19 518 .LBB8_8: # Parent Loop BB8_4 Depth=1 519 movl %eax, %ecx 520 addq $256, %rax # imm = 0x100 521 movq -104(%rsp), %rdi # 8-byte Reload 522 cmpq %rdi, %rax 523 movq %rax, 16(%rsp) # 8-byte Spill 524 cmovaq %rdi, %rax 525 cmpl %eax, %ecx 526 jge .LBB8_18 527 movslq %ecx, %rdi 528 movq -128(%rsp), %rcx # 8-byte Reload 529 movq %rdi, 48(%rsp) # 8-byte Spill 530 imulq %rdi, %rcx 531 movq -88(%rsp), %rdi # 8-byte Reload 532 leaq (%rdi,%rcx,8), %rdi 533 movq %rdi, 40(%rsp) # 8-byte Spill 534 movq -8(%rsp), %rdi # 8-byte Reload 535 leaq (%rdi,%rcx,8), %rcx 536 movq %rcx, 32(%rsp) # 8-byte Spill 537 cltq 538 movq -112(%rsp), %rcx # 8-byte Reload 539 movq (%rsp), %r10 # 8-byte Reload 540 movq -16(%rsp), %r8 # 8-byte Reload 541 jmp .LBB8_10 542 .LBB8_17: # in Loop: Header=BB8_10 Depth=4 543 movq 56(%rsp), %r8 # 8-byte Reload 544 addq $1, %r8 545 addq %rbx, %r10 546 addq %rbx, %rcx 547 cmpq 24(%rsp), %r8 # 8-byte Folded Reload 548 jae .LBB8_18 549 .LBB8_10: # Parent Loop BB8_4 Depth=1 550 movq %r8, 56(%rsp) # 8-byte Spill 551 imulq -104(%rsp), %r8 # 8-byte Folded Reload 552 movq 40(%rsp), %r15 # 8-byte Reload 553 movq 32(%rsp), %rdi # 8-byte Reload 554 movq 48(%rsp), %r9 # 8-byte Reload 555 jmp .LBB8_11 556 .LBB8_16: # in Loop: Header=BB8_11 Depth=5 557 addq $1, %r9 558 addq %rbx, %rdi 559 addq %rbx, %r15 560 cmpq %rax, %r9 561 jge .LBB8_17 562 .LBB8_11: # Parent Loop BB8_4 Depth=1 563 leaq (%r9,%r8), %rbp 564 vmovsd (%rsi,%rbp,8), %xmm0 # xmm0 = mem[0],zero 565 movq %r14, %rbp 566 cmpq $16, %r12 567 jb .LBB8_15 568 vbroadcastsd %xmm0, %ymm1 569 xorl %ebp, %ebp 570 .LBB8_13: # Parent Loop BB8_4 Depth=1 571 vmovupd -96(%rdi,%rbp,8), %ymm2 572 vmovupd -64(%rdi,%rbp,8), %ymm3 573 vmovupd -32(%rdi,%rbp,8), %ymm4 574 vmovupd (%rdi,%rbp,8), %ymm5 575 vfmadd213pd -96(%r10,%rbp,8), %ymm1, %ymm2 # ymm2 = (ymm1 * ymm2) + mem 576 vfmadd213pd -64(%r10,%rbp,8), %ymm1, %ymm3 # ymm3 = (ymm1 * ymm3) + mem 577 vfmadd213pd -32(%r10,%rbp,8), %ymm1, %ymm4 # ymm4 = (ymm1 * ymm4) + mem 578 vfmadd213pd (%r10,%rbp,8), %ymm1, %ymm5 # ymm5 = (ymm1 * ymm5) + mem 579 vmovupd %ymm2, -96(%r10,%rbp,8) 580 vmovupd %ymm3, -64(%r10,%rbp,8) 581 vmovupd %ymm4, -32(%r10,%rbp,8) 582 vmovupd %ymm5, (%r10,%rbp,8) 583 addq $16, %rbp 584 cmpq %rbp, %r11 585 jne .LBB8_13 586 movq %r13, %rbp 587 cmpq 64(%rsp), %r12 # 8-byte Folded Reload 588 je .LBB8_16 589 .LBB8_15: # Parent Loop BB8_4 Depth=1 590 vmovsd (%r15,%rbp,8), %xmm1 # xmm1 = mem[0],zero 591 vfmadd213sd (%rcx,%rbp,8), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem 592 vmovsd %xmm1, (%rcx,%rbp,8) 593 addq $1, %rbp 594 cmpq %rdx, %rbp 595 jl .LBB8_15 596 jmp .LBB8_16 597 .LBB8_21: 598 addq $72, %rsp 599 popq %rbx 600 popq %r12 601 popq %r13 602 popq %r14 603 popq %r15 604 popq %rbp 605 vzeroupper 606 retq 607 MatMulTiled_F32_V(float*, float*, float*, unsigned long, unsigned long, unsigned long): # @MatMulTiled_F32_V(float*, float*, float*, unsigned long, unsigned long, unsigned long) 608 pushq %rbp 609 pushq %r15 610 pushq %r14 611 pushq %r13 612 pushq %r12 613 pushq %rbx 614 subq $72, %rsp 615 movq %r9, -128(%rsp) # 8-byte Spill 616 movq %r8, -104(%rsp) # 8-byte Spill 617 movq %rdx, -88(%rsp) # 8-byte Spill 618 movq %rdi, -112(%rsp) # 8-byte Spill 619 movq %rcx, -64(%rsp) # 8-byte Spill 620 addq $7, %rcx 621 movq %rcx, -72(%rsp) # 8-byte Spill 622 je .LBB9_21 623 movq -104(%rsp), %rax # 8-byte Reload 624 addq $255, %rax 625 movq %rax, 8(%rsp) # 8-byte Spill 626 je .LBB9_21 627 movq -128(%rsp), %rax # 8-byte Reload 628 addq $255, %rax 629 movq %rax, -40(%rsp) # 8-byte Spill 630 je .LBB9_21 631 movq -88(%rsp), %rax # 8-byte Reload 632 addq $96, %rax 633 movq %rax, -48(%rsp) # 8-byte Spill 634 movq -128(%rsp), %rax # 8-byte Reload 635 leaq (%r16,%rax,4), %rbx 636 movq -112(%rsp), %rcx # 8-byte Reload 637 addq $96, %rcx 638 movq %rcx, -96(%rsp) # 8-byte Spill 639 shlq $5, %rax 640 movq %rax, -80(%rsp) # 8-byte Spill 641 xorl %edx, %edx 642 jmp .LBB9_4 643 .LBB9_20: # in Loop: Header=BB9_4 Depth=1 644 movq -80(%rsp), %rax # 8-byte Reload 645 addq %rax, -96(%rsp) # 8-byte Folded Spill 646 addq %rax, -112(%rsp) # 8-byte Folded Spill 647 movq -56(%rsp), %rax # 8-byte Reload 648 movq %rax, %rdx 649 cmpq -72(%rsp), %rax # 8-byte Folded Reload 650 jae .LBB9_21 651 .LBB9_4: # =>This Loop Header: Depth=1 652 leaq 8(%rdx), %rax 653 movq -64(%rsp), %rcx # 8-byte Reload 654 cmpq %rcx, %rax 655 movq %rax, -56(%rsp) # 8-byte Spill 656 cmovaq %rcx, %rax 657 cltq 658 movq %rdx, -16(%rsp) # 8-byte Spill 659 movq %rax, 24(%rsp) # 8-byte Spill 660 cmpq %rax, %rdx 661 jae .LBB9_20 662 xorl %eax, %eax 663 movq %rax, -120(%rsp) # 8-byte Spill 664 movl $256, %edx # imm = 0x100 665 xorl %eax, %eax 666 jmp .LBB9_6 667 .LBB9_19: # in Loop: Header=BB9_6 Depth=2 668 movq -120(%rsp), %rax # 8-byte Reload 669 addl $1, %eax 670 movq %rax, -120(%rsp) # 8-byte Spill 671 movq -24(%rsp), %rdx # 8-byte Reload 672 addq $256, %rdx # imm = 0x100 673 movq -32(%rsp), %rax # 8-byte Reload 674 cmpq -40(%rsp), %rax # 8-byte Folded Reload 675 jae .LBB9_20 676 .LBB9_6: # Parent Loop BB9_4 Depth=1 677 movl %eax, %edi 678 movq -128(%rsp), %rbp # 8-byte Reload 679 cmpq %rdx, %rbp 680 movq %rdx, -24(%rsp) # 8-byte Spill 681 cmovbq %rbp, %rdx 682 addq $256, %rax # imm = 0x100 683 cmpq %rax, %rbp 684 movq %rax, %rcx 685 cmovbq %rbp, %rcx 686 movq %rax, -32(%rsp) # 8-byte Spill 687 cmovbq %rbp, %rax 688 cmpl %eax, %edi 689 jge .LBB9_19 690 movslq %edi, %r14 691 movq -96(%rsp), %rdi # 8-byte Reload 692 leaq (%rdi,%r14,4), %rdi 693 movq %rdi, (%rsp) # 8-byte Spill 694 movslq %edx, %r11 695 subq %r14, %r11 696 andq $-32, %r11 697 movslq %ecx, %r12 698 movq -120(%rsp), %rcx # 8-byte Reload 699 shll $8, %ecx 700 movslq %ecx, %rcx 701 subq %rcx, %r12 702 movslq %eax, %rdx 703 movq %r12, %rcx 704 andq $-32, %rcx 705 movq -48(%rsp), %rax # 8-byte Reload 706 leaq (%rax,%r14,4), %rax 707 movq %rax, -8(%rsp) # 8-byte Spill 708 movq %r14, %r13 709 movq %rcx, 64(%rsp) # 8-byte Spill 710 addq %rcx, %r13 711 xorl %eax, %eax 712 jmp .LBB9_8 713 .LBB9_18: # in Loop: Header=BB9_8 Depth=3 714 movq 16(%rsp), %rax # 8-byte Reload 715 cmpq 8(%rsp), %rax # 8-byte Folded Reload 716 jae .LBB9_19 717 .LBB9_8: # Parent Loop BB9_4 Depth=1 718 movl %eax, %ecx 719 addq $256, %rax # imm = 0x100 720 movq -104(%rsp), %rdi # 8-byte Reload 721 cmpq %rdi, %rax 722 movq %rax, 16(%rsp) # 8-byte Spill 723 cmovaq %rdi, %rax 724 cmpl %eax, %ecx 725 jge .LBB9_18 726 movslq %ecx, %rdi 727 movq -128(%rsp), %rcx # 8-byte Reload 728 movq %rdi, 48(%rsp) # 8-byte Spill 729 imulq %rdi, %rcx 730 movq -88(%rsp), %rdi # 8-byte Reload 731 leaq (%rdi,%rcx,4), %rdi 732 movq %rdi, 40(%rsp) # 8-byte Spill 733 movq -8(%rsp), %rdi # 8-byte Reload 734 leaq (%rdi,%rcx,4), %rcx 735 movq %rcx, 32(%rsp) # 8-byte Spill 736 cltq 737 movq -112(%rsp), %rcx # 8-byte Reload 738 movq (%rsp), %r10 # 8-byte Reload 739 movq -16(%rsp), %r8 # 8-byte Reload 740 jmp .LBB9_10 741 .LBB9_17: # in Loop: Header=BB9_10 Depth=4 742 movq 56(%rsp), %r8 # 8-byte Reload 743 addq $1, %r8 744 addq %rbx, %r10 745 addq %rbx, %rcx 746 cmpq 24(%rsp), %r8 # 8-byte Folded Reload 747 jae .LBB9_18 748 .LBB9_10: # Parent Loop BB9_4 Depth=1 749 movq %r8, 56(%rsp) # 8-byte Spill 750 imulq -104(%rsp), %r8 # 8-byte Folded Reload 751 movq 40(%rsp), %r15 # 8-byte Reload 752 movq 32(%rsp), %rdi # 8-byte Reload 753 movq 48(%rsp), %r9 # 8-byte Reload 754 jmp .LBB9_11 755 .LBB9_16: # in Loop: Header=BB9_11 Depth=5 756 addq $1, %r9 757 addq %rbx, %rdi 758 addq %rbx, %r15 759 cmpq %rax, %r9 760 jge .LBB9_17 761 .LBB9_11: # Parent Loop BB9_4 Depth=1 762 leaq (%r9,%r8), %rbp 763 vmovss (%rsi,%rbp,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 764 movq %r14, %rbp 765 cmpq $32, %r12 766 jb .LBB9_15 767 vbroadcastss %xmm0, %ymm1 768 xorl %ebp, %ebp 769 .LBB9_13: # Parent Loop BB9_4 Depth=1 770 vmovups -96(%rdi,%rbp,4), %ymm2 771 vmovups -64(%rdi,%rbp,4), %ymm3 772 vmovups -32(%rdi,%rbp,4), %ymm4 773 vmovups (%rdi,%rbp,4), %ymm5 774 vfmadd213ps -96(%r10,%rbp,4), %ymm1, %ymm2 # ymm2 = (ymm1 * ymm2) + mem 775 vfmadd213ps -64(%r10,%rbp,4), %ymm1, %ymm3 # ymm3 = (ymm1 * ymm3) + mem 776 vfmadd213ps -32(%r10,%rbp,4), %ymm1, %ymm4 # ymm4 = (ymm1 * ymm4) + mem 777 vfmadd213ps (%r10,%rbp,4), %ymm1, %ymm5 # ymm5 = (ymm1 * ymm5) + mem 778 vmovups %ymm2, -96(%r10,%rbp,4) 779 vmovups %ymm3, -64(%r10,%rbp,4) 780 vmovups %ymm4, -32(%r10,%rbp,4) 781 vmovups %ymm5, (%r10,%rbp,4) 782 addq $32, %rbp 783 cmpq %rbp, %r11 784 jne .LBB9_13 785 movq %r13, %rbp 786 cmpq 64(%rsp), %r12 # 8-byte Folded Reload 787 je .LBB9_16 788 .LBB9_15: # Parent Loop BB9_4 Depth=1 789 vmovss (%r15,%rbp,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 790 vfmadd213ss (%rcx,%rbp,4), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem 791 vmovss %xmm1, (%rcx,%rbp,4) 792 addq $1, %rbp 793 cmpq %rdx, %rbp 794 jl .LBB9_15 795 jmp .LBB9_16 796 .LBB9_21: 797 addq $72, %rsp 798 popq %rbx 799 popq %r12 800 popq %r13 801 popq %r14 802 popq %r15 803 popq %rbp 804 vzeroupper 805 retq