gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/boolean.s (about) 1 .LCPI0_0: 2 .zero 32,1 3 .LCPI0_1: 4 .zero 16,1 5 Not_V(bool*, unsigned long): # @Not_V(bool*, unsigned long) 6 testq %rsi, %rsi 7 je .LBB0_17 8 cmpq $16, %rsi 9 jae .LBB0_3 10 xorl %eax, %eax 11 jmp .LBB0_16 12 .LBB0_3: 13 cmpq $128, %rsi 14 jae .LBB0_5 15 xorl %eax, %eax 16 jmp .LBB0_13 17 .LBB0_5: 18 movq %rsi, %rax 19 andq $-128, %rax 20 leaq -128(%rax), %rcx 21 movq %rcx, %r8 22 shrq $7, %r8 23 addq $1, %r8 24 testq %rcx, %rcx 25 je .LBB0_6 26 movq %r8, %rdx 27 andq $-2, %rdx 28 xorl %ecx, %ecx 29 vmovaps .LCPI0_0(%rip), %ymm0 # ymm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 30 .LBB0_8: # =>This Inner Loop Header: Depth=1 31 vxorps (%rdi,%rcx), %ymm0, %ymm1 32 vxorps 32(%rdi,%rcx), %ymm0, %ymm2 33 vxorps 64(%rdi,%rcx), %ymm0, %ymm3 34 vxorps 96(%rdi,%rcx), %ymm0, %ymm4 35 vmovups %ymm1, (%rdi,%rcx) 36 vmovups %ymm2, 32(%rdi,%rcx) 37 vmovups %ymm3, 64(%rdi,%rcx) 38 vmovups %ymm4, 96(%rdi,%rcx) 39 vxorps 128(%rdi,%rcx), %ymm0, %ymm1 40 vxorps 160(%rdi,%rcx), %ymm0, %ymm2 41 vxorps 192(%rdi,%rcx), %ymm0, %ymm3 42 vxorps 224(%rdi,%rcx), %ymm0, %ymm4 43 vmovups %ymm1, 128(%rdi,%rcx) 44 vmovups %ymm2, 160(%rdi,%rcx) 45 vmovups %ymm3, 192(%rdi,%rcx) 46 vmovups %ymm4, 224(%rdi,%rcx) 47 addq $256, %rcx # imm = 0x100 48 addq $-2, %rdx 49 jne .LBB0_8 50 testb $1, %r8b 51 je .LBB0_11 52 .LBB0_10: 53 vmovaps .LCPI0_0(%rip), %ymm0 # ymm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 54 vxorps (%rdi,%rcx), %ymm0, %ymm1 55 vxorps 32(%rdi,%rcx), %ymm0, %ymm2 56 vxorps 64(%rdi,%rcx), %ymm0, %ymm3 57 vxorps 96(%rdi,%rcx), %ymm0, %ymm0 58 vmovups %ymm1, (%rdi,%rcx) 59 vmovups %ymm2, 32(%rdi,%rcx) 60 vmovups %ymm3, 64(%rdi,%rcx) 61 vmovups %ymm0, 96(%rdi,%rcx) 62 .LBB0_11: 63 cmpq %rsi, %rax 64 je .LBB0_17 65 testb $112, %sil 66 je .LBB0_16 67 .LBB0_13: 68 movq %rax, %rcx 69 movq %rsi, %rax 70 andq $-16, %rax 71 vmovaps .LCPI0_1(%rip), %xmm0 # xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 72 .LBB0_14: # =>This Inner Loop Header: Depth=1 73 vxorps (%rdi,%rcx), %xmm0, %xmm1 74 vmovups %xmm1, (%rdi,%rcx) 75 addq $16, %rcx 76 cmpq %rcx, %rax 77 jne .LBB0_14 78 cmpq %rsi, %rax 79 je .LBB0_17 80 .LBB0_16: # =>This Inner Loop Header: Depth=1 81 xorb $1, (%rdi,%rax) 82 addq $1, %rax 83 cmpq %rax, %rsi 84 jne .LBB0_16 85 .LBB0_17: 86 vzeroupper 87 retq 88 .LBB0_6: 89 xorl %ecx, %ecx 90 testb $1, %r8b 91 jne .LBB0_10 92 jmp .LBB0_11 93 And_V(bool*, bool*, unsigned long): # @And_V(bool*, bool*, unsigned long) 94 testq %rdx, %rdx 95 je .LBB1_13 96 cmpq $16, %rdx 97 jae .LBB1_3 98 xorl %eax, %eax 99 jmp .LBB1_12 100 .LBB1_3: 101 cmpq $128, %rdx 102 jae .LBB1_5 103 xorl %eax, %eax 104 jmp .LBB1_9 105 .LBB1_5: 106 movq %rdx, %rax 107 andq $-128, %rax 108 xorl %ecx, %ecx 109 .LBB1_6: # =>This Inner Loop Header: Depth=1 110 vmovups (%rsi,%rcx), %ymm0 111 vmovups 32(%rsi,%rcx), %ymm1 112 vmovups 64(%rsi,%rcx), %ymm2 113 vmovups 96(%rsi,%rcx), %ymm3 114 vandps (%rdi,%rcx), %ymm0, %ymm0 115 vandps 32(%rdi,%rcx), %ymm1, %ymm1 116 vandps 64(%rdi,%rcx), %ymm2, %ymm2 117 vandps 96(%rdi,%rcx), %ymm3, %ymm3 118 vmovups %ymm0, (%rdi,%rcx) 119 vmovups %ymm1, 32(%rdi,%rcx) 120 vmovups %ymm2, 64(%rdi,%rcx) 121 vmovups %ymm3, 96(%rdi,%rcx) 122 subq $-128, %rcx 123 cmpq %rcx, %rax 124 jne .LBB1_6 125 cmpq %rdx, %rax 126 je .LBB1_13 127 testb $112, %dl 128 je .LBB1_12 129 .LBB1_9: 130 movq %rax, %rcx 131 movq %rdx, %rax 132 andq $-16, %rax 133 .LBB1_10: # =>This Inner Loop Header: Depth=1 134 vmovups (%rsi,%rcx), %xmm0 135 vandps (%rdi,%rcx), %xmm0, %xmm0 136 vmovups %xmm0, (%rdi,%rcx) 137 addq $16, %rcx 138 cmpq %rcx, %rax 139 jne .LBB1_10 140 cmpq %rdx, %rax 141 je .LBB1_13 142 .LBB1_12: # =>This Inner Loop Header: Depth=1 143 movzbl (%rsi,%rax), %ecx 144 andb %cl, (%rdi,%rax) 145 addq $1, %rax 146 cmpq %rax, %rdx 147 jne .LBB1_12 148 .LBB1_13: 149 vzeroupper 150 retq 151 Or_V(bool*, bool*, unsigned long): # @Or_V(bool*, bool*, unsigned long) 152 testq %rdx, %rdx 153 je .LBB2_13 154 cmpq $16, %rdx 155 jae .LBB2_3 156 xorl %eax, %eax 157 jmp .LBB2_12 158 .LBB2_3: 159 cmpq $128, %rdx 160 jae .LBB2_5 161 xorl %eax, %eax 162 jmp .LBB2_9 163 .LBB2_5: 164 movq %rdx, %rax 165 andq $-128, %rax 166 xorl %ecx, %ecx 167 .LBB2_6: # =>This Inner Loop Header: Depth=1 168 vmovups (%rsi,%rcx), %ymm0 169 vmovups 32(%rsi,%rcx), %ymm1 170 vmovups 64(%rsi,%rcx), %ymm2 171 vmovups 96(%rsi,%rcx), %ymm3 172 vorps (%rdi,%rcx), %ymm0, %ymm0 173 vorps 32(%rdi,%rcx), %ymm1, %ymm1 174 vorps 64(%rdi,%rcx), %ymm2, %ymm2 175 vorps 96(%rdi,%rcx), %ymm3, %ymm3 176 vmovups %ymm0, (%rdi,%rcx) 177 vmovups %ymm1, 32(%rdi,%rcx) 178 vmovups %ymm2, 64(%rdi,%rcx) 179 vmovups %ymm3, 96(%rdi,%rcx) 180 subq $-128, %rcx 181 cmpq %rcx, %rax 182 jne .LBB2_6 183 cmpq %rdx, %rax 184 je .LBB2_13 185 testb $112, %dl 186 je .LBB2_12 187 .LBB2_9: 188 movq %rax, %rcx 189 movq %rdx, %rax 190 andq $-16, %rax 191 .LBB2_10: # =>This Inner Loop Header: Depth=1 192 vmovups (%rsi,%rcx), %xmm0 193 vorps (%rdi,%rcx), %xmm0, %xmm0 194 vmovups %xmm0, (%rdi,%rcx) 195 addq $16, %rcx 196 cmpq %rcx, %rax 197 jne .LBB2_10 198 cmpq %rdx, %rax 199 je .LBB2_13 200 .LBB2_12: # =>This Inner Loop Header: Depth=1 201 movzbl (%rsi,%rax), %ecx 202 orb %cl, (%rdi,%rax) 203 addq $1, %rax 204 cmpq %rax, %rdx 205 jne .LBB2_12 206 .LBB2_13: 207 vzeroupper 208 retq 209 Xor_V(bool*, bool*, unsigned long): # @Xor_V(bool*, bool*, unsigned long) 210 testq %rdx, %rdx 211 je .LBB3_13 212 cmpq $16, %rdx 213 jae .LBB3_3 214 xorl %eax, %eax 215 jmp .LBB3_12 216 .LBB3_3: 217 cmpq $128, %rdx 218 jae .LBB3_5 219 xorl %eax, %eax 220 jmp .LBB3_9 221 .LBB3_5: 222 movq %rdx, %rax 223 andq $-128, %rax 224 xorl %ecx, %ecx 225 .LBB3_6: # =>This Inner Loop Header: Depth=1 226 vmovups (%rsi,%rcx), %ymm0 227 vmovups 32(%rsi,%rcx), %ymm1 228 vmovups 64(%rsi,%rcx), %ymm2 229 vmovups 96(%rsi,%rcx), %ymm3 230 vxorps (%rdi,%rcx), %ymm0, %ymm0 231 vxorps 32(%rdi,%rcx), %ymm1, %ymm1 232 vxorps 64(%rdi,%rcx), %ymm2, %ymm2 233 vxorps 96(%rdi,%rcx), %ymm3, %ymm3 234 vmovups %ymm0, (%rdi,%rcx) 235 vmovups %ymm1, 32(%rdi,%rcx) 236 vmovups %ymm2, 64(%rdi,%rcx) 237 vmovups %ymm3, 96(%rdi,%rcx) 238 subq $-128, %rcx 239 cmpq %rcx, %rax 240 jne .LBB3_6 241 cmpq %rdx, %rax 242 je .LBB3_13 243 testb $112, %dl 244 je .LBB3_12 245 .LBB3_9: 246 movq %rax, %rcx 247 movq %rdx, %rax 248 andq $-16, %rax 249 .LBB3_10: # =>This Inner Loop Header: Depth=1 250 vmovups (%rsi,%rcx), %xmm0 251 vxorps (%rdi,%rcx), %xmm0, %xmm0 252 vmovups %xmm0, (%rdi,%rcx) 253 addq $16, %rcx 254 cmpq %rcx, %rax 255 jne .LBB3_10 256 cmpq %rdx, %rax 257 je .LBB3_13 258 .LBB3_12: # =>This Inner Loop Header: Depth=1 259 movzbl (%rsi,%rax), %ecx 260 xorb %cl, (%rdi,%rax) 261 addq $1, %rax 262 cmpq %rax, %rdx 263 jne .LBB3_12 264 .LBB3_13: 265 vzeroupper 266 retq 267 Select_F64_I(double*, double*, bool*, unsigned long): # @Select_F64_I(double*, double*, bool*, unsigned long) 268 testq %rcx, %rcx 269 je .LBB4_1 270 cmpq $1, %rcx 271 jne .LBB4_4 272 xorl %r8d, %r8d 273 xorl %eax, %eax 274 .LBB4_10: 275 testb $1, %cl 276 je .LBB4_13 277 cmpb $0, (%rdx,%r8) 278 je .LBB4_13 279 vmovsd (%rsi,%r8,8), %xmm0 # xmm0 = mem[0],zero 280 vmovsd %xmm0, (%rdi,%rax,8) 281 addq $1, %rax 282 .LBB4_13: 283 retq 284 .LBB4_1: 285 xorl %eax, %eax 286 retq 287 .LBB4_4: 288 movq %rcx, %r9 289 andq $-2, %r9 290 xorl %r8d, %r8d 291 xorl %eax, %eax 292 jmp .LBB4_5 293 .LBB4_9: # in Loop: Header=BB4_5 Depth=1 294 addq $2, %r8 295 cmpq %r8, %r9 296 je .LBB4_10 297 .LBB4_5: # =>This Inner Loop Header: Depth=1 298 cmpb $0, (%rdx,%r8) 299 je .LBB4_7 300 vmovsd (%rsi,%r8,8), %xmm0 # xmm0 = mem[0],zero 301 vmovsd %xmm0, (%rdi,%rax,8) 302 addq $1, %rax 303 .LBB4_7: # in Loop: Header=BB4_5 Depth=1 304 cmpb $0, 1(%rdx,%r8) 305 je .LBB4_9 306 vmovsd 8(%rsi,%r8,8), %xmm0 # xmm0 = mem[0],zero 307 vmovsd %xmm0, (%rdi,%rax,8) 308 addq $1, %rax 309 jmp .LBB4_9 310 Select_F32_I(float*, float*, bool*, unsigned long): # @Select_F32_I(float*, float*, bool*, unsigned long) 311 testq %rcx, %rcx 312 je .LBB5_1 313 cmpq $1, %rcx 314 jne .LBB5_4 315 xorl %r8d, %r8d 316 xorl %eax, %eax 317 .LBB5_10: 318 testb $1, %cl 319 je .LBB5_13 320 cmpb $0, (%rdx,%r8) 321 je .LBB5_13 322 vmovss (%rsi,%r8,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 323 vmovss %xmm0, (%rdi,%rax,4) 324 addq $1, %rax 325 .LBB5_13: 326 retq 327 .LBB5_1: 328 xorl %eax, %eax 329 retq 330 .LBB5_4: 331 movq %rcx, %r9 332 andq $-2, %r9 333 xorl %r8d, %r8d 334 xorl %eax, %eax 335 jmp .LBB5_5 336 .LBB5_9: # in Loop: Header=BB5_5 Depth=1 337 addq $2, %r8 338 cmpq %r8, %r9 339 je .LBB5_10 340 .LBB5_5: # =>This Inner Loop Header: Depth=1 341 cmpb $0, (%rdx,%r8) 342 je .LBB5_7 343 vmovss (%rsi,%r8,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 344 vmovss %xmm0, (%rdi,%rax,4) 345 addq $1, %rax 346 .LBB5_7: # in Loop: Header=BB5_5 Depth=1 347 cmpb $0, 1(%rdx,%r8) 348 je .LBB5_9 349 vmovss 4(%rsi,%r8,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 350 vmovss %xmm0, (%rdi,%rax,4) 351 addq $1, %rax 352 jmp .LBB5_9 353 All_I(bool*, unsigned long): # @All_I(bool*, unsigned long) 354 movq %rsi, %rax 355 xorl %ecx, %ecx 356 andq $-32, %rax 357 je .LBB0_1 358 vpxor %xmm0, %xmm0, %xmm0 359 .LBB0_8: # =>This Inner Loop Header: Depth=1 360 vpcmpeqb (%rdi,%rcx), %ymm0, %ymm1 361 vptest %ymm1, %ymm1 362 jne .LBB0_9 363 addq $32, %rcx 364 cmpq %rax, %rcx 365 jb .LBB0_8 366 .LBB0_1: 367 movb $1, %al 368 cmpq %rsi, %rcx 369 jae .LBB0_6 370 addq $-1, %rsi 371 .LBB0_3: # =>This Inner Loop Header: Depth=1 372 movzbl (%rdi,%rcx), %eax 373 testb %al, %al 374 je .LBB0_5 375 leaq 1(%rcx), %rdx 376 cmpq %rcx, %rsi 377 movq %rdx, %rcx 378 jne .LBB0_3 379 .LBB0_5: 380 testb %al, %al 381 setne %al 382 .LBB0_6: 383 vzeroupper 384 retq 385 .LBB0_9: 386 xorl %eax, %eax 387 vzeroupper 388 retq 389 Any_I(bool*, unsigned long): # @Any_I(bool*, unsigned long) 390 movq %rsi, %rcx 391 xorl %eax, %eax 392 andq $-32, %rcx 393 je .LBB1_1 394 .LBB1_4: # =>This Inner Loop Header: Depth=1 395 vmovdqu (%rdi,%rax), %ymm0 396 vptest %ymm0, %ymm0 397 jne .LBB1_5 398 addq $32, %rax 399 cmpq %rcx, %rax 400 jb .LBB1_4 401 .LBB1_1: 402 cmpq %rsi, %rax 403 jae .LBB1_2 404 addq $-1, %rsi 405 .LBB1_7: # =>This Inner Loop Header: Depth=1 406 movzbl (%rdi,%rax), %ecx 407 testb %cl, %cl 408 jne .LBB1_9 409 leaq 1(%rax), %rdx 410 cmpq %rax, %rsi 411 movq %rdx, %rax 412 jne .LBB1_7 413 .LBB1_9: 414 testb %cl, %cl 415 setne %al 416 vzeroupper 417 retq 418 .LBB1_5: 419 movb $1, %al 420 vzeroupper 421 retq 422 .LBB1_2: 423 xorl %eax, %eax 424 vzeroupper 425 retq 426 None_I(bool*, unsigned long): # @None_I(bool*, unsigned long) 427 movq %rsi, %rax 428 xorl %ecx, %ecx 429 andq $-32, %rax 430 je .LBB2_1 431 .LBB2_7: # =>This Inner Loop Header: Depth=1 432 vmovdqu (%rdi,%rcx), %ymm0 433 vptest %ymm0, %ymm0 434 jne .LBB2_8 435 addq $32, %rcx 436 cmpq %rax, %rcx 437 jb .LBB2_7 438 .LBB2_1: 439 movb $1, %al 440 cmpq %rsi, %rcx 441 jae .LBB2_5 442 addq $-1, %rsi 443 .LBB2_3: # =>This Inner Loop Header: Depth=1 444 cmpb $0, (%rdi,%rcx) 445 sete %al 446 jne .LBB2_5 447 leaq 1(%rcx), %rdx 448 cmpq %rcx, %rsi 449 movq %rdx, %rcx 450 jne .LBB2_3 451 .LBB2_5: 452 vzeroupper 453 retq 454 .LBB2_8: 455 xorl %eax, %eax 456 vzeroupper 457 retq 458 Count_I(bool*, unsigned long): # @Count_I(bool*, unsigned long) 459 testq %rsi, %rsi 460 je .LBB9_1 461 cmpq $16, %rsi 462 jae .LBB9_4 463 xorl %ecx, %ecx 464 xorl %eax, %eax 465 jmp .LBB9_11 466 .LBB9_1: 467 xorl %eax, %eax 468 retq 469 .LBB9_4: 470 movq %rsi, %rcx 471 andq $-16, %rcx 472 leaq -16(%rcx), %rax 473 movq %rax, %r8 474 shrq $4, %r8 475 addq $1, %r8 476 testq %rax, %rax 477 je .LBB9_5 478 movq %r8, %rdx 479 andq $-2, %rdx 480 vpxor %xmm0, %xmm0, %xmm0 481 xorl %eax, %eax 482 vpxor %xmm1, %xmm1, %xmm1 483 vpxor %xmm2, %xmm2, %xmm2 484 vpxor %xmm3, %xmm3, %xmm3 485 .LBB9_7: # =>This Inner Loop Header: Depth=1 486 vpmovzxbq (%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 487 vpaddq %ymm4, %ymm0, %ymm0 488 vpmovzxbq 4(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 489 vpaddq %ymm4, %ymm1, %ymm1 490 vpmovzxbq 8(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 491 vpmovzxbq 12(%rdi,%rax), %ymm5 # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 492 vpaddq %ymm4, %ymm2, %ymm2 493 vpaddq %ymm5, %ymm3, %ymm3 494 vpmovzxbq 16(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 495 vpaddq %ymm4, %ymm0, %ymm0 496 vpmovzxbq 20(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 497 vpaddq %ymm4, %ymm1, %ymm1 498 vpmovzxbq 24(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 499 vpmovzxbq 28(%rdi,%rax), %ymm5 # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 500 vpaddq %ymm4, %ymm2, %ymm2 501 vpaddq %ymm5, %ymm3, %ymm3 502 addq $32, %rax 503 addq $-2, %rdx 504 jne .LBB9_7 505 testb $1, %r8b 506 je .LBB9_10 507 .LBB9_9: 508 vpmovzxbq (%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 509 vpmovzxbq 4(%rdi,%rax), %ymm5 # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 510 vpaddq %ymm4, %ymm0, %ymm0 511 vpaddq %ymm5, %ymm1, %ymm1 512 vpmovzxbq 8(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 513 vpaddq %ymm4, %ymm2, %ymm2 514 vpmovzxbq 12(%rdi,%rax), %ymm4 # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 515 vpaddq %ymm4, %ymm3, %ymm3 516 .LBB9_10: 517 vpaddq %ymm3, %ymm1, %ymm1 518 vpaddq %ymm2, %ymm0, %ymm0 519 vpaddq %ymm1, %ymm0, %ymm0 520 vextracti128 $1, %ymm0, %xmm1 521 vpaddq %xmm1, %xmm0, %xmm0 522 vpshufd $238, %xmm0, %xmm1 # xmm1 = xmm0[2,3,2,3] 523 vpaddq %xmm1, %xmm0, %xmm0 524 vmovq %xmm0, %rax 525 cmpq %rsi, %rcx 526 je .LBB9_12 527 .LBB9_11: # =>This Inner Loop Header: Depth=1 528 movzbl (%rdi,%rcx), %edx 529 addq %rdx, %rax 530 addq $1, %rcx 531 cmpq %rcx, %rsi 532 jne .LBB9_11 533 .LBB9_12: 534 vzeroupper 535 retq 536 .LBB9_5: 537 vpxor %xmm0, %xmm0, %xmm0 538 xorl %eax, %eax 539 vpxor %xmm1, %xmm1, %xmm1 540 vpxor %xmm2, %xmm2, %xmm2 541 vpxor %xmm3, %xmm3, %xmm3 542 testb $1, %r8b 543 jne .LBB9_9 544 jmp .LBB9_10