gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/boolean.s (about)

     1  .LCPI0_0:
     2          .zero   32,1
     3  .LCPI0_1:
     4          .zero   16,1
     5  Not_V(bool*, unsigned long):                            # @Not_V(bool*, unsigned long)
     6          testq   %rsi, %rsi
     7          je      .LBB0_17
     8          cmpq    $16, %rsi
     9          jae     .LBB0_3
    10          xorl    %eax, %eax
    11          jmp     .LBB0_16
    12  .LBB0_3:
    13          cmpq    $128, %rsi
    14          jae     .LBB0_5
    15          xorl    %eax, %eax
    16          jmp     .LBB0_13
    17  .LBB0_5:
    18          movq    %rsi, %rax
    19          andq    $-128, %rax
    20          leaq    -128(%rax), %rcx
    21          movq    %rcx, %r8
    22          shrq    $7, %r8
    23          addq    $1, %r8
    24          testq   %rcx, %rcx
    25          je      .LBB0_6
    26          movq    %r8, %rdx
    27          andq    $-2, %rdx
    28          xorl    %ecx, %ecx
    29          vmovaps .LCPI0_0(%rip), %ymm0           # ymm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    30  .LBB0_8:                                # =>This Inner Loop Header: Depth=1
    31          vxorps  (%rdi,%rcx), %ymm0, %ymm1
    32          vxorps  32(%rdi,%rcx), %ymm0, %ymm2
    33          vxorps  64(%rdi,%rcx), %ymm0, %ymm3
    34          vxorps  96(%rdi,%rcx), %ymm0, %ymm4
    35          vmovups %ymm1, (%rdi,%rcx)
    36          vmovups %ymm2, 32(%rdi,%rcx)
    37          vmovups %ymm3, 64(%rdi,%rcx)
    38          vmovups %ymm4, 96(%rdi,%rcx)
    39          vxorps  128(%rdi,%rcx), %ymm0, %ymm1
    40          vxorps  160(%rdi,%rcx), %ymm0, %ymm2
    41          vxorps  192(%rdi,%rcx), %ymm0, %ymm3
    42          vxorps  224(%rdi,%rcx), %ymm0, %ymm4
    43          vmovups %ymm1, 128(%rdi,%rcx)
    44          vmovups %ymm2, 160(%rdi,%rcx)
    45          vmovups %ymm3, 192(%rdi,%rcx)
    46          vmovups %ymm4, 224(%rdi,%rcx)
    47          addq    $256, %rcx                      # imm = 0x100
    48          addq    $-2, %rdx
    49          jne     .LBB0_8
    50          testb   $1, %r8b
    51          je      .LBB0_11
    52  .LBB0_10:
    53          vmovaps .LCPI0_0(%rip), %ymm0           # ymm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    54          vxorps  (%rdi,%rcx), %ymm0, %ymm1
    55          vxorps  32(%rdi,%rcx), %ymm0, %ymm2
    56          vxorps  64(%rdi,%rcx), %ymm0, %ymm3
    57          vxorps  96(%rdi,%rcx), %ymm0, %ymm0
    58          vmovups %ymm1, (%rdi,%rcx)
    59          vmovups %ymm2, 32(%rdi,%rcx)
    60          vmovups %ymm3, 64(%rdi,%rcx)
    61          vmovups %ymm0, 96(%rdi,%rcx)
    62  .LBB0_11:
    63          cmpq    %rsi, %rax
    64          je      .LBB0_17
    65          testb   $112, %sil
    66          je      .LBB0_16
    67  .LBB0_13:
    68          movq    %rax, %rcx
    69          movq    %rsi, %rax
    70          andq    $-16, %rax
    71          vmovaps .LCPI0_1(%rip), %xmm0           # xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    72  .LBB0_14:                               # =>This Inner Loop Header: Depth=1
    73          vxorps  (%rdi,%rcx), %xmm0, %xmm1
    74          vmovups %xmm1, (%rdi,%rcx)
    75          addq    $16, %rcx
    76          cmpq    %rcx, %rax
    77          jne     .LBB0_14
    78          cmpq    %rsi, %rax
    79          je      .LBB0_17
    80  .LBB0_16:                               # =>This Inner Loop Header: Depth=1
    81          xorb    $1, (%rdi,%rax)
    82          addq    $1, %rax
    83          cmpq    %rax, %rsi
    84          jne     .LBB0_16
    85  .LBB0_17:
    86          vzeroupper
    87          retq
    88  .LBB0_6:
    89          xorl    %ecx, %ecx
    90          testb   $1, %r8b
    91          jne     .LBB0_10
    92          jmp     .LBB0_11
    93  And_V(bool*, bool*, unsigned long):                          # @And_V(bool*, bool*, unsigned long)
    94          testq   %rdx, %rdx
    95          je      .LBB1_13
    96          cmpq    $16, %rdx
    97          jae     .LBB1_3
    98          xorl    %eax, %eax
    99          jmp     .LBB1_12
   100  .LBB1_3:
   101          cmpq    $128, %rdx
   102          jae     .LBB1_5
   103          xorl    %eax, %eax
   104          jmp     .LBB1_9
   105  .LBB1_5:
   106          movq    %rdx, %rax
   107          andq    $-128, %rax
   108          xorl    %ecx, %ecx
   109  .LBB1_6:                                # =>This Inner Loop Header: Depth=1
   110          vmovups (%rsi,%rcx), %ymm0
   111          vmovups 32(%rsi,%rcx), %ymm1
   112          vmovups 64(%rsi,%rcx), %ymm2
   113          vmovups 96(%rsi,%rcx), %ymm3
   114          vandps  (%rdi,%rcx), %ymm0, %ymm0
   115          vandps  32(%rdi,%rcx), %ymm1, %ymm1
   116          vandps  64(%rdi,%rcx), %ymm2, %ymm2
   117          vandps  96(%rdi,%rcx), %ymm3, %ymm3
   118          vmovups %ymm0, (%rdi,%rcx)
   119          vmovups %ymm1, 32(%rdi,%rcx)
   120          vmovups %ymm2, 64(%rdi,%rcx)
   121          vmovups %ymm3, 96(%rdi,%rcx)
   122          subq    $-128, %rcx
   123          cmpq    %rcx, %rax
   124          jne     .LBB1_6
   125          cmpq    %rdx, %rax
   126          je      .LBB1_13
   127          testb   $112, %dl
   128          je      .LBB1_12
   129  .LBB1_9:
   130          movq    %rax, %rcx
   131          movq    %rdx, %rax
   132          andq    $-16, %rax
   133  .LBB1_10:                               # =>This Inner Loop Header: Depth=1
   134          vmovups (%rsi,%rcx), %xmm0
   135          vandps  (%rdi,%rcx), %xmm0, %xmm0
   136          vmovups %xmm0, (%rdi,%rcx)
   137          addq    $16, %rcx
   138          cmpq    %rcx, %rax
   139          jne     .LBB1_10
   140          cmpq    %rdx, %rax
   141          je      .LBB1_13
   142  .LBB1_12:                               # =>This Inner Loop Header: Depth=1
   143          movzbl  (%rsi,%rax), %ecx
   144          andb    %cl, (%rdi,%rax)
   145          addq    $1, %rax
   146          cmpq    %rax, %rdx
   147          jne     .LBB1_12
   148  .LBB1_13:
   149          vzeroupper
   150          retq
   151  Or_V(bool*, bool*, unsigned long):                           # @Or_V(bool*, bool*, unsigned long)
   152          testq   %rdx, %rdx
   153          je      .LBB2_13
   154          cmpq    $16, %rdx
   155          jae     .LBB2_3
   156          xorl    %eax, %eax
   157          jmp     .LBB2_12
   158  .LBB2_3:
   159          cmpq    $128, %rdx
   160          jae     .LBB2_5
   161          xorl    %eax, %eax
   162          jmp     .LBB2_9
   163  .LBB2_5:
   164          movq    %rdx, %rax
   165          andq    $-128, %rax
   166          xorl    %ecx, %ecx
   167  .LBB2_6:                                # =>This Inner Loop Header: Depth=1
   168          vmovups (%rsi,%rcx), %ymm0
   169          vmovups 32(%rsi,%rcx), %ymm1
   170          vmovups 64(%rsi,%rcx), %ymm2
   171          vmovups 96(%rsi,%rcx), %ymm3
   172          vorps   (%rdi,%rcx), %ymm0, %ymm0
   173          vorps   32(%rdi,%rcx), %ymm1, %ymm1
   174          vorps   64(%rdi,%rcx), %ymm2, %ymm2
   175          vorps   96(%rdi,%rcx), %ymm3, %ymm3
   176          vmovups %ymm0, (%rdi,%rcx)
   177          vmovups %ymm1, 32(%rdi,%rcx)
   178          vmovups %ymm2, 64(%rdi,%rcx)
   179          vmovups %ymm3, 96(%rdi,%rcx)
   180          subq    $-128, %rcx
   181          cmpq    %rcx, %rax
   182          jne     .LBB2_6
   183          cmpq    %rdx, %rax
   184          je      .LBB2_13
   185          testb   $112, %dl
   186          je      .LBB2_12
   187  .LBB2_9:
   188          movq    %rax, %rcx
   189          movq    %rdx, %rax
   190          andq    $-16, %rax
   191  .LBB2_10:                               # =>This Inner Loop Header: Depth=1
   192          vmovups (%rsi,%rcx), %xmm0
   193          vorps   (%rdi,%rcx), %xmm0, %xmm0
   194          vmovups %xmm0, (%rdi,%rcx)
   195          addq    $16, %rcx
   196          cmpq    %rcx, %rax
   197          jne     .LBB2_10
   198          cmpq    %rdx, %rax
   199          je      .LBB2_13
   200  .LBB2_12:                               # =>This Inner Loop Header: Depth=1
   201          movzbl  (%rsi,%rax), %ecx
   202          orb     %cl, (%rdi,%rax)
   203          addq    $1, %rax
   204          cmpq    %rax, %rdx
   205          jne     .LBB2_12
   206  .LBB2_13:
   207          vzeroupper
   208          retq
   209  Xor_V(bool*, bool*, unsigned long):                          # @Xor_V(bool*, bool*, unsigned long)
   210          testq   %rdx, %rdx
   211          je      .LBB3_13
   212          cmpq    $16, %rdx
   213          jae     .LBB3_3
   214          xorl    %eax, %eax
   215          jmp     .LBB3_12
   216  .LBB3_3:
   217          cmpq    $128, %rdx
   218          jae     .LBB3_5
   219          xorl    %eax, %eax
   220          jmp     .LBB3_9
   221  .LBB3_5:
   222          movq    %rdx, %rax
   223          andq    $-128, %rax
   224          xorl    %ecx, %ecx
   225  .LBB3_6:                                # =>This Inner Loop Header: Depth=1
   226          vmovups (%rsi,%rcx), %ymm0
   227          vmovups 32(%rsi,%rcx), %ymm1
   228          vmovups 64(%rsi,%rcx), %ymm2
   229          vmovups 96(%rsi,%rcx), %ymm3
   230          vxorps  (%rdi,%rcx), %ymm0, %ymm0
   231          vxorps  32(%rdi,%rcx), %ymm1, %ymm1
   232          vxorps  64(%rdi,%rcx), %ymm2, %ymm2
   233          vxorps  96(%rdi,%rcx), %ymm3, %ymm3
   234          vmovups %ymm0, (%rdi,%rcx)
   235          vmovups %ymm1, 32(%rdi,%rcx)
   236          vmovups %ymm2, 64(%rdi,%rcx)
   237          vmovups %ymm3, 96(%rdi,%rcx)
   238          subq    $-128, %rcx
   239          cmpq    %rcx, %rax
   240          jne     .LBB3_6
   241          cmpq    %rdx, %rax
   242          je      .LBB3_13
   243          testb   $112, %dl
   244          je      .LBB3_12
   245  .LBB3_9:
   246          movq    %rax, %rcx
   247          movq    %rdx, %rax
   248          andq    $-16, %rax
   249  .LBB3_10:                               # =>This Inner Loop Header: Depth=1
   250          vmovups (%rsi,%rcx), %xmm0
   251          vxorps  (%rdi,%rcx), %xmm0, %xmm0
   252          vmovups %xmm0, (%rdi,%rcx)
   253          addq    $16, %rcx
   254          cmpq    %rcx, %rax
   255          jne     .LBB3_10
   256          cmpq    %rdx, %rax
   257          je      .LBB3_13
   258  .LBB3_12:                               # =>This Inner Loop Header: Depth=1
   259          movzbl  (%rsi,%rax), %ecx
   260          xorb    %cl, (%rdi,%rax)
   261          addq    $1, %rax
   262          cmpq    %rax, %rdx
   263          jne     .LBB3_12
   264  .LBB3_13:
   265          vzeroupper
   266          retq
   267  Select_F64_I(double*, double*, bool*, unsigned long):                # @Select_F64_I(double*, double*, bool*, unsigned long)
   268          testq   %rcx, %rcx
   269          je      .LBB4_1
   270          cmpq    $1, %rcx
   271          jne     .LBB4_4
   272          xorl    %r8d, %r8d
   273          xorl    %eax, %eax
   274  .LBB4_10:
   275          testb   $1, %cl
   276          je      .LBB4_13
   277          cmpb    $0, (%rdx,%r8)
   278          je      .LBB4_13
   279          vmovsd  (%rsi,%r8,8), %xmm0             # xmm0 = mem[0],zero
   280          vmovsd  %xmm0, (%rdi,%rax,8)
   281          addq    $1, %rax
   282  .LBB4_13:
   283          retq
   284  .LBB4_1:
   285          xorl    %eax, %eax
   286          retq
   287  .LBB4_4:
   288          movq    %rcx, %r9
   289          andq    $-2, %r9
   290          xorl    %r8d, %r8d
   291          xorl    %eax, %eax
   292          jmp     .LBB4_5
   293  .LBB4_9:                                #   in Loop: Header=BB4_5 Depth=1
   294          addq    $2, %r8
   295          cmpq    %r8, %r9
   296          je      .LBB4_10
   297  .LBB4_5:                                # =>This Inner Loop Header: Depth=1
   298          cmpb    $0, (%rdx,%r8)
   299          je      .LBB4_7
   300          vmovsd  (%rsi,%r8,8), %xmm0             # xmm0 = mem[0],zero
   301          vmovsd  %xmm0, (%rdi,%rax,8)
   302          addq    $1, %rax
   303  .LBB4_7:                                #   in Loop: Header=BB4_5 Depth=1
   304          cmpb    $0, 1(%rdx,%r8)
   305          je      .LBB4_9
   306          vmovsd  8(%rsi,%r8,8), %xmm0            # xmm0 = mem[0],zero
   307          vmovsd  %xmm0, (%rdi,%rax,8)
   308          addq    $1, %rax
   309          jmp     .LBB4_9
   310  Select_F32_I(float*, float*, bool*, unsigned long):                # @Select_F32_I(float*, float*, bool*, unsigned long)
   311          testq   %rcx, %rcx
   312          je      .LBB5_1
   313          cmpq    $1, %rcx
   314          jne     .LBB5_4
   315          xorl    %r8d, %r8d
   316          xorl    %eax, %eax
   317  .LBB5_10:
   318          testb   $1, %cl
   319          je      .LBB5_13
   320          cmpb    $0, (%rdx,%r8)
   321          je      .LBB5_13
   322          vmovss  (%rsi,%r8,4), %xmm0             # xmm0 = mem[0],zero,zero,zero
   323          vmovss  %xmm0, (%rdi,%rax,4)
   324          addq    $1, %rax
   325  .LBB5_13:
   326          retq
   327  .LBB5_1:
   328          xorl    %eax, %eax
   329          retq
   330  .LBB5_4:
   331          movq    %rcx, %r9
   332          andq    $-2, %r9
   333          xorl    %r8d, %r8d
   334          xorl    %eax, %eax
   335          jmp     .LBB5_5
   336  .LBB5_9:                                #   in Loop: Header=BB5_5 Depth=1
   337          addq    $2, %r8
   338          cmpq    %r8, %r9
   339          je      .LBB5_10
   340  .LBB5_5:                                # =>This Inner Loop Header: Depth=1
   341          cmpb    $0, (%rdx,%r8)
   342          je      .LBB5_7
   343          vmovss  (%rsi,%r8,4), %xmm0             # xmm0 = mem[0],zero,zero,zero
   344          vmovss  %xmm0, (%rdi,%rax,4)
   345          addq    $1, %rax
   346  .LBB5_7:                                #   in Loop: Header=BB5_5 Depth=1
   347          cmpb    $0, 1(%rdx,%r8)
   348          je      .LBB5_9
   349          vmovss  4(%rsi,%r8,4), %xmm0            # xmm0 = mem[0],zero,zero,zero
   350          vmovss  %xmm0, (%rdi,%rax,4)
   351          addq    $1, %rax
   352          jmp     .LBB5_9
   353  All_I(bool*, unsigned long):                            # @All_I(bool*, unsigned long)
   354          movq    %rsi, %rax
   355          xorl    %ecx, %ecx
   356          andq    $-32, %rax
   357          je      .LBB0_1
   358          vpxor   %xmm0, %xmm0, %xmm0
   359  .LBB0_8:                                # =>This Inner Loop Header: Depth=1
   360          vpcmpeqb        (%rdi,%rcx), %ymm0, %ymm1
   361          vptest  %ymm1, %ymm1
   362          jne     .LBB0_9
   363          addq    $32, %rcx
   364          cmpq    %rax, %rcx
   365          jb      .LBB0_8
   366  .LBB0_1:
   367          movb    $1, %al
   368          cmpq    %rsi, %rcx
   369          jae     .LBB0_6
   370          addq    $-1, %rsi
   371  .LBB0_3:                                # =>This Inner Loop Header: Depth=1
   372          movzbl  (%rdi,%rcx), %eax
   373          testb   %al, %al
   374          je      .LBB0_5
   375          leaq    1(%rcx), %rdx
   376          cmpq    %rcx, %rsi
   377          movq    %rdx, %rcx
   378          jne     .LBB0_3
   379  .LBB0_5:
   380          testb   %al, %al
   381          setne   %al
   382  .LBB0_6:
   383          vzeroupper
   384          retq
   385  .LBB0_9:
   386          xorl    %eax, %eax
   387          vzeroupper
   388          retq
   389  Any_I(bool*, unsigned long):                            # @Any_I(bool*, unsigned long)
   390          movq    %rsi, %rcx
   391          xorl    %eax, %eax
   392          andq    $-32, %rcx
   393          je      .LBB1_1
   394  .LBB1_4:                                # =>This Inner Loop Header: Depth=1
   395          vmovdqu (%rdi,%rax), %ymm0
   396          vptest  %ymm0, %ymm0
   397          jne     .LBB1_5
   398          addq    $32, %rax
   399          cmpq    %rcx, %rax
   400          jb      .LBB1_4
   401  .LBB1_1:
   402          cmpq    %rsi, %rax
   403          jae     .LBB1_2
   404          addq    $-1, %rsi
   405  .LBB1_7:                                # =>This Inner Loop Header: Depth=1
   406          movzbl  (%rdi,%rax), %ecx
   407          testb   %cl, %cl
   408          jne     .LBB1_9
   409          leaq    1(%rax), %rdx
   410          cmpq    %rax, %rsi
   411          movq    %rdx, %rax
   412          jne     .LBB1_7
   413  .LBB1_9:
   414          testb   %cl, %cl
   415          setne   %al
   416          vzeroupper
   417          retq
   418  .LBB1_5:
   419          movb    $1, %al
   420          vzeroupper
   421          retq
   422  .LBB1_2:
   423          xorl    %eax, %eax
   424          vzeroupper
   425          retq
   426  None_I(bool*, unsigned long):                           # @None_I(bool*, unsigned long)
   427          movq    %rsi, %rax
   428          xorl    %ecx, %ecx
   429          andq    $-32, %rax
   430          je      .LBB2_1
   431  .LBB2_7:                                # =>This Inner Loop Header: Depth=1
   432          vmovdqu (%rdi,%rcx), %ymm0
   433          vptest  %ymm0, %ymm0
   434          jne     .LBB2_8
   435          addq    $32, %rcx
   436          cmpq    %rax, %rcx
   437          jb      .LBB2_7
   438  .LBB2_1:
   439          movb    $1, %al
   440          cmpq    %rsi, %rcx
   441          jae     .LBB2_5
   442          addq    $-1, %rsi
   443  .LBB2_3:                                # =>This Inner Loop Header: Depth=1
   444          cmpb    $0, (%rdi,%rcx)
   445          sete    %al
   446          jne     .LBB2_5
   447          leaq    1(%rcx), %rdx
   448          cmpq    %rcx, %rsi
   449          movq    %rdx, %rcx
   450          jne     .LBB2_3
   451  .LBB2_5:
   452          vzeroupper
   453          retq
   454  .LBB2_8:
   455          xorl    %eax, %eax
   456          vzeroupper
   457          retq
   458  Count_I(bool*, unsigned long):                          # @Count_I(bool*, unsigned long)
   459          testq   %rsi, %rsi
   460          je      .LBB9_1
   461          cmpq    $16, %rsi
   462          jae     .LBB9_4
   463          xorl    %ecx, %ecx
   464          xorl    %eax, %eax
   465          jmp     .LBB9_11
   466  .LBB9_1:
   467          xorl    %eax, %eax
   468          retq
   469  .LBB9_4:
   470          movq    %rsi, %rcx
   471          andq    $-16, %rcx
   472          leaq    -16(%rcx), %rax
   473          movq    %rax, %r8
   474          shrq    $4, %r8
   475          addq    $1, %r8
   476          testq   %rax, %rax
   477          je      .LBB9_5
   478          movq    %r8, %rdx
   479          andq    $-2, %rdx
   480          vpxor   %xmm0, %xmm0, %xmm0
   481          xorl    %eax, %eax
   482          vpxor   %xmm1, %xmm1, %xmm1
   483          vpxor   %xmm2, %xmm2, %xmm2
   484          vpxor   %xmm3, %xmm3, %xmm3
   485  .LBB9_7:                                # =>This Inner Loop Header: Depth=1
   486          vpmovzxbq       (%rdi,%rax), %ymm4      # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   487          vpaddq  %ymm4, %ymm0, %ymm0
   488          vpmovzxbq       4(%rdi,%rax), %ymm4     # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   489          vpaddq  %ymm4, %ymm1, %ymm1
   490          vpmovzxbq       8(%rdi,%rax), %ymm4     # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   491          vpmovzxbq       12(%rdi,%rax), %ymm5    # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   492          vpaddq  %ymm4, %ymm2, %ymm2
   493          vpaddq  %ymm5, %ymm3, %ymm3
   494          vpmovzxbq       16(%rdi,%rax), %ymm4    # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   495          vpaddq  %ymm4, %ymm0, %ymm0
   496          vpmovzxbq       20(%rdi,%rax), %ymm4    # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   497          vpaddq  %ymm4, %ymm1, %ymm1
   498          vpmovzxbq       24(%rdi,%rax), %ymm4    # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   499          vpmovzxbq       28(%rdi,%rax), %ymm5    # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   500          vpaddq  %ymm4, %ymm2, %ymm2
   501          vpaddq  %ymm5, %ymm3, %ymm3
   502          addq    $32, %rax
   503          addq    $-2, %rdx
   504          jne     .LBB9_7
   505          testb   $1, %r8b
   506          je      .LBB9_10
   507  .LBB9_9:
   508          vpmovzxbq       (%rdi,%rax), %ymm4      # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   509          vpmovzxbq       4(%rdi,%rax), %ymm5     # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   510          vpaddq  %ymm4, %ymm0, %ymm0
   511          vpaddq  %ymm5, %ymm1, %ymm1
   512          vpmovzxbq       8(%rdi,%rax), %ymm4     # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   513          vpaddq  %ymm4, %ymm2, %ymm2
   514          vpmovzxbq       12(%rdi,%rax), %ymm4    # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
   515          vpaddq  %ymm4, %ymm3, %ymm3
   516  .LBB9_10:
   517          vpaddq  %ymm3, %ymm1, %ymm1
   518          vpaddq  %ymm2, %ymm0, %ymm0
   519          vpaddq  %ymm1, %ymm0, %ymm0
   520          vextracti128    $1, %ymm0, %xmm1
   521          vpaddq  %xmm1, %xmm0, %xmm0
   522          vpshufd $238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
   523          vpaddq  %xmm1, %xmm0, %xmm0
   524          vmovq   %xmm0, %rax
   525          cmpq    %rsi, %rcx
   526          je      .LBB9_12
   527  .LBB9_11:                               # =>This Inner Loop Header: Depth=1
   528          movzbl  (%rdi,%rcx), %edx
   529          addq    %rdx, %rax
   530          addq    $1, %rcx
   531          cmpq    %rcx, %rsi
   532          jne     .LBB9_11
   533  .LBB9_12:
   534          vzeroupper
   535          retq
   536  .LBB9_5:
   537          vpxor   %xmm0, %xmm0, %xmm0
   538          xorl    %eax, %eax
   539          vpxor   %xmm1, %xmm1, %xmm1
   540          vpxor   %xmm2, %xmm2, %xmm2
   541          vpxor   %xmm3, %xmm3, %xmm3
   542          testb   $1, %r8b
   543          jne     .LBB9_9
   544          jmp     .LBB9_10