github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/null_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // func nullIndex8(bits *uint64, rows sparse.Array)
     6  TEXT ·nullIndex8(SB), NOSPLIT, $0-32
     7      MOVQ bits+0(FP), AX
     8      MOVQ rows_array_ptr+8(FP), BX
     9      MOVQ rows_array_len+16(FP), DI
    10      MOVQ rows_array_off+24(FP), DX
    11  
    12      MOVQ $1, CX
    13      XORQ SI, SI
    14  
    15      CMPQ DI, $0
    16      JE done
    17  loop1x1:
    18      XORQ R8, R8
    19      MOVB (BX), R9
    20      CMPB R9, $0
    21      JE next1x1
    22  
    23      MOVQ SI, R10
    24      SHRQ $6, R10
    25      ORQ CX, (AX)(R10*8)
    26  next1x1:
    27      ADDQ DX, BX
    28      ROLQ $1, CX
    29      INCQ SI
    30      CMPQ SI, DI
    31      JNE loop1x1
    32  done:
    33      RET
    34  
    35  // func nullIndex32(bits *uint64, rows sparse.Array)
    36  TEXT ·nullIndex32(SB), NOSPLIT, $0-32
    37      MOVQ bits+0(FP), AX
    38      MOVQ rows_array_ptr+8(FP), BX
    39      MOVQ rows_array_len+16(FP), DI
    40      MOVQ rows_array_off+24(FP), DX
    41  
    42      MOVQ $1, CX
    43      XORQ SI, SI
    44  
    45      CMPQ DI, $0
    46      JE done
    47  
    48      CMPQ DI, $8
    49      JB loop1x4
    50  
    51      CMPB ·hasAVX2(SB), $0
    52      JE loop1x4
    53  
    54      MOVQ DI, R8
    55      SHRQ $3, R8
    56      SHLQ $3, R8
    57  
    58      VPBROADCASTD rows_array_off+24(FP), Y0
    59      VPMULLD ·range0n8(SB), Y0, Y0
    60      VPCMPEQD Y1, Y1, Y1
    61      VPCMPEQD Y2, Y2, Y2
    62      VPXOR Y3, Y3, Y3
    63  loop8x4:
    64      VPGATHERDD Y1, (BX)(Y0*1), Y4
    65      VPCMPEQD Y3, Y4, Y4
    66      VMOVMSKPS Y4, R9
    67      VMOVDQU Y2, Y1
    68  
    69      NOTQ R9
    70      ANDQ $0b11111111, R9
    71  
    72      MOVQ SI, CX
    73      ANDQ $0b111111, CX
    74  
    75      MOVQ SI, R10
    76      SHRQ $6, R10
    77  
    78      SHLQ CX, R9
    79      ORQ R9, (AX)(R10*8)
    80  
    81      LEAQ (BX)(DX*8), BX
    82      ADDQ $8, SI
    83      CMPQ SI, R8
    84      JNE loop8x4
    85      VZEROUPPER
    86  
    87      CMPQ SI, DI
    88      JE done
    89  
    90      MOVQ $1, R8
    91      MOVQ SI, CX
    92      ANDQ $0b111111, R8
    93      SHLQ CX, R8
    94      MOVQ R8, CX
    95  
    96  loop1x4:
    97      MOVL (BX), R8
    98      CMPL R8, $0
    99      JE next1x4
   100  
   101      MOVQ SI, R9
   102      SHRQ $6, R9
   103      ORQ CX, (AX)(R9*8)
   104  next1x4:
   105      ADDQ DX, BX
   106      ROLQ $1, CX
   107      INCQ SI
   108      CMPQ SI, DI
   109      JNE loop1x4
   110  done:
   111      RET
   112  
   113  // func nullIndex64(bits *uint64, rows sparse.Array)
   114  TEXT ·nullIndex64(SB), NOSPLIT, $0-32
   115      MOVQ bits+0(FP), AX
   116      MOVQ rows_array_ptr+8(FP), BX
   117      MOVQ rows_array_len+16(FP), DI
   118      MOVQ rows_array_off+24(FP), DX
   119  
   120      MOVQ $1, CX
   121      XORQ SI, SI
   122  
   123      CMPQ DI, $0
   124      JE done
   125  
   126      CMPQ DI, $4
   127      JB loop1x8
   128  
   129      CMPB ·hasAVX2(SB), $0
   130      JE loop1x8
   131  
   132      MOVQ DI, R8
   133      SHRQ $2, R8
   134      SHLQ $2, R8
   135  
   136      VPBROADCASTQ rows_array_off+24(FP), Y0
   137      VPMULLD scale4x8<>(SB), Y0, Y0
   138      VPCMPEQQ Y1, Y1, Y1
   139      VPCMPEQQ Y2, Y2, Y2
   140      VPXOR Y3, Y3, Y3
   141  loop4x8:
   142      VPGATHERQQ Y1, (BX)(Y0*1), Y4
   143      VPCMPEQQ Y3, Y4, Y4
   144      VMOVMSKPD Y4, R9
   145      VMOVDQU Y2, Y1
   146  
   147      NOTQ R9
   148      ANDQ $0b1111, R9
   149  
   150      MOVQ SI, CX
   151      ANDQ $0b111111, CX
   152  
   153      MOVQ SI, R10
   154      SHRQ $6, R10
   155  
   156      SHLQ CX, R9
   157      ORQ R9, (AX)(R10*8)
   158  
   159      LEAQ (BX)(DX*4), BX
   160      ADDQ $4, SI
   161      CMPQ SI, R8
   162      JNE loop4x8
   163      VZEROUPPER
   164  
   165      CMPQ SI, DI
   166      JE done
   167  
   168      MOVQ $1, R8
   169      MOVQ SI, CX
   170      ANDQ $0b111111, R8
   171      SHLQ CX, R8
   172      MOVQ R8, CX
   173  
   174  loop1x8:
   175      MOVQ (BX), R8
   176      CMPQ R8, $0
   177      JE next1x8
   178  
   179      MOVQ SI, R9
   180      SHRQ $6, R9
   181      ORQ CX, (AX)(R9*8)
   182  next1x8:
   183      ADDQ DX, BX
   184      ROLQ $1, CX
   185      INCQ SI
   186      CMPQ SI, DI
   187      JNE loop1x8
   188  done:
   189      RET
   190  
   191  GLOBL scale4x8<>(SB), RODATA|NOPTR, $32
   192  DATA scale4x8<>+0(SB)/8,  $0
   193  DATA scale4x8<>+8(SB)/8,  $1
   194  DATA scale4x8<>+16(SB)/8, $2
   195  DATA scale4x8<>+24(SB)/8, $3
   196  
   197  // func nullIndex128(bits *uint64, rows sparse.Array)
   198  TEXT ·nullIndex128(SB), NOSPLIT, $0-32
   199      MOVQ bits+0(FP), AX
   200      MOVQ rows_array_ptr+8(FP), BX
   201      MOVQ rows_array_len+16(FP), DI
   202      MOVQ rows_array_off+24(FP), DX
   203  
   204      CMPQ DI, $0
   205      JE done
   206  
   207      MOVQ $1, CX
   208      XORQ SI, SI
   209      PXOR X0, X0
   210  loop1x16:
   211      MOVOU (BX), X1
   212      PCMPEQQ X0, X1
   213      MOVMSKPD X1, R8
   214      CMPB R8, $0b11
   215      JE next1x16
   216  
   217      MOVQ SI, R9
   218      SHRQ $6, R9
   219      ORQ CX, (AX)(R9*8)
   220  next1x16:
   221      ADDQ DX, BX
   222      ROLQ $1, CX
   223      INCQ SI
   224      CMPQ SI, DI
   225      JNE loop1x16
   226  done:
   227      RET