github.com/dgraph-io/simdjson-go@v0.3.0/find_structural_bits_avx512_amd64.s (about) 1 //+build !noasm !appengine gc 2 3 #include "common.h" 4 5 TEXT ·_find_structural_bits_avx512(SB), $0-56 6 7 CALL ·__init_odd_backslash_sequences_avx512(SB) 8 CALL ·__init_quote_mask_and_bits_avx512(SB) 9 CALL ·__init_whitespace_and_structurals_avx512(SB) 10 CALL ·__init_newline_delimiters_avx512(SB) 11 12 MOVQ p1+0(FP), DI 13 MOVQ p3+8(FP), DX 14 15 KORQ K_ERRORMASK, K_ERRORMASK, K_ERRORMASK 16 17 VMOVDQU32 (DI), Z8 18 19 CALL ·__find_odd_backslash_sequences_avx512(SB) 20 21 MOVQ AX, DX // odd_ends + 16 22 MOVQ prev_iter_inside_quote+16(FP), CX 23 24 CALL ·__find_quote_mask_and_bits_avx512(SB) 25 PUSHQ AX // MOVQ AX, quote_mask + 64 26 27 CALL ·__find_whitespace_and_structurals_avx512(SB) 28 29 POPQ DX // DX = quote_mask 30 MOVQ prev_iter_ends_pseudo_pred+40(FP), R8 // R8 = &prev_iter_ends_pseudo_pred 31 32 CALL ·__finalize_structurals_avx512(SB) 33 34 VZEROUPPER 35 MOVQ error_mask+24(FP), R9 36 KMOVQ K_ERRORMASK, (R9) 37 MOVQ AX, structurals+48(FP) 38 RET 39 40 #define MASK_WHITESPACE(MAX, Y) \ 41 LEAQ MASKTABLE<>(SB), DX \ 42 MOVQ $MAX, BX \ 43 SUBQ CX, BX \ 44 VMOVDQU (DX)(BX*1), Y10 \ // Load mask 45 VPCMPEQB Y11, Y11, Y11 \ // Set all bits 46 VPXOR Y11, Y10, Y12 \ // Invert mask 47 VPAND Y13, Y12, Y12 \ // Mask whitespace 48 VPAND Y10, Y, Y \ // Mask message 49 VPOR Y12, Y, Y // Combine together 50 51 TEXT ·_find_structural_bits_in_slice_avx512(SB), $0-104 52 53 CALL ·__init_odd_backslash_sequences_avx512(SB) 54 CALL ·__init_quote_mask_and_bits_avx512(SB) 55 CALL ·__init_whitespace_and_structurals_avx512(SB) 56 CALL ·__init_newline_delimiters_avx512(SB) 57 58 MOVQ error_mask+32(FP), R9 59 KMOVQ (R9), K_ERRORMASK 60 61 XORQ AX, AX 62 MOVQ len+8(FP), CX 63 ANDQ $0xffffffffffffffc0, CX 64 CMPQ AX, CX 65 JEQ check_partial_load 66 67 loop: 68 MOVQ buf+0(FP), DI 69 VMOVDQU32 (DI)(AX*1), Z8 70 ADDQ $0x40, AX 71 72 loop_after_load: 73 PUSHQ CX 74 PUSHQ AX 75 76 MOVQ p3+16(FP), DX 77 CALL ·__find_odd_backslash_sequences_avx512(SB) 78 79 MOVQ AX, DX // odd_ends + 16 80 MOVQ prev_iter_inside_quote+24(FP), CX 81 82 CALL ·__find_quote_mask_and_bits_avx512(SB) 83 PUSHQ AX // MOVQ AX, quote_mask + 64 84 85 CALL ·__find_whitespace_and_structurals_avx512(SB) 86 87 POPQ DX // DX = quote_mask 88 PUSHQ DX // Save again for newline determination 89 MOVQ prev_iter_ends_pseudo_pred+40(FP), R8 // R8 = &prev_iter_ends_pseudo_pred 90 91 CALL ·__finalize_structurals_avx512(SB) 92 93 POPQ DX // DX = quote_mask 94 CMPQ ndjson+88(FP), $0 95 JZ skip_ndjson_detection 96 CALL ·__find_newline_delimiters_avx512(SB) 97 ORQ BX, AX 98 99 skip_ndjson_detection: 100 MOVQ indexes+48(FP), DI 101 MOVQ index+56(FP), SI; MOVQ (SI), BX // BX = index 102 MOVQ carried+72(FP), R11; MOVQ (R11), DX // DX = carried 103 MOVQ position+80(FP), R12; MOVQ (R12), R10 // R10 = position 104 CALL ·__flatten_bits_incremental(SB) 105 MOVQ BX, (SI) // *index = BX 106 MOVQ DX, (R11) // *carried = DX 107 MOVQ R10, (R12) // *position = R10 108 109 POPQ AX 110 POPQ CX 111 112 CMPQ BX, indexes_len+64(FP) 113 JGE done 114 115 CMPQ AX, CX 116 JLT loop 117 118 // Check if AX is not aligned on a 64-byte boundary, this signals the last (partial) iteration 119 MOVQ AX, BX 120 ANDQ $0x3f, BX 121 CMPQ BX, $0 122 JNE done 123 124 check_partial_load: 125 MOVQ len+8(FP), CX 126 ANDQ $0x3f, CX 127 CMPQ CX, $0 128 JNE masking // end of message is not aligned on 64-byte boundary, so mask the remaining bytes 129 130 done: 131 VZEROUPPER 132 MOVQ error_mask+32(FP), R9 133 KMOVQ K_ERRORMASK, (R9) 134 MOVQ AX, processed+96(FP) 135 RET 136 137 masking: 138 // Do a partial load and mask out bytes after the end of the message with whitespace 139 VPBROADCASTQ WHITESPACE<>(SB), Y13 // Load padding whitespace constant 140 141 MOVQ buf+0(FP), DI 142 VMOVDQU (DI)(AX*1), Y8 // Always load low 32-bytes 143 CMPQ CX, $0x20 144 JGE masking_high 145 146 // Perform masking on low 32-bytes 147 MASK_WHITESPACE(0x1f, Y8) 148 VMOVDQU Y13, Y9 149 JMP masking_done 150 151 masking_high: 152 // Perform masking on high 32-bytes 153 VMOVDQU 0x20(DI)(AX*1), Y9 // Load high 32-bytes 154 MASK_WHITESPACE(0x3f, Y9) 155 156 masking_done: 157 ADDQ CX, AX 158 159 // Merge Y9 into upper half of Z8 160 VPXORD Z10, Z10, Z10 161 VALIGND $8, Z10, Z9, Z9 162 VPORD Z9, Z8, Z8 163 164 JMP loop_after_load // Rejoin loop after regular loading 165 166 DATA MASKTABLE<>+0x000(SB)/8, $0xffffffffffffffff 167 DATA MASKTABLE<>+0x008(SB)/8, $0xffffffffffffffff 168 DATA MASKTABLE<>+0x010(SB)/8, $0xffffffffffffffff 169 DATA MASKTABLE<>+0x018(SB)/8, $0x00ffffffffffffff 170 DATA MASKTABLE<>+0x020(SB)/8, $0x0000000000000000 171 DATA MASKTABLE<>+0x028(SB)/8, $0x0000000000000000 172 DATA MASKTABLE<>+0x030(SB)/8, $0x0000000000000000 173 DATA MASKTABLE<>+0x038(SB)/8, $0x0000000000000000 174 GLOBL MASKTABLE<>(SB), 8, $64 175 176 DATA WHITESPACE<>+0x000(SB)/8, $0x2020202020202020 177 GLOBL WHITESPACE<>(SB), 8, $8