github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/klauspost/compress/flate/crc32_amd64.s (about) 1 //+build !noasm !appengine 2 3 // Copyright 2015, Klaus Post, see LICENSE for details. 4 5 // func crc32sse(a []byte) hash 6 TEXT ·crc32sse(SB), 7, $0 7 MOVQ a+0(FP), R10 8 XORQ BX, BX 9 10 // CRC32 dword (R10), EBX 11 BYTE $0xF2; BYTE $0x41; BYTE $0x0f 12 BYTE $0x38; BYTE $0xf1; BYTE $0x1a 13 14 MOVL BX, ret+24(FP) 15 RET 16 17 // func crc32sseAll(a []byte, dst []hash) 18 TEXT ·crc32sseAll(SB), 7, $0 19 MOVQ a+0(FP), R8 // R8: src 20 MOVQ a_len+8(FP), R10 // input length 21 MOVQ dst+24(FP), R9 // R9: dst 22 SUBQ $4, R10 23 JS end 24 JZ one_crc 25 MOVQ R10, R13 26 SHRQ $2, R10 // len/4 27 ANDQ $3, R13 // len&3 28 XORQ BX, BX 29 ADDQ $1, R13 30 TESTQ R10, R10 31 JZ rem_loop 32 33 crc_loop: 34 MOVQ (R8), R11 35 XORQ BX, BX 36 XORQ DX, DX 37 XORQ DI, DI 38 MOVQ R11, R12 39 SHRQ $8, R11 40 MOVQ R12, AX 41 MOVQ R11, CX 42 SHRQ $16, R12 43 SHRQ $16, R11 44 MOVQ R12, SI 45 46 // CRC32 EAX, EBX 47 BYTE $0xF2; BYTE $0x0f 48 BYTE $0x38; BYTE $0xf1; BYTE $0xd8 49 50 // CRC32 ECX, EDX 51 BYTE $0xF2; BYTE $0x0f 52 BYTE $0x38; BYTE $0xf1; BYTE $0xd1 53 54 // CRC32 ESI, EDI 55 BYTE $0xF2; BYTE $0x0f 56 BYTE $0x38; BYTE $0xf1; BYTE $0xfe 57 MOVL BX, (R9) 58 MOVL DX, 4(R9) 59 MOVL DI, 8(R9) 60 61 XORQ BX, BX 62 MOVL R11, AX 63 64 // CRC32 EAX, EBX 65 BYTE $0xF2; BYTE $0x0f 66 BYTE $0x38; BYTE $0xf1; BYTE $0xd8 67 MOVL BX, 12(R9) 68 69 ADDQ $16, R9 70 ADDQ $4, R8 71 XORQ BX, BX 72 SUBQ $1, R10 73 JNZ crc_loop 74 75 rem_loop: 76 MOVL (R8), AX 77 78 // CRC32 EAX, EBX 79 BYTE $0xF2; BYTE $0x0f 80 BYTE $0x38; BYTE $0xf1; BYTE $0xd8 81 82 MOVL BX, (R9) 83 ADDQ $4, R9 84 ADDQ $1, R8 85 XORQ BX, BX 86 SUBQ $1, R13 87 JNZ rem_loop 88 89 end: 90 RET 91 92 one_crc: 93 MOVQ $1, R13 94 XORQ BX, BX 95 JMP rem_loop 96 97 // func matchLenSSE4(a, b []byte, max int) int 98 TEXT ·matchLenSSE4(SB), 7, $0 99 MOVQ a+0(FP), SI // RSI: &a 100 MOVQ b+24(FP), DI // RDI: &b 101 MOVQ max+48(FP), R10 // R10: max 102 XORQ R11, R11 // R11: match length 103 MOVQ R10, R12 // R12: Remainder 104 SHRQ $4, R10 // max / 16 105 MOVQ $16, AX // Set length for PCMPESTRI 106 MOVQ $16, DX // Set length for PCMPESTRI 107 ANDQ $15, R12 // max & 15 108 TESTQ R10, R10 109 JZ matchlen_verysmall 110 111 loopback_matchlen: 112 MOVOU (SI), X0 // a[x] 113 MOVOU (DI), X1 // b[x] 114 115 // PCMPESTRI $0x18, X1, X0 116 // 0x18 = _SIDD_UBYTE_OPS (0x0) | _SIDD_CMP_EQUAL_EACH (0x8) | _SIDD_NEGATIVE_POLARITY (0x10) 117 BYTE $0x66; BYTE $0x0f; BYTE $0x3a 118 BYTE $0x61; BYTE $0xc1; BYTE $0x18 119 120 JC match_ended 121 122 ADDQ $16, SI 123 ADDQ $16, DI 124 ADDQ $16, R11 125 126 SUBQ $1, R10 127 JNZ loopback_matchlen 128 129 // Check the remainder using REP CMPSB 130 matchlen_verysmall: 131 TESTQ R12, R12 132 JZ done_matchlen 133 MOVQ R12, CX 134 ADDQ R12, R11 135 136 // Compare CX bytes at [SI] [DI] 137 // Subtract one from CX for every match. 138 // Terminates when CX is zero (checked pre-compare) 139 CLD 140 REP; CMPSB 141 142 // Check if last was a match. 143 JZ done_matchlen 144 145 // Subtract remanding bytes. 146 SUBQ CX, R11 147 SUBQ $1, R11 148 MOVQ R11, ret+56(FP) 149 RET 150 151 match_ended: 152 ADDQ CX, R11 153 154 done_matchlen: 155 MOVQ R11, ret+56(FP) 156 RET 157 158 // func histogram(b []byte, h []int32) 159 TEXT ·histogram(SB), 7, $0 160 MOVQ b+0(FP), SI // SI: &b 161 MOVQ b_len+8(FP), R9 // R9: len(b) 162 MOVQ h+24(FP), DI // DI: Histogram 163 MOVQ R9, R8 164 SHRQ $3, R8 165 JZ hist1 166 XORQ R11, R11 167 168 loop_hist8: 169 MOVQ (SI), R10 170 171 MOVB R10, R11 172 INCL (DI)(R11*4) 173 SHRQ $8, R10 174 175 MOVB R10, R11 176 INCL (DI)(R11*4) 177 SHRQ $8, R10 178 179 MOVB R10, R11 180 INCL (DI)(R11*4) 181 SHRQ $8, R10 182 183 MOVB R10, R11 184 INCL (DI)(R11*4) 185 SHRQ $8, R10 186 187 MOVB R10, R11 188 INCL (DI)(R11*4) 189 SHRQ $8, R10 190 191 MOVB R10, R11 192 INCL (DI)(R11*4) 193 SHRQ $8, R10 194 195 MOVB R10, R11 196 INCL (DI)(R11*4) 197 SHRQ $8, R10 198 199 INCL (DI)(R10*4) 200 201 ADDQ $8, SI 202 DECQ R8 203 JNZ loop_hist8 204 205 hist1: 206 ANDQ $7, R9 207 JZ end_hist 208 XORQ R10, R10 209 210 loop_hist1: 211 MOVB (SI), R10 212 INCL (DI)(R10*4) 213 INCQ SI 214 DECQ R9 215 JNZ loop_hist1 216 217 end_hist: 218 RET