github.com/coyove/common@v0.0.0-20240403014525-f70e643f9de8/shoco/shoco.go (about) 1 package shoco 2 3 /* 4 * this is a go implementation of shoco alogrithm 5 * shoco is a C library to compress and decompress short strings. 6 * shoco is free software, distributed under the MIT license 7 * original project address: https://github.com/Ed-von-Schleck/shoco 8 */ 9 10 import ( 11 "encoding/binary" 12 "fmt" 13 ) 14 15 const SIZE_MAX = -1 16 17 var _ = fmt.Println 18 19 func decode_header(val byte) int { 20 i := -1 21 v := uint8(val) 22 23 for int8(v) < 0 { 24 v <<= 1 25 i++ 26 } 27 28 return i 29 } 30 31 func check_indices(indices []int16, pack_n int) bool { 32 for i := 0; i < int(packs[pack_n].bytes_unpacked); i++ { 33 if indices[i] > packs[pack_n].masks[i] { 34 return false 35 } 36 } 37 38 return true 39 } 40 41 func find_best_encoding(indices []int16, n_consecutive int) int { 42 for p := PACK_COUNT - 1; p >= 0; p-- { 43 if uint32(n_consecutive) >= packs[p].bytes_unpacked && check_indices(indices, p) { 44 return p 45 } 46 } 47 48 return -1 49 } 50 51 func shoco_compress(in []byte, out []byte) int { 52 in = append(in, 0) // append NULL like C 53 54 indices := make([]int16, MAX_SUCCESSOR_N+1) 55 last_chr_index := int32(0) 56 current_index := int32(0) 57 current_out_index := 0 58 successor_index := int32(0) 59 n_consecutive := 0 60 pack_n := 0 61 62 last_resort := func() (bool, int) { 63 if (in[0] & 0x80) != 0 { 64 // non-ascii case 65 if current_out_index+2 > len(out) { 66 return true, len(out) // + 1 67 } 68 // put in a sentinel byte 69 // *o++ = 0x00; 70 out[current_out_index] = 0x00 71 current_out_index++ 72 } else { 73 // an ascii byte 74 if current_out_index+1 > len(out) { 75 return true, len(out) // + 1 76 } 77 } 78 // *o++ = *in++; 79 out[current_out_index] = in[0] 80 current_out_index++ 81 in = in[1:] 82 83 return false, 0 84 } 85 86 for len(in) > 0 { 87 indices[0] = int16(chr_ids_by_chr[in[0]]) 88 last_chr_index = int32(indices[0]) 89 if last_chr_index < 0 { 90 if r, v := last_resort(); r { 91 return v 92 } else { 93 continue 94 } 95 } 96 97 rest := len(in) 98 for n_consecutive = 1; n_consecutive <= MAX_SUCCESSOR_N; n_consecutive++ { 99 if n_consecutive == rest { 100 break 101 } 102 103 if current_index = int32(chr_ids_by_chr[in[n_consecutive]]); current_index < 0 { 104 break 105 } 106 107 successor_index = int32(successor_ids_by_chr_id_and_chr_id[last_chr_index][current_index]) 108 if successor_index < 0 { 109 break 110 } 111 112 indices[n_consecutive] = int16(successor_index) 113 last_chr_index = current_index 114 } 115 116 if n_consecutive < 2 { 117 if r, v := last_resort(); r { 118 return v 119 } else { 120 continue 121 } 122 } 123 124 pack_n = find_best_encoding(indices, n_consecutive) 125 if pack_n >= 0 { 126 if current_out_index+int(packs[pack_n].bytes_packed) > len(out) { 127 return len(out) //+ 1 128 } 129 130 word := packs[pack_n].word 131 for i := uint32(0); i < packs[pack_n].bytes_unpacked; i++ { 132 word |= uint32(indices[i]) << packs[pack_n].offsets[i] 133 } 134 135 tmp := make([]byte, 4) 136 binary.BigEndian.PutUint32(tmp, word) 137 138 for i := uint32(0); i < packs[pack_n].bytes_packed; i++ { 139 out[i+uint32(current_out_index)] = tmp[i] 140 } 141 142 current_out_index += int(packs[pack_n].bytes_packed) 143 in = in[packs[pack_n].bytes_unpacked:] 144 } else { 145 if r, v := last_resort(); r { 146 return v 147 } 148 } 149 } 150 151 return current_out_index - 1 152 } 153 154 func shoco_decompress(in []byte, out []byte) int { 155 mark := 0 156 current_out_index := 0 157 offset := uint32(0) 158 mask := uint32(0) 159 var last_chr byte 160 161 for len(in) > 0 { 162 mark = decode_header(in[0]) 163 if mark >= len(packs) { 164 return 0 165 } 166 167 if mark < 0 { 168 if current_out_index > len(out) { 169 return len(out) 170 } 171 172 if in[0] == 0x00 { 173 in = in[1:] 174 if len(in) == 0 { 175 return SIZE_MAX 176 } 177 } 178 179 out[current_out_index] = in[0] 180 current_out_index++ 181 in = in[1:] 182 } else { 183 if current_out_index+int(packs[mark].bytes_unpacked) > len(out) { 184 return len(out) 185 } else if int(packs[mark].bytes_packed) > len(in) { 186 return SIZE_MAX 187 } 188 189 tmp := make([]byte, 4) 190 for i := uint32(0); i < packs[mark].bytes_packed; i++ { 191 tmp[i] = in[i] 192 } 193 194 word := binary.BigEndian.Uint32(tmp) 195 196 offset = packs[mark].offsets[0] 197 mask = uint32(packs[mark].masks[0]) 198 199 out[current_out_index] = chrs_by_chr_id[(word>>offset)&mask] 200 last_chr = out[current_out_index] 201 202 for i := uint32(1); i < packs[mark].bytes_unpacked; i++ { 203 offset = packs[mark].offsets[i] 204 mask = uint32(packs[mark].masks[i]) 205 206 chridx := uint8(last_chr) - uint8(MIN_CHR) 207 idx := (word >> offset) & mask 208 209 if last_chr >= MAX_CHR || last_chr < MIN_CHR || idx >= uint32(len(chrs_by_chr_and_successor_id[chridx])) { 210 return 0 211 } 212 213 last_chr = byte(chrs_by_chr_and_successor_id[chridx][idx]) 214 out[current_out_index+int(i)] = last_chr 215 } 216 217 current_out_index += int(packs[mark].bytes_unpacked) 218 in = in[packs[mark].bytes_packed:] 219 } 220 } 221 222 return current_out_index 223 } 224 225 func Compress(text string) []byte { 226 buf := make([]byte, len(text)*2) 227 x := shoco_compress([]byte(text), buf) 228 return buf[:x] 229 } 230 231 func Decompress(buf []byte) string { 232 out := make([]byte, len(buf)*2) 233 x := shoco_decompress(buf, out) 234 235 if x > 0 { 236 return string(out[:x]) 237 } else { 238 return "" 239 } 240 }