github.com/coyove/common@v0.0.0-20240403014525-f70e643f9de8/shoco/shoco.go (about)

     1  package shoco
     2  
     3  /*
     4   * this is a go implementation of shoco alogrithm
     5   * shoco is a C library to compress and decompress short strings.
     6   * shoco is free software, distributed under the MIT license
     7   * original project address: https://github.com/Ed-von-Schleck/shoco
     8   */
     9  
    10  import (
    11  	"encoding/binary"
    12  	"fmt"
    13  )
    14  
    15  const SIZE_MAX = -1
    16  
    17  var _ = fmt.Println
    18  
    19  func decode_header(val byte) int {
    20  	i := -1
    21  	v := uint8(val)
    22  
    23  	for int8(v) < 0 {
    24  		v <<= 1
    25  		i++
    26  	}
    27  
    28  	return i
    29  }
    30  
    31  func check_indices(indices []int16, pack_n int) bool {
    32  	for i := 0; i < int(packs[pack_n].bytes_unpacked); i++ {
    33  		if indices[i] > packs[pack_n].masks[i] {
    34  			return false
    35  		}
    36  	}
    37  
    38  	return true
    39  }
    40  
    41  func find_best_encoding(indices []int16, n_consecutive int) int {
    42  	for p := PACK_COUNT - 1; p >= 0; p-- {
    43  		if uint32(n_consecutive) >= packs[p].bytes_unpacked && check_indices(indices, p) {
    44  			return p
    45  		}
    46  	}
    47  
    48  	return -1
    49  }
    50  
    51  func shoco_compress(in []byte, out []byte) int {
    52  	in = append(in, 0) // append NULL like C
    53  
    54  	indices := make([]int16, MAX_SUCCESSOR_N+1)
    55  	last_chr_index := int32(0)
    56  	current_index := int32(0)
    57  	current_out_index := 0
    58  	successor_index := int32(0)
    59  	n_consecutive := 0
    60  	pack_n := 0
    61  
    62  	last_resort := func() (bool, int) {
    63  		if (in[0] & 0x80) != 0 {
    64  			// non-ascii case
    65  			if current_out_index+2 > len(out) {
    66  				return true, len(out) // + 1
    67  			}
    68  			// put in a sentinel byte
    69  			// *o++ = 0x00;
    70  			out[current_out_index] = 0x00
    71  			current_out_index++
    72  		} else {
    73  			// an ascii byte
    74  			if current_out_index+1 > len(out) {
    75  				return true, len(out) // + 1
    76  			}
    77  		}
    78  		// *o++ = *in++;
    79  		out[current_out_index] = in[0]
    80  		current_out_index++
    81  		in = in[1:]
    82  
    83  		return false, 0
    84  	}
    85  
    86  	for len(in) > 0 {
    87  		indices[0] = int16(chr_ids_by_chr[in[0]])
    88  		last_chr_index = int32(indices[0])
    89  		if last_chr_index < 0 {
    90  			if r, v := last_resort(); r {
    91  				return v
    92  			} else {
    93  				continue
    94  			}
    95  		}
    96  
    97  		rest := len(in)
    98  		for n_consecutive = 1; n_consecutive <= MAX_SUCCESSOR_N; n_consecutive++ {
    99  			if n_consecutive == rest {
   100  				break
   101  			}
   102  
   103  			if current_index = int32(chr_ids_by_chr[in[n_consecutive]]); current_index < 0 {
   104  				break
   105  			}
   106  
   107  			successor_index = int32(successor_ids_by_chr_id_and_chr_id[last_chr_index][current_index])
   108  			if successor_index < 0 {
   109  				break
   110  			}
   111  
   112  			indices[n_consecutive] = int16(successor_index)
   113  			last_chr_index = current_index
   114  		}
   115  
   116  		if n_consecutive < 2 {
   117  			if r, v := last_resort(); r {
   118  				return v
   119  			} else {
   120  				continue
   121  			}
   122  		}
   123  
   124  		pack_n = find_best_encoding(indices, n_consecutive)
   125  		if pack_n >= 0 {
   126  			if current_out_index+int(packs[pack_n].bytes_packed) > len(out) {
   127  				return len(out) //+ 1
   128  			}
   129  
   130  			word := packs[pack_n].word
   131  			for i := uint32(0); i < packs[pack_n].bytes_unpacked; i++ {
   132  				word |= uint32(indices[i]) << packs[pack_n].offsets[i]
   133  			}
   134  
   135  			tmp := make([]byte, 4)
   136  			binary.BigEndian.PutUint32(tmp, word)
   137  
   138  			for i := uint32(0); i < packs[pack_n].bytes_packed; i++ {
   139  				out[i+uint32(current_out_index)] = tmp[i]
   140  			}
   141  
   142  			current_out_index += int(packs[pack_n].bytes_packed)
   143  			in = in[packs[pack_n].bytes_unpacked:]
   144  		} else {
   145  			if r, v := last_resort(); r {
   146  				return v
   147  			}
   148  		}
   149  	}
   150  
   151  	return current_out_index - 1
   152  }
   153  
   154  func shoco_decompress(in []byte, out []byte) int {
   155  	mark := 0
   156  	current_out_index := 0
   157  	offset := uint32(0)
   158  	mask := uint32(0)
   159  	var last_chr byte
   160  
   161  	for len(in) > 0 {
   162  		mark = decode_header(in[0])
   163  		if mark >= len(packs) {
   164  			return 0
   165  		}
   166  
   167  		if mark < 0 {
   168  			if current_out_index > len(out) {
   169  				return len(out)
   170  			}
   171  
   172  			if in[0] == 0x00 {
   173  				in = in[1:]
   174  				if len(in) == 0 {
   175  					return SIZE_MAX
   176  				}
   177  			}
   178  
   179  			out[current_out_index] = in[0]
   180  			current_out_index++
   181  			in = in[1:]
   182  		} else {
   183  			if current_out_index+int(packs[mark].bytes_unpacked) > len(out) {
   184  				return len(out)
   185  			} else if int(packs[mark].bytes_packed) > len(in) {
   186  				return SIZE_MAX
   187  			}
   188  
   189  			tmp := make([]byte, 4)
   190  			for i := uint32(0); i < packs[mark].bytes_packed; i++ {
   191  				tmp[i] = in[i]
   192  			}
   193  
   194  			word := binary.BigEndian.Uint32(tmp)
   195  
   196  			offset = packs[mark].offsets[0]
   197  			mask = uint32(packs[mark].masks[0])
   198  
   199  			out[current_out_index] = chrs_by_chr_id[(word>>offset)&mask]
   200  			last_chr = out[current_out_index]
   201  
   202  			for i := uint32(1); i < packs[mark].bytes_unpacked; i++ {
   203  				offset = packs[mark].offsets[i]
   204  				mask = uint32(packs[mark].masks[i])
   205  
   206  				chridx := uint8(last_chr) - uint8(MIN_CHR)
   207  				idx := (word >> offset) & mask
   208  
   209  				if last_chr >= MAX_CHR || last_chr < MIN_CHR || idx >= uint32(len(chrs_by_chr_and_successor_id[chridx])) {
   210  					return 0
   211  				}
   212  
   213  				last_chr = byte(chrs_by_chr_and_successor_id[chridx][idx])
   214  				out[current_out_index+int(i)] = last_chr
   215  			}
   216  
   217  			current_out_index += int(packs[mark].bytes_unpacked)
   218  			in = in[packs[mark].bytes_packed:]
   219  		}
   220  	}
   221  
   222  	return current_out_index
   223  }
   224  
   225  func Compress(text string) []byte {
   226  	buf := make([]byte, len(text)*2)
   227  	x := shoco_compress([]byte(text), buf)
   228  	return buf[:x]
   229  }
   230  
   231  func Decompress(buf []byte) string {
   232  	out := make([]byte, len(buf)*2)
   233  	x := shoco_decompress(buf, out)
   234  
   235  	if x > 0 {
   236  		return string(out[:x])
   237  	} else {
   238  		return ""
   239  	}
   240  }