github.com/pbberlin/tools@v0.0.0-20160910141205-7aa5421c2169/text/levenshtein/wordb/type-specific.go (about)

     1  // Package levenshtein/wordb does tokenization on word level,
     2  // using byte slices instead of string, saving conversion cost.
     3  package wordb
     4  
     5  import (
     6  	"bytes"
     7  	"sort"
     8  
     9  	ls_core "github.com/pbberlin/tools/text/levenshtein"
    10  )
    11  
    12  type Token []byte // similar to word, but without string
    13  
    14  func (tk1 Token) Equal(compareTo interface{}) bool {
    15  	tk2, ok := compareTo.(Token)
    16  	if !ok {
    17  		panic("Not the same type")
    18  	}
    19  	return bytes.Equal(tk1, tk2) // bytes.EqualFold would make it case insensitive
    20  }
    21  
    22  // See word.WrapAsEqualer
    23  func WrapAsEqualer(sb []byte, sorted bool) []ls_core.Equaler {
    24  
    25  	sbf := bytes.Fields(sb)
    26  	if sorted {
    27  		sort.Sort(sortBoB(sbf))
    28  
    29  		// weed out doublettes
    30  		su, prev := make([][]byte, 0, len(sbf)), []byte{}
    31  		for _, v := range sbf {
    32  			if bytes.Equal(v, prev) {
    33  				continue
    34  			}
    35  			su = append(su, v)
    36  			prev = v
    37  		}
    38  		sbf = su
    39  
    40  	}
    41  
    42  	ret := make([]ls_core.Equaler, 0, len(sbf))
    43  	for _, v := range sbf {
    44  		cnv := ls_core.Equaler(Token(v))
    45  		ret = append(ret, cnv)
    46  	}
    47  	return ret
    48  }
    49  
    50  //
    51  // We have to implement the sorting of strings stored as byte slices
    52  type sortBoB [][]byte // slice of bytes of bytes
    53  
    54  func (sb sortBoB) Len() int {
    55  	return len(sb)
    56  }
    57  
    58  func (sb sortBoB) Less(i, j int) bool {
    59  	return bytes.Compare(sb[i], sb[j]) > 0
    60  }
    61  
    62  func (sb sortBoB) Swap(i, j int) {
    63  	sb[i], sb[j] = sb[j], sb[i]
    64  }