github.com/pbberlin/tools@v0.0.0-20160910141205-7aa5421c2169/text/levenshtein/wordb/type-specific.go (about) 1 // Package levenshtein/wordb does tokenization on word level, 2 // using byte slices instead of string, saving conversion cost. 3 package wordb 4 5 import ( 6 "bytes" 7 "sort" 8 9 ls_core "github.com/pbberlin/tools/text/levenshtein" 10 ) 11 12 type Token []byte // similar to word, but without string 13 14 func (tk1 Token) Equal(compareTo interface{}) bool { 15 tk2, ok := compareTo.(Token) 16 if !ok { 17 panic("Not the same type") 18 } 19 return bytes.Equal(tk1, tk2) // bytes.EqualFold would make it case insensitive 20 } 21 22 // See word.WrapAsEqualer 23 func WrapAsEqualer(sb []byte, sorted bool) []ls_core.Equaler { 24 25 sbf := bytes.Fields(sb) 26 if sorted { 27 sort.Sort(sortBoB(sbf)) 28 29 // weed out doublettes 30 su, prev := make([][]byte, 0, len(sbf)), []byte{} 31 for _, v := range sbf { 32 if bytes.Equal(v, prev) { 33 continue 34 } 35 su = append(su, v) 36 prev = v 37 } 38 sbf = su 39 40 } 41 42 ret := make([]ls_core.Equaler, 0, len(sbf)) 43 for _, v := range sbf { 44 cnv := ls_core.Equaler(Token(v)) 45 ret = append(ret, cnv) 46 } 47 return ret 48 } 49 50 // 51 // We have to implement the sorting of strings stored as byte slices 52 type sortBoB [][]byte // slice of bytes of bytes 53 54 func (sb sortBoB) Len() int { 55 return len(sb) 56 } 57 58 func (sb sortBoB) Less(i, j int) bool { 59 return bytes.Compare(sb[i], sb[j]) > 0 60 } 61 62 func (sb sortBoB) Swap(i, j int) { 63 sb[i], sb[j] = sb[j], sb[i] 64 }