github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/text/levenshtein/word/type-specific.go

github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/text/levenshtein/word/type-specific.go (about)

     1  // Package levenshtein/word does tokenization on word level.
     2  package word
     3  
     4  import (
     5  	"sort"
     6  
     7  	"github.com/pbberlin/tools/stringspb"
     8  	ls_core "github.com/pbberlin/tools/text/levenshtein"
     9  )
    10  
    11  type Token string // we could use []rune instead of string
    12  
    13  func (tk1 Token) Equal(compareTo interface{}) bool {
    14  	tk2, ok := compareTo.(Token)
    15  	if !ok {
    16  		panic("Not the same type")
    17  	}
    18  	return tk1 == tk2
    19  }
    20  
    21  // WrapAsEqualer breaks string into a slice of strings.
    22  // Each string is then converted to <Token> to <Equaler>.
    23  // []<Equaler> can then be pumped into the generic core.
    24  // We could as well create slices of Equalers in the first place
    25  // but first leads to a var farTooUglyLiteral =
    26  //   []ls_core.Equaler{ls_core.Equaler(Token("trink")), ls_core.Equaler(Token("nicht"))}
    27  func WrapAsEqualer(s string, sorted bool) []ls_core.Equaler {
    28  
    29  	ss := stringspb.SplitByWhitespace(s)
    30  	if sorted {
    31  		sort.Strings(ss)
    32  
    33  		// weed out doublettes
    34  		su, prev := make([]string, 0, len(ss)), ""
    35  		for _, v := range ss {
    36  			if v == prev {
    37  				continue
    38  			}
    39  			su = append(su, v)
    40  			prev = v
    41  		}
    42  		ss = su
    43  
    44  	}
    45  
    46  	ret := make([]ls_core.Equaler, 0, len(ss))
    47  	for _, v := range ss {
    48  		cnv := ls_core.Equaler(Token(v))
    49  		ret = append(ret, cnv)
    50  	}
    51  	return ret
    52  }