github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/tokenize.go (about) 1 package nlp 2 3 import ( 4 "github.com/agonopol/go-stem" 5 "regexp" 6 "strings" 7 ) 8 9 // Tokenizer splits text into tokens. This regexp represents a single word. 10 // Changing this regexp will affect the Tokenize function. 11 var Tokenizer = regexp.MustCompile("\\w([\\w']*\\w)?") 12 13 // Tokenize splits a given text to a slice of stemmed, lowercase words. If 14 // keepStopWords is false, will drop stop words. 15 func Tokenize(s string, keepStopWords bool) []string { 16 s = correctUtf8Punctuation(s) 17 s = strings.ToLower(s) 18 words := Tokenizer.FindAllString(s, -1) 19 var result []string 20 for _, word := range words { 21 if !keepStopWords && StopWords[word] { 22 continue 23 } 24 result = append(result, Stem(word)) 25 } 26 27 return result 28 } 29 30 // Stem porter-stems the given word. 31 func Stem(s string) string { 32 if strings.HasSuffix(s, "'s") { 33 s = s[:len(s)-2] 34 } 35 return string(stemmer.Stem([]byte(s))) 36 } 37 38 // correctUtf8Punctuation translates or removes non-ASCII punctuation characters. 39 func correctUtf8Punctuation(s string) string { 40 return strings.Replace(s, "’", "'", -1) 41 // TODO(amit): Improve this function with more characters. 42 }