github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/tokenize.go (about)

     1  package nlp
     2  
     3  import (
     4  	"github.com/agonopol/go-stem"
     5  	"regexp"
     6  	"strings"
     7  )
     8  
     9  // Tokenizer splits text into tokens. This regexp represents a single word.
    10  // Changing this regexp will affect the Tokenize function.
    11  var Tokenizer = regexp.MustCompile("\\w([\\w']*\\w)?")
    12  
    13  // Tokenize splits a given text to a slice of stemmed, lowercase words. If
    14  // keepStopWords is false, will drop stop words.
    15  func Tokenize(s string, keepStopWords bool) []string {
    16  	s = correctUtf8Punctuation(s)
    17  	s = strings.ToLower(s)
    18  	words := Tokenizer.FindAllString(s, -1)
    19  	var result []string
    20  	for _, word := range words {
    21  		if !keepStopWords && StopWords[word] {
    22  			continue
    23  		}
    24  		result = append(result, Stem(word))
    25  	}
    26  
    27  	return result
    28  }
    29  
    30  // Stem porter-stems the given word.
    31  func Stem(s string) string {
    32  	if strings.HasSuffix(s, "'s") {
    33  		s = s[:len(s)-2]
    34  	}
    35  	return string(stemmer.Stem([]byte(s)))
    36  }
    37  
    38  // correctUtf8Punctuation translates or removes non-ASCII punctuation characters.
    39  func correctUtf8Punctuation(s string) string {
    40  	return strings.Replace(s, "’", "'", -1)
    41  	// TODO(amit): Improve this function with more characters.
    42  }