github.com/errata-ai/vale/v3@v3.4.2/internal/nlp/tokenize.go (about)

     1  package nlp
     2  
     3  import (
     4  	"regexp"
     5  	"strings"
     6  	"unicode"
     7  	"unicode/utf8"
     8  )
     9  
    10  type TokenTester func(string) bool
    11  
    12  type Tokenizer interface {
    13  	Tokenize(string) []string
    14  }
    15  
    16  // IterTokenizer splits a sentence into words.
    17  type IterTokenizer struct {
    18  	specialRE      *regexp.Regexp
    19  	sanitizer      *strings.Replacer
    20  	contractions   []string
    21  	splitCases     []string
    22  	suffixes       []string
    23  	prefixes       []string
    24  	emoticons      map[string]int
    25  	isUnsplittable TokenTester
    26  }
    27  
    28  type TokenizerOptFunc func(*IterTokenizer)
    29  
    30  // UsingIsUnsplittable gives a function that tests whether a token is splittable or not.
    31  func UsingIsUnsplittable(x TokenTester) TokenizerOptFunc {
    32  	return func(tokenizer *IterTokenizer) {
    33  		tokenizer.isUnsplittable = x
    34  	}
    35  }
    36  
    37  // UsingSpecialRE sets the provided special regex for unsplittable tokens.
    38  func UsingSpecialRE(x *regexp.Regexp) TokenizerOptFunc {
    39  	return func(tokenizer *IterTokenizer) {
    40  		tokenizer.specialRE = x
    41  	}
    42  }
    43  
    44  // UsingSanitizer sets the provided sanitizer.
    45  func UsingSanitizer(x *strings.Replacer) TokenizerOptFunc {
    46  	return func(tokenizer *IterTokenizer) {
    47  		tokenizer.sanitizer = x
    48  	}
    49  }
    50  
    51  // UsingSuffixes sets the provided suffixes.
    52  func UsingSuffixes(x []string) TokenizerOptFunc {
    53  	return func(tokenizer *IterTokenizer) {
    54  		tokenizer.suffixes = x
    55  	}
    56  }
    57  
    58  // UsingPrefixes sets the provided prefixes.
    59  func UsingPrefixes(x []string) TokenizerOptFunc {
    60  	return func(tokenizer *IterTokenizer) {
    61  		tokenizer.prefixes = x
    62  	}
    63  }
    64  
    65  // UsingEmoticons sets the provided map of emoticons.
    66  func UsingEmoticons(x map[string]int) TokenizerOptFunc {
    67  	return func(tokenizer *IterTokenizer) {
    68  		tokenizer.emoticons = x
    69  	}
    70  }
    71  
    72  // UsingContractions sets the provided contractions.
    73  func UsingContractions(x []string) TokenizerOptFunc {
    74  	return func(tokenizer *IterTokenizer) {
    75  		tokenizer.contractions = x
    76  	}
    77  }
    78  
    79  // UsingSplitCases sets the provided splitCases.
    80  func UsingSplitCases(x []string) TokenizerOptFunc {
    81  	return func(tokenizer *IterTokenizer) {
    82  		tokenizer.splitCases = x
    83  	}
    84  }
    85  
    86  // NewIterTokenizer creates a new iterTokenizer.
    87  func NewIterTokenizer(opts ...TokenizerOptFunc) *IterTokenizer {
    88  	tok := new(IterTokenizer)
    89  
    90  	// Set default parameters
    91  	tok.emoticons = emoticons
    92  	tok.isUnsplittable = func(_ string) bool { return false }
    93  	tok.prefixes = prefixes
    94  	tok.sanitizer = sanitizer
    95  	tok.specialRE = internalRE
    96  	tok.suffixes = suffixes
    97  
    98  	// Apply options if provided
    99  	for _, applyOpt := range opts {
   100  		applyOpt(tok)
   101  	}
   102  
   103  	return tok
   104  }
   105  
   106  func addToken(s string, toks []string) []string {
   107  	if !allNonLetter(s) {
   108  		toks = append(toks, s)
   109  	}
   110  	return toks
   111  }
   112  
   113  func (t *IterTokenizer) isSpecial(token string) bool {
   114  	_, found := t.emoticons[token]
   115  	return found || t.specialRE.MatchString(token) || t.isUnsplittable(token)
   116  }
   117  
   118  func (t *IterTokenizer) doSplit(token string) []string {
   119  	var tokens []string
   120  
   121  	last := 0
   122  	for token != "" && StrLen(token) != last {
   123  		if t.isSpecial(token) {
   124  			// We've found a special case (e.g., an emoticon) -- so, we add it as a token without
   125  			// any further processing.
   126  			tokens = addToken(token, tokens)
   127  			break
   128  		}
   129  		last = StrLen(token)
   130  		lower := strings.ToLower(token)
   131  		if hasAnyPrefix(token, t.prefixes) {
   132  			// Remove prefixes -- e.g., $100 -> [$, 100].
   133  			token = token[1:]
   134  		} else if idx := hasAnyIndex(lower, t.splitCases); idx > -1 {
   135  			// Handle "they'll", "I'll", "Don't", "won't", amount($).
   136  			//
   137  			// they'll -> [they, 'll].
   138  			// don't -> [do, n't].
   139  			// amount($) -> [amount, (, $, )].
   140  			tokens = addToken(token[:idx], tokens)
   141  			token = token[idx:]
   142  		} else if hasAnySuffix(token, t.suffixes) {
   143  			// Remove suffixes -- e.g., Well) -> [Well, )].
   144  			token = token[:len(token)-1]
   145  		} else {
   146  			tokens = addToken(token, tokens)
   147  		}
   148  	}
   149  
   150  	return tokens
   151  }
   152  
   153  // Tokenize splits a sentence into a slice of words.
   154  func (t *IterTokenizer) Tokenize(text string) []string {
   155  	var tokens []string
   156  
   157  	clean, white := t.sanitizer.Replace(text), false
   158  	length := len(clean)
   159  
   160  	start, index := 0, 0
   161  	cache := map[string][]string{}
   162  	for index <= length {
   163  		uc, size := utf8.DecodeRuneInString(clean[index:])
   164  		if size == 0 {
   165  			break
   166  		} else if index == 0 {
   167  			white = unicode.IsSpace(uc)
   168  		}
   169  		if unicode.IsSpace(uc) != white {
   170  			if start < index {
   171  				span := clean[start:index]
   172  				if toks, found := cache[span]; found {
   173  					tokens = append(tokens, toks...)
   174  				} else {
   175  					toks = t.doSplit(span)
   176  					cache[span] = toks
   177  					tokens = append(tokens, toks...)
   178  				}
   179  			}
   180  			if uc == ' ' {
   181  				start = index + 1
   182  			} else {
   183  				start = index
   184  			}
   185  			white = !white
   186  		}
   187  		index += size
   188  	}
   189  
   190  	if start < index {
   191  		tokens = append(tokens, t.doSplit(clean[start:index])...)
   192  	}
   193  
   194  	return tokens
   195  }