github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/tfidf.go (about)

     1  package nlp
     2  
     3  // TF-IDF functionality.
     4  
     5  import (
     6  	"math"
     7  )
     8  
     9  // TfIdf returns the TF-IDF scores of the given corpus. For each documet,
    10  // returns a map from token to TF-IDF score.
    11  //
    12  // TF = count(token in document) / count(all tokens in document)
    13  //
    14  // IDF = log(count(documents) / count(documents with token))
    15  func TfIdf(docTokens [][]string) []map[string]float64 {
    16  	tf := make([]map[string]float64, len(docTokens))
    17  	idf := map[string]float64{}
    18  
    19  	// Collect TF and DF.
    20  	for i := range docTokens {
    21  		tf[i] = map[string]float64{}
    22  		for j := range docTokens[i] {
    23  			tf[i][docTokens[i][j]]++
    24  		}
    25  		for token := range tf[i] {
    26  			tf[i][token] /= float64(len(docTokens[i]))
    27  			idf[token]++
    28  		}
    29  	}
    30  
    31  	// Turn DF to IDF.
    32  	for token, df := range idf {
    33  		idf[token] = math.Log(float64(len(docTokens)) / df)
    34  	}
    35  
    36  	// Turn TF to TF-IDF.
    37  	for i := range tf {
    38  		for token := range tf[i] {
    39  			tf[i][token] *= idf[token]
    40  		}
    41  	}
    42  
    43  	return tf
    44  }