github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/tfidf.go (about) 1 package nlp 2 3 // TF-IDF functionality. 4 5 import ( 6 "math" 7 ) 8 9 // TfIdf returns the TF-IDF scores of the given corpus. For each documet, 10 // returns a map from token to TF-IDF score. 11 // 12 // TF = count(token in document) / count(all tokens in document) 13 // 14 // IDF = log(count(documents) / count(documents with token)) 15 func TfIdf(docTokens [][]string) []map[string]float64 { 16 tf := make([]map[string]float64, len(docTokens)) 17 idf := map[string]float64{} 18 19 // Collect TF and DF. 20 for i := range docTokens { 21 tf[i] = map[string]float64{} 22 for j := range docTokens[i] { 23 tf[i][docTokens[i][j]]++ 24 } 25 for token := range tf[i] { 26 tf[i][token] /= float64(len(docTokens[i])) 27 idf[token]++ 28 } 29 } 30 31 // Turn DF to IDF. 32 for token, df := range idf { 33 idf[token] = math.Log(float64(len(docTokens)) / df) 34 } 35 36 // Turn TF to TF-IDF. 37 for i := range tf { 38 for token := range tf[i] { 39 tf[i][token] *= idf[token] 40 } 41 } 42 43 return tf 44 }