github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/lda-tool/main.go (about) 1 // Command lda-tool performs LDA on the input documents. 2 package main 3 4 import ( 5 "bufio" 6 "encoding/json" 7 "flag" 8 "fmt" 9 "io" 10 "os" 11 "regexp" 12 "runtime" 13 "strings" 14 15 "github.com/fluhus/gostuff/nlp" 16 "golang.org/x/exp/maps" 17 ) 18 19 var ( 20 k = flag.Int("k", 0, "Number of topics") 21 numThreads = flag.Int("t", 1, "Number of therads to use") 22 js = flag.Bool("j", false, "Output as JSON instead of default format") 23 ) 24 25 func main() { 26 parseArgs() 27 28 // Read input and perform LDA. 29 fmt.Fprintln(os.Stdout, "Run with no arguments for usage help.") 30 fmt.Fprintln(os.Stdout, "Reading documents from stdin...") 31 docs, err := readDocs(os.Stdin) 32 if err != nil { 33 die("Error: failed to read input:", err) 34 } 35 fmt.Fprintln(os.Stdout, "Found", len(docs), "documents.") 36 37 fmt.Fprintln(os.Stdout, "Performing LDA...") 38 lda, _ := nlp.LdaThreads(docs, *k, *numThreads) 39 40 // Print output. 41 if *js { 42 j, _ := json.MarshalIndent(lda, "", "\t") 43 fmt.Println(string(j)) 44 } else { 45 for _, w := range maps.Keys(lda) { 46 fmt.Print(w) 47 for _, x := range lda[w] { 48 fmt.Printf(" %v", x) 49 } 50 fmt.Println() 51 } 52 } 53 } 54 55 // readDocs reads documents, one per line, from the input reader. 56 // It splits and lowercases the documents, and returns them as a 2d slice. 57 func readDocs(r io.Reader) ([][]string, error) { 58 wordsRe := regexp.MustCompile(`\w+`) 59 scanner := bufio.NewScanner(r) 60 var result [][]string 61 for scanner.Scan() { 62 w := wordsRe.FindAllString(strings.ToLower(scanner.Text()), -1) 63 64 // Copy line to a lower capacity slice, to reduce memory usage. 65 result = append(result, make([]string, len(w))) 66 copy(result[len(result)-1], w) 67 } 68 if scanner.Err() != nil { 69 return nil, scanner.Err() 70 } 71 return result, nil 72 } 73 74 // die reports an error message and exits with error code 2. 75 // Arguments are treated like Println. 76 func die(a ...interface{}) { 77 fmt.Fprintln(os.Stderr, a...) 78 os.Exit(2) 79 } 80 81 // parseArgs parses the program's arguments and validates them. 82 // Exits with an error message upon validation error. 83 func parseArgs() { 84 flag.Parse() 85 if len(os.Args) == 1 { 86 fmt.Fprintln(os.Stderr, help) 87 flag.PrintDefaults() 88 os.Exit(1) 89 } 90 if *k < 1 { 91 die("Error: invalid k:", *k) 92 } 93 if *numThreads < 0 { 94 die("Error: invalid number of threads:", *numThreads) 95 } 96 if *numThreads == 0 { 97 *numThreads = runtime.NumCPU() 98 } 99 } 100 101 var help = `Performs LDA on the given documents. 102 103 Input is read from the standard input. Format is one document per line. 104 Documents will be lowercased and normalized (spaces and punctuation omitted). 105 106 Output is printed to the standard output. Format is one word per line. 107 Each word is followed by K numbers, the i'th number represents the likelihood 108 of the i'th topic to emit that word. 109 `