github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/benchmark_test.go (about) 1 package gpt_bpe 2 3 import ( 4 "bufio" 5 "os" 6 "runtime" 7 "runtime/pprof" 8 "strings" 9 "testing" 10 "time" 11 ) 12 13 func BenchmarkGPTEncoder_WordSplitterChan(b *testing.B) { 14 b.StopTimer() 15 gpt2Encoder.SplitterThreads = 8 16 corpusHandle := strings.NewReader(corpus) 17 nextWord := gpt2Encoder.WordSplitter( 18 bufio.NewReaderSize( 19 corpusHandle, 8*1024*1024, 20 ), 21 ) 22 23 start := time.Now() 24 b.StartTimer() 25 wordCount := 0 26 for { 27 word := nextWord() 28 if word == nil { 29 break 30 } 31 wordCount++ 32 } 33 b.StopTimer() 34 elapsed := time.Since(start) 35 b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec") 36 b.ReportMetric(float64(wordCount), "words") 37 } 38 39 func BenchmarkGPTEncoder_WordSplitter(b *testing.B) { 40 b.StopTimer() 41 corpusHandle := strings.NewReader(*largeCorpus) 42 gpt2Encoder.SplitterThreads = 8 43 wordCount := 0 44 runeReader := bufio.NewReaderSize(corpusHandle, 8*1024*1024) 45 profileHandle, _ := os.Create("wordsplitter.prof") 46 runtime.GC() 47 pprof.StartCPUProfile(profileHandle) 48 wordSplitter := gpt2Encoder.makeWordSplitter( 49 runeReader.ReadRune, 50 func(words []string) { 51 wordCount += len(words) 52 }, 53 func() {}, 54 ) 55 start := time.Now() 56 b.StartTimer() 57 wordSplitter() 58 b.StopTimer() 59 pprof.StopCPUProfile() 60 elapsed := time.Since(start) 61 numBytes := len(*largeCorpus) 62 b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec") 63 b.ReportMetric(float64(wordCount), "words") 64 b.ReportMetric(float64(numBytes)/elapsed.Seconds(), "bytes/sec") 65 b.ReportMetric(float64(numBytes), "bytes") 66 } 67 68 func BenchmarkGPTEncoder_ToBPE(b *testing.B) { 69 b.StopTimer() 70 71 // Pre-split words 72 words := *nerdstashV2Encoder.SplitWords(largeCorpus) 73 74 // Pre-calculate tokens for each word 75 tokenLengths := make([]int, len(words)) 76 totalTokens := 0 77 for i, word := range words { 78 tokens := nerdstashV2Encoder.ToBPE(word) 79 tokenLengths[i] = len(tokens) 80 totalTokens += tokenLengths[i] 81 } 82 profileHandle, _ := os.Create("tobpe.prof") 83 84 numBytes := len(*largeCorpus) 85 start := time.Now() 86 87 b.StartTimer() 88 runtime.GC() 89 pprof.StartCPUProfile(profileHandle) 90 for i := 0; i < b.N; i++ { 91 for idx := range words { 92 // Just do the ToBPE call without length calculation 93 nerdstashV2Encoder.ToBPE(words[idx]) 94 } 95 } 96 pprof.StopCPUProfile() 97 b.StopTimer() 98 99 elapsed := time.Since(start) 100 totalTokens *= b.N 101 102 // Use pre-calculated values for metrics 103 b.ReportMetric(float64(numBytes)/elapsed.Seconds(), "bytes/sec") 104 b.ReportMetric(float64(numBytes), "bytes") 105 b.ReportMetric(float64(totalTokens)/elapsed.Seconds(), "tokens/sec") 106 b.ReportMetric(float64(totalTokens), "tokens") 107 // Report on tokenizer LRU cache 108 b.ReportMetric(float64(nerdstashV2Encoder.LruHits), "lru_hits") 109 b.ReportMetric(float64(nerdstashV2Encoder.LruMisses), "lru_misses") 110 b.ReportMetric(float64(nerdstashV2Encoder.LruEvictions), "lru_evictions") 111 112 } 113 114 func BenchmarkGPTEncoder_WordSplitterTokens(b *testing.B) { 115 b.StopTimer() 116 wordCount := 0 117 tokensCount := 0 118 corpusHandle := strings.NewReader(corpus) 119 runeReader := bufio.NewReaderSize(corpusHandle, 8*1024*1024) 120 121 wordSplitter := nerdstashV2Encoder.makeWordSplitter( 122 runeReader.ReadRune, 123 func(words []string) { 124 if len(words) > 0 { 125 for _, word := range words { 126 tokensCount += len(nerdstashV2Encoder.ToBPE(word)) 127 } 128 } 129 wordCount++ 130 }, 131 func() {}, 132 ) 133 start := time.Now() 134 b.StartTimer() 135 wordSplitter() 136 b.StopTimer() 137 elapsed := time.Since(start) 138 //numBytes := int64(len(corpusText)) 139 b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec") 140 b.ReportMetric(float64(wordCount), "words") 141 b.ReportMetric(float64(tokensCount)/elapsed.Seconds(), "tokens/sec") 142 b.ReportMetric(float64(tokensCount), "tokens") 143 } 144 145 func BenchmarkGPTEncoder_Decode(b *testing.B) { 146 if gpt2Encoded == nil { 147 corpEncoded := gpt2Encoder.Encode(&corpus) 148 gpt2Encoded = corpEncoded 149 } 150 start := time.Now() 151 tokenNumBytes := len(gpt2Encoder.Decode(gpt2Encoded)) 152 duration := time.Since(start) 153 b.Logf( 154 "%v tokens into %v bytes over %v", 155 len(*gpt2Encoded), tokenNumBytes, duration, 156 ) 157 } 158 159 func BenchmarkGPTEncoder_Encode(b *testing.B) { 160 start := time.Now() 161 tokenCt := len(*gpt2Encoder.Encode(&corpus)) 162 duration := time.Since(start) 163 b.Logf( 164 "%v bytes into %v tokens over %v", 165 len(corpus), tokenCt, duration, 166 ) 167 } 168 169 func BenchmarkGPTEncoder_EncodeBuffer(b *testing.B) { 170 corpusBytes := []byte(corpus) 171 start := time.Now() 172 _, tokenCt := gpt2Encoder.EncodeBuffer(&corpusBytes) 173 duration := time.Since(start) 174 b.Logf( 175 "%v bytes into %v tokens over %v", 176 len(corpus), tokenCt, duration, 177 ) 178 } 179 180 //func BenchmarkGPTEncoder_WordSplitterTokensChan(b *testing.B) { 181 // b.StopTimer() 182 // corpusHandle, err := os.Open(largeCorpusPath) 183 // //corpusText, err := ioutil.ReadFile(largeCorpusPath) 184 // nerdstashEncoder.SplitterThreads = 1 185 // //defer corpusHandle.Close() 186 // if err != nil { 187 // b.Error(err) 188 // } 189 // wordCount := 0 190 // tokensCount := 0 191 // runeReader := bufio.NewReaderSize(corpusHandle, 8*1024*1024) 192 // wordsChan := make(chan *string, 1000) 193 // go nerdstashEncoder.splitWordsOntoChan(runeReader.ReadRune, 194 // wordsChan) 195 // start := time.Now() 196 // b.StartTimer() 197 // for { 198 // word := <-wordsChan 199 // if word == nil { 200 // break 201 // } 202 // tokensCount += len(gpt2Encoder.ToBPE(*word)) 203 // wordCount++ 204 // } 205 // b.StopTimer() 206 // elapsed := time.Since(start) 207 // //numBytes := int64(len(corpusText)) 208 // numBytes, _ := corpusHandle.Seek(0, io.SeekCurrent) 209 // b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec") 210 // b.ReportMetric(float64(wordCount), "words") 211 // b.ReportMetric(float64(numBytes)/elapsed.Seconds(), "bytes/sec") 212 // b.ReportMetric(float64(numBytes), "bytes") 213 // b.ReportMetric(float64(tokensCount)/elapsed.Seconds(), "tokens/sec") 214 // b.ReportMetric(float64(tokensCount), "tokens") 215 //}