github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/benchmark_test.go (about)

     1  package gpt_bpe
     2  
     3  import (
     4  	"bufio"
     5  	"os"
     6  	"runtime"
     7  	"runtime/pprof"
     8  	"strings"
     9  	"testing"
    10  	"time"
    11  )
    12  
    13  func BenchmarkGPTEncoder_WordSplitterChan(b *testing.B) {
    14  	b.StopTimer()
    15  	gpt2Encoder.SplitterThreads = 8
    16  	corpusHandle := strings.NewReader(corpus)
    17  	nextWord := gpt2Encoder.WordSplitter(
    18  		bufio.NewReaderSize(
    19  			corpusHandle, 8*1024*1024,
    20  		),
    21  	)
    22  
    23  	start := time.Now()
    24  	b.StartTimer()
    25  	wordCount := 0
    26  	for {
    27  		word := nextWord()
    28  		if word == nil {
    29  			break
    30  		}
    31  		wordCount++
    32  	}
    33  	b.StopTimer()
    34  	elapsed := time.Since(start)
    35  	b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec")
    36  	b.ReportMetric(float64(wordCount), "words")
    37  }
    38  
    39  func BenchmarkGPTEncoder_WordSplitter(b *testing.B) {
    40  	b.StopTimer()
    41  	corpusHandle := strings.NewReader(*largeCorpus)
    42  	gpt2Encoder.SplitterThreads = 8
    43  	wordCount := 0
    44  	runeReader := bufio.NewReaderSize(corpusHandle, 8*1024*1024)
    45  	profileHandle, _ := os.Create("wordsplitter.prof")
    46  	runtime.GC()
    47  	pprof.StartCPUProfile(profileHandle)
    48  	wordSplitter := gpt2Encoder.makeWordSplitter(
    49  		runeReader.ReadRune,
    50  		func(words []string) {
    51  			wordCount += len(words)
    52  		},
    53  		func() {},
    54  	)
    55  	start := time.Now()
    56  	b.StartTimer()
    57  	wordSplitter()
    58  	b.StopTimer()
    59  	pprof.StopCPUProfile()
    60  	elapsed := time.Since(start)
    61  	numBytes := len(*largeCorpus)
    62  	b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec")
    63  	b.ReportMetric(float64(wordCount), "words")
    64  	b.ReportMetric(float64(numBytes)/elapsed.Seconds(), "bytes/sec")
    65  	b.ReportMetric(float64(numBytes), "bytes")
    66  }
    67  
    68  func BenchmarkGPTEncoder_ToBPE(b *testing.B) {
    69  	b.StopTimer()
    70  
    71  	// Pre-split words
    72  	words := *nerdstashV2Encoder.SplitWords(largeCorpus)
    73  
    74  	// Pre-calculate tokens for each word
    75  	tokenLengths := make([]int, len(words))
    76  	totalTokens := 0
    77  	for i, word := range words {
    78  		tokens := nerdstashV2Encoder.ToBPE(word)
    79  		tokenLengths[i] = len(tokens)
    80  		totalTokens += tokenLengths[i]
    81  	}
    82  	profileHandle, _ := os.Create("tobpe.prof")
    83  
    84  	numBytes := len(*largeCorpus)
    85  	start := time.Now()
    86  
    87  	b.StartTimer()
    88  	runtime.GC()
    89  	pprof.StartCPUProfile(profileHandle)
    90  	for i := 0; i < b.N; i++ {
    91  		for idx := range words {
    92  			// Just do the ToBPE call without length calculation
    93  			nerdstashV2Encoder.ToBPE(words[idx])
    94  		}
    95  	}
    96  	pprof.StopCPUProfile()
    97  	b.StopTimer()
    98  
    99  	elapsed := time.Since(start)
   100  	totalTokens *= b.N
   101  
   102  	// Use pre-calculated values for metrics
   103  	b.ReportMetric(float64(numBytes)/elapsed.Seconds(), "bytes/sec")
   104  	b.ReportMetric(float64(numBytes), "bytes")
   105  	b.ReportMetric(float64(totalTokens)/elapsed.Seconds(), "tokens/sec")
   106  	b.ReportMetric(float64(totalTokens), "tokens")
   107  	// Report on tokenizer LRU cache
   108  	b.ReportMetric(float64(nerdstashV2Encoder.LruHits), "lru_hits")
   109  	b.ReportMetric(float64(nerdstashV2Encoder.LruMisses), "lru_misses")
   110  	b.ReportMetric(float64(nerdstashV2Encoder.LruEvictions), "lru_evictions")
   111  
   112  }
   113  
   114  func BenchmarkGPTEncoder_WordSplitterTokens(b *testing.B) {
   115  	b.StopTimer()
   116  	wordCount := 0
   117  	tokensCount := 0
   118  	corpusHandle := strings.NewReader(corpus)
   119  	runeReader := bufio.NewReaderSize(corpusHandle, 8*1024*1024)
   120  
   121  	wordSplitter := nerdstashV2Encoder.makeWordSplitter(
   122  		runeReader.ReadRune,
   123  		func(words []string) {
   124  			if len(words) > 0 {
   125  				for _, word := range words {
   126  					tokensCount += len(nerdstashV2Encoder.ToBPE(word))
   127  				}
   128  			}
   129  			wordCount++
   130  		},
   131  		func() {},
   132  	)
   133  	start := time.Now()
   134  	b.StartTimer()
   135  	wordSplitter()
   136  	b.StopTimer()
   137  	elapsed := time.Since(start)
   138  	//numBytes := int64(len(corpusText))
   139  	b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec")
   140  	b.ReportMetric(float64(wordCount), "words")
   141  	b.ReportMetric(float64(tokensCount)/elapsed.Seconds(), "tokens/sec")
   142  	b.ReportMetric(float64(tokensCount), "tokens")
   143  }
   144  
   145  func BenchmarkGPTEncoder_Decode(b *testing.B) {
   146  	if gpt2Encoded == nil {
   147  		corpEncoded := gpt2Encoder.Encode(&corpus)
   148  		gpt2Encoded = corpEncoded
   149  	}
   150  	start := time.Now()
   151  	tokenNumBytes := len(gpt2Encoder.Decode(gpt2Encoded))
   152  	duration := time.Since(start)
   153  	b.Logf(
   154  		"%v tokens into %v bytes over %v",
   155  		len(*gpt2Encoded), tokenNumBytes, duration,
   156  	)
   157  }
   158  
   159  func BenchmarkGPTEncoder_Encode(b *testing.B) {
   160  	start := time.Now()
   161  	tokenCt := len(*gpt2Encoder.Encode(&corpus))
   162  	duration := time.Since(start)
   163  	b.Logf(
   164  		"%v bytes into %v tokens over %v",
   165  		len(corpus), tokenCt, duration,
   166  	)
   167  }
   168  
   169  func BenchmarkGPTEncoder_EncodeBuffer(b *testing.B) {
   170  	corpusBytes := []byte(corpus)
   171  	start := time.Now()
   172  	_, tokenCt := gpt2Encoder.EncodeBuffer(&corpusBytes)
   173  	duration := time.Since(start)
   174  	b.Logf(
   175  		"%v bytes into %v tokens over %v",
   176  		len(corpus), tokenCt, duration,
   177  	)
   178  }
   179  
   180  //func BenchmarkGPTEncoder_WordSplitterTokensChan(b *testing.B) {
   181  //	b.StopTimer()
   182  //	corpusHandle, err := os.Open(largeCorpusPath)
   183  //	//corpusText, err := ioutil.ReadFile(largeCorpusPath)
   184  //	nerdstashEncoder.SplitterThreads = 1
   185  //	//defer corpusHandle.Close()
   186  //	if err != nil {
   187  //		b.Error(err)
   188  //	}
   189  //	wordCount := 0
   190  //	tokensCount := 0
   191  //	runeReader := bufio.NewReaderSize(corpusHandle, 8*1024*1024)
   192  //	wordsChan := make(chan *string, 1000)
   193  //	go nerdstashEncoder.splitWordsOntoChan(runeReader.ReadRune,
   194  //		wordsChan)
   195  //	start := time.Now()
   196  //	b.StartTimer()
   197  //	for {
   198  //		word := <-wordsChan
   199  //		if word == nil {
   200  //			break
   201  //		}
   202  //		tokensCount += len(gpt2Encoder.ToBPE(*word))
   203  //		wordCount++
   204  //	}
   205  //	b.StopTimer()
   206  //	elapsed := time.Since(start)
   207  //	//numBytes := int64(len(corpusText))
   208  //	numBytes, _ := corpusHandle.Seek(0, io.SeekCurrent)
   209  //	b.ReportMetric(float64(wordCount)/elapsed.Seconds(), "words/sec")
   210  //	b.ReportMetric(float64(wordCount), "words")
   211  //	b.ReportMetric(float64(numBytes)/elapsed.Seconds(), "bytes/sec")
   212  //	b.ReportMetric(float64(numBytes), "bytes")
   213  //	b.ReportMetric(float64(tokensCount)/elapsed.Seconds(), "tokens/sec")
   214  //	b.ReportMetric(float64(tokensCount), "tokens")
   215  //}