github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/lib/test/test.c (about)

     1  #include "gpt_bpe.h"
     2  #include "library.h"
     3  #include <stdio.h>
     4  #include <sys/mman.h>
     5  #include <sys/fcntl.h>
     6  #include <assert.h>
     7  #include <sys/stat.h>
     8  #include <stdlib.h>
     9  #include <time.h>
    10  
    11  
    12  const int NANO_SECONDS_IN_SEC = 1000000000;
    13  /* returns a static buffer of struct timespec with the time difference of ts1
    14   * and ts2, ts1 is assumed to be greater than ts2 */
    15  struct timespec *TimeSpecDiff(struct timespec *ts1, struct timespec *ts2)
    16  {
    17      static struct timespec ts;
    18      ts.tv_sec = ts1->tv_sec - ts2->tv_sec;
    19      ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec;
    20      if (ts.tv_nsec < 0) {
    21          ts.tv_sec--;
    22          ts.tv_nsec += NANO_SECONDS_IN_SEC;
    23      }
    24      return &ts;
    25  }
    26  
    27  char *testString = "This is a test string";
    28  
    29  size_t mmap_file(char *path, char **mmap_addr) {
    30      int fd = open(path, O_RDONLY);
    31      assert (fd != -1);
    32  
    33      struct stat file_info;
    34      assert (fstat(fd, &file_info) != -1);
    35  
    36      *mmap_addr = mmap(NULL, file_info.st_size, PROT_READ, MAP_PRIVATE,
    37                       fd, 0);
    38      assert (mmap_addr != MAP_FAILED);
    39      return file_info.st_size;
    40  }
    41  
    42  size_t read_file(char *path, char **buf_ptr) {
    43      size_t bufsize = 0;
    44      FILE *fp = fopen(path, "r");
    45      if (fp != NULL) {
    46          /* Go to the end of the file. */
    47          if (fseek(fp, 0L, SEEK_END) == 0) {
    48              /* Get the size of the file. */
    49              bufsize = ftell(fp);
    50              if (bufsize == -1) { /* Error */ }
    51  
    52              /* Allocate our buffer to that size. */
    53              *buf_ptr = malloc(sizeof(char) * (bufsize + 1));
    54  
    55              /* Go back to the start of the file. */
    56              if (fseek(fp, 0L, SEEK_SET) != 0) { /* Error */ }
    57  
    58              /* Read the entire file into memory. */
    59              size_t newLen = fread(*buf_ptr, sizeof(char),
    60                                    bufsize, fp);
    61              if ( ferror( fp ) != 0 ) {
    62                  fputs("Error reading file", stderr);
    63              } else {
    64                  (*buf_ptr)[newLen++] = '\0'; /* Just to be safe. */
    65              }
    66          }
    67          fclose(fp);
    68      }
    69      return bufsize;
    70  }
    71  
    72  void benchmark_tokenize(char *input, size_t size) {
    73      uint64_t start_rdtsc, end_rdtsc, host_cpu_ticks;
    74      double host_cpu_ns, host_cpu_us, host_cpu_s, tokens_per_s;
    75      struct timespec begints, endts;
    76      uint64_t begin = 0, end = 0;
    77  
    78      clock_gettime(CLOCK_MONOTONIC, &begints);
    79      Tokens result = tokenizeBuffer("gpt2-tokenizer", input,
    80                                     size);
    81      clock_gettime(CLOCK_MONOTONIC, &endts);
    82      struct timespec *tmpts = TimeSpecDiff(&endts, &begints);
    83      uint64_t nsecElapsed = (unsigned long) tmpts->tv_sec * \
    84                                  1000000000LL + tmpts->tv_nsec;
    85  
    86      // Calculate rates
    87      host_cpu_s = (double) nsecElapsed / 1000000000;
    88      tokens_per_s = (double) result.len / host_cpu_s;
    89      free(result.tokens);
    90  
    91      printf("TOKENS: %lu in %0.2f seconds, %0.2f tokens/s\n", result.len,
    92             host_cpu_s, tokens_per_s);
    93  }
    94  
    95  void bench_loop(char *input, size_t size, size_t reps) {
    96      for (size_t i=0; i<reps; ++i) {
    97          benchmark_tokenize(input, size);
    98      }
    99  }
   100  
   101  
   102  int main() {
   103      initTokenizer("gpt2-tokenizer");
   104  
   105      char* input;
   106      printf("mmap:\n");
   107      size_t size = mmap_file("../../all.txt",
   108                             &input);
   109      bench_loop(input, size, 10);
   110  
   111      printf("read:\n");
   112      size = read_file("../../all.txt",
   113                       &input);
   114      bench_loop(input, size, 10);
   115  
   116      return 0;
   117  }