github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/lib/test/test.c (about) 1 #include "gpt_bpe.h" 2 #include "library.h" 3 #include <stdio.h> 4 #include <sys/mman.h> 5 #include <sys/fcntl.h> 6 #include <assert.h> 7 #include <sys/stat.h> 8 #include <stdlib.h> 9 #include <time.h> 10 11 12 const int NANO_SECONDS_IN_SEC = 1000000000; 13 /* returns a static buffer of struct timespec with the time difference of ts1 14 * and ts2, ts1 is assumed to be greater than ts2 */ 15 struct timespec *TimeSpecDiff(struct timespec *ts1, struct timespec *ts2) 16 { 17 static struct timespec ts; 18 ts.tv_sec = ts1->tv_sec - ts2->tv_sec; 19 ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec; 20 if (ts.tv_nsec < 0) { 21 ts.tv_sec--; 22 ts.tv_nsec += NANO_SECONDS_IN_SEC; 23 } 24 return &ts; 25 } 26 27 char *testString = "This is a test string"; 28 29 size_t mmap_file(char *path, char **mmap_addr) { 30 int fd = open(path, O_RDONLY); 31 assert (fd != -1); 32 33 struct stat file_info; 34 assert (fstat(fd, &file_info) != -1); 35 36 *mmap_addr = mmap(NULL, file_info.st_size, PROT_READ, MAP_PRIVATE, 37 fd, 0); 38 assert (mmap_addr != MAP_FAILED); 39 return file_info.st_size; 40 } 41 42 size_t read_file(char *path, char **buf_ptr) { 43 size_t bufsize = 0; 44 FILE *fp = fopen(path, "r"); 45 if (fp != NULL) { 46 /* Go to the end of the file. */ 47 if (fseek(fp, 0L, SEEK_END) == 0) { 48 /* Get the size of the file. */ 49 bufsize = ftell(fp); 50 if (bufsize == -1) { /* Error */ } 51 52 /* Allocate our buffer to that size. */ 53 *buf_ptr = malloc(sizeof(char) * (bufsize + 1)); 54 55 /* Go back to the start of the file. */ 56 if (fseek(fp, 0L, SEEK_SET) != 0) { /* Error */ } 57 58 /* Read the entire file into memory. */ 59 size_t newLen = fread(*buf_ptr, sizeof(char), 60 bufsize, fp); 61 if ( ferror( fp ) != 0 ) { 62 fputs("Error reading file", stderr); 63 } else { 64 (*buf_ptr)[newLen++] = '\0'; /* Just to be safe. */ 65 } 66 } 67 fclose(fp); 68 } 69 return bufsize; 70 } 71 72 void benchmark_tokenize(char *input, size_t size) { 73 uint64_t start_rdtsc, end_rdtsc, host_cpu_ticks; 74 double host_cpu_ns, host_cpu_us, host_cpu_s, tokens_per_s; 75 struct timespec begints, endts; 76 uint64_t begin = 0, end = 0; 77 78 clock_gettime(CLOCK_MONOTONIC, &begints); 79 Tokens result = tokenizeBuffer("gpt2-tokenizer", input, 80 size); 81 clock_gettime(CLOCK_MONOTONIC, &endts); 82 struct timespec *tmpts = TimeSpecDiff(&endts, &begints); 83 uint64_t nsecElapsed = (unsigned long) tmpts->tv_sec * \ 84 1000000000LL + tmpts->tv_nsec; 85 86 // Calculate rates 87 host_cpu_s = (double) nsecElapsed / 1000000000; 88 tokens_per_s = (double) result.len / host_cpu_s; 89 free(result.tokens); 90 91 printf("TOKENS: %lu in %0.2f seconds, %0.2f tokens/s\n", result.len, 92 host_cpu_s, tokens_per_s); 93 } 94 95 void bench_loop(char *input, size_t size, size_t reps) { 96 for (size_t i=0; i<reps; ++i) { 97 benchmark_tokenize(input, size); 98 } 99 } 100 101 102 int main() { 103 initTokenizer("gpt2-tokenizer"); 104 105 char* input; 106 printf("mmap:\n"); 107 size_t size = mmap_file("../../all.txt", 108 &input); 109 bench_loop(input, size, 10); 110 111 printf("read:\n"); 112 size = read_file("../../all.txt", 113 &input); 114 bench_loop(input, size, 10); 115 116 return 0; 117 }