github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/runetree_test.go (about) 1 package gpt_bpe 2 3 import ( 4 "fmt" 5 "io" 6 "regexp/syntax" 7 "strings" 8 "testing" 9 ) 10 11 var sanitizeTable = map[string]string{ 12 "€": "€", 13 "‚": "‚", 14 "Æ’": "ƒ", 15 "„": "„", 16 "…": "…", 17 "‡": "‡", 18 "ˆ": "ˆ", 19 "‰": "‰", 20 "‹": "‹", 21 "Å’": "Œ", 22 "Ž": "Ž", 23 "‘": "‘", 24 "’": "’", 25 "“": "“", 26 "•": "•", 27 "–": "–", 28 "—": "—", 29 "Ëœ": "˜", 30 "â„¢": "™", 31 "Å¡": "š", 32 "›": "›", 33 "Å“": "œ", 34 "ž": "ž", 35 "Ÿ": "Ÿ", 36 "¡": "¡", 37 "¢": "¢", 38 "£": "£", 39 "¤": "¤", 40 "Â¥": "¥", 41 "¦": "¦", 42 "§": "§", 43 "¨": "¨", 44 "©": "©", 45 "ª": "ª", 46 "«": "«", 47 "®": "®", 48 "¯": "¯", 49 "°": "°", 50 "±": "±", 51 "²": "²", 52 "³": "³", 53 "´": "´", 54 "µ": "µ", 55 "¶": "¶", 56 "·": "·", 57 "¸": "¸", 58 "¹": "¹", 59 "º": "º", 60 "»": "»", 61 "¼": "¼", 62 "½": "½", 63 "¾": "¾", 64 "¿": "¿", 65 "À": "À", 66 "Â": "Â", 67 "Ã": "Ã", 68 "Ä": "Ä", 69 "Ã…": "Å", 70 "Æ": "Æ", 71 "Ç": "Ç", 72 "È": "È", 73 "É": "É", 74 "Ê": "Ê", 75 "Ë": "Ë", 76 "ÃŒ": "Ì", 77 "ÃŽ": "Î", 78 "Ñ": "Ñ", 79 "Ã’": "Ò", 80 "Ó": "Ó", 81 "Ô": "Ô", 82 "Õ": "Õ", 83 "Ö": "Ö", 84 "×": "×", 85 "Ø": "Ø", 86 "Ù": "Ù", 87 "Ú": "Ú", 88 "Û": "Û", 89 "Ü": "Ü", 90 "Þ": "Þ", 91 "ß": "ß", 92 "á": "á", 93 "â": "â", 94 "ã": "ã", 95 "ä": "ä", 96 "Ã¥": "å", 97 "æ": "æ", 98 "ç": "ç", 99 "è": "è", 100 "é": "é", 101 "ê": "ê", 102 "ë": "ë", 103 "ì": "ì", 104 "î": "î", 105 "ï": "ï", 106 "ð": "ð", 107 "ñ": "ñ", 108 "ò": "ò", 109 "ó": "ó", 110 "ô": "ô", 111 "õ": "õ", 112 "ö": "ö", 113 "÷": "÷", 114 "ø": "ø", 115 "ù": "ù", 116 "ú": "ú", 117 "û": "û", 118 "ü": "ü", 119 "ý": "ý", 120 "þ": "þ", 121 "ÿ": "ÿ", 122 } 123 124 var encodingSanitzer = map[string]string{} 125 126 func TestRuneNode_String(t *testing.T) { 127 nerdstashV2Encoder = *CacheLoadEncoder("nerdstash_v2-tokenizer") 128 print(nerdstashV2Encoder.SpecialsTree.String()) 129 } 130 131 func TestRuneMatch(t *testing.T) { 132 s := "// TypeScript Version: 2.9" 133 rr := io.RuneReader(strings.NewReader(s)) 134 nerdstashV2Encoder = *CacheLoadEncoder("nerdstash_v2-tokenizer") 135 nextWord := nerdstashV2Encoder.WordSplitter(rr) 136 for { 137 word := nextWord() 138 if word == nil { 139 break 140 } 141 t.Log(*word) 142 } 143 } 144 145 func TestRuneReplacement(t *testing.T) { 146 s := "ù TypeScriptÖ" 147 rr := io.RuneReader(strings.NewReader(s)) 148 nerdstashV2Encoder = *CacheLoadEncoder("nerdstash_v2-tokenizer") 149 nerdstashV2Encoder.SpecialsTree.InsertReplacementsIntoRuneTree( 150 sanitizeTable, 151 ) 152 print(nerdstashV2Encoder.SpecialsTree.String()) 153 nextWord := nerdstashV2Encoder.WordSplitter(rr) 154 for { 155 word := nextWord() 156 if word == nil { 157 break 158 } 159 t.Log(*word) 160 } 161 } 162 163 func TestRegex(t *testing.T) { 164 // This test is to check if the regex is able to split the text correctly 165 testStr := "This is a test. This is another test. filler filler. fill'll fill't 1 12 123 1234 12345 123456 1234567\n The quick brown turtle did a backflip and won a marathon." 166 llama3Encoder = *CacheLoadEncoder("llama3-tokenizer") 167 regexStringLLama3 := llama3Encoder.pattern.String() 168 fmt.Printf("regexString: %v\n", regexStringLLama3) 169 regexASTLLama3, err := syntax.Parse(regexStringLLama3, syntax.Perl) 170 if err != nil { 171 t.Error(err) 172 } 173 regexASTLLama3.Simplify() 174 175 regexTree := CreateRegexTree(regexASTLLama3) 176 //regexTree.PrintTree() 177 runesTest := []rune(testStr) 178 pathMap := regexTree.GeneratePathMap() 179 returnedval := regexTree.EvaluateRegexTree(runesTest, pathMap) 180 fmt.Printf("returnedval: %v\n", returnedval) 181 }