github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/runetree_test.go (about)

     1  package gpt_bpe
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"regexp/syntax"
     7  	"strings"
     8  	"testing"
     9  )
    10  
    11  var sanitizeTable = map[string]string{
    12  	"€": "€",
    13  	"‚": "‚",
    14  	"Æ’":  "ƒ",
    15  	"„": "„",
    16  	"…": "…",
    17  	"‡": "‡",
    18  	"ˆ":  "ˆ",
    19  	"‰": "‰",
    20  	"‹": "‹",
    21  	"Å’":  "Œ",
    22  	"Ž":  "Ž",
    23  	"‘": "‘",
    24  	"’": "’",
    25  	"“": "“",
    26  	"•": "•",
    27  	"–": "–",
    28  	"—": "—",
    29  	"Ëœ":  "˜",
    30  	"â„¢": "™",
    31  	"Å¡":  "š",
    32  	"›": "›",
    33  	"Å“":  "œ",
    34  	"ž":  "ž",
    35  	"Ÿ":  "Ÿ",
    36  	"¡":  "¡",
    37  	"¢":  "¢",
    38  	"£":  "£",
    39  	"¤":  "¤",
    40  	"Â¥":  "¥",
    41  	"¦":  "¦",
    42  	"§":  "§",
    43  	"¨":  "¨",
    44  	"©":  "©",
    45  	"ª":  "ª",
    46  	"«":  "«",
    47  	"®":  "®",
    48  	"¯":  "¯",
    49  	"°":  "°",
    50  	"±":  "±",
    51  	"²":  "²",
    52  	"³":  "³",
    53  	"´":  "´",
    54  	"µ":  "µ",
    55  	"¶":  "¶",
    56  	"·":  "·",
    57  	"¸":  "¸",
    58  	"¹":  "¹",
    59  	"º":  "º",
    60  	"»":  "»",
    61  	"¼":  "¼",
    62  	"½":  "½",
    63  	"¾":  "¾",
    64  	"¿":  "¿",
    65  	"À":  "À",
    66  	"Â":  "Â",
    67  	"Ã":  "Ã",
    68  	"Ä":  "Ä",
    69  	"Ã…":  "Å",
    70  	"Æ":  "Æ",
    71  	"Ç":  "Ç",
    72  	"È":  "È",
    73  	"É":  "É",
    74  	"Ê":  "Ê",
    75  	"Ë":  "Ë",
    76  	"ÃŒ":  "Ì",
    77  	"ÃŽ":  "Î",
    78  	"Ñ":  "Ñ",
    79  	"Ã’":  "Ò",
    80  	"Ó":  "Ó",
    81  	"Ô":  "Ô",
    82  	"Õ":  "Õ",
    83  	"Ö":  "Ö",
    84  	"×":  "×",
    85  	"Ø":  "Ø",
    86  	"Ù":  "Ù",
    87  	"Ú":  "Ú",
    88  	"Û":  "Û",
    89  	"Ü":  "Ü",
    90  	"Þ":  "Þ",
    91  	"ß":  "ß",
    92  	"á":  "á",
    93  	"â":  "â",
    94  	"ã":  "ã",
    95  	"ä":  "ä",
    96  	"Ã¥":  "å",
    97  	"æ":  "æ",
    98  	"ç":  "ç",
    99  	"è":  "è",
   100  	"é":  "é",
   101  	"ê":  "ê",
   102  	"ë":  "ë",
   103  	"ì":  "ì",
   104  	"î":  "î",
   105  	"ï":  "ï",
   106  	"ð":  "ð",
   107  	"ñ":  "ñ",
   108  	"ò":  "ò",
   109  	"ó":  "ó",
   110  	"ô":  "ô",
   111  	"õ":  "õ",
   112  	"ö":  "ö",
   113  	"÷":  "÷",
   114  	"ø":  "ø",
   115  	"ù":  "ù",
   116  	"ú":  "ú",
   117  	"û":  "û",
   118  	"ü":  "ü",
   119  	"ý":  "ý",
   120  	"þ":  "þ",
   121  	"ÿ":  "ÿ",
   122  }
   123  
   124  var encodingSanitzer = map[string]string{}
   125  
   126  func TestRuneNode_String(t *testing.T) {
   127  	nerdstashV2Encoder = *CacheLoadEncoder("nerdstash_v2-tokenizer")
   128  	print(nerdstashV2Encoder.SpecialsTree.String())
   129  }
   130  
   131  func TestRuneMatch(t *testing.T) {
   132  	s := "// TypeScript Version: 2.9"
   133  	rr := io.RuneReader(strings.NewReader(s))
   134  	nerdstashV2Encoder = *CacheLoadEncoder("nerdstash_v2-tokenizer")
   135  	nextWord := nerdstashV2Encoder.WordSplitter(rr)
   136  	for {
   137  		word := nextWord()
   138  		if word == nil {
   139  			break
   140  		}
   141  		t.Log(*word)
   142  	}
   143  }
   144  
   145  func TestRuneReplacement(t *testing.T) {
   146  	s := "ù TypeScriptÖ"
   147  	rr := io.RuneReader(strings.NewReader(s))
   148  	nerdstashV2Encoder = *CacheLoadEncoder("nerdstash_v2-tokenizer")
   149  	nerdstashV2Encoder.SpecialsTree.InsertReplacementsIntoRuneTree(
   150  		sanitizeTable,
   151  	)
   152  	print(nerdstashV2Encoder.SpecialsTree.String())
   153  	nextWord := nerdstashV2Encoder.WordSplitter(rr)
   154  	for {
   155  		word := nextWord()
   156  		if word == nil {
   157  			break
   158  		}
   159  		t.Log(*word)
   160  	}
   161  }
   162  
   163  func TestRegex(t *testing.T) {
   164  	// This test is to check if the regex is able to split the text correctly
   165  	testStr := "This is a test.  This is another test. filler filler. fill'll fill't 1 12 123 1234 12345 123456 1234567\n The quick brown turtle did a backflip and won a marathon."
   166  	llama3Encoder = *CacheLoadEncoder("llama3-tokenizer")
   167  	regexStringLLama3 := llama3Encoder.pattern.String()
   168  	fmt.Printf("regexString: %v\n", regexStringLLama3)
   169  	regexASTLLama3, err := syntax.Parse(regexStringLLama3, syntax.Perl)
   170  	if err != nil {
   171  		t.Error(err)
   172  	}
   173  	regexASTLLama3.Simplify()
   174  
   175  	regexTree := CreateRegexTree(regexASTLLama3)
   176  	//regexTree.PrintTree()
   177  	runesTest := []rune(testStr)
   178  	pathMap := regexTree.GeneratePathMap()
   179  	returnedval := regexTree.EvaluateRegexTree(runesTest, pathMap)
   180  	fmt.Printf("returnedval: %v\n", returnedval)
   181  }