github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/resources/data/nerdstash_v1-tokenizer/specials.txt (about)

     1  3
     2  4
     3  9
     4  0
     5  1
     6  2
     7  5
     8  6
     9  7
    10  8
    11   "
    12    
    13   
    14   
    15  ─
    16  ***
    17  ⁂
    18  ----
    19      
    20  Type
    21   Tags
    22   Title
    23   Style
    24   Genre
    25   Rating
    26   Author
    27  <|pad|>
    28          
    29  <|fill|>
    30   Summary
    31  <|mask|>
    32  Glossary
    33  <|mtend|>
    34  Characters
    35   Knowledge
    36  <|fillend|>
    37  <|maskend|>
    38  <|rubyend|>
    39  <|mtvocab|>
    40  <|masklong|>
    41              
    42  <|spmspace|>
    43  <|reserved6|>
    44  <|reserved3|>
    45  <|reserved2|>
    46  <|rubystart|>
    47  <|reserved0|>
    48  <|reserved7|>
    49  <|reserved4|>
    50  <|rubycover|>
    51  <|endoftext|>
    52  <|reserved5|>
    53  <|reserved8|>
    54  <|maskshort|>
    55  <|reserved9|>
    56  <|reserved1|>
    57  <|mtsenglish|>
    58  <|reserved10|>
    59  <|maskmedium|>
    60  <|masksingle|>
    61  <|mtvocabend|>
    62  <|mtvenglish|>
    63  <|mtsentence|>
    64  <|startoftext|>
    65  <|mtvjapanese|>
    66  <|mtsjapanese|>
    67                  
    68  <|mtsentenceend|>
    69  <|maskparagraph|>
    70  <|spmspace|><|spmspace|>
    71  <|spmspace|><|spmspace|><|spmspace|><|spmspace|>
    72  <|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|>
    73  <|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|>
    74  <|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|>