github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/resources/data/nerdstash_v2-tokenizer/specials.txt (about)

     1  't
     2  'm
     3  'd
     4  'n
     5  's
     6   "
     7    
     8  've
     9  ─
    10  'll
    11  're
    12   
    13   
    14  ⁂
    15  ***
    16  Type
    17      
    18  ----
    19   Tags
    20  't've
    21   Title
    22   Style
    23   Genre
    24  'clock
    25  <|pad|>
    26   Author
    27   Rating
    28  <|fill|>
    29  <|mask|>
    30          
    31  Glossary
    32   Summary
    33  <|mtend|>
    34   Knowledge
    35  Characters
    36  <|maskend|>
    37  <|fillend|>
    38  <|rubyend|>
    39  <|mtvocab|>
    40  <|masklong|>
    41              
    42  <|spmspace|>
    43  <|reserved7|>
    44  <|reserved3|>
    45  <|reserved2|>
    46  <|rubystart|>
    47  <|reserved0|>
    48  <|reserved9|>
    49  <|reserved4|>
    50  <|rubycover|>
    51  <|endoftext|>
    52  <|reserved5|>
    53  <|reserved8|>
    54  <|maskshort|>
    55  <|reserved6|>
    56  <|reserved1|>
    57  <|mtsenglish|>
    58  <|reserved10|>
    59  <|maskmedium|>
    60  <|masksingle|>
    61  <|mtvocabend|>
    62  <|mtvenglish|>
    63  <|mtsentence|>
    64  <|startoftext|>
    65  <|mtvjapanese|>
    66  <|mtsjapanese|>
    67                  
    68  <|mtsentenceend|>
    69  <|maskparagraph|>
    70  <|spmspace|><|spmspace|>
    71  <|spmspace|><|spmspace|><|spmspace|><|spmspace|>
    72  <|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|>
    73  <|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|>
    74  <|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|><|spmspace|>