github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/resources/data/llama-tokenizer/duplicates.json (about)

     1  [
     2    {"old_id": 35, "new_id": 29871, "repr": " "},
     3    {"old_id": 104, "new_id": 29872, "repr": "e"},
     4    {"old_id": 119, "new_id": 29873, "repr": "t"},
     5    {"old_id": 100, "new_id": 29874, "repr": "a"},
     6    {"old_id": 108, "new_id": 29875, "repr": "i"},
     7    {"old_id": 113, "new_id": 29876, "repr": "n"},
     8    {"old_id": 114, "new_id": 29877, "repr": "o"},
     9    {"old_id": 117, "new_id": 29878, "repr": "r"},
    10    {"old_id": 118, "new_id": 29879, "repr": "s"},
    11    {"old_id": 111, "new_id": 29880, "repr": "l"},
    12    {"old_id": 103, "new_id": 29881, "repr": "d"},
    13    {"old_id": 107, "new_id": 29882, "repr": "h"},
    14    {"old_id": 102, "new_id": 29883, "repr": "c"},
    15    {"old_id": 120, "new_id": 29884, "repr": "u"},
    16    {"old_id": 112, "new_id": 29885, "repr": "m"},
    17    {"old_id": 115, "new_id": 29886, "repr": "p"},
    18    {"old_id": 106, "new_id": 29887, "repr": "g"},
    19    {"old_id": 105, "new_id": 29888, "repr": "f"},
    20    {"old_id": 49, "new_id": 29889, "repr": "."},
    21    {"old_id": 101, "new_id": 29890, "repr": "b"},
    22    {"old_id": 124, "new_id": 29891, "repr": "y"},
    23    {"old_id": 47, "new_id": 29892, "repr": ","},
    24    {"old_id": 122, "new_id": 29893, "repr": "w"},
    25    {"old_id": 121, "new_id": 29894, "repr": "v"},
    26    {"old_id": 110, "new_id": 29895, "repr": "k"},
    27    {"old_id": 52, "new_id": 29896, "repr": "1"},
    28    {"old_id": 44, "new_id": 29897, "repr": ")"},
    29    {"old_id": 43, "new_id": 29898, "repr": "("},
    30    {"old_id": 48, "new_id": 29899, "repr": "-"},
    31    {"old_id": 51, "new_id": 29900, "repr": "0"},
    32    {"old_id": 61, "new_id": 29901, "repr": ":"},
    33    {"old_id": 76, "new_id": 29902, "repr": "I"},
    34    {"old_id": 86, "new_id": 29903, "repr": "S"},
    35    {"old_id": 95, "new_id": 29905, "repr": "\\"},
    36    {"old_id": 53, "new_id": 29906, "repr": "2"},
    37    {"old_id": 70, "new_id": 29907, "repr": "C"},
    38    {"old_id": 37, "new_id": 29908, "repr": "\""},
    39    {"old_id": 68, "new_id": 29909, "repr": "A"},
    40    {"old_id": 87, "new_id": 29911, "repr": "T"},
    41    {"old_id": 126, "new_id": 29912, "repr": "{"},
    42    {"old_id": 128, "new_id": 29913, "repr": "}"},
    43    {"old_id": 50, "new_id": 29914, "repr": "/"},
    44    {"old_id": 42, "new_id": 29915, "repr": "'"},
    45    {"old_id": 123, "new_id": 29916, "repr": "x"},
    46    {"old_id": 98, "new_id": 29918, "repr": "_"},
    47    {"old_id": 125, "new_id": 29920, "repr": "z"},
    48    {"old_id": 64, "new_id": 29922, "repr": "="},
    49    {"old_id": 72, "new_id": 29923, "repr": "E"},
    50    {"old_id": 80, "new_id": 29924, "repr": "M"},
    51    {"old_id": 83, "new_id": 29925, "repr": "P"},
    52    {"old_id": 109, "new_id": 29926, "repr": "j"},
    53    {"old_id": 71, "new_id": 29928, "repr": "D"},
    54    {"old_id": 60, "new_id": 29929, "repr": "9"},
    55    {"old_id": 45, "new_id": 29930, "repr": "*"},
    56    {"old_id": 79, "new_id": 29931, "repr": "L"},
    57    {"old_id": 69, "new_id": 29933, "repr": "B"},
    58    {"old_id": 85, "new_id": 29934, "repr": "R"},
    59    {"old_id": 62, "new_id": 29936, "repr": ";"},
    60    {"old_id": 38, "new_id": 29937, "repr": "#"},
    61    {"old_id": 39, "new_id": 29938, "repr": "$"},
    62    {"old_id": 116, "new_id": 29939, "repr": "q"},
    63    {"old_id": 81, "new_id": 29940, "repr": "N"},
    64    {"old_id": 54, "new_id": 29941, "repr": "3"},
    65    {"old_id": 73, "new_id": 29943, "repr": "F"},
    66    {"old_id": 56, "new_id": 29945, "repr": "5"},
    67    {"old_id": 55, "new_id": 29946, "repr": "4"},
    68    {"old_id": 59, "new_id": 29947, "repr": "8"},
    69    {"old_id": 82, "new_id": 29949, "repr": "O"},
    70    {"old_id": 75, "new_id": 29950, "repr": "H"},
    71    {"old_id": 99, "new_id": 29952, "repr": "`"},
    72    {"old_id": 57, "new_id": 29953, "repr": "6"},
    73    {"old_id": 74, "new_id": 29954, "repr": "G"},
    74    {"old_id": 58, "new_id": 29955, "repr": "7"},
    75    {"old_id": 90, "new_id": 29956, "repr": "W"},
    76    {"old_id": 65, "new_id": 29958, "repr": ">"},
    77    {"old_id": 94, "new_id": 29961, "repr": "["},
    78    {"old_id": 96, "new_id": 29962, "repr": "]"},
    79    {"old_id": 89, "new_id": 29963, "repr": "V"},
    80    {"old_id": 88, "new_id": 29965, "repr": "U"},
    81    {"old_id": 63, "new_id": 29966, "repr": "<"},
    82    {"old_id": 77, "new_id": 29967, "repr": "J"},
    83    {"old_id": 78, "new_id": 29968, "repr": "K"},
    84    {"old_id": 66, "new_id": 29973, "repr": "?"},
    85    {"old_id": 46, "new_id": 29974, "repr": "+"},
    86    {"old_id": 92, "new_id": 29979, "repr": "Y"},
    87    {"old_id": 84, "new_id": 29984, "repr": "Q"},
    88    {"old_id": 97, "new_id": 29985, "repr": "^"},
    89    {"old_id": 41, "new_id": 29987, "repr": "&"},
    90    {"old_id": 127, "new_id": 29989, "repr": "|"},
    91    {"old_id": 91, "new_id": 29990, "repr": "X"},
    92    {"old_id": 36, "new_id": 29991, "repr": "!"},
    93    {"old_id": 67, "new_id": 29992, "repr": "@"},
    94    {"old_id": 40, "new_id": 29995, "repr": "%"},
    95    {"old_id": 93, "new_id": 29999, "repr": "Z"},
    96    {"old_id": 16, "new_id": 30004, "repr": "\r"},
    97    {"old_id": 129, "new_id": 30022, "repr": "~"}
    98  ]