github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/cmd/detokenizer/detokenizer.go (about) 1 package main 2 3 import ( 4 "flag" 5 "github.com/wbrown/gpt_bpe" 6 "log" 7 "os" 8 ) 9 10 func main() { 11 inputTokenizerId := flag.String("input_tokenizer", "gpt2", 12 "input tokenizer id [gpt2, pile, clip, huggingface-id]") 13 inputFile := flag.String("input", "", 14 "input file to retokenize") 15 in32 := flag.Bool("in32", false, "force input tokens to be read as 32-bit") 16 outputFile := flag.String("output", "detokenized.txt", 17 "output file to write retokenized data") 18 flag.Parse() 19 20 if *inputFile == "" { 21 flag.Usage() 22 log.Fatal("Must provide -input") 23 } 24 if *inputTokenizerId == "" { 25 flag.Usage() 26 log.Fatal("Must provide -input_tokenizer") 27 } 28 if *outputFile == "" { 29 flag.Usage() 30 log.Fatal("Must provide -output") 31 } 32 33 // check if input file exists 34 if _, err := os.Stat(*inputFile); os.IsNotExist(err) { 35 log.Fatal("Input file does not exist") 36 } 37 38 // Check if it's an internal reference. If not, it's a file path. 39 inputTokenizer, inputErr := gpt_bpe.NewEncoder( 40 *inputTokenizerId + "-tokenizer") 41 if inputErr != nil { 42 // Fall back to path-like. 43 inputTokenizer, inputErr = gpt_bpe.NewEncoder(*inputTokenizerId) 44 if inputErr != nil { 45 log.Fatal(inputErr) 46 } 47 } 48 input32Bit := *in32 || len(inputTokenizer.Encoder) > 65536 49 50 inputFileHandle, err := os.Open(*inputFile) 51 if err != nil { 52 log.Fatal(err) 53 } 54 defer inputFileHandle.Close() 55 56 outputFileHandle, err := os.Create(*outputFile) 57 if err != nil { 58 log.Fatal(err) 59 } 60 61 if input32Bit { 62 log.Println("Reading as 32-bit") 63 } else { 64 log.Println("Reading as 16-bit") 65 } 66 // Read 4096 bytes at a time from the input file. 67 // This is a bit arbitrary, but it's a good tradeoff 68 // between memory usage and speed. 69 bytes := make([]byte, 4096) 70 for { 71 bytesRead, err := inputFileHandle.Read(bytes) 72 if err != nil { 73 break 74 } 75 76 // Decode the bytes into a string. 77 var decoded string 78 if bytesRead == 4096 { 79 decoded = inputTokenizer.DecodeBuffer(&bytes, input32Bit) 80 } else { 81 filledBytes := bytes[:bytesRead] 82 decoded = inputTokenizer.DecodeBuffer(&filledBytes, input32Bit) 83 } 84 85 // Write the decoded string to the output file. 86 _, err = outputFileHandle.WriteString(decoded) 87 if err != nil { 88 log.Fatal(err) 89 } 90 } 91 }