github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/cmd/detokenizer/detokenizer.go (about)

     1  package main
     2  
     3  import (
     4  	"flag"
     5  	"github.com/wbrown/gpt_bpe"
     6  	"log"
     7  	"os"
     8  )
     9  
    10  func main() {
    11  	inputTokenizerId := flag.String("input_tokenizer", "gpt2",
    12  		"input tokenizer id [gpt2, pile, clip, huggingface-id]")
    13  	inputFile := flag.String("input", "",
    14  		"input file to retokenize")
    15  	in32 := flag.Bool("in32", false, "force input tokens to be read as 32-bit")
    16  	outputFile := flag.String("output", "detokenized.txt",
    17  		"output file to write retokenized data")
    18  	flag.Parse()
    19  
    20  	if *inputFile == "" {
    21  		flag.Usage()
    22  		log.Fatal("Must provide -input")
    23  	}
    24  	if *inputTokenizerId == "" {
    25  		flag.Usage()
    26  		log.Fatal("Must provide -input_tokenizer")
    27  	}
    28  	if *outputFile == "" {
    29  		flag.Usage()
    30  		log.Fatal("Must provide -output")
    31  	}
    32  
    33  	// check if input file exists
    34  	if _, err := os.Stat(*inputFile); os.IsNotExist(err) {
    35  		log.Fatal("Input file does not exist")
    36  	}
    37  
    38  	// Check if it's an internal reference. If not, it's a file path.
    39  	inputTokenizer, inputErr := gpt_bpe.NewEncoder(
    40  		*inputTokenizerId + "-tokenizer")
    41  	if inputErr != nil {
    42  		// Fall back to path-like.
    43  		inputTokenizer, inputErr = gpt_bpe.NewEncoder(*inputTokenizerId)
    44  		if inputErr != nil {
    45  			log.Fatal(inputErr)
    46  		}
    47  	}
    48  	input32Bit := *in32 || len(inputTokenizer.Encoder) > 65536
    49  
    50  	inputFileHandle, err := os.Open(*inputFile)
    51  	if err != nil {
    52  		log.Fatal(err)
    53  	}
    54  	defer inputFileHandle.Close()
    55  
    56  	outputFileHandle, err := os.Create(*outputFile)
    57  	if err != nil {
    58  		log.Fatal(err)
    59  	}
    60  
    61  	if input32Bit {
    62  		log.Println("Reading as 32-bit")
    63  	} else {
    64  		log.Println("Reading as 16-bit")
    65  	}
    66  	// Read 4096 bytes at a time from the input file.
    67  	// This is a bit arbitrary, but it's a good tradeoff
    68  	// between memory usage and speed.
    69  	bytes := make([]byte, 4096)
    70  	for {
    71  		bytesRead, err := inputFileHandle.Read(bytes)
    72  		if err != nil {
    73  			break
    74  		}
    75  
    76  		// Decode the bytes into a string.
    77  		var decoded string
    78  		if bytesRead == 4096 {
    79  			decoded = inputTokenizer.DecodeBuffer(&bytes, input32Bit)
    80  		} else {
    81  			filledBytes := bytes[:bytesRead]
    82  			decoded = inputTokenizer.DecodeBuffer(&filledBytes, input32Bit)
    83  		}
    84  
    85  		// Write the decoded string to the output file.
    86  		_, err = outputFileHandle.WriteString(decoded)
    87  		if err != nil {
    88  			log.Fatal(err)
    89  		}
    90  	}
    91  }