github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/split.go (about) 1 package text 2 3 import ( 4 "github.com/pkoukk/tiktoken-go" 5 ) 6 7 const defaultChunkTokenSize = 500 8 9 // SplitByTokenInput defines the input for split by token task 10 type SplitByTokenInput struct { 11 // Text: Text to split 12 Text string `json:"text"` 13 // Model: ID of the model to use for tokenization 14 Model string `json:"model"` 15 // ChunkTokenSize: Number of tokens per text chunk 16 ChunkTokenSize *int `json:"chunk_token_size,omitempty"` 17 } 18 19 // SplitByTokenOutput defines the output for split by token task 20 type SplitByTokenOutput struct { 21 // TokenCount: Number of tokens in the text 22 TokenCount int `json:"token_count"` 23 // TextChunks: List of text chunks 24 TextChunks []string `json:"text_chunks"` 25 // ChunkNum: Number of text chunks 26 ChunkNum int `json:"chunk_num"` 27 } 28 29 // splitTextIntoChunks splits text into text chunks based on token size 30 func splitTextIntoChunks(input SplitByTokenInput) (SplitByTokenOutput, error) { 31 output := SplitByTokenOutput{} 32 33 if input.ChunkTokenSize == nil || *input.ChunkTokenSize <= 0 { 34 input.ChunkTokenSize = new(int) 35 *input.ChunkTokenSize = defaultChunkTokenSize 36 } 37 38 tkm, err := tiktoken.EncodingForModel(input.Model) 39 if err != nil { 40 return output, err 41 } 42 43 token := tkm.Encode(input.Text, nil, nil) 44 output.TokenCount = len(token) 45 output.TextChunks = []string{} 46 for start := 0; start < len(token); start += *input.ChunkTokenSize { 47 end := min(start+*input.ChunkTokenSize, len(token)) 48 output.TextChunks = append(output.TextChunks, tkm.Decode(token[start:end])) 49 } 50 output.ChunkNum = len(output.TextChunks) 51 return output, nil 52 }