github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/split.go

github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/split.go (about)

     1  package text
     2  
     3  import (
     4  	"github.com/pkoukk/tiktoken-go"
     5  )
     6  
     7  const defaultChunkTokenSize = 500
     8  
     9  // SplitByTokenInput defines the input for split by token task
    10  type SplitByTokenInput struct {
    11  	// Text: Text to split
    12  	Text string `json:"text"`
    13  	// Model: ID of the model to use for tokenization
    14  	Model string `json:"model"`
    15  	// ChunkTokenSize: Number of tokens per text chunk
    16  	ChunkTokenSize *int `json:"chunk_token_size,omitempty"`
    17  }
    18  
    19  // SplitByTokenOutput defines the output for split by token task
    20  type SplitByTokenOutput struct {
    21  	// TokenCount: Number of tokens in the text
    22  	TokenCount int `json:"token_count"`
    23  	// TextChunks: List of text chunks
    24  	TextChunks []string `json:"text_chunks"`
    25  	// ChunkNum: Number of text chunks
    26  	ChunkNum int `json:"chunk_num"`
    27  }
    28  
    29  // splitTextIntoChunks splits text into text chunks based on token size
    30  func splitTextIntoChunks(input SplitByTokenInput) (SplitByTokenOutput, error) {
    31  	output := SplitByTokenOutput{}
    32  
    33  	if input.ChunkTokenSize == nil || *input.ChunkTokenSize <= 0 {
    34  		input.ChunkTokenSize = new(int)
    35  		*input.ChunkTokenSize = defaultChunkTokenSize
    36  	}
    37  
    38  	tkm, err := tiktoken.EncodingForModel(input.Model)
    39  	if err != nil {
    40  		return output, err
    41  	}
    42  
    43  	token := tkm.Encode(input.Text, nil, nil)
    44  	output.TokenCount = len(token)
    45  	output.TextChunks = []string{}
    46  	for start := 0; start < len(token); start += *input.ChunkTokenSize {
    47  		end := min(start+*input.ChunkTokenSize, len(token))
    48  		output.TextChunks = append(output.TextChunks, tkm.Decode(token[start:end]))
    49  	}
    50  	output.ChunkNum = len(output.TextChunks)
    51  	return output, nil
    52  }