github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/convert.go (about)

     1  package text
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"strings"
     7  
     8  	"encoding/base64"
     9  
    10  	"code.sajari.com/docconv"
    11  
    12  	"github.com/instill-ai/component/pkg/base"
    13  )
    14  
    15  // ConvertToTextInput defines the input for convert to text task
    16  type ConvertToTextInput struct {
    17  	// Doc: Document to convert
    18  	Doc string `json:"doc"`
    19  }
    20  
    21  // ConvertToTextOutput defines the output for convert to text task
    22  type ConvertToTextOutput struct {
    23  	// Body: Plain text converted from the document
    24  	Body string `json:"body"`
    25  	// Meta: Metadata extracted from the document
    26  	Meta map[string]string `json:"meta"`
    27  	// MSecs: Time taken to convert the document
    28  	MSecs uint32 `json:"msecs"`
    29  	// Error: Error message if any during the conversion process
    30  	Error string `json:"error"`
    31  }
    32  
    33  func getContentTypeFromBase64(base64String string) (string, error) {
    34  	// Remove the "data:" prefix and split at the first semicolon
    35  	contentType := strings.TrimPrefix(base64String, "data:")
    36  
    37  	parts := strings.SplitN(contentType, ";", 2)
    38  	if len(parts) != 2 {
    39  		return "", fmt.Errorf("invalid format")
    40  	}
    41  
    42  	// The first part is the content type
    43  	return parts[0], nil
    44  }
    45  
    46  func convertToText(input ConvertToTextInput) (ConvertToTextOutput, error) {
    47  
    48  	contentType, err := getContentTypeFromBase64(input.Doc)
    49  	if err != nil {
    50  		return ConvertToTextOutput{}, err
    51  	}
    52  
    53  	b, err := base64.StdEncoding.DecodeString(base.TrimBase64Mime(input.Doc))
    54  	if err != nil {
    55  		return ConvertToTextOutput{}, err
    56  	}
    57  
    58  	res, err := docconv.Convert(bytes.NewReader(b), contentType, false)
    59  	if err != nil {
    60  		return ConvertToTextOutput{}, err
    61  	}
    62  
    63  	if res.Meta == nil {
    64  		res.Meta = map[string]string{}
    65  	}
    66  
    67  	return ConvertToTextOutput{
    68  		Body:  res.Body,
    69  		Meta:  res.Meta,
    70  		MSecs: res.MSecs,
    71  		Error: res.Error,
    72  	}, nil
    73  }