github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/convert.go (about) 1 package text 2 3 import ( 4 "bytes" 5 "fmt" 6 "strings" 7 8 "encoding/base64" 9 10 "code.sajari.com/docconv" 11 12 "github.com/instill-ai/component/pkg/base" 13 ) 14 15 // ConvertToTextInput defines the input for convert to text task 16 type ConvertToTextInput struct { 17 // Doc: Document to convert 18 Doc string `json:"doc"` 19 } 20 21 // ConvertToTextOutput defines the output for convert to text task 22 type ConvertToTextOutput struct { 23 // Body: Plain text converted from the document 24 Body string `json:"body"` 25 // Meta: Metadata extracted from the document 26 Meta map[string]string `json:"meta"` 27 // MSecs: Time taken to convert the document 28 MSecs uint32 `json:"msecs"` 29 // Error: Error message if any during the conversion process 30 Error string `json:"error"` 31 } 32 33 func getContentTypeFromBase64(base64String string) (string, error) { 34 // Remove the "data:" prefix and split at the first semicolon 35 contentType := strings.TrimPrefix(base64String, "data:") 36 37 parts := strings.SplitN(contentType, ";", 2) 38 if len(parts) != 2 { 39 return "", fmt.Errorf("invalid format") 40 } 41 42 // The first part is the content type 43 return parts[0], nil 44 } 45 46 func convertToText(input ConvertToTextInput) (ConvertToTextOutput, error) { 47 48 contentType, err := getContentTypeFromBase64(input.Doc) 49 if err != nil { 50 return ConvertToTextOutput{}, err 51 } 52 53 b, err := base64.StdEncoding.DecodeString(base.TrimBase64Mime(input.Doc)) 54 if err != nil { 55 return ConvertToTextOutput{}, err 56 } 57 58 res, err := docconv.Convert(bytes.NewReader(b), contentType, false) 59 if err != nil { 60 return ConvertToTextOutput{}, err 61 } 62 63 if res.Meta == nil { 64 res.Meta = map[string]string{} 65 } 66 67 return ConvertToTextOutput{ 68 Body: res.Body, 69 Meta: res.Meta, 70 MSecs: res.MSecs, 71 Error: res.Error, 72 }, nil 73 }