github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/config/tasks.json (about) 1 { 2 "TASK_CONVERT_TO_TEXT": { 3 "instillShortDescription": "Convert document to text.", 4 "input": { 5 "description": "Input", 6 "instillEditOnNodeFields": [ 7 "doc" 8 ], 9 "instillUIOrder": 0, 10 "properties": { 11 "doc": { 12 "description": "Base64 encoded document (PDF, DOC, DOCX, XML, HTML, RTF, etc.) to be converted to plain text", 13 "instillAcceptFormats": [ 14 "*/*" 15 ], 16 "instillUIMultiline": true, 17 "instillUIOrder": 0, 18 "instillUpstreamTypes": [ 19 "reference" 20 ], 21 "title": "Document", 22 "type": "string" 23 } 24 }, 25 "required": [ 26 "doc" 27 ], 28 "title": "Input", 29 "type": "object" 30 }, 31 "output": { 32 "description": "Output", 33 "instillUIOrder": 0, 34 "properties": { 35 "body": { 36 "description": "Plain text converted from the document", 37 "instillFormat": "string", 38 "instillUIMultiline": true, 39 "instillUIOrder": 0, 40 "title": "Body", 41 "type": "string" 42 }, 43 "error": { 44 "description": "Error message if any during the conversion process", 45 "instillFormat": "string", 46 "instillUIMultiline": true, 47 "instillUIOrder": 3, 48 "title": "Error", 49 "type": "string" 50 }, 51 "meta": { 52 "description": "Metadata extracted from the document", 53 "instillFormat": "semi-structured/object", 54 "instillUIOrder": 1, 55 "required": [], 56 "title": "Meta", 57 "type": "object" 58 }, 59 "msecs": { 60 "description": "Time taken to convert the document", 61 "instillFormat": "number", 62 "instillUIOrder": 2, 63 "title": "MSecs", 64 "type": "number" 65 } 66 }, 67 "required": [ 68 "body", 69 "meta", 70 "msecs", 71 "error" 72 ], 73 "title": "Output", 74 "type": "object" 75 } 76 }, 77 "TASK_SPLIT_BY_TOKEN": { 78 "instillShortDescription": "Split text by token.", 79 "input": { 80 "description": "Input", 81 "instillEditOnNodeFields": [ 82 "text", 83 "model" 84 ], 85 "instillUIOrder": 0, 86 "properties": { 87 "chunk_token_size": { 88 "default": 500, 89 "description": "Number of tokens per text chunk", 90 "instillAcceptFormats": [ 91 "integer" 92 ], 93 "instillUIOrder": 2, 94 "instillUpstreamTypes": [ 95 "value", 96 "reference" 97 ], 98 "minimum": 1, 99 "title": "Chunk Token Size", 100 "type": "integer" 101 }, 102 "model": { 103 "description": "ID of the model to use for tokenization", 104 "enum": [ 105 "gpt-4", 106 "gpt-3.5-turbo", 107 "text-davinci-003", 108 "text-davinci-002", 109 "text-davinci-001", 110 "text-curie-001", 111 "text-babbage-001", 112 "text-ada-001", 113 "davinci", 114 "curie", 115 "babbage", 116 "ada", 117 "code-davinci-002", 118 "code-davinci-001", 119 "code-cushman-002", 120 "code-cushman-001", 121 "davinci-codex", 122 "cushman-codex", 123 "text-davinci-edit-001", 124 "code-davinci-edit-001", 125 "text-embedding-ada-002", 126 "text-similarity-davinci-001", 127 "text-similarity-curie-001", 128 "text-similarity-babbage-001", 129 "text-similarity-ada-001", 130 "text-search-davinci-doc-001", 131 "text-search-curie-doc-001", 132 "text-search-babbage-doc-001", 133 "text-search-ada-doc-001", 134 "code-search-babbage-code-001", 135 "code-search-ada-code-001", 136 "gpt2" 137 ], 138 "instillAcceptFormats": [ 139 "string" 140 ], 141 "instillUIOrder": 1, 142 "instillUpstreamTypes": [ 143 "value", 144 "reference", 145 "template" 146 ], 147 "title": "Model", 148 "type": "string" 149 }, 150 "text": { 151 "description": "Text to be split", 152 "instillAcceptFormats": [ 153 "string" 154 ], 155 "instillUIMultiline": true, 156 "instillUIOrder": 0, 157 "instillUpstreamTypes": [ 158 "value", 159 "reference", 160 "template" 161 ], 162 "title": "Text", 163 "type": "string" 164 } 165 }, 166 "required": [ 167 "text", 168 "model" 169 ], 170 "title": "Input", 171 "type": "object" 172 }, 173 "output": { 174 "description": "Output", 175 "instillEditOnNodeFields": [ 176 "texts" 177 ], 178 "instillUIOrder": 0, 179 "properties": { 180 "chunk_num": { 181 "description": "Total number of output text chunks", 182 "instillUIOrder": 2, 183 "instillFormat": "integer", 184 "title": "Number of Text Chunks", 185 "type": "integer" 186 }, 187 "text_chunks": { 188 "description": "Text chunks after splitting", 189 "instillUIOrder": 1, 190 "instillFormat": "array:string", 191 "items": { 192 "title": "Text Chunk", 193 "description": "Text chunk after splitting", 194 "instillFormat": "string", 195 "instillUIMultiline": true, 196 "type": "string" 197 }, 198 "title": "Text Chunks", 199 "type": "array" 200 }, 201 "token_count": { 202 "description": "Total count of tokens in the input text", 203 "instillUIOrder": 0, 204 "instillFormat": "integer", 205 "title": "Token Count", 206 "type": "integer" 207 } 208 }, 209 "required": [ 210 "token_count", 211 "text_chunks", 212 "chunk_num" 213 ], 214 "title": "Output", 215 "type": "object" 216 } 217 } 218 }