github.com/instill-ai/component@v0.16.0-beta/pkg/operator/text/v0/config/tasks.json (about)

     1  {
     2    "TASK_CONVERT_TO_TEXT": {
     3      "instillShortDescription": "Convert document to text.",
     4      "input": {
     5        "description": "Input",
     6        "instillEditOnNodeFields": [
     7          "doc"
     8        ],
     9        "instillUIOrder": 0,
    10        "properties": {
    11          "doc": {
    12            "description": "Base64 encoded document (PDF, DOC, DOCX, XML, HTML, RTF, etc.) to be converted to plain text",
    13            "instillAcceptFormats": [
    14              "*/*"
    15            ],
    16            "instillUIMultiline": true,
    17            "instillUIOrder": 0,
    18            "instillUpstreamTypes": [
    19              "reference"
    20            ],
    21            "title": "Document",
    22            "type": "string"
    23          }
    24        },
    25        "required": [
    26          "doc"
    27        ],
    28        "title": "Input",
    29        "type": "object"
    30      },
    31      "output": {
    32        "description": "Output",
    33        "instillUIOrder": 0,
    34        "properties": {
    35          "body": {
    36            "description": "Plain text converted from the document",
    37            "instillFormat": "string",
    38            "instillUIMultiline": true,
    39            "instillUIOrder": 0,
    40            "title": "Body",
    41            "type": "string"
    42          },
    43          "error": {
    44            "description": "Error message if any during the conversion process",
    45            "instillFormat": "string",
    46            "instillUIMultiline": true,
    47            "instillUIOrder": 3,
    48            "title": "Error",
    49            "type": "string"
    50          },
    51          "meta": {
    52            "description": "Metadata extracted from the document",
    53            "instillFormat": "semi-structured/object",
    54            "instillUIOrder": 1,
    55            "required": [],
    56            "title": "Meta",
    57            "type": "object"
    58          },
    59          "msecs": {
    60            "description": "Time taken to convert the document",
    61            "instillFormat": "number",
    62            "instillUIOrder": 2,
    63            "title": "MSecs",
    64            "type": "number"
    65          }
    66        },
    67        "required": [
    68          "body",
    69          "meta",
    70          "msecs",
    71          "error"
    72        ],
    73        "title": "Output",
    74        "type": "object"
    75      }
    76    },
    77    "TASK_SPLIT_BY_TOKEN": {
    78      "instillShortDescription": "Split text by token.",
    79      "input": {
    80        "description": "Input",
    81        "instillEditOnNodeFields": [
    82          "text",
    83          "model"
    84        ],
    85        "instillUIOrder": 0,
    86        "properties": {
    87          "chunk_token_size": {
    88            "default": 500,
    89            "description": "Number of tokens per text chunk",
    90            "instillAcceptFormats": [
    91              "integer"
    92            ],
    93            "instillUIOrder": 2,
    94            "instillUpstreamTypes": [
    95              "value",
    96              "reference"
    97            ],
    98            "minimum": 1,
    99            "title": "Chunk Token Size",
   100            "type": "integer"
   101          },
   102          "model": {
   103            "description": "ID of the model to use for tokenization",
   104            "enum": [
   105              "gpt-4",
   106              "gpt-3.5-turbo",
   107              "text-davinci-003",
   108              "text-davinci-002",
   109              "text-davinci-001",
   110              "text-curie-001",
   111              "text-babbage-001",
   112              "text-ada-001",
   113              "davinci",
   114              "curie",
   115              "babbage",
   116              "ada",
   117              "code-davinci-002",
   118              "code-davinci-001",
   119              "code-cushman-002",
   120              "code-cushman-001",
   121              "davinci-codex",
   122              "cushman-codex",
   123              "text-davinci-edit-001",
   124              "code-davinci-edit-001",
   125              "text-embedding-ada-002",
   126              "text-similarity-davinci-001",
   127              "text-similarity-curie-001",
   128              "text-similarity-babbage-001",
   129              "text-similarity-ada-001",
   130              "text-search-davinci-doc-001",
   131              "text-search-curie-doc-001",
   132              "text-search-babbage-doc-001",
   133              "text-search-ada-doc-001",
   134              "code-search-babbage-code-001",
   135              "code-search-ada-code-001",
   136              "gpt2"
   137            ],
   138            "instillAcceptFormats": [
   139              "string"
   140            ],
   141            "instillUIOrder": 1,
   142            "instillUpstreamTypes": [
   143              "value",
   144              "reference",
   145              "template"
   146            ],
   147            "title": "Model",
   148            "type": "string"
   149          },
   150          "text": {
   151            "description": "Text to be split",
   152            "instillAcceptFormats": [
   153              "string"
   154            ],
   155            "instillUIMultiline": true,
   156            "instillUIOrder": 0,
   157            "instillUpstreamTypes": [
   158              "value",
   159              "reference",
   160              "template"
   161            ],
   162            "title": "Text",
   163            "type": "string"
   164          }
   165        },
   166        "required": [
   167          "text",
   168          "model"
   169        ],
   170        "title": "Input",
   171        "type": "object"
   172      },
   173      "output": {
   174        "description": "Output",
   175        "instillEditOnNodeFields": [
   176          "texts"
   177        ],
   178        "instillUIOrder": 0,
   179        "properties": {
   180          "chunk_num": {
   181            "description": "Total number of output text chunks",
   182            "instillUIOrder": 2,
   183            "instillFormat": "integer",
   184            "title": "Number of Text Chunks",
   185            "type": "integer"
   186          },
   187          "text_chunks": {
   188            "description": "Text chunks after splitting",
   189            "instillUIOrder": 1,
   190            "instillFormat": "array:string",
   191            "items": {
   192              "title": "Text Chunk",
   193              "description": "Text chunk after splitting",
   194              "instillFormat": "string",
   195              "instillUIMultiline": true,
   196              "type": "string"
   197            },
   198            "title": "Text Chunks",
   199            "type": "array"
   200          },
   201          "token_count": {
   202            "description": "Total count of tokens in the input text",
   203            "instillUIOrder": 0,
   204            "instillFormat": "integer",
   205            "title": "Token Count",
   206            "type": "integer"
   207          }
   208        },
   209        "required": [
   210          "token_count",
   211          "text_chunks",
   212          "chunk_num"
   213        ],
   214        "title": "Output",
   215        "type": "object"
   216      }
   217    }
   218  }