github.com/instill-ai/component@v0.16.0-beta/pkg/connector/website/v0/config/tasks.json (about)

     1  {
     2    "$defs": {
     3      "page_info": {
     4        "properties": {
     5          "link": {
     6            "description": "The full URL to which the webpage link is pointing, e.g., http://www.example.com/foo/bar.",
     7            "instillFormat": "string",
     8            "instillUIOrder": 0,
     9            "title": "Link",
    10            "type": "string"
    11          },
    12          "link_html": {
    13            "description": "The scraped raw html of the link associated with this webpage link",
    14            "instillFormat": "string",
    15            "instillUIMultiline": true,
    16            "instillUIOrder": 3,
    17            "title": "Link HTML",
    18            "type": "string"
    19          },
    20          "link_text": {
    21            "description": "The scraped text of the link associated with this webpage link, in plain text",
    22            "instillFormat": "string",
    23            "instillUIMultiline": true,
    24            "instillUIOrder": 2,
    25            "title": "Link Text",
    26            "type": "string"
    27          },
    28          "title": {
    29            "description": "The title of a webpage link, in plain text",
    30            "instillFormat": "string",
    31            "instillUIMultiline": true,
    32            "instillUIOrder": 1,
    33            "title": "Title",
    34            "type": "string"
    35          }
    36        },
    37        "required": [
    38          "link"
    39        ],
    40        "title": "Page Information",
    41        "type": "object"
    42      }
    43    },
    44    "TASK_SCRAPE_WEBSITE": {
    45      "instillShortDescription": "Scrape the website contents.",
    46      "input": {
    47        "instillUIOrder": 0,
    48        "properties": {
    49          "allowed_domains": {
    50            "description": "A list of domains that are allowed to be scraped. If empty, all domains are allowed.",
    51            "instillAcceptFormats": [
    52              "array:string"
    53            ],
    54            "instillUIOrder": 1,
    55            "instillUpstreamTypes": [
    56              "value",
    57              "reference"
    58            ],
    59            "items": {
    60              "type": "string"
    61            },
    62            "title": "Allowed Domains",
    63            "type": "array"
    64          },
    65          "include_link_html": {
    66            "default": false,
    67            "description": "Indicate whether to scrape the link and include the raw HTML of the link associated with this page in the 'link_html' field",
    68            "instillAcceptFormats": [
    69              "boolean"
    70            ],
    71            "instillUIOrder": 4,
    72            "instillUpstreamTypes": [
    73              "value",
    74              "reference"
    75            ],
    76            "title": "Include Link HTML",
    77            "type": "boolean"
    78          },
    79          "include_link_text": {
    80            "default": false,
    81            "description": "Indicate whether to scrape the link and include the text of the link associated with this page in the 'link_text' field",
    82            "instillAcceptFormats": [
    83              "boolean"
    84            ],
    85            "instillUIOrder": 3,
    86            "instillUpstreamTypes": [
    87              "value",
    88              "reference"
    89            ],
    90            "title": "Include Link Text",
    91            "type": "boolean"
    92          },
    93          "max_k": {
    94            "default": 10,
    95            "description": "The max number of pages to return. If the number is set to 0, all pages will be returned. If the number is set to a positive integer, at most max k pages will be returned.",
    96            "instillAcceptFormats": [
    97              "integer"
    98            ],
    99            "instillUIOrder": 2,
   100            "instillUpstreamTypes": [
   101              "value",
   102              "reference"
   103            ],
   104            "maximum": 100,
   105            "minimum": 0,
   106            "title": "Max Number of Pages",
   107            "type": "integer"
   108          },
   109          "target_url": {
   110            "description": "The root URL to scrape. All links on this page will be scraped, and all links on those pages, and so on.",
   111            "instillAcceptFormats": [
   112              "string"
   113            ],
   114            "instillUIMultiline": true,
   115            "instillUIOrder": 0,
   116            "instillUpstreamTypes": [
   117              "value",
   118              "reference",
   119              "template"
   120            ],
   121            "title": "Query",
   122            "type": "string"
   123          }
   124        },
   125        "required": [
   126          "target_url",
   127          "max_k"
   128        ],
   129        "title": "Input",
   130        "type": "object"
   131      },
   132      "output": {
   133        "instillUIOrder": 0,
   134        "properties": {
   135          "pages": {
   136            "description": "The scraped webpages",
   137            "instillUIOrder": 0,
   138            "items": {
   139              "$ref": "#/$defs/page_info",
   140              "title": "Page"
   141            },
   142            "title": "Pages",
   143            "type": "array"
   144          }
   145        },
   146        "required": [
   147          "pages"
   148        ],
   149        "title": "Output",
   150        "type": "object"
   151      }
   152    }
   153  }