github.com/instill-ai/component@v0.16.0-beta/pkg/connector/website/v0/config/tasks.json (about) 1 { 2 "$defs": { 3 "page_info": { 4 "properties": { 5 "link": { 6 "description": "The full URL to which the webpage link is pointing, e.g., http://www.example.com/foo/bar.", 7 "instillFormat": "string", 8 "instillUIOrder": 0, 9 "title": "Link", 10 "type": "string" 11 }, 12 "link_html": { 13 "description": "The scraped raw html of the link associated with this webpage link", 14 "instillFormat": "string", 15 "instillUIMultiline": true, 16 "instillUIOrder": 3, 17 "title": "Link HTML", 18 "type": "string" 19 }, 20 "link_text": { 21 "description": "The scraped text of the link associated with this webpage link, in plain text", 22 "instillFormat": "string", 23 "instillUIMultiline": true, 24 "instillUIOrder": 2, 25 "title": "Link Text", 26 "type": "string" 27 }, 28 "title": { 29 "description": "The title of a webpage link, in plain text", 30 "instillFormat": "string", 31 "instillUIMultiline": true, 32 "instillUIOrder": 1, 33 "title": "Title", 34 "type": "string" 35 } 36 }, 37 "required": [ 38 "link" 39 ], 40 "title": "Page Information", 41 "type": "object" 42 } 43 }, 44 "TASK_SCRAPE_WEBSITE": { 45 "instillShortDescription": "Scrape the website contents.", 46 "input": { 47 "instillUIOrder": 0, 48 "properties": { 49 "allowed_domains": { 50 "description": "A list of domains that are allowed to be scraped. If empty, all domains are allowed.", 51 "instillAcceptFormats": [ 52 "array:string" 53 ], 54 "instillUIOrder": 1, 55 "instillUpstreamTypes": [ 56 "value", 57 "reference" 58 ], 59 "items": { 60 "type": "string" 61 }, 62 "title": "Allowed Domains", 63 "type": "array" 64 }, 65 "include_link_html": { 66 "default": false, 67 "description": "Indicate whether to scrape the link and include the raw HTML of the link associated with this page in the 'link_html' field", 68 "instillAcceptFormats": [ 69 "boolean" 70 ], 71 "instillUIOrder": 4, 72 "instillUpstreamTypes": [ 73 "value", 74 "reference" 75 ], 76 "title": "Include Link HTML", 77 "type": "boolean" 78 }, 79 "include_link_text": { 80 "default": false, 81 "description": "Indicate whether to scrape the link and include the text of the link associated with this page in the 'link_text' field", 82 "instillAcceptFormats": [ 83 "boolean" 84 ], 85 "instillUIOrder": 3, 86 "instillUpstreamTypes": [ 87 "value", 88 "reference" 89 ], 90 "title": "Include Link Text", 91 "type": "boolean" 92 }, 93 "max_k": { 94 "default": 10, 95 "description": "The max number of pages to return. If the number is set to 0, all pages will be returned. If the number is set to a positive integer, at most max k pages will be returned.", 96 "instillAcceptFormats": [ 97 "integer" 98 ], 99 "instillUIOrder": 2, 100 "instillUpstreamTypes": [ 101 "value", 102 "reference" 103 ], 104 "maximum": 100, 105 "minimum": 0, 106 "title": "Max Number of Pages", 107 "type": "integer" 108 }, 109 "target_url": { 110 "description": "The root URL to scrape. All links on this page will be scraped, and all links on those pages, and so on.", 111 "instillAcceptFormats": [ 112 "string" 113 ], 114 "instillUIMultiline": true, 115 "instillUIOrder": 0, 116 "instillUpstreamTypes": [ 117 "value", 118 "reference", 119 "template" 120 ], 121 "title": "Query", 122 "type": "string" 123 } 124 }, 125 "required": [ 126 "target_url", 127 "max_k" 128 ], 129 "title": "Input", 130 "type": "object" 131 }, 132 "output": { 133 "instillUIOrder": 0, 134 "properties": { 135 "pages": { 136 "description": "The scraped webpages", 137 "instillUIOrder": 0, 138 "items": { 139 "$ref": "#/$defs/page_info", 140 "title": "Page" 141 }, 142 "title": "Pages", 143 "type": "array" 144 } 145 }, 146 "required": [ 147 "pages" 148 ], 149 "title": "Output", 150 "type": "object" 151 } 152 } 153 }