github.com/instill-ai/component@v0.16.0-beta/pkg/connector/util/helper.go (about) 1 package util 2 3 import ( 4 "encoding/base64" 5 "mime/multipart" 6 "net/http" 7 "strings" 8 9 md "github.com/JohannesKaufmann/html-to-markdown" 10 "github.com/PuerkitoBio/goquery" 11 "github.com/h2non/filetype" 12 "github.com/instill-ai/component/pkg/base" 13 ) 14 15 func GetFileExt(fileData []byte) string { 16 kind, _ := filetype.Match(fileData) 17 if kind != filetype.Unknown && kind.Extension != "" { 18 return kind.Extension 19 } 20 //fallback to DetectContentType 21 mimeType := http.DetectContentType(fileData) 22 return mimeType[strings.LastIndex(mimeType, "/")+1:] 23 } 24 25 func WriteFile(writer *multipart.Writer, fileName string, fileData []byte) error { 26 part, err := writer.CreateFormFile(fileName, "file."+GetFileExt(fileData)) 27 if err != nil { 28 return err 29 } 30 _, err = part.Write(fileData) 31 return err 32 } 33 34 func WriteField(writer *multipart.Writer, key string, value string) { 35 if key != "" && value != "" { 36 _ = writer.WriteField(key, value) 37 } 38 } 39 40 // ScrapeWebpageHTML scrape the HTML content of a webpage 41 func ScrapeWebpageHTML(doc *goquery.Document) (string, error) { 42 return doc.Selection.Html() 43 } 44 45 // ScrapeWebpageTitle extracts and returns the title from the *goquery.Document 46 func ScrapeWebpageTitle(doc *goquery.Document) string { 47 // Find the title tag and get its text content 48 title := doc.Find("title").Text() 49 50 // Return the trimmed title 51 return strings.TrimSpace(title) 52 } 53 54 // ScrapeWebpageHTMLToMarkdown converts an HTML string to Markdown format 55 func ScrapeWebpageHTMLToMarkdown(html string) (string, error) { 56 // Initialize the markdown converter 57 converter := md.NewConverter("", true, nil) 58 59 // Convert the HTML to Markdown 60 markdown, err := converter.ConvertString(html) 61 if err != nil { 62 return "", err 63 } 64 65 return markdown, nil 66 } 67 68 // DecodeBase64 takes a base64-encoded blob, trims the MIME type (if present) 69 // and decodes the remaining bytes. 70 func DecodeBase64(input string) ([]byte, error) { 71 return base64.StdEncoding.DecodeString(base.TrimBase64Mime(input)) 72 }