github.com/instill-ai/component@v0.16.0-beta/pkg/connector/util/helper.go (about)

     1  package util
     2  
     3  import (
     4  	"encoding/base64"
     5  	"mime/multipart"
     6  	"net/http"
     7  	"strings"
     8  
     9  	md "github.com/JohannesKaufmann/html-to-markdown"
    10  	"github.com/PuerkitoBio/goquery"
    11  	"github.com/h2non/filetype"
    12  	"github.com/instill-ai/component/pkg/base"
    13  )
    14  
    15  func GetFileExt(fileData []byte) string {
    16  	kind, _ := filetype.Match(fileData)
    17  	if kind != filetype.Unknown && kind.Extension != "" {
    18  		return kind.Extension
    19  	}
    20  	//fallback to DetectContentType
    21  	mimeType := http.DetectContentType(fileData)
    22  	return mimeType[strings.LastIndex(mimeType, "/")+1:]
    23  }
    24  
    25  func WriteFile(writer *multipart.Writer, fileName string, fileData []byte) error {
    26  	part, err := writer.CreateFormFile(fileName, "file."+GetFileExt(fileData))
    27  	if err != nil {
    28  		return err
    29  	}
    30  	_, err = part.Write(fileData)
    31  	return err
    32  }
    33  
    34  func WriteField(writer *multipart.Writer, key string, value string) {
    35  	if key != "" && value != "" {
    36  		_ = writer.WriteField(key, value)
    37  	}
    38  }
    39  
    40  // ScrapeWebpageHTML scrape the HTML content of a webpage
    41  func ScrapeWebpageHTML(doc *goquery.Document) (string, error) {
    42  	return doc.Selection.Html()
    43  }
    44  
    45  // ScrapeWebpageTitle extracts and returns the title from the *goquery.Document
    46  func ScrapeWebpageTitle(doc *goquery.Document) string {
    47  	// Find the title tag and get its text content
    48  	title := doc.Find("title").Text()
    49  
    50  	// Return the trimmed title
    51  	return strings.TrimSpace(title)
    52  }
    53  
    54  // ScrapeWebpageHTMLToMarkdown converts an HTML string to Markdown format
    55  func ScrapeWebpageHTMLToMarkdown(html string) (string, error) {
    56  	// Initialize the markdown converter
    57  	converter := md.NewConverter("", true, nil)
    58  
    59  	// Convert the HTML to Markdown
    60  	markdown, err := converter.ConvertString(html)
    61  	if err != nil {
    62  		return "", err
    63  	}
    64  
    65  	return markdown, nil
    66  }
    67  
    68  // DecodeBase64 takes a base64-encoded blob, trims the MIME type (if present)
    69  // and decodes the remaining bytes.
    70  func DecodeBase64(input string) ([]byte, error) {
    71  	return base64.StdEncoding.DecodeString(base.TrimBase64Mime(input))
    72  }