github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/pdf.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package docextractor
     5  
     6  import (
     7  	"bytes"
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"io/ioutil"
    12  	"os"
    13  	"path"
    14  	"strings"
    15  
    16  	"github.com/ledongthuc/pdf"
    17  )
    18  
    19  type pdfExtractor struct{}
    20  
    21  func (pe *pdfExtractor) Match(filename string) bool {
    22  	supportedExtensions := map[string]bool{
    23  		"pdf": true,
    24  	}
    25  	extension := strings.TrimPrefix(path.Ext(filename), ".")
    26  	return supportedExtensions[extension]
    27  }
    28  
    29  func (pe *pdfExtractor) Extract(filename string, r io.ReadSeeker) (out string, outErr error) {
    30  	defer func() {
    31  		if r := recover(); r != nil {
    32  			out = ""
    33  			outErr = errors.New("error extracting pdf text")
    34  		}
    35  	}()
    36  	f, err := ioutil.TempFile(os.TempDir(), "pdflib")
    37  	if err != nil {
    38  		return "", fmt.Errorf("error creating temporary file: %v", err)
    39  	}
    40  	defer f.Close()
    41  	defer os.Remove(f.Name())
    42  	size, err := io.Copy(f, r)
    43  	if err != nil {
    44  		return "", fmt.Errorf("error copying data into temporary file: %v", err)
    45  	}
    46  
    47  	reader, err := pdf.NewReader(f, size)
    48  	if err != nil {
    49  		return "", err
    50  	}
    51  
    52  	var buf bytes.Buffer
    53  	b, err := reader.GetPlainText()
    54  	if err != nil {
    55  		return "", err
    56  	}
    57  	buf.ReadFrom(b)
    58  	return buf.String(), nil
    59  }