github.com/haalcala/mattermost-server-change-repo@v0.0.0-20210713015153-16753fbeee5f/services/docextractor/pdf.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package docextractor
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  	"io"
    10  	"io/ioutil"
    11  	"os"
    12  	"path"
    13  	"strings"
    14  
    15  	"github.com/ledongthuc/pdf"
    16  )
    17  
    18  type pdfExtractor struct{}
    19  
    20  func (pe *pdfExtractor) Match(filename string) bool {
    21  	supportedExtensions := map[string]bool{
    22  		"pdf": true,
    23  	}
    24  	extension := strings.TrimPrefix(path.Ext(filename), ".")
    25  	return supportedExtensions[extension]
    26  }
    27  
    28  func (pe *pdfExtractor) Extract(filename string, r io.Reader) (string, error) {
    29  	f, err := ioutil.TempFile(os.TempDir(), "pdflib")
    30  	if err != nil {
    31  		return "", fmt.Errorf("error creating temporary file: %v", err)
    32  	}
    33  	defer f.Close()
    34  	defer os.Remove(f.Name())
    35  	size, err := io.Copy(f, r)
    36  	if err != nil {
    37  		return "", fmt.Errorf("error copying data into temporary file: %v", err)
    38  	}
    39  
    40  	reader, err := pdf.NewReader(f, size)
    41  	if err != nil {
    42  		return "", err
    43  	}
    44  
    45  	var buf bytes.Buffer
    46  	b, err := reader.GetPlainText()
    47  	if err != nil {
    48  		return "", err
    49  	}
    50  	buf.ReadFrom(b)
    51  	return buf.String(), nil
    52  }