github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/pdf.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "bytes" 8 "errors" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "os" 13 "path" 14 "strings" 15 16 "github.com/ledongthuc/pdf" 17 ) 18 19 type pdfExtractor struct{} 20 21 func (pe *pdfExtractor) Match(filename string) bool { 22 supportedExtensions := map[string]bool{ 23 "pdf": true, 24 } 25 extension := strings.TrimPrefix(path.Ext(filename), ".") 26 return supportedExtensions[extension] 27 } 28 29 func (pe *pdfExtractor) Extract(filename string, r io.ReadSeeker) (out string, outErr error) { 30 defer func() { 31 if r := recover(); r != nil { 32 out = "" 33 outErr = errors.New("error extracting pdf text") 34 } 35 }() 36 f, err := ioutil.TempFile(os.TempDir(), "pdflib") 37 if err != nil { 38 return "", fmt.Errorf("error creating temporary file: %v", err) 39 } 40 defer f.Close() 41 defer os.Remove(f.Name()) 42 size, err := io.Copy(f, r) 43 if err != nil { 44 return "", fmt.Errorf("error copying data into temporary file: %v", err) 45 } 46 47 reader, err := pdf.NewReader(f, size) 48 if err != nil { 49 return "", err 50 } 51 52 var buf bytes.Buffer 53 b, err := reader.GetPlainText() 54 if err != nil { 55 return "", err 56 } 57 buf.ReadFrom(b) 58 return buf.String(), nil 59 }