github.com/mattermost/mattermost-server/v5@v5.39.3/services/docextractor/documents.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package docextractor
     5  
     6  import (
     7  	"errors"
     8  	"io"
     9  	"path"
    10  	"strings"
    11  
    12  	"code.sajari.com/docconv"
    13  )
    14  
    15  type documentExtractor struct{}
    16  
    17  var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string]string, error){
    18  	"doc":   docconv.ConvertDoc,
    19  	"docx":  docconv.ConvertDocx,
    20  	"pptx":  docconv.ConvertPptx,
    21  	"odt":   docconv.ConvertODT,
    22  	"html":  func(r io.Reader) (string, map[string]string, error) { return docconv.ConvertHTML(r, true) },
    23  	"pages": docconv.ConvertPages,
    24  	"rtf":   docconv.ConvertRTF,
    25  	"pdf":   docconv.ConvertPDF,
    26  }
    27  
    28  func (de *documentExtractor) Match(filename string) bool {
    29  	extension := strings.TrimPrefix(path.Ext(filename), ".")
    30  	_, ok := doconvConverterByExtensions[extension]
    31  	return ok
    32  }
    33  
    34  func (de *documentExtractor) Extract(filename string, r io.ReadSeeker) (out string, outErr error) {
    35  	defer func() {
    36  		if r := recover(); r != nil {
    37  			out = ""
    38  			outErr = errors.New("error extracting document text")
    39  		}
    40  	}()
    41  
    42  	extension := strings.TrimPrefix(path.Ext(filename), ".")
    43  	converter, ok := doconvConverterByExtensions[extension]
    44  	if !ok {
    45  		return "", errors.New("unknown converter")
    46  	}
    47  
    48  	text, _, err := converter(r)
    49  	if err != nil {
    50  		return "", err
    51  	}
    52  
    53  	return text, nil
    54  }