github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/documents.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package docextractor
     5  
     6  import (
     7  	"errors"
     8  	"io"
     9  	"path"
    10  	"strings"
    11  
    12  	"code.sajari.com/docconv"
    13  )
    14  
    15  type documentExtractor struct{}
    16  
    17  var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string]string, error){
    18  	"doc":   docconv.ConvertDoc,
    19  	"docx":  docconv.ConvertDocx,
    20  	"pptx":  docconv.ConvertPptx,
    21  	"odt":   docconv.ConvertODT,
    22  	"html":  func(r io.Reader) (string, map[string]string, error) { return docconv.ConvertHTML(r, true) },
    23  	"pages": docconv.ConvertPages,
    24  	"rtf":   docconv.ConvertRTF,
    25  	"pdf":   docconv.ConvertPDF,
    26  }
    27  
    28  func (de *documentExtractor) Match(filename string) bool {
    29  	extension := strings.TrimPrefix(path.Ext(filename), ".")
    30  	_, ok := doconvConverterByExtensions[extension]
    31  	return ok
    32  }
    33  
    34  func (de *documentExtractor) Extract(filename string, r io.ReadSeeker) (string, error) {
    35  	extension := strings.TrimPrefix(path.Ext(filename), ".")
    36  	converter, ok := doconvConverterByExtensions[extension]
    37  	if !ok {
    38  		return "", errors.New("unknown converter")
    39  	}
    40  
    41  	text, _, err := converter(r)
    42  	if err != nil {
    43  		return "", err
    44  	}
    45  
    46  	return text, nil
    47  }