github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/docextractor_test.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package docextractor
     5  
     6  import (
     7  	"bytes"
     8  	"errors"
     9  	"io"
    10  	"strings"
    11  	"testing"
    12  
    13  	"github.com/stretchr/testify/assert"
    14  	"github.com/stretchr/testify/require"
    15  
    16  	"github.com/masterhung0112/hk_server/v5/utils/testutils"
    17  )
    18  
    19  func TestExtract(t *testing.T) {
    20  	testCases := []struct {
    21  		Name         string
    22  		TestFileName string
    23  		Settings     ExtractSettings
    24  		Contains     []string
    25  		NotContains  []string
    26  		ExpectError  bool
    27  	}{
    28  		{
    29  			"Plain text file",
    30  			"test-markdown-basics.md",
    31  			ExtractSettings{},
    32  			[]string{"followed", "separated", "Basic"},
    33  			[]string{},
    34  			false,
    35  		},
    36  		{
    37  			"Plain small text file",
    38  			"test-hashtags.md",
    39  			ExtractSettings{},
    40  			[]string{"should", "render", "strings"},
    41  			[]string{},
    42  			false,
    43  		},
    44  		{
    45  			"Zip file without recursion",
    46  			"Fake_Team_Import.zip",
    47  			ExtractSettings{},
    48  			[]string{"users", "channels", "general"},
    49  			[]string{"purpose", "announcements"},
    50  			false,
    51  		},
    52  		{
    53  			"Zip file with recursion",
    54  			"Fake_Team_Import.zip",
    55  			ExtractSettings{ArchiveRecursion: true},
    56  			[]string{"users", "channels", "general", "purpose", "announcements"},
    57  			[]string{},
    58  			false,
    59  		},
    60  		{
    61  			"Rar file without recursion",
    62  			"Fake_Team_Import.rar",
    63  			ExtractSettings{},
    64  			[]string{"users", "channels", "general"},
    65  			[]string{"purpose", "announcements"},
    66  			false,
    67  		},
    68  		{
    69  			"Rar file with recursion",
    70  			"Fake_Team_Import.rar",
    71  			ExtractSettings{ArchiveRecursion: true},
    72  			[]string{"users", "channels", "general", "purpose", "announcements"},
    73  			[]string{},
    74  			false,
    75  		},
    76  		{
    77  			"Tar.gz file without recursion",
    78  			"Fake_Team_Import.tar.gz",
    79  			ExtractSettings{},
    80  			[]string{"users", "channels", "general"},
    81  			[]string{"purpose", "announcements"},
    82  			false,
    83  		},
    84  		{
    85  			"Tar.gz file with recursion",
    86  			"Fake_Team_Import.tar.gz",
    87  			ExtractSettings{ArchiveRecursion: true},
    88  			[]string{"users", "channels", "general", "purpose", "announcements"},
    89  			[]string{},
    90  			false,
    91  		},
    92  		{
    93  			"Pdf file",
    94  			"sample-doc.pdf",
    95  			ExtractSettings{},
    96  			[]string{"simple", "document", "contains"},
    97  			[]string{},
    98  			false,
    99  		},
   100  		{
   101  			"Docx file",
   102  			"sample-doc.docx",
   103  			ExtractSettings{},
   104  			[]string{"simple", "document", "contains"},
   105  			[]string{},
   106  			false,
   107  		},
   108  		{
   109  			"Odt file",
   110  			"sample-doc.odt",
   111  			ExtractSettings{},
   112  			[]string{"simple", "document", "contains"},
   113  			[]string{},
   114  			false,
   115  		},
   116  		{
   117  			"Pptx file",
   118  			"sample-doc.pptx",
   119  			ExtractSettings{},
   120  			[]string{"simple", "document", "contains"},
   121  			[]string{},
   122  			false,
   123  		},
   124  	}
   125  
   126  	for _, tc := range testCases {
   127  		t.Run(tc.Name, func(t *testing.T) {
   128  			data, err := testutils.ReadTestFile(tc.TestFileName)
   129  			require.NoError(t, err)
   130  			text, err := Extract(tc.TestFileName, bytes.NewReader(data), tc.Settings)
   131  			if tc.ExpectError {
   132  				require.Error(t, err)
   133  			} else {
   134  				require.NoError(t, err)
   135  				for _, expectedString := range tc.Contains {
   136  					assert.Contains(t, text, expectedString)
   137  				}
   138  				for _, notExpectedString := range tc.NotContains {
   139  					assert.NotContains(t, text, notExpectedString)
   140  				}
   141  			}
   142  		})
   143  	}
   144  
   145  	t.Run("Unsupported binary file", func(t *testing.T) {
   146  		data, err := testutils.ReadTestFile("testjpg.jpg")
   147  		require.NoError(t, err)
   148  		text, err := Extract("testjpg.jpg", bytes.NewReader(data), ExtractSettings{})
   149  		require.NoError(t, err)
   150  		require.Equal(t, "", text)
   151  	})
   152  
   153  	t.Run("Wrong extension", func(t *testing.T) {
   154  		data, err := testutils.ReadTestFile("sample-doc.pdf")
   155  		require.NoError(t, err)
   156  		text, err := Extract("sample-doc.docx", bytes.NewReader(data), ExtractSettings{})
   157  		require.NoError(t, err)
   158  		require.Equal(t, "", text)
   159  	})
   160  }
   161  
   162  type customTestPdfExtractor struct{}
   163  
   164  func (te *customTestPdfExtractor) Match(filename string) bool {
   165  	return strings.HasSuffix(filename, ".pdf")
   166  }
   167  
   168  func (te *customTestPdfExtractor) Extract(filename string, r io.ReadSeeker) (string, error) {
   169  	return "this is a text generated content", nil
   170  }
   171  
   172  type failingExtractor struct{}
   173  
   174  func (te *failingExtractor) Match(filename string) bool {
   175  	return true
   176  }
   177  
   178  func (te *failingExtractor) Extract(filename string, r io.ReadSeeker) (string, error) {
   179  	return "", errors.New("this always fail")
   180  }
   181  
   182  func TestExtractWithExtraExtractors(t *testing.T) {
   183  	t.Run("overrite existing extractor", func(t *testing.T) {
   184  		data, err := testutils.ReadTestFile("sample-doc.pdf")
   185  		require.NoError(t, err)
   186  
   187  		text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}})
   188  		require.NoError(t, err)
   189  		require.Equal(t, text, "this is a text generated content")
   190  	})
   191  
   192  	t.Run("failing extractor", func(t *testing.T) {
   193  		data, err := testutils.ReadTestFile("sample-doc.pdf")
   194  		require.NoError(t, err)
   195  
   196  		text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}})
   197  		require.NoError(t, err)
   198  		assert.Contains(t, text, "simple")
   199  		assert.Contains(t, text, "document")
   200  		assert.Contains(t, text, "contains")
   201  	})
   202  }