github.com/haalcala/mattermost-server-change-repo@v0.0.0-20210713015153-16753fbeee5f/services/docextractor/docextractor_test.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "bytes" 8 "errors" 9 "io" 10 "strings" 11 "testing" 12 13 "github.com/stretchr/testify/assert" 14 "github.com/stretchr/testify/require" 15 16 "github.com/mattermost/mattermost-server/v5/utils/testutils" 17 ) 18 19 func TestExtract(t *testing.T) { 20 testCases := []struct { 21 Name string 22 TestFileName string 23 Settings ExtractSettings 24 Contains []string 25 NotContains []string 26 ExpectError bool 27 }{ 28 { 29 "Plain text file", 30 "test-markdown-basics.md", 31 ExtractSettings{}, 32 []string{"followed", "separated"}, 33 []string{}, 34 false, 35 }, 36 { 37 "Zip file without recursion", 38 "Fake_Team_Import.zip", 39 ExtractSettings{}, 40 []string{"users", "channels", "general"}, 41 []string{"purpose", "announcements"}, 42 false, 43 }, 44 { 45 "Zip file with recursion", 46 "Fake_Team_Import.zip", 47 ExtractSettings{ArchiveRecursion: true}, 48 []string{"users", "channels", "general", "purpose", "announcements"}, 49 []string{}, 50 false, 51 }, 52 { 53 "Rar file without recursion", 54 "Fake_Team_Import.rar", 55 ExtractSettings{}, 56 []string{"users", "channels", "general"}, 57 []string{"purpose", "announcements"}, 58 false, 59 }, 60 { 61 "Rar file with recursion", 62 "Fake_Team_Import.rar", 63 ExtractSettings{ArchiveRecursion: true}, 64 []string{"users", "channels", "general", "purpose", "announcements"}, 65 []string{}, 66 false, 67 }, 68 { 69 "Tar.gz file without recursion", 70 "Fake_Team_Import.tar.gz", 71 ExtractSettings{}, 72 []string{"users", "channels", "general"}, 73 []string{"purpose", "announcements"}, 74 false, 75 }, 76 { 77 "Tar.gz file with recursion", 78 "Fake_Team_Import.tar.gz", 79 ExtractSettings{ArchiveRecursion: true}, 80 []string{"users", "channels", "general", "purpose", "announcements"}, 81 []string{}, 82 false, 83 }, 84 { 85 "Pdf file", 86 "sample-doc.pdf", 87 ExtractSettings{}, 88 []string{"simple", "document", "contains"}, 89 []string{}, 90 false, 91 }, 92 { 93 "Docx file", 94 "sample-doc.docx", 95 ExtractSettings{}, 96 []string{"simple", "document", "contains"}, 97 []string{}, 98 false, 99 }, 100 { 101 "Pptx file", 102 "sample-doc.pptx", 103 ExtractSettings{}, 104 []string{"simple", "document", "contains"}, 105 []string{}, 106 false, 107 }, 108 } 109 110 for _, tc := range testCases { 111 t.Run(tc.Name, func(t *testing.T) { 112 data, err := testutils.ReadTestFile(tc.TestFileName) 113 require.NoError(t, err) 114 text, err := Extract(tc.TestFileName, bytes.NewReader(data), tc.Settings) 115 if tc.ExpectError { 116 require.Error(t, err) 117 } else { 118 require.NoError(t, err) 119 for _, expectedString := range tc.Contains { 120 assert.Contains(t, text, expectedString) 121 } 122 for _, notExpectedString := range tc.NotContains { 123 assert.NotContains(t, text, notExpectedString) 124 } 125 } 126 }) 127 } 128 129 t.Run("Unsupported binary file", func(t *testing.T) { 130 data, err := testutils.ReadTestFile("testjpg.jpg") 131 require.NoError(t, err) 132 text, err := Extract("testjpg.jpg", bytes.NewReader(data), ExtractSettings{}) 133 require.NoError(t, err) 134 require.Equal(t, "", text) 135 }) 136 137 t.Run("Wrong extension", func(t *testing.T) { 138 data, err := testutils.ReadTestFile("sample-doc.pdf") 139 require.NoError(t, err) 140 text, err := Extract("sample-doc.docx", bytes.NewReader(data), ExtractSettings{}) 141 require.NoError(t, err) 142 require.Equal(t, "", text) 143 }) 144 } 145 146 type customTestPdfExtractor struct{} 147 148 func (te *customTestPdfExtractor) Match(filename string) bool { 149 return strings.HasSuffix(filename, ".pdf") 150 } 151 152 func (te *customTestPdfExtractor) Extract(filename string, r io.Reader) (string, error) { 153 return "this is a text generated content", nil 154 } 155 156 type failingExtractor struct{} 157 158 func (te *failingExtractor) Match(filename string) bool { 159 return true 160 } 161 162 func (te *failingExtractor) Extract(filename string, r io.Reader) (string, error) { 163 return "", errors.New("this always fail") 164 } 165 166 func TestExtractWithExtraExtractors(t *testing.T) { 167 t.Run("overrite existing extractor", func(t *testing.T) { 168 data, err := testutils.ReadTestFile("sample-doc.pdf") 169 require.NoError(t, err) 170 171 text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}}) 172 require.NoError(t, err) 173 require.Equal(t, text, "this is a text generated content") 174 }) 175 176 t.Run("failing extractor", func(t *testing.T) { 177 data, err := testutils.ReadTestFile("sample-doc.pdf") 178 require.NoError(t, err) 179 180 text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}}) 181 require.NoError(t, err) 182 assert.Contains(t, text, "simple") 183 assert.Contains(t, text, "document") 184 assert.Contains(t, text, "contains") 185 }) 186 }