github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/docextractor_test.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "bytes" 8 "errors" 9 "io" 10 "strings" 11 "testing" 12 13 "github.com/stretchr/testify/assert" 14 "github.com/stretchr/testify/require" 15 16 "github.com/masterhung0112/hk_server/v5/utils/testutils" 17 ) 18 19 func TestExtract(t *testing.T) { 20 testCases := []struct { 21 Name string 22 TestFileName string 23 Settings ExtractSettings 24 Contains []string 25 NotContains []string 26 ExpectError bool 27 }{ 28 { 29 "Plain text file", 30 "test-markdown-basics.md", 31 ExtractSettings{}, 32 []string{"followed", "separated", "Basic"}, 33 []string{}, 34 false, 35 }, 36 { 37 "Plain small text file", 38 "test-hashtags.md", 39 ExtractSettings{}, 40 []string{"should", "render", "strings"}, 41 []string{}, 42 false, 43 }, 44 { 45 "Zip file without recursion", 46 "Fake_Team_Import.zip", 47 ExtractSettings{}, 48 []string{"users", "channels", "general"}, 49 []string{"purpose", "announcements"}, 50 false, 51 }, 52 { 53 "Zip file with recursion", 54 "Fake_Team_Import.zip", 55 ExtractSettings{ArchiveRecursion: true}, 56 []string{"users", "channels", "general", "purpose", "announcements"}, 57 []string{}, 58 false, 59 }, 60 { 61 "Rar file without recursion", 62 "Fake_Team_Import.rar", 63 ExtractSettings{}, 64 []string{"users", "channels", "general"}, 65 []string{"purpose", "announcements"}, 66 false, 67 }, 68 { 69 "Rar file with recursion", 70 "Fake_Team_Import.rar", 71 ExtractSettings{ArchiveRecursion: true}, 72 []string{"users", "channels", "general", "purpose", "announcements"}, 73 []string{}, 74 false, 75 }, 76 { 77 "Tar.gz file without recursion", 78 "Fake_Team_Import.tar.gz", 79 ExtractSettings{}, 80 []string{"users", "channels", "general"}, 81 []string{"purpose", "announcements"}, 82 false, 83 }, 84 { 85 "Tar.gz file with recursion", 86 "Fake_Team_Import.tar.gz", 87 ExtractSettings{ArchiveRecursion: true}, 88 []string{"users", "channels", "general", "purpose", "announcements"}, 89 []string{}, 90 false, 91 }, 92 { 93 "Pdf file", 94 "sample-doc.pdf", 95 ExtractSettings{}, 96 []string{"simple", "document", "contains"}, 97 []string{}, 98 false, 99 }, 100 { 101 "Docx file", 102 "sample-doc.docx", 103 ExtractSettings{}, 104 []string{"simple", "document", "contains"}, 105 []string{}, 106 false, 107 }, 108 { 109 "Odt file", 110 "sample-doc.odt", 111 ExtractSettings{}, 112 []string{"simple", "document", "contains"}, 113 []string{}, 114 false, 115 }, 116 { 117 "Pptx file", 118 "sample-doc.pptx", 119 ExtractSettings{}, 120 []string{"simple", "document", "contains"}, 121 []string{}, 122 false, 123 }, 124 } 125 126 for _, tc := range testCases { 127 t.Run(tc.Name, func(t *testing.T) { 128 data, err := testutils.ReadTestFile(tc.TestFileName) 129 require.NoError(t, err) 130 text, err := Extract(tc.TestFileName, bytes.NewReader(data), tc.Settings) 131 if tc.ExpectError { 132 require.Error(t, err) 133 } else { 134 require.NoError(t, err) 135 for _, expectedString := range tc.Contains { 136 assert.Contains(t, text, expectedString) 137 } 138 for _, notExpectedString := range tc.NotContains { 139 assert.NotContains(t, text, notExpectedString) 140 } 141 } 142 }) 143 } 144 145 t.Run("Unsupported binary file", func(t *testing.T) { 146 data, err := testutils.ReadTestFile("testjpg.jpg") 147 require.NoError(t, err) 148 text, err := Extract("testjpg.jpg", bytes.NewReader(data), ExtractSettings{}) 149 require.NoError(t, err) 150 require.Equal(t, "", text) 151 }) 152 153 t.Run("Wrong extension", func(t *testing.T) { 154 data, err := testutils.ReadTestFile("sample-doc.pdf") 155 require.NoError(t, err) 156 text, err := Extract("sample-doc.docx", bytes.NewReader(data), ExtractSettings{}) 157 require.NoError(t, err) 158 require.Equal(t, "", text) 159 }) 160 } 161 162 type customTestPdfExtractor struct{} 163 164 func (te *customTestPdfExtractor) Match(filename string) bool { 165 return strings.HasSuffix(filename, ".pdf") 166 } 167 168 func (te *customTestPdfExtractor) Extract(filename string, r io.ReadSeeker) (string, error) { 169 return "this is a text generated content", nil 170 } 171 172 type failingExtractor struct{} 173 174 func (te *failingExtractor) Match(filename string) bool { 175 return true 176 } 177 178 func (te *failingExtractor) Extract(filename string, r io.ReadSeeker) (string, error) { 179 return "", errors.New("this always fail") 180 } 181 182 func TestExtractWithExtraExtractors(t *testing.T) { 183 t.Run("overrite existing extractor", func(t *testing.T) { 184 data, err := testutils.ReadTestFile("sample-doc.pdf") 185 require.NoError(t, err) 186 187 text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}}) 188 require.NoError(t, err) 189 require.Equal(t, text, "this is a text generated content") 190 }) 191 192 t.Run("failing extractor", func(t *testing.T) { 193 data, err := testutils.ReadTestFile("sample-doc.pdf") 194 require.NoError(t, err) 195 196 text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}}) 197 require.NoError(t, err) 198 assert.Contains(t, text, "simple") 199 assert.Contains(t, text, "document") 200 assert.Contains(t, text, "contains") 201 }) 202 }