gitee.com/wgliang/goreporter@v0.0.0-20180902115603-df1b20f7c5d0/linters/spellcheck/misspell/mime.go (about) 1 package misspell 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "io/ioutil" 8 "net/http" 9 "os" 10 "path/filepath" 11 "strings" 12 ) 13 14 // The number of possible binary formats is very large 15 // items that might be checked into a repo or be an 16 // artifact of a build. Additions welcome. 17 // 18 // Golang's internal table is very small and can't be 19 // relied on. Even then things like ".js" have a mime 20 // type of "application/javascipt" which isn't very helpful. 21 // "[x]" means we have sniff test and suffix test should be eliminated 22 var binary = map[string]bool{ 23 ".a": true, // [ ] archive 24 ".bin": true, // [ ] binary 25 ".bz2": true, // [ ] compression 26 ".class": true, // [x] Java class file 27 ".dll": true, // [ ] shared library 28 ".exe": true, // [ ] binary 29 ".gif": true, // [ ] image 30 ".gpg": true, // [x] text, but really all base64 31 ".gz": true, // [ ] compression 32 ".ico": true, // [ ] image 33 ".jar": true, // [x] archive 34 ".jpeg": true, // [ ] image 35 ".jpg": true, // [ ] image 36 ".mp3": true, // [ ] audio 37 ".mp4": true, // [ ] video 38 ".mpeg": true, // [ ] video 39 ".o": true, // [ ] object file 40 ".pdf": true, // [x] pdf 41 ".png": true, // [x] image 42 ".pyc": true, // [ ] Python bytecode 43 ".pyo": true, // [ ] Python bytecode 44 ".so": true, // [x] shared library 45 ".swp": true, // [ ] vim swap file 46 ".tar": true, // [ ] archive 47 ".tiff": true, // [ ] image 48 ".woff": true, // [ ] font 49 ".woff2": true, // [ ] font 50 ".xz": true, // [ ] compression 51 ".z": true, // [ ] compression 52 ".zip": true, // [x] archive 53 } 54 55 // isBinaryFilename returns true if the file is likely to be binary 56 // 57 // Better heuristics could be done here, in particular a binary 58 // file is unlikely to be UTF-8 encoded. However this is cheap 59 // and will solve the immediate need of making sure common 60 // binary formats are not corrupted by mistake. 61 func isBinaryFilename(s string) bool { 62 return binary[strings.ToLower(filepath.Ext(s))] 63 } 64 65 var scm = map[string]bool{ 66 ".bzr": true, 67 ".git": true, 68 ".hg": true, 69 ".svn": true, 70 "CVS": true, 71 } 72 73 // isSCMPath returns true if the path is likely part of a (private) SCM 74 // directory. E.g. ./git/something = true 75 func isSCMPath(s string) bool { 76 parts := strings.Split(s, string(filepath.Separator)) 77 for _, dir := range parts { 78 if scm[dir] { 79 return true 80 } 81 } 82 return false 83 } 84 85 var magicHeaders = [][]byte{ 86 // Issue #68 87 // PGP messages and signatures are "text" but really just 88 // blobs of base64-text and should not be misspell-checked 89 []byte("-----BEGIN PGP MESSAGE-----"), 90 []byte("-----BEGIN PGP SIGNATURE-----"), 91 92 // ELF 93 {0x7f, 0x45, 0x4c, 0x46}, 94 95 // Postscript 96 {0x25, 0x21, 0x50, 0x53}, 97 98 // PDF 99 {0x25, 0x50, 0x44, 0x46}, 100 101 // Java class file 102 // https://en.wikipedia.org/wiki/Java_class_file 103 {0xCA, 0xFE, 0xBA, 0xBE}, 104 105 // PNG 106 // https://en.wikipedia.org/wiki/Portable_Network_Graphics 107 {0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a}, 108 109 // ZIP, JAR, ODF, OOXML 110 {0x50, 0x4B, 0x03, 0x04}, 111 {0x50, 0x4B, 0x05, 0x06}, 112 {0x50, 0x4B, 0x07, 0x08}, 113 } 114 115 func isTextFile(raw []byte) bool { 116 for _, magic := range magicHeaders { 117 if bytes.HasPrefix(raw, magic) { 118 return false 119 } 120 } 121 122 // allow any text/ type with utf-8 encoding 123 // DetectContentType sometimes returns charset=utf-16 for XML stuff 124 // in which case ignore. 125 mime := http.DetectContentType(raw) 126 return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8") 127 } 128 129 // ReadTextFile returns the contents of a file, first testing if it is a text file 130 // returns ("", nil) if not a text file 131 // returns ("", error) if error 132 // returns (string, nil) if text 133 // 134 // unfortunately, in worse case, this does 135 // 1 stat 136 // 1 open,read,close of 512 bytes 137 // 1 more stat,open, read everything, close (via ioutil.ReadAll) 138 // This could be kinder to the filesystem. 139 // 140 // This uses some heuristics of the file's extension (e.g. .zip, .txt) and 141 // uses a sniffer to determine if the file is text or not. 142 // Using file extensions isn't great, but probably 143 // good enough for real-world use. 144 // Golang's built in sniffer is problematic for differnet reasons. It's 145 // optimized for HTML, and is very limited in detection. It would be good 146 // to explicitly add some tests for ELF/DWARF formats to make sure we never 147 // corrupt binary files. 148 func ReadTextFile(filename string) (string, error) { 149 if isBinaryFilename(filename) { 150 return "", nil 151 } 152 153 if isSCMPath(filename) { 154 return "", nil 155 } 156 157 fstat, err := os.Stat(filename) 158 159 if err != nil { 160 return "", fmt.Errorf("Unable to stat %q: %s", filename, err) 161 } 162 163 // directory: nothing to do. 164 if fstat.IsDir() { 165 return "", nil 166 } 167 168 // avoid reading in multi-gig files 169 // if input is large, read the first 512 bytes to sniff type 170 // if not-text, then exit 171 isText := false 172 if fstat.Size() > 50000 { 173 fin, err := os.Open(filename) 174 if err != nil { 175 return "", fmt.Errorf("Unable to open large file %q: %s", filename, err) 176 } 177 defer fin.Close() 178 buf := make([]byte, 512) 179 _, err = io.ReadFull(fin, buf) 180 if err != nil { 181 return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err) 182 } 183 if !isTextFile(buf) { 184 return "", nil 185 } 186 187 // set so we don't double check this file 188 isText = true 189 } 190 191 // read in whole file 192 raw, err := ioutil.ReadFile(filename) 193 if err != nil { 194 return "", fmt.Errorf("Unable to read all %q: %s", filename, err) 195 } 196 197 if !isText && !isTextFile(raw) { 198 return "", nil 199 } 200 return string(raw), nil 201 }