gopkg.in/alecthomas/gometalinter.v3@v3.0.0/_linters/src/github.com/client9/misspell/mime.go (about)

     1  package misspell
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"net/http"
     9  	"os"
    10  	"path/filepath"
    11  	"strings"
    12  )
    13  
    14  // The number of possible binary formats is very large
    15  // items that might be checked into a repo or be an
    16  // artifact of a build.  Additions welcome.
    17  //
    18  // Golang's internal table is very small and can't be
    19  // relied on.  Even then things like ".js" have a mime
    20  // type of "application/javascipt" which isn't very helpful.
    21  // "[x]" means we have  sniff test and suffix test should be eliminated
    22  var binary = map[string]bool{
    23  	".a":     true, // [ ] archive
    24  	".bin":   true, // [ ] binary
    25  	".bz2":   true, // [ ] compression
    26  	".class": true, // [x] Java class file
    27  	".dll":   true, // [ ] shared library
    28  	".exe":   true, // [ ] binary
    29  	".gif":   true, // [ ] image
    30  	".gpg":   true, // [x] text, but really all base64
    31  	".gz":    true, // [ ] compression
    32  	".ico":   true, // [ ] image
    33  	".jar":   true, // [x] archive
    34  	".jpeg":  true, // [ ] image
    35  	".jpg":   true, // [ ] image
    36  	".mp3":   true, // [ ] audio
    37  	".mp4":   true, // [ ] video
    38  	".mpeg":  true, // [ ] video
    39  	".o":     true, // [ ] object file
    40  	".pdf":   true, // [x] pdf
    41  	".png":   true, // [x] image
    42  	".pyc":   true, // [ ] Python bytecode
    43  	".pyo":   true, // [ ] Python bytecode
    44  	".so":    true, // [x] shared library
    45  	".swp":   true, // [ ] vim swap file
    46  	".tar":   true, // [ ] archive
    47  	".tiff":  true, // [ ] image
    48  	".woff":  true, // [ ] font
    49  	".woff2": true, // [ ] font
    50  	".xz":    true, // [ ] compression
    51  	".z":     true, // [ ] compression
    52  	".zip":   true, // [x] archive
    53  }
    54  
    55  // isBinaryFilename returns true if the file is likely to be binary
    56  //
    57  // Better heuristics could be done here, in particular a binary
    58  // file is unlikely to be UTF-8 encoded.  However this is cheap
    59  // and will solve the immediate need of making sure common
    60  // binary formats are not corrupted by mistake.
    61  func isBinaryFilename(s string) bool {
    62  	return binary[strings.ToLower(filepath.Ext(s))]
    63  }
    64  
    65  var scm = map[string]bool{
    66  	".bzr": true,
    67  	".git": true,
    68  	".hg":  true,
    69  	".svn": true,
    70  	"CVS":  true,
    71  }
    72  
    73  // isSCMPath returns true if the path is likely part of a (private) SCM
    74  //  directory.  E.g.  ./git/something  = true
    75  func isSCMPath(s string) bool {
    76  	// hack for .git/COMMIT_EDITMSG and .git/TAG_EDITMSG
    77  	// normally we don't look at anything in .git
    78  	// but COMMIT_EDITMSG and TAG_EDITMSG are used as
    79  	// temp files for git commits.  Allowing misspell to inspect
    80  	// these files allows for commit-msg hooks
    81  	// https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
    82  	if strings.Contains(filepath.Base(s), "EDITMSG") {
    83  		return false
    84  	}
    85  	parts := strings.Split(filepath.Clean(s), string(filepath.Separator))
    86  	for _, dir := range parts {
    87  		if scm[dir] {
    88  			return true
    89  		}
    90  	}
    91  	return false
    92  }
    93  
    94  var magicHeaders = [][]byte{
    95  	// Issue #68
    96  	// PGP messages and signatures are "text" but really just
    97  	// blobs of base64-text and should not be misspell-checked
    98  	[]byte("-----BEGIN PGP MESSAGE-----"),
    99  	[]byte("-----BEGIN PGP SIGNATURE-----"),
   100  
   101  	// ELF
   102  	{0x7f, 0x45, 0x4c, 0x46},
   103  
   104  	// Postscript
   105  	{0x25, 0x21, 0x50, 0x53},
   106  
   107  	// PDF
   108  	{0x25, 0x50, 0x44, 0x46},
   109  
   110  	// Java class file
   111  	// https://en.wikipedia.org/wiki/Java_class_file
   112  	{0xCA, 0xFE, 0xBA, 0xBE},
   113  
   114  	// PNG
   115  	// https://en.wikipedia.org/wiki/Portable_Network_Graphics
   116  	{0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a},
   117  
   118  	// ZIP, JAR, ODF, OOXML
   119  	{0x50, 0x4B, 0x03, 0x04},
   120  	{0x50, 0x4B, 0x05, 0x06},
   121  	{0x50, 0x4B, 0x07, 0x08},
   122  }
   123  
   124  func isTextFile(raw []byte) bool {
   125  	for _, magic := range magicHeaders {
   126  		if bytes.HasPrefix(raw, magic) {
   127  			return false
   128  		}
   129  	}
   130  
   131  	// allow any text/ type with utf-8 encoding
   132  	// DetectContentType sometimes returns charset=utf-16 for XML stuff
   133  	//  in which case ignore.
   134  	mime := http.DetectContentType(raw)
   135  	return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8")
   136  }
   137  
   138  // ReadTextFile returns the contents of a file, first testing if it is a text file
   139  //  returns ("", nil) if not a text file
   140  //  returns ("", error) if error
   141  //  returns (string, nil) if text
   142  //
   143  // unfortunately, in worse case, this does
   144  //   1 stat
   145  //   1 open,read,close of 512 bytes
   146  //   1 more stat,open, read everything, close (via ioutil.ReadAll)
   147  //  This could be kinder to the filesystem.
   148  //
   149  // This uses some heuristics of the file's extension (e.g. .zip, .txt) and
   150  // uses a sniffer to determine if the file is text or not.
   151  // Using file extensions isn't great, but probably
   152  // good enough for real-world use.
   153  // Golang's built in sniffer is problematic for differnet reasons.  It's
   154  // optimized for HTML, and is very limited in detection.  It would be good
   155  // to explicitly add some tests for ELF/DWARF formats to make sure we never
   156  // corrupt binary files.
   157  func ReadTextFile(filename string) (string, error) {
   158  	if isBinaryFilename(filename) {
   159  		return "", nil
   160  	}
   161  
   162  	if isSCMPath(filename) {
   163  		return "", nil
   164  	}
   165  
   166  	fstat, err := os.Stat(filename)
   167  
   168  	if err != nil {
   169  		return "", fmt.Errorf("Unable to stat %q: %s", filename, err)
   170  	}
   171  
   172  	// directory: nothing to do.
   173  	if fstat.IsDir() {
   174  		return "", nil
   175  	}
   176  
   177  	// avoid reading in multi-gig files
   178  	// if input is large, read the first 512 bytes to sniff type
   179  	// if not-text, then exit
   180  	isText := false
   181  	if fstat.Size() > 50000 {
   182  		fin, err := os.Open(filename)
   183  		if err != nil {
   184  			return "", fmt.Errorf("Unable to open large file %q: %s", filename, err)
   185  		}
   186  		defer fin.Close()
   187  		buf := make([]byte, 512)
   188  		_, err = io.ReadFull(fin, buf)
   189  		if err != nil {
   190  			return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err)
   191  		}
   192  		if !isTextFile(buf) {
   193  			return "", nil
   194  		}
   195  
   196  		// set so we don't double check this file
   197  		isText = true
   198  	}
   199  
   200  	// read in whole file
   201  	raw, err := ioutil.ReadFile(filename)
   202  	if err != nil {
   203  		return "", fmt.Errorf("Unable to read all %q: %s", filename, err)
   204  	}
   205  
   206  	if !isText && !isTextFile(raw) {
   207  		return "", nil
   208  	}
   209  	return string(raw), nil
   210  }