code.gitea.io/gitea@v1.19.3/modules/typesniffer/typesniffer.go (about)

     1  // Copyright 2021 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package typesniffer
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  	"io"
    10  	"net/http"
    11  	"regexp"
    12  	"strings"
    13  
    14  	"code.gitea.io/gitea/modules/util"
    15  )
    16  
    17  // Use at most this many bytes to determine Content Type.
    18  const sniffLen = 1024
    19  
    20  const (
    21  	// SvgMimeType MIME type of SVG images.
    22  	SvgMimeType = "image/svg+xml"
    23  	// ApplicationOctetStream MIME type of binary files.
    24  	ApplicationOctetStream = "application/octet-stream"
    25  )
    26  
    27  var (
    28  	svgComment       = regexp.MustCompile(`(?s)<!--.*?-->`)
    29  	svgTagRegex      = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
    30  	svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
    31  )
    32  
    33  // SniffedType contains information about a blobs type.
    34  type SniffedType struct {
    35  	contentType string
    36  }
    37  
    38  // IsText etects if content format is plain text.
    39  func (ct SniffedType) IsText() bool {
    40  	return strings.Contains(ct.contentType, "text/")
    41  }
    42  
    43  // IsImage detects if data is an image format
    44  func (ct SniffedType) IsImage() bool {
    45  	return strings.Contains(ct.contentType, "image/")
    46  }
    47  
    48  // IsSvgImage detects if data is an SVG image format
    49  func (ct SniffedType) IsSvgImage() bool {
    50  	return strings.Contains(ct.contentType, SvgMimeType)
    51  }
    52  
    53  // IsPDF detects if data is a PDF format
    54  func (ct SniffedType) IsPDF() bool {
    55  	return strings.Contains(ct.contentType, "application/pdf")
    56  }
    57  
    58  // IsVideo detects if data is an video format
    59  func (ct SniffedType) IsVideo() bool {
    60  	return strings.Contains(ct.contentType, "video/")
    61  }
    62  
    63  // IsAudio detects if data is an video format
    64  func (ct SniffedType) IsAudio() bool {
    65  	return strings.Contains(ct.contentType, "audio/")
    66  }
    67  
    68  // IsRepresentableAsText returns true if file content can be represented as
    69  // plain text or is empty.
    70  func (ct SniffedType) IsRepresentableAsText() bool {
    71  	return ct.IsText() || ct.IsSvgImage()
    72  }
    73  
    74  // IsBrowsableType returns whether a non-text type can be displayed in a browser
    75  func (ct SniffedType) IsBrowsableBinaryType() bool {
    76  	return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
    77  }
    78  
    79  // GetMimeType returns the mime type
    80  func (ct SniffedType) GetMimeType() string {
    81  	return strings.SplitN(ct.contentType, ";", 2)[0]
    82  }
    83  
    84  // DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
    85  func DetectContentType(data []byte) SniffedType {
    86  	if len(data) == 0 {
    87  		return SniffedType{"text/unknown"}
    88  	}
    89  
    90  	ct := http.DetectContentType(data)
    91  
    92  	if len(data) > sniffLen {
    93  		data = data[:sniffLen]
    94  	}
    95  
    96  	// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
    97  
    98  	detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
    99  	detectByXML := strings.Contains(ct, "text/xml")
   100  	if detectByHTML || detectByXML {
   101  		dataProcessed := svgComment.ReplaceAll(data, nil)
   102  		dataProcessed = bytes.TrimSpace(dataProcessed)
   103  		if detectByHTML && svgTagRegex.Match(dataProcessed) ||
   104  			detectByXML && svgTagInXMLRegex.Match(dataProcessed) {
   105  			ct = SvgMimeType
   106  		}
   107  	}
   108  
   109  	if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
   110  		// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
   111  		// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
   112  		// This works especially because audio files contain many unprintable/invalid characters like `0x00`
   113  		ct2 := http.DetectContentType(data[3:])
   114  		if strings.HasPrefix(ct2, "text/") {
   115  			ct = ct2
   116  		}
   117  	}
   118  
   119  	return SniffedType{ct}
   120  }
   121  
   122  // DetectContentTypeFromReader guesses the content type contained in the reader.
   123  func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
   124  	buf := make([]byte, sniffLen)
   125  	n, err := util.ReadAtMost(r, buf)
   126  	if err != nil {
   127  		return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err)
   128  	}
   129  	buf = buf[:n]
   130  
   131  	return DetectContentType(buf), nil
   132  }