github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/cmn/archive/mime.go (about)

     1  // Package archive: write, read, copy, append, list primitives
     2  // across all supported formats
     3  /*
     4   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package archive
     7  
     8  import (
     9  	"bytes"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"strings"
    14  
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/cmn/debug"
    17  	"github.com/NVIDIA/aistore/cmn/nlog"
    18  	"github.com/NVIDIA/aistore/memsys"
    19  )
    20  
    21  // supported archive types (file extensions); see also archExts in cmd/cli/cli/const.go
    22  // NOTE: when adding/removing formats - update:
    23  //   - FileExtensions
    24  //   - allMagics
    25  //   - ext/dsort/shard/rw.go
    26  const (
    27  	ExtTar    = ".tar"
    28  	ExtTgz    = ".tgz"
    29  	ExtTarGz  = ".tar.gz"
    30  	ExtZip    = ".zip"
    31  	ExtTarLz4 = ".tar.lz4"
    32  )
    33  
    34  const (
    35  	sizeDetectMime = 512
    36  )
    37  
    38  // - here and elsewhere, mime (string) is a "." + IANA mime
    39  // - for standard MIME types, see: cmn/cos/http_headers.go
    40  // - references:
    41  //   * https://en.wikipedia.org/wiki/List_of_file_signatures
    42  //   * https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
    43  
    44  type detect struct {
    45  	mime   string // '.' + IANA mime
    46  	sig    []byte
    47  	offset int
    48  }
    49  
    50  var FileExtensions = []string{ExtTar, ExtTgz, ExtTarGz, ExtZip, ExtTarLz4}
    51  
    52  // standard file signatures
    53  var (
    54  	magicTar  = detect{offset: 257, sig: []byte("ustar"), mime: ExtTar}
    55  	magicGzip = detect{sig: []byte{0x1f, 0x8b}, mime: ExtTarGz}
    56  	magicZip  = detect{sig: []byte{0x50, 0x4b}, mime: ExtZip}
    57  	magicLz4  = detect{sig: []byte{0x04, 0x22, 0x4d, 0x18}, mime: ExtTarLz4}
    58  
    59  	allMagics = []detect{magicTar, magicGzip, magicZip, magicLz4} // NOTE: must contain all
    60  )
    61  
    62  // motivation: prevent from creating archives with non-standard extensions
    63  func Strict(mime, filename string) (m string, err error) {
    64  	if mime != "" {
    65  		if m, err = normalize(mime); err != nil {
    66  			return
    67  		}
    68  	}
    69  	m, err = byExt(filename)
    70  	if err != nil || mime == "" {
    71  		return
    72  	}
    73  	if mime != m {
    74  		// user-defined (non-empty) MIME must correspond
    75  		err = fmt.Errorf("mime mismatch %q vs %q", mime, m)
    76  	}
    77  	return
    78  }
    79  
    80  func Mime(mime, filename string) (string, error) {
    81  	if mime != "" {
    82  		return normalize(mime)
    83  	}
    84  	return byExt(filename)
    85  }
    86  
    87  // e.g. MIME: "application/zip"
    88  func normalize(mime string) (string, error) {
    89  	switch {
    90  	case strings.Contains(mime, ExtTarGz[1:]): // ExtTarGz contains ExtTar
    91  		return ExtTarGz, nil
    92  	case strings.Contains(mime, ExtTarLz4[1:]): // ditto
    93  		return ExtTarLz4, nil
    94  	default:
    95  		for _, ext := range FileExtensions {
    96  			if strings.Contains(mime, ext[1:]) {
    97  				return ext, nil
    98  			}
    99  		}
   100  	}
   101  	return "", NewErrUnknownMime(mime)
   102  }
   103  
   104  // by filename extension
   105  func byExt(filename string) (string, error) {
   106  	for _, ext := range FileExtensions {
   107  		if strings.HasSuffix(filename, ext) {
   108  			return ext, nil
   109  		}
   110  	}
   111  	return "", NewErrUnknownFileExt(filename, "")
   112  }
   113  
   114  // NOTE convention: caller may pass nil `smm` _not_ to spend time (usage: listing and reading)
   115  func MimeFile(file *os.File, smm *memsys.MMSA, mime, archname string) (m string, err error) {
   116  	m, err = Mime(mime, archname)
   117  	if err == nil || IsErrUnknownMime(err) {
   118  		return
   119  	}
   120  	if smm == nil {
   121  		err = NewErrUnknownFileExt(archname, "not reading file magic")
   122  		return
   123  	}
   124  	// by magic
   125  	var (
   126  		n         int
   127  		buf, slab = smm.AllocSize(sizeDetectMime)
   128  	)
   129  	m, n, err = _detect(file, archname, buf)
   130  	if n > 0 {
   131  		_, errV := file.Seek(0, io.SeekStart)
   132  		debug.AssertNoErr(errV)
   133  		if err == nil {
   134  			err = errV
   135  		}
   136  		if err == nil {
   137  			nlog.Infoln("archname", archname, "is in fact", m, "(via magic sign)")
   138  		}
   139  	}
   140  	slab.Free(buf)
   141  	return
   142  }
   143  
   144  // NOTE:
   145  // - on purpose redundant vs the above - not to open file if can be avoided
   146  // - convention: caller may pass nil `smm` _not_ to spend time (usage: listing and reading)
   147  func MimeFQN(smm *memsys.MMSA, mime, archname string) (m string, err error) {
   148  	m, err = Mime(mime, archname)
   149  	if err == nil || IsErrUnknownMime(err) {
   150  		return
   151  	}
   152  	if smm == nil {
   153  		err = NewErrUnknownFileExt(archname, "not reading file magic")
   154  		return
   155  	}
   156  	fh, err := os.Open(archname)
   157  	if err != nil {
   158  		return "", err
   159  	}
   160  	buf, slab := smm.AllocSize(sizeDetectMime)
   161  	m, _, err = _detect(fh, archname, buf)
   162  	slab.Free(buf)
   163  	cos.Close(fh)
   164  	return
   165  }
   166  
   167  func _detect(file *os.File, archname string, buf []byte) (m string, n int, err error) {
   168  	n, err = file.Read(buf)
   169  	if err != nil {
   170  		return
   171  	}
   172  	if n < sizeDetectMime {
   173  		err = NewErrUnknownFileExt(archname, "file is too short")
   174  		return
   175  	}
   176  	for _, magic := range allMagics {
   177  		if n > magic.offset && bytes.HasPrefix(buf[magic.offset:], magic.sig) {
   178  			return magic.mime, n, nil
   179  		}
   180  	}
   181  	err = fmt.Errorf("failed to detect supported file signatures in %q", archname)
   182  	return
   183  }
   184  
   185  func EqExt(ext1, ext2 string) bool {
   186  	switch {
   187  	case ext1 == ext2:
   188  		return true
   189  	case ext1 == ExtTarGz && ext2 == ExtTgz:
   190  		return true
   191  	case ext2 == ExtTarGz && ext1 == ExtTgz:
   192  		return true
   193  	}
   194  	return false
   195  }