github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/untar.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"archive/tar"
    22  	"bufio"
    23  	"bytes"
    24  	"context"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"io/fs"
    29  	"os"
    30  	"path"
    31  	"runtime"
    32  	"sync"
    33  	"time"
    34  
    35  	"github.com/cosnicolaou/pbzip2"
    36  	"github.com/klauspost/compress/s2"
    37  	"github.com/klauspost/compress/zstd"
    38  	gzip "github.com/klauspost/pgzip"
    39  	"github.com/minio/minio/internal/logger"
    40  	"github.com/pierrec/lz4"
    41  )
    42  
    43  // Max bzip2 concurrency across calls. 50% of GOMAXPROCS.
    44  var bz2Limiter = pbzip2.CreateConcurrencyPool((runtime.GOMAXPROCS(0) + 1) / 2)
    45  
    46  func detect(r *bufio.Reader) format {
    47  	z, err := r.Peek(4)
    48  	if err != nil {
    49  		return formatUnknown
    50  	}
    51  	for _, f := range magicHeaders {
    52  		if bytes.Equal(f.header, z[:len(f.header)]) {
    53  			return f.f
    54  		}
    55  	}
    56  	return formatUnknown
    57  }
    58  
    59  //go:generate stringer -type=format -trimprefix=format $GOFILE
    60  type format int
    61  
    62  const (
    63  	formatUnknown format = iota
    64  	formatGzip
    65  	formatZstd
    66  	formatLZ4
    67  	formatS2
    68  	formatBZ2
    69  )
    70  
    71  var magicHeaders = []struct {
    72  	header []byte
    73  	f      format
    74  }{
    75  	{
    76  		header: []byte{0x1f, 0x8b, 8},
    77  		f:      formatGzip,
    78  	},
    79  	{
    80  		// Zstd default header.
    81  		header: []byte{0x28, 0xb5, 0x2f, 0xfd},
    82  		f:      formatZstd,
    83  	},
    84  	{
    85  		// Zstd skippable frame header.
    86  		header: []byte{0x2a, 0x4d, 0x18},
    87  		f:      formatZstd,
    88  	},
    89  	{
    90  		// LZ4
    91  		header: []byte{0x4, 0x22, 0x4d, 0x18},
    92  		f:      formatLZ4,
    93  	},
    94  	{
    95  		// Snappy/S2 stream
    96  		header: []byte{0xff, 0x06, 0x00, 0x00},
    97  		f:      formatS2,
    98  	},
    99  	{
   100  		header: []byte{0x42, 0x5a, 'h'},
   101  		f:      formatBZ2,
   102  	},
   103  }
   104  
   105  type untarOptions struct {
   106  	ignoreDirs bool
   107  	ignoreErrs bool
   108  	prefixAll  string
   109  }
   110  
   111  // disconnectReader will ensure that no reads can take place on
   112  // the upstream reader after close has been called.
   113  type disconnectReader struct {
   114  	r  io.Reader
   115  	mu sync.Mutex
   116  }
   117  
   118  func (d *disconnectReader) Read(p []byte) (n int, err error) {
   119  	d.mu.Lock()
   120  	defer d.mu.Unlock()
   121  	if d.r != nil {
   122  		return d.r.Read(p)
   123  	}
   124  	return 0, errors.New("reader closed")
   125  }
   126  
   127  func (d *disconnectReader) Close() error {
   128  	d.mu.Lock()
   129  	d.r = nil
   130  	d.mu.Unlock()
   131  	return nil
   132  }
   133  
   134  func untar(ctx context.Context, r io.Reader, putObject func(reader io.Reader, info os.FileInfo, name string) error, o untarOptions) error {
   135  	bf := bufio.NewReader(r)
   136  	switch f := detect(bf); f {
   137  	case formatGzip:
   138  		gz, err := gzip.NewReader(bf)
   139  		if err != nil {
   140  			return err
   141  		}
   142  		defer gz.Close()
   143  		r = gz
   144  	case formatS2:
   145  		r = s2.NewReader(bf)
   146  	case formatZstd:
   147  		// Limit to 16 MiB per stream.
   148  		dec, err := zstd.NewReader(bf, zstd.WithDecoderMaxWindow(16<<20))
   149  		if err != nil {
   150  			return err
   151  		}
   152  		defer dec.Close()
   153  		r = dec
   154  	case formatBZ2:
   155  		ctx, cancel := context.WithCancel(ctx)
   156  		defer cancel()
   157  		r = pbzip2.NewReader(ctx, bf, pbzip2.DecompressionOptions(
   158  			pbzip2.BZConcurrency((runtime.GOMAXPROCS(0)+1)/2),
   159  			pbzip2.BZConcurrencyPool(bz2Limiter)))
   160  	case formatLZ4:
   161  		r = lz4.NewReader(bf)
   162  	case formatUnknown:
   163  		r = bf
   164  	default:
   165  		return fmt.Errorf("Unsupported format %s", f)
   166  	}
   167  	tarReader := tar.NewReader(r)
   168  	n := 0
   169  	asyncWriters := make(chan struct{}, 16)
   170  	var wg sync.WaitGroup
   171  
   172  	var asyncErr error
   173  	var asyncErrMu sync.Mutex
   174  	for {
   175  		if !o.ignoreErrs {
   176  			asyncErrMu.Lock()
   177  			err := asyncErr
   178  			asyncErrMu.Unlock()
   179  			if err != nil {
   180  				return err
   181  			}
   182  		}
   183  
   184  		header, err := tarReader.Next()
   185  		switch {
   186  
   187  		// if no more files are found return
   188  		case err == io.EOF:
   189  			wg.Wait()
   190  			return asyncErr
   191  
   192  		// return any other error
   193  		case err != nil:
   194  			wg.Wait()
   195  			extra := ""
   196  			if n > 0 {
   197  				extra = fmt.Sprintf(" after %d successful object(s)", n)
   198  			}
   199  			return fmt.Errorf("tar file error: %w%s", err, extra)
   200  
   201  		// if the header is nil, just skip it (not sure how this happens)
   202  		case header == nil:
   203  			continue
   204  		}
   205  
   206  		name := header.Name
   207  		switch path.Clean(name) {
   208  		case ".", slashSeparator:
   209  			continue
   210  		}
   211  
   212  		switch header.Typeflag {
   213  		case tar.TypeDir: // = directory
   214  			if o.ignoreDirs {
   215  				continue
   216  			}
   217  			name = trimLeadingSlash(pathJoin(name, slashSeparator))
   218  		case tar.TypeReg, tar.TypeChar, tar.TypeBlock, tar.TypeFifo, tar.TypeGNUSparse: // = regular
   219  			name = trimLeadingSlash(path.Clean(name))
   220  		default:
   221  			// ignore symlink'ed
   222  			continue
   223  		}
   224  		if o.prefixAll != "" {
   225  			name = pathJoin(o.prefixAll, name)
   226  		}
   227  
   228  		// Do small files async
   229  		n++
   230  		if header.Size <= smallFileThreshold {
   231  			asyncWriters <- struct{}{}
   232  			b := poolBuf128k.Get().([]byte)
   233  			if cap(b) < int(header.Size) {
   234  				b = make([]byte, smallFileThreshold)
   235  			}
   236  			b = b[:header.Size]
   237  			if _, err := io.ReadFull(tarReader, b); err != nil {
   238  				return err
   239  			}
   240  			wg.Add(1)
   241  			go func(name string, fi fs.FileInfo, b []byte) {
   242  				rc := disconnectReader{r: bytes.NewReader(b)}
   243  				defer func() {
   244  					rc.Close()
   245  					<-asyncWriters
   246  					wg.Done()
   247  					//nolint:staticcheck // SA6002 we are fine with the tiny alloc
   248  					poolBuf128k.Put(b)
   249  				}()
   250  				if err := putObject(&rc, fi, name); err != nil {
   251  					if o.ignoreErrs {
   252  						logger.LogIf(ctx, err)
   253  						return
   254  					}
   255  					asyncErrMu.Lock()
   256  					if asyncErr == nil {
   257  						asyncErr = err
   258  					}
   259  					asyncErrMu.Unlock()
   260  				}
   261  			}(name, header.FileInfo(), b)
   262  			continue
   263  		}
   264  
   265  		// If zero or earlier modtime, set to current.
   266  		// Otherwise the resulting objects will be invalid.
   267  		if header.ModTime.UnixNano() <= 0 {
   268  			header.ModTime = time.Now()
   269  		}
   270  
   271  		// Sync upload.
   272  		rc := disconnectReader{r: tarReader}
   273  		if err := putObject(&rc, header.FileInfo(), name); err != nil {
   274  			rc.Close()
   275  			if o.ignoreErrs {
   276  				logger.LogIf(ctx, err)
   277  				continue
   278  			}
   279  			return err
   280  		}
   281  		rc.Close()
   282  	}
   283  }