github.com/weaviate/weaviate@v1.24.6/usecases/backup/zip.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package backup
    13  
    14  import (
    15  	"archive/tar"
    16  	"bytes"
    17  	"compress/gzip"
    18  	"context"
    19  	"fmt"
    20  	"io"
    21  	"io/fs"
    22  	"os"
    23  	"path/filepath"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"github.com/weaviate/weaviate/entities/backup"
    28  )
    29  
    30  // CompressionLevel represents supported compression level
    31  type CompressionLevel int
    32  
    33  const (
    34  	DefaultCompression CompressionLevel = iota
    35  	BestSpeed
    36  	BestCompression
    37  )
    38  
    39  type zip struct {
    40  	sourcePath string
    41  	w          *tar.Writer
    42  	gzw        *gzip.Writer
    43  	pipeWriter *io.PipeWriter
    44  	counter    func() int64
    45  }
    46  
    47  func NewZip(sourcePath string, level int) (zip, io.ReadCloser) {
    48  	pr, pw := io.Pipe()
    49  	gzw, _ := gzip.NewWriterLevel(pw, zipLevel(level))
    50  	reader := &readCloser{src: pr, n: 0}
    51  
    52  	return zip{
    53  		sourcePath: sourcePath,
    54  		gzw:        gzw,
    55  		w:          tar.NewWriter(gzw),
    56  		pipeWriter: pw,
    57  		counter:    reader.counter(),
    58  	}, reader
    59  }
    60  
    61  func (z *zip) Close() error {
    62  	var err1, err2, err3 error
    63  	err1 = z.w.Close()
    64  	err2 = z.gzw.Close()
    65  	if err := z.pipeWriter.Close(); err != nil && err != io.ErrClosedPipe {
    66  		err3 = err
    67  	}
    68  	if err1 != nil || err2 != nil || err3 != nil {
    69  		return fmt.Errorf("tar: %w, gzip: %w, pw: %w", err1, err2, err3)
    70  	}
    71  	return nil
    72  }
    73  
    74  // WriteShard writes shard internal files including in memory files stored in sd
    75  func (z *zip) WriteShard(ctx context.Context, sd *backup.ShardDescriptor) (written int64, err error) {
    76  	var n int64 // temporary written bytes
    77  	for _, x := range [3]struct {
    78  		relPath string
    79  		data    []byte
    80  		modTime time.Time
    81  	}{
    82  		{relPath: sd.DocIDCounterPath, data: sd.DocIDCounter},
    83  		{relPath: sd.PropLengthTrackerPath, data: sd.PropLengthTracker},
    84  		{relPath: sd.ShardVersionPath, data: sd.Version},
    85  	} {
    86  		info := vFileInfo{
    87  			name: filepath.Base(x.relPath),
    88  			size: len(x.data),
    89  		}
    90  		if n, err = z.writeOne(info, x.relPath, bytes.NewReader(x.data)); err != nil {
    91  			return written, err
    92  		}
    93  		written += n
    94  
    95  	}
    96  
    97  	n, err = z.WriteRegulars(ctx, sd.Files)
    98  	written += n
    99  
   100  	return
   101  }
   102  
   103  func (z *zip) WriteRegulars(ctx context.Context, relPaths []string) (written int64, err error) {
   104  	for _, relPath := range relPaths {
   105  		if filepath.Base(relPath) == ".DS_Store" {
   106  			continue
   107  		}
   108  		if err := ctx.Err(); err != nil {
   109  			return written, err
   110  		}
   111  		n, err := z.WriteRegular(relPath)
   112  		if err != nil {
   113  			return written, err
   114  		}
   115  		written += n
   116  	}
   117  	return written, nil
   118  }
   119  
   120  func (z *zip) WriteRegular(relPath string) (written int64, err error) {
   121  	// open file for read
   122  	absPath := filepath.Join(z.sourcePath, relPath)
   123  	info, err := os.Stat(absPath)
   124  	if err != nil {
   125  		return written, fmt.Errorf("stat: %w", err)
   126  	}
   127  	if !info.Mode().IsRegular() {
   128  		return 0, nil // ignore directories
   129  	}
   130  	f, err := os.Open(absPath)
   131  	if err != nil {
   132  		return written, fmt.Errorf("open: %w", err)
   133  	}
   134  	defer f.Close()
   135  
   136  	return z.writeOne(info, relPath, f)
   137  }
   138  
   139  func (z *zip) writeOne(info fs.FileInfo, relPath string, r io.Reader) (written int64, err error) {
   140  	// write info header
   141  	header, err := tar.FileInfoHeader(info, info.Name())
   142  	if err != nil {
   143  		return written, fmt.Errorf("file header: %w", err)
   144  	}
   145  	header.Name = relPath
   146  	header.ChangeTime = info.ModTime()
   147  	if err := z.w.WriteHeader(header); err != nil {
   148  		return written, fmt.Errorf("write header %s: %w", relPath, err)
   149  	}
   150  	// write bytes
   151  	written, err = io.Copy(z.w, r)
   152  	if err != nil {
   153  		return written, fmt.Errorf("copy: %s %w", relPath, err)
   154  	}
   155  	return
   156  }
   157  
   158  // lastWritten number of bytes
   159  func (z *zip) lastWritten() int64 {
   160  	return z.counter()
   161  }
   162  
   163  type unzip struct {
   164  	destPath   string
   165  	gzr        *gzip.Reader
   166  	r          *tar.Reader
   167  	pipeReader *io.PipeReader
   168  }
   169  
   170  func NewUnzip(dst string) (unzip, io.WriteCloser) {
   171  	pr, pw := io.Pipe()
   172  	return unzip{
   173  		destPath:   dst,
   174  		pipeReader: pr,
   175  	}, pw
   176  }
   177  
   178  func (u *unzip) init() error {
   179  	if u.gzr != nil {
   180  		return nil
   181  	}
   182  	gz, err := gzip.NewReader(u.pipeReader)
   183  	if err != nil {
   184  		return fmt.Errorf("gzip.NewReader: %w", err)
   185  	}
   186  	u.gzr = gz
   187  	u.r = tar.NewReader(gz)
   188  	return nil
   189  }
   190  
   191  func (u *unzip) Close() (err error) {
   192  	var err1, err2 error
   193  	if err := u.pipeReader.Close(); err != nil && err != io.ErrClosedPipe {
   194  		err1 = err
   195  	}
   196  	if u.gzr != nil {
   197  		err2 = u.gzr.Close()
   198  	}
   199  	if err1 != nil || err2 != nil {
   200  		return fmt.Errorf("close pr: %w, gunzip: %w", err1, err2)
   201  	}
   202  
   203  	return nil
   204  }
   205  
   206  func (u *unzip) ReadChunk() (written int64, err error) {
   207  	if err := u.init(); err != nil {
   208  		return 0, err
   209  	}
   210  	parentPath := ""
   211  	for {
   212  		header, err := u.r.Next()
   213  		if err != nil {
   214  			if err == io.EOF { // end of the loop
   215  				return written, nil
   216  			}
   217  			return written, fmt.Errorf("fetch next: %w", err)
   218  		}
   219  		if header == nil {
   220  			continue
   221  		}
   222  
   223  		// target file
   224  		target := filepath.Join(u.destPath, header.Name)
   225  		switch header.Typeflag {
   226  		case tar.TypeDir:
   227  			if err := os.MkdirAll(target, 0o755); err != nil {
   228  				return written, fmt.Errorf("crateDir %s: %w", target, err)
   229  			}
   230  		case tar.TypeReg:
   231  			if pp := filepath.Dir(target); pp != parentPath {
   232  				parentPath = pp
   233  				if err := os.MkdirAll(parentPath, 0o755); err != nil {
   234  					return written, fmt.Errorf("crateDir %s: %w", target, err)
   235  				}
   236  			}
   237  			n, err := copyFile(target, header, u.r)
   238  			if err != nil {
   239  				return written, fmt.Errorf("copy file %s: %w", target, err)
   240  			}
   241  			written += n
   242  		}
   243  	}
   244  }
   245  
   246  func copyFile(target string, h *tar.Header, r io.Reader) (written int64, err error) {
   247  	f, err := os.OpenFile(target, os.O_CREATE|os.O_RDWR, os.FileMode(h.Mode))
   248  	if err != nil {
   249  		return written, fmt.Errorf("create: %w", err)
   250  	}
   251  	defer f.Close()
   252  	written, err = io.Copy(f, r)
   253  	if err != nil {
   254  		return written, fmt.Errorf("copy: %w", err)
   255  	}
   256  	return written, nil
   257  }
   258  
   259  type vFileInfo struct {
   260  	name    string
   261  	size    int
   262  	modTime time.Time // TODO: get it when parsing source files
   263  }
   264  
   265  func (v vFileInfo) Name() string       { return v.name }
   266  func (v vFileInfo) Size() int64        { return int64(v.size) }
   267  func (v vFileInfo) Mode() os.FileMode  { return 0o644 }
   268  func (v vFileInfo) ModTime() time.Time { return v.modTime }
   269  func (v vFileInfo) IsDir() bool        { return false }
   270  func (v vFileInfo) Sys() interface{}   { return nil }
   271  
   272  type readCloser struct {
   273  	src io.ReadCloser
   274  	n   int64
   275  }
   276  
   277  func (r *readCloser) Read(p []byte) (n int, err error) {
   278  	n, err = r.src.Read(p)
   279  	atomic.AddInt64(&r.n, int64(n))
   280  	return
   281  }
   282  
   283  func (r *readCloser) Close() error { return r.src.Close() }
   284  
   285  func (r *readCloser) counter() func() int64 {
   286  	return func() int64 {
   287  		return atomic.LoadInt64(&r.n)
   288  	}
   289  }
   290  
   291  func zipLevel(level int) int {
   292  	if level < 0 || level > 3 {
   293  		return gzip.DefaultCompression
   294  	}
   295  	switch CompressionLevel(level) {
   296  	case BestSpeed:
   297  		return gzip.BestSpeed
   298  	case BestCompression:
   299  		return gzip.BestCompression
   300  	default:
   301  		return gzip.DefaultCompression
   302  	}
   303  }
   304  
   305  type zipConfig struct {
   306  	Level      int
   307  	GoPoolSize int
   308  	ChunkSize  int
   309  }
   310  
   311  func newZipConfig(c Compression) zipConfig {
   312  	// convert from MB to byte because input already
   313  	// in MB and validated against min:2 max:512
   314  	switch c.ChunkSize = c.ChunkSize * 1024 * 1024; {
   315  	case c.ChunkSize == 0:
   316  		c.ChunkSize = DefaultChunkSize
   317  	case c.ChunkSize > maxChunkSize:
   318  		c.ChunkSize = maxChunkSize
   319  	case c.ChunkSize < minChunkSize:
   320  		c.ChunkSize = minChunkSize
   321  	}
   322  
   323  	return zipConfig{
   324  		Level:      int(c.Level),
   325  		GoPoolSize: routinePoolSize(c.CPUPercentage),
   326  		ChunkSize:  c.ChunkSize,
   327  	}
   328  }