github.com/weaviate/weaviate@v1.24.6/usecases/backup/zip.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package backup 13 14 import ( 15 "archive/tar" 16 "bytes" 17 "compress/gzip" 18 "context" 19 "fmt" 20 "io" 21 "io/fs" 22 "os" 23 "path/filepath" 24 "sync/atomic" 25 "time" 26 27 "github.com/weaviate/weaviate/entities/backup" 28 ) 29 30 // CompressionLevel represents supported compression level 31 type CompressionLevel int 32 33 const ( 34 DefaultCompression CompressionLevel = iota 35 BestSpeed 36 BestCompression 37 ) 38 39 type zip struct { 40 sourcePath string 41 w *tar.Writer 42 gzw *gzip.Writer 43 pipeWriter *io.PipeWriter 44 counter func() int64 45 } 46 47 func NewZip(sourcePath string, level int) (zip, io.ReadCloser) { 48 pr, pw := io.Pipe() 49 gzw, _ := gzip.NewWriterLevel(pw, zipLevel(level)) 50 reader := &readCloser{src: pr, n: 0} 51 52 return zip{ 53 sourcePath: sourcePath, 54 gzw: gzw, 55 w: tar.NewWriter(gzw), 56 pipeWriter: pw, 57 counter: reader.counter(), 58 }, reader 59 } 60 61 func (z *zip) Close() error { 62 var err1, err2, err3 error 63 err1 = z.w.Close() 64 err2 = z.gzw.Close() 65 if err := z.pipeWriter.Close(); err != nil && err != io.ErrClosedPipe { 66 err3 = err 67 } 68 if err1 != nil || err2 != nil || err3 != nil { 69 return fmt.Errorf("tar: %w, gzip: %w, pw: %w", err1, err2, err3) 70 } 71 return nil 72 } 73 74 // WriteShard writes shard internal files including in memory files stored in sd 75 func (z *zip) WriteShard(ctx context.Context, sd *backup.ShardDescriptor) (written int64, err error) { 76 var n int64 // temporary written bytes 77 for _, x := range [3]struct { 78 relPath string 79 data []byte 80 modTime time.Time 81 }{ 82 {relPath: sd.DocIDCounterPath, data: sd.DocIDCounter}, 83 {relPath: sd.PropLengthTrackerPath, data: sd.PropLengthTracker}, 84 {relPath: sd.ShardVersionPath, data: sd.Version}, 85 } { 86 info := vFileInfo{ 87 name: filepath.Base(x.relPath), 88 size: len(x.data), 89 } 90 if n, err = z.writeOne(info, x.relPath, bytes.NewReader(x.data)); err != nil { 91 return written, err 92 } 93 written += n 94 95 } 96 97 n, err = z.WriteRegulars(ctx, sd.Files) 98 written += n 99 100 return 101 } 102 103 func (z *zip) WriteRegulars(ctx context.Context, relPaths []string) (written int64, err error) { 104 for _, relPath := range relPaths { 105 if filepath.Base(relPath) == ".DS_Store" { 106 continue 107 } 108 if err := ctx.Err(); err != nil { 109 return written, err 110 } 111 n, err := z.WriteRegular(relPath) 112 if err != nil { 113 return written, err 114 } 115 written += n 116 } 117 return written, nil 118 } 119 120 func (z *zip) WriteRegular(relPath string) (written int64, err error) { 121 // open file for read 122 absPath := filepath.Join(z.sourcePath, relPath) 123 info, err := os.Stat(absPath) 124 if err != nil { 125 return written, fmt.Errorf("stat: %w", err) 126 } 127 if !info.Mode().IsRegular() { 128 return 0, nil // ignore directories 129 } 130 f, err := os.Open(absPath) 131 if err != nil { 132 return written, fmt.Errorf("open: %w", err) 133 } 134 defer f.Close() 135 136 return z.writeOne(info, relPath, f) 137 } 138 139 func (z *zip) writeOne(info fs.FileInfo, relPath string, r io.Reader) (written int64, err error) { 140 // write info header 141 header, err := tar.FileInfoHeader(info, info.Name()) 142 if err != nil { 143 return written, fmt.Errorf("file header: %w", err) 144 } 145 header.Name = relPath 146 header.ChangeTime = info.ModTime() 147 if err := z.w.WriteHeader(header); err != nil { 148 return written, fmt.Errorf("write header %s: %w", relPath, err) 149 } 150 // write bytes 151 written, err = io.Copy(z.w, r) 152 if err != nil { 153 return written, fmt.Errorf("copy: %s %w", relPath, err) 154 } 155 return 156 } 157 158 // lastWritten number of bytes 159 func (z *zip) lastWritten() int64 { 160 return z.counter() 161 } 162 163 type unzip struct { 164 destPath string 165 gzr *gzip.Reader 166 r *tar.Reader 167 pipeReader *io.PipeReader 168 } 169 170 func NewUnzip(dst string) (unzip, io.WriteCloser) { 171 pr, pw := io.Pipe() 172 return unzip{ 173 destPath: dst, 174 pipeReader: pr, 175 }, pw 176 } 177 178 func (u *unzip) init() error { 179 if u.gzr != nil { 180 return nil 181 } 182 gz, err := gzip.NewReader(u.pipeReader) 183 if err != nil { 184 return fmt.Errorf("gzip.NewReader: %w", err) 185 } 186 u.gzr = gz 187 u.r = tar.NewReader(gz) 188 return nil 189 } 190 191 func (u *unzip) Close() (err error) { 192 var err1, err2 error 193 if err := u.pipeReader.Close(); err != nil && err != io.ErrClosedPipe { 194 err1 = err 195 } 196 if u.gzr != nil { 197 err2 = u.gzr.Close() 198 } 199 if err1 != nil || err2 != nil { 200 return fmt.Errorf("close pr: %w, gunzip: %w", err1, err2) 201 } 202 203 return nil 204 } 205 206 func (u *unzip) ReadChunk() (written int64, err error) { 207 if err := u.init(); err != nil { 208 return 0, err 209 } 210 parentPath := "" 211 for { 212 header, err := u.r.Next() 213 if err != nil { 214 if err == io.EOF { // end of the loop 215 return written, nil 216 } 217 return written, fmt.Errorf("fetch next: %w", err) 218 } 219 if header == nil { 220 continue 221 } 222 223 // target file 224 target := filepath.Join(u.destPath, header.Name) 225 switch header.Typeflag { 226 case tar.TypeDir: 227 if err := os.MkdirAll(target, 0o755); err != nil { 228 return written, fmt.Errorf("crateDir %s: %w", target, err) 229 } 230 case tar.TypeReg: 231 if pp := filepath.Dir(target); pp != parentPath { 232 parentPath = pp 233 if err := os.MkdirAll(parentPath, 0o755); err != nil { 234 return written, fmt.Errorf("crateDir %s: %w", target, err) 235 } 236 } 237 n, err := copyFile(target, header, u.r) 238 if err != nil { 239 return written, fmt.Errorf("copy file %s: %w", target, err) 240 } 241 written += n 242 } 243 } 244 } 245 246 func copyFile(target string, h *tar.Header, r io.Reader) (written int64, err error) { 247 f, err := os.OpenFile(target, os.O_CREATE|os.O_RDWR, os.FileMode(h.Mode)) 248 if err != nil { 249 return written, fmt.Errorf("create: %w", err) 250 } 251 defer f.Close() 252 written, err = io.Copy(f, r) 253 if err != nil { 254 return written, fmt.Errorf("copy: %w", err) 255 } 256 return written, nil 257 } 258 259 type vFileInfo struct { 260 name string 261 size int 262 modTime time.Time // TODO: get it when parsing source files 263 } 264 265 func (v vFileInfo) Name() string { return v.name } 266 func (v vFileInfo) Size() int64 { return int64(v.size) } 267 func (v vFileInfo) Mode() os.FileMode { return 0o644 } 268 func (v vFileInfo) ModTime() time.Time { return v.modTime } 269 func (v vFileInfo) IsDir() bool { return false } 270 func (v vFileInfo) Sys() interface{} { return nil } 271 272 type readCloser struct { 273 src io.ReadCloser 274 n int64 275 } 276 277 func (r *readCloser) Read(p []byte) (n int, err error) { 278 n, err = r.src.Read(p) 279 atomic.AddInt64(&r.n, int64(n)) 280 return 281 } 282 283 func (r *readCloser) Close() error { return r.src.Close() } 284 285 func (r *readCloser) counter() func() int64 { 286 return func() int64 { 287 return atomic.LoadInt64(&r.n) 288 } 289 } 290 291 func zipLevel(level int) int { 292 if level < 0 || level > 3 { 293 return gzip.DefaultCompression 294 } 295 switch CompressionLevel(level) { 296 case BestSpeed: 297 return gzip.BestSpeed 298 case BestCompression: 299 return gzip.BestCompression 300 default: 301 return gzip.DefaultCompression 302 } 303 } 304 305 type zipConfig struct { 306 Level int 307 GoPoolSize int 308 ChunkSize int 309 } 310 311 func newZipConfig(c Compression) zipConfig { 312 // convert from MB to byte because input already 313 // in MB and validated against min:2 max:512 314 switch c.ChunkSize = c.ChunkSize * 1024 * 1024; { 315 case c.ChunkSize == 0: 316 c.ChunkSize = DefaultChunkSize 317 case c.ChunkSize > maxChunkSize: 318 c.ChunkSize = maxChunkSize 319 case c.ChunkSize < minChunkSize: 320 c.ChunkSize = minChunkSize 321 } 322 323 return zipConfig{ 324 Level: int(c.Level), 325 GoPoolSize: routinePoolSize(c.CPUPercentage), 326 ChunkSize: c.ChunkSize, 327 } 328 }