github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/zip.go (about)

     1  // Package shard provides Extract(shard), Create(shard), and associated methods
     2  // across all suppported archival formats (see cmn/archive/mime.go)
     3  /*
     4   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package shard
     7  
     8  import (
     9  	"archive/zip"
    10  	"io"
    11  
    12  	"github.com/NVIDIA/aistore/cmn/archive"
    13  	"github.com/NVIDIA/aistore/cmn/cos"
    14  	"github.com/NVIDIA/aistore/cmn/debug"
    15  	"github.com/NVIDIA/aistore/core"
    16  	"github.com/NVIDIA/aistore/memsys"
    17  	jsoniter "github.com/json-iterator/go"
    18  )
    19  
    20  type (
    21  	zipRW struct {
    22  		ext string
    23  	}
    24  
    25  	zipFileHeader struct {
    26  		Name    string `json:"name"`
    27  		Comment string `json:"comment"`
    28  	}
    29  
    30  	// zipRecordDataReader is used for writing metadata as well as data to the buffer.
    31  	zipRecordDataReader struct {
    32  		slab *memsys.Slab
    33  
    34  		metadataSize int64
    35  		size         int64
    36  		written      int64
    37  		metadataBuf  []byte
    38  		header       zipFileHeader
    39  		zipWriter    *zip.Writer
    40  
    41  		writer io.Writer
    42  	}
    43  )
    44  
    45  // interface guard
    46  var _ RW = (*zipRW)(nil)
    47  
    48  ///////////
    49  // zipRW //
    50  ///////////
    51  
    52  func NewZipRW() RW { return &zipRW{ext: archive.ExtZip} }
    53  
    54  func (*zipRW) IsCompressed() bool   { return true }
    55  func (*zipRW) SupportsOffset() bool { return false }
    56  func (*zipRW) MetadataSize() int64  { return 0 } // zip does not have header size
    57  
    58  // Extract reads the tarball f and extracts its metadata.
    59  func (zrw *zipRW) Extract(lom *core.LOM, r cos.ReadReaderAt, extractor RecordExtractor, toDisk bool) (int64, int, error) {
    60  	ar, err := archive.NewReader(zrw.ext, r, lom.SizeBytes())
    61  	if err != nil {
    62  		return 0, 0, err
    63  	}
    64  	c := &rcbCtx{parent: zrw, extractor: extractor, shardName: lom.ObjName, toDisk: toDisk, fromTar: false}
    65  	buf, slab := core.T.PageMM().AllocSize(lom.SizeBytes())
    66  	c.buf = buf
    67  
    68  	err = ar.ReadUntil(c, cos.EmptyMatchAll, "")
    69  
    70  	slab.Free(buf)
    71  	return c.extractedSize, c.extractedCount, err
    72  }
    73  
    74  // Create creates a new shard locally based on the Shard.
    75  // Note that the order of closing must be trw, gzw, then finally tarball.
    76  func (*zipRW) Create(s *Shard, w io.Writer, loader ContentLoader) (written int64, err error) {
    77  	var n int64
    78  	zw := zip.NewWriter(w)
    79  	defer cos.Close(zw)
    80  
    81  	rdReader := newZipRecordDataReader()
    82  	for _, rec := range s.Records.All() {
    83  		for _, obj := range rec.Objects {
    84  			rdReader.reinit(zw, obj.Size, obj.MetadataSize)
    85  			if n, err = loader.Load(rdReader, rec, obj); err != nil {
    86  				return written + n, err
    87  			}
    88  
    89  			written += n
    90  		}
    91  	}
    92  	rdReader.free()
    93  	return written, nil
    94  }
    95  
    96  /////////////////////////
    97  // zipRecordDataReader //
    98  /////////////////////////
    99  
   100  func newZipRecordDataReader() *zipRecordDataReader {
   101  	rd := &zipRecordDataReader{}
   102  	rd.metadataBuf, rd.slab = core.T.ByteMM().Alloc()
   103  	return rd
   104  }
   105  
   106  func (rd *zipRecordDataReader) reinit(zw *zip.Writer, size, metadataSize int64) {
   107  	rd.zipWriter = zw
   108  	rd.written = 0
   109  	rd.size = size
   110  	rd.metadataSize = metadataSize
   111  }
   112  
   113  func (rd *zipRecordDataReader) free() {
   114  	rd.slab.Free(rd.metadataBuf)
   115  }
   116  
   117  func (rd *zipRecordDataReader) Write(p []byte) (int, error) {
   118  	// Read header and initialize file writer
   119  	remainingMetadataSize := rd.metadataSize - rd.written
   120  	if remainingMetadataSize > 0 {
   121  		writeN := int64(len(p))
   122  		if writeN < remainingMetadataSize {
   123  			debug.Assert(int64(len(rd.metadataBuf))-rd.written >= writeN)
   124  			copy(rd.metadataBuf[rd.written:], p)
   125  			rd.written += writeN
   126  			return len(p), nil
   127  		}
   128  		debug.Assert(int64(len(rd.metadataBuf))-rd.written >= remainingMetadataSize)
   129  
   130  		copy(rd.metadataBuf[rd.written:], p[:remainingMetadataSize])
   131  		rd.written += remainingMetadataSize
   132  		p = p[remainingMetadataSize:]
   133  		var metadata zipFileHeader
   134  		if err := jsoniter.Unmarshal(rd.metadataBuf[:rd.metadataSize], &metadata); err != nil {
   135  			return int(remainingMetadataSize), err
   136  		}
   137  
   138  		rd.header = metadata
   139  		writer, err := rd.zipWriter.Create(rd.header.Name)
   140  		if err != nil {
   141  			return int(remainingMetadataSize), err
   142  		}
   143  		if err := rd.zipWriter.SetComment(rd.header.Comment); err != nil {
   144  			return int(remainingMetadataSize), err
   145  		}
   146  		rd.writer = writer
   147  	} else {
   148  		remainingMetadataSize = 0
   149  	}
   150  
   151  	n, err := rd.writer.Write(p)
   152  	rd.written += int64(n)
   153  	return n + int(remainingMetadataSize), err
   154  }