github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/tar.go (about)

     1  // Package shard provides Extract(shard), Create(shard), and associated methods
     2  // across all suppported archival formats (see cmn/archive/mime.go)
     3  /*
     4   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package shard
     7  
     8  import (
     9  	"archive/tar"
    10  	"io"
    11  	"strconv"
    12  
    13  	"github.com/NVIDIA/aistore/cmn/archive"
    14  	"github.com/NVIDIA/aistore/cmn/cos"
    15  	"github.com/NVIDIA/aistore/cmn/debug"
    16  	"github.com/NVIDIA/aistore/core"
    17  	"github.com/NVIDIA/aistore/memsys"
    18  	jsoniter "github.com/json-iterator/go"
    19  )
    20  
    21  type (
    22  	tarRW struct {
    23  		ext string
    24  	}
    25  	tarRecordW struct {
    26  		tarWriter    *tar.Writer
    27  		slab         *memsys.Slab
    28  		metadataSize int64
    29  		size         int64
    30  		written      int64
    31  		metadataBuf  []byte
    32  	}
    33  )
    34  
    35  // interface guard
    36  var _ RW = (*tarRW)(nil)
    37  
    38  ////////////////
    39  // tarRecordW //
    40  ////////////////
    41  
    42  func newTarRecordDataReader() *tarRecordW {
    43  	rd := &tarRecordW{}
    44  	rd.metadataBuf, rd.slab = core.T.ByteMM().Alloc()
    45  	return rd
    46  }
    47  
    48  func (rd *tarRecordW) reinit(tw *tar.Writer, size, metadataSize int64) {
    49  	rd.tarWriter = tw
    50  	rd.written = 0
    51  	rd.size = size
    52  	rd.metadataSize = metadataSize
    53  }
    54  
    55  func (rd *tarRecordW) free() {
    56  	rd.slab.Free(rd.metadataBuf)
    57  }
    58  
    59  func (rd *tarRecordW) Write(p []byte) (int, error) {
    60  	// Write header
    61  	remainingMetadataSize := rd.metadataSize - rd.written
    62  	if remainingMetadataSize > 0 {
    63  		writeN := int64(len(p))
    64  		if writeN < remainingMetadataSize {
    65  			debug.Assert(int64(len(rd.metadataBuf))-rd.written >= writeN)
    66  			copy(rd.metadataBuf[rd.written:], p)
    67  			rd.written += writeN
    68  			return len(p), nil
    69  		}
    70  
    71  		debug.Assert(int64(len(rd.metadataBuf))-rd.written >= remainingMetadataSize)
    72  		copy(rd.metadataBuf[rd.written:], p[:remainingMetadataSize])
    73  		rd.written += remainingMetadataSize
    74  		p = p[remainingMetadataSize:]
    75  		var header tar.Header
    76  		if err := jsoniter.Unmarshal(rd.metadataBuf[:rd.metadataSize], &header); err != nil {
    77  			return int(remainingMetadataSize), err
    78  		}
    79  
    80  		if err := rd.tarWriter.WriteHeader(&header); err != nil {
    81  			return int(remainingMetadataSize), err
    82  		}
    83  	} else {
    84  		remainingMetadataSize = 0
    85  	}
    86  
    87  	n, err := rd.tarWriter.Write(p)
    88  	rd.written += int64(n)
    89  	return n + int(remainingMetadataSize), err
    90  }
    91  
    92  ///////////
    93  // tarRW //
    94  ///////////
    95  
    96  func NewTarRW() RW { return &tarRW{ext: archive.ExtTar} }
    97  
    98  func (*tarRW) IsCompressed() bool   { return false }
    99  func (*tarRW) SupportsOffset() bool { return true }
   100  func (*tarRW) MetadataSize() int64  { return archive.TarBlockSize } // size of tar header with padding
   101  
   102  func (trw *tarRW) Extract(lom *core.LOM, r cos.ReadReaderAt, extractor RecordExtractor, toDisk bool) (int64, int, error) {
   103  	ar, err := archive.NewReader(trw.ext, r)
   104  	if err != nil {
   105  		return 0, 0, err
   106  	}
   107  	c := &rcbCtx{parent: trw, tw: nil, extractor: extractor, shardName: lom.ObjName, toDisk: toDisk, fromTar: true}
   108  	buf, slab := core.T.PageMM().AllocSize(lom.SizeBytes())
   109  	c.buf = buf
   110  
   111  	err = ar.ReadUntil(c, cos.EmptyMatchAll, "")
   112  
   113  	slab.Free(buf)
   114  	return c.extractedSize, c.extractedCount, err
   115  }
   116  
   117  // Note that the order of closing must be trw, gzw, then finally tarball.
   118  func (*tarRW) Create(s *Shard, tarball io.Writer, loader ContentLoader) (written int64, err error) {
   119  	var (
   120  		n         int64
   121  		needFlush bool
   122  		tw        = tar.NewWriter(tarball)
   123  		rdReader  = newTarRecordDataReader()
   124  	)
   125  	defer func() {
   126  		rdReader.free()
   127  		cos.Close(tw)
   128  	}()
   129  
   130  	for _, rec := range s.Records.All() {
   131  		for _, obj := range rec.Objects {
   132  			switch obj.StoreType {
   133  			case OffsetStoreType:
   134  				if needFlush {
   135  					// We now will write directly to the tarball file so we need
   136  					// to flush everything what we have written so far.
   137  					if err := tw.Flush(); err != nil {
   138  						return written, err
   139  					}
   140  					needFlush = false
   141  				}
   142  				if n, err = loader.Load(tarball, rec, obj); err != nil {
   143  					return written + n, err
   144  				}
   145  				// pad to 512 bytes
   146  				diff := cos.CeilAlignInt64(n, archive.TarBlockSize) - n
   147  				if diff > 0 {
   148  					if _, err = tarball.Write(padBuf[:diff]); err != nil {
   149  						return written + n, err
   150  					}
   151  					n += diff
   152  				}
   153  				debug.Assert(diff >= 0 && diff < archive.TarBlockSize)
   154  			case SGLStoreType, DiskStoreType:
   155  				rdReader.reinit(tw, obj.Size, obj.MetadataSize)
   156  				if n, err = loader.Load(rdReader, rec, obj); err != nil {
   157  					return written + n, err
   158  				}
   159  				written += n
   160  
   161  				needFlush = true
   162  			default:
   163  				debug.Assert(false, obj.StoreType)
   164  			}
   165  
   166  			written += n
   167  		}
   168  	}
   169  
   170  	return written, nil
   171  }
   172  
   173  // mostly follows `tar.formatPAXRecord`
   174  func estimateXHeaderSize(paxRecords map[string]string) int64 {
   175  	const padding = 3 // Extra padding for ' ', '=', and '\n'
   176  	totalSize := 0
   177  	for k, v := range paxRecords {
   178  		size := len(k) + len(v) + padding
   179  		size += len(strconv.Itoa(size))
   180  		record := strconv.Itoa(size) + " " + k + "=" + v + "\n"
   181  
   182  		// Final adjustment if adding size field increased the record size.
   183  		if len(record) != size {
   184  			record = strconv.Itoa(len(record)) + " " + k + "=" + v + "\n"
   185  		}
   186  		totalSize += len(record)
   187  	}
   188  	return int64(totalSize)
   189  }