github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/tar.go (about) 1 // Package shard provides Extract(shard), Create(shard), and associated methods 2 // across all suppported archival formats (see cmn/archive/mime.go) 3 /* 4 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package shard 7 8 import ( 9 "archive/tar" 10 "io" 11 "strconv" 12 13 "github.com/NVIDIA/aistore/cmn/archive" 14 "github.com/NVIDIA/aistore/cmn/cos" 15 "github.com/NVIDIA/aistore/cmn/debug" 16 "github.com/NVIDIA/aistore/core" 17 "github.com/NVIDIA/aistore/memsys" 18 jsoniter "github.com/json-iterator/go" 19 ) 20 21 type ( 22 tarRW struct { 23 ext string 24 } 25 tarRecordW struct { 26 tarWriter *tar.Writer 27 slab *memsys.Slab 28 metadataSize int64 29 size int64 30 written int64 31 metadataBuf []byte 32 } 33 ) 34 35 // interface guard 36 var _ RW = (*tarRW)(nil) 37 38 //////////////// 39 // tarRecordW // 40 //////////////// 41 42 func newTarRecordDataReader() *tarRecordW { 43 rd := &tarRecordW{} 44 rd.metadataBuf, rd.slab = core.T.ByteMM().Alloc() 45 return rd 46 } 47 48 func (rd *tarRecordW) reinit(tw *tar.Writer, size, metadataSize int64) { 49 rd.tarWriter = tw 50 rd.written = 0 51 rd.size = size 52 rd.metadataSize = metadataSize 53 } 54 55 func (rd *tarRecordW) free() { 56 rd.slab.Free(rd.metadataBuf) 57 } 58 59 func (rd *tarRecordW) Write(p []byte) (int, error) { 60 // Write header 61 remainingMetadataSize := rd.metadataSize - rd.written 62 if remainingMetadataSize > 0 { 63 writeN := int64(len(p)) 64 if writeN < remainingMetadataSize { 65 debug.Assert(int64(len(rd.metadataBuf))-rd.written >= writeN) 66 copy(rd.metadataBuf[rd.written:], p) 67 rd.written += writeN 68 return len(p), nil 69 } 70 71 debug.Assert(int64(len(rd.metadataBuf))-rd.written >= remainingMetadataSize) 72 copy(rd.metadataBuf[rd.written:], p[:remainingMetadataSize]) 73 rd.written += remainingMetadataSize 74 p = p[remainingMetadataSize:] 75 var header tar.Header 76 if err := jsoniter.Unmarshal(rd.metadataBuf[:rd.metadataSize], &header); err != nil { 77 return int(remainingMetadataSize), err 78 } 79 80 if err := rd.tarWriter.WriteHeader(&header); err != nil { 81 return int(remainingMetadataSize), err 82 } 83 } else { 84 remainingMetadataSize = 0 85 } 86 87 n, err := rd.tarWriter.Write(p) 88 rd.written += int64(n) 89 return n + int(remainingMetadataSize), err 90 } 91 92 /////////// 93 // tarRW // 94 /////////// 95 96 func NewTarRW() RW { return &tarRW{ext: archive.ExtTar} } 97 98 func (*tarRW) IsCompressed() bool { return false } 99 func (*tarRW) SupportsOffset() bool { return true } 100 func (*tarRW) MetadataSize() int64 { return archive.TarBlockSize } // size of tar header with padding 101 102 func (trw *tarRW) Extract(lom *core.LOM, r cos.ReadReaderAt, extractor RecordExtractor, toDisk bool) (int64, int, error) { 103 ar, err := archive.NewReader(trw.ext, r) 104 if err != nil { 105 return 0, 0, err 106 } 107 c := &rcbCtx{parent: trw, tw: nil, extractor: extractor, shardName: lom.ObjName, toDisk: toDisk, fromTar: true} 108 buf, slab := core.T.PageMM().AllocSize(lom.SizeBytes()) 109 c.buf = buf 110 111 err = ar.ReadUntil(c, cos.EmptyMatchAll, "") 112 113 slab.Free(buf) 114 return c.extractedSize, c.extractedCount, err 115 } 116 117 // Note that the order of closing must be trw, gzw, then finally tarball. 118 func (*tarRW) Create(s *Shard, tarball io.Writer, loader ContentLoader) (written int64, err error) { 119 var ( 120 n int64 121 needFlush bool 122 tw = tar.NewWriter(tarball) 123 rdReader = newTarRecordDataReader() 124 ) 125 defer func() { 126 rdReader.free() 127 cos.Close(tw) 128 }() 129 130 for _, rec := range s.Records.All() { 131 for _, obj := range rec.Objects { 132 switch obj.StoreType { 133 case OffsetStoreType: 134 if needFlush { 135 // We now will write directly to the tarball file so we need 136 // to flush everything what we have written so far. 137 if err := tw.Flush(); err != nil { 138 return written, err 139 } 140 needFlush = false 141 } 142 if n, err = loader.Load(tarball, rec, obj); err != nil { 143 return written + n, err 144 } 145 // pad to 512 bytes 146 diff := cos.CeilAlignInt64(n, archive.TarBlockSize) - n 147 if diff > 0 { 148 if _, err = tarball.Write(padBuf[:diff]); err != nil { 149 return written + n, err 150 } 151 n += diff 152 } 153 debug.Assert(diff >= 0 && diff < archive.TarBlockSize) 154 case SGLStoreType, DiskStoreType: 155 rdReader.reinit(tw, obj.Size, obj.MetadataSize) 156 if n, err = loader.Load(rdReader, rec, obj); err != nil { 157 return written + n, err 158 } 159 written += n 160 161 needFlush = true 162 default: 163 debug.Assert(false, obj.StoreType) 164 } 165 166 written += n 167 } 168 } 169 170 return written, nil 171 } 172 173 // mostly follows `tar.formatPAXRecord` 174 func estimateXHeaderSize(paxRecords map[string]string) int64 { 175 const padding = 3 // Extra padding for ' ', '=', and '\n' 176 totalSize := 0 177 for k, v := range paxRecords { 178 size := len(k) + len(v) + padding 179 size += len(strconv.Itoa(size)) 180 record := strconv.Itoa(size) + " " + k + "=" + v + "\n" 181 182 // Final adjustment if adding size field increased the record size. 183 if len(record) != size { 184 record = strconv.Itoa(len(record)) + " " + k + "=" + v + "\n" 185 } 186 totalSize += len(record) 187 } 188 return int64(totalSize) 189 }