github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/tools/readers/readers.go (about)

     1  // Package readers provides implementation for common reader types
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package readers
     6  
     7  import (
     8  	"archive/tar"
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"math/rand"
    14  	"os"
    15  	"path"
    16  
    17  	"github.com/NVIDIA/aistore/cmn/archive"
    18  	"github.com/NVIDIA/aistore/cmn/cos"
    19  	"github.com/NVIDIA/aistore/cmn/debug"
    20  	"github.com/NVIDIA/aistore/cmn/mono"
    21  	"github.com/NVIDIA/aistore/ext/dsort/shard"
    22  	"github.com/NVIDIA/aistore/memsys"
    23  	"github.com/NVIDIA/aistore/tools/tarch"
    24  )
    25  
    26  const (
    27  	// TypeFile defines the name for file reader
    28  	TypeFile = "file"
    29  	// TypeSG defines the name for sg reader
    30  	TypeSG = "sg"
    31  	// TypeRand defines the name for rand reader
    32  	TypeRand = "rand"
    33  	// TypeTar defines the name for random TAR reader
    34  	TypeTar = "tar"
    35  )
    36  
    37  type (
    38  	Reader interface {
    39  		cos.ReadOpenCloser
    40  		io.Seeker
    41  		Cksum() *cos.Cksum
    42  	}
    43  	randReader struct {
    44  		seed   int64
    45  		rnd    *rand.Rand
    46  		size   int64
    47  		offset int64
    48  		cksum  *cos.Cksum
    49  	}
    50  	tarReader struct {
    51  		b []byte
    52  		bytes.Reader
    53  		cksum *cos.Cksum
    54  	}
    55  	rrLimited struct {
    56  		random *rand.Rand
    57  		size   int64
    58  		off    int64
    59  	}
    60  	fileReader struct {
    61  		*os.File
    62  		filePath string // Example: "/dir/ais/"
    63  		name     string // Example: "smoke/bGzhWKWoxHDSePnELftx"
    64  		cksum    *cos.Cksum
    65  	}
    66  	sgReader struct {
    67  		memsys.Reader
    68  		cksum *cos.Cksum
    69  	}
    70  	bytesReader struct {
    71  		*bytes.Buffer
    72  		buf []byte
    73  	}
    74  
    75  	// (aisloader only)
    76  	Params struct {
    77  		Type       string      // file | sg | inmem | rand
    78  		SGL        *memsys.SGL // When Type == sg
    79  		Path, Name string      // When Type == file; path and name of file to be created (if not already existing)
    80  		Size       int64
    81  	}
    82  )
    83  
    84  // interface guard
    85  var (
    86  	_ Reader = (*randReader)(nil)
    87  	_ Reader = (*tarReader)(nil)
    88  	_ Reader = (*fileReader)(nil)
    89  	_ Reader = (*sgReader)(nil)
    90  )
    91  
    92  ////////////////
    93  // randReader //
    94  ////////////////
    95  
    96  func NewRand(size int64, cksumType string) (Reader, error) {
    97  	var (
    98  		cksum *cos.Cksum
    99  		seed  = mono.NanoTime()
   100  	)
   101  	rand1 := rand.New(rand.NewSource(seed))
   102  	if cksumType != cos.ChecksumNone {
   103  		rr := &rrLimited{rand1, size, 0}
   104  		_, cksumHash, err := cos.CopyAndChecksum(io.Discard, rr, nil, cksumType)
   105  		if err != nil {
   106  			return nil, err
   107  		}
   108  		cksum = cksumHash.Clone()
   109  	}
   110  	rand1dup := rand.New(rand.NewSource(seed))
   111  	return &randReader{
   112  		seed:  seed,
   113  		rnd:   rand1dup,
   114  		size:  size,
   115  		cksum: cksum,
   116  	}, nil
   117  }
   118  
   119  func (r *randReader) Read(buf []byte) (int, error) {
   120  	available := r.size - r.offset
   121  	if available == 0 {
   122  		return 0, io.EOF
   123  	}
   124  
   125  	want := int64(len(buf))
   126  	n := min(want, available)
   127  	actual, err := r.rnd.Read(buf[:n])
   128  	if err != nil {
   129  		return 0, nil
   130  	}
   131  
   132  	r.offset += int64(actual)
   133  	return actual, nil
   134  }
   135  
   136  // Open implements the Reader interface.
   137  // Returns a new rand reader using the same seed.
   138  func (r *randReader) Open() (cos.ReadOpenCloser, error) {
   139  	return &randReader{
   140  		seed:  r.seed,
   141  		rnd:   rand.New(rand.NewSource(r.seed)),
   142  		size:  r.size,
   143  		cksum: r.cksum,
   144  	}, nil
   145  }
   146  
   147  // Close implements the Reader interface.
   148  func (*randReader) Close() error { return nil }
   149  
   150  // Seek implements the Reader interface.
   151  func (r *randReader) Seek(offset int64, whence int) (int64, error) {
   152  	var abs int64
   153  
   154  	switch whence {
   155  	case io.SeekStart:
   156  		abs = offset
   157  	case io.SeekCurrent:
   158  		abs = r.offset + offset
   159  	case io.SeekEnd:
   160  		abs = r.size + offset
   161  	default:
   162  		return 0, errors.New("invalid whence")
   163  	}
   164  
   165  	if abs < 0 {
   166  		return 0, errors.New("negative offset position")
   167  	}
   168  
   169  	if abs >= r.size {
   170  		r.offset = r.size
   171  		return r.offset, nil
   172  	}
   173  
   174  	r.rnd = rand.New(rand.NewSource(r.seed))
   175  	r.offset = 0
   176  	actual, err := io.CopyN(io.Discard, r, abs)
   177  	if err != nil {
   178  		return 0, err
   179  	}
   180  
   181  	if actual != abs {
   182  		err := fmt.Errorf("failed to seek to %d, seeked to %d instead", offset, actual)
   183  		return 0, err
   184  	}
   185  
   186  	return abs, nil
   187  }
   188  
   189  // XXHash implements the Reader interface.
   190  func (r *randReader) Cksum() *cos.Cksum {
   191  	return r.cksum
   192  }
   193  
   194  func (rr *rrLimited) Read(p []byte) (n int, err error) {
   195  	rem := int(min(rr.size-rr.off, int64(len(p))))
   196  	n, _ = rr.random.Read(p[:rem]) // never fails
   197  	rr.off += int64(n)
   198  	if rem < len(p) {
   199  		err = io.EOF
   200  	}
   201  	return
   202  }
   203  
   204  ////////////////
   205  // fileReader //
   206  ////////////////
   207  
   208  // creates/opens the file, populates it with random data, and returns a new fileReader
   209  // NOTE: Caller is responsible for closing.
   210  func NewRandFile(filepath, name string, size int64, cksumType string) (Reader, error) {
   211  	var (
   212  		cksum     *cos.Cksum
   213  		cksumHash *cos.CksumHash
   214  		fn        = path.Join(filepath, name)
   215  		f, err    = os.OpenFile(fn, os.O_RDWR|os.O_CREATE, cos.PermRWR)
   216  		exists    bool
   217  	)
   218  	if err != nil {
   219  		return nil, err
   220  	}
   221  	if size == -1 {
   222  		// checksum existing file
   223  		exists = true
   224  		if cksumType != cos.ChecksumNone {
   225  			debug.Assert(cksumType != "")
   226  			_, cksumHash, err = cos.CopyAndChecksum(io.Discard, f, nil, cksumType)
   227  		}
   228  	} else {
   229  		// Write random file
   230  		cksumHash, err = copyRandWithHash(f, size, cksumType, cos.NowRand())
   231  	}
   232  	if err == nil {
   233  		_, err = f.Seek(0, io.SeekStart)
   234  	}
   235  
   236  	if err != nil {
   237  		// cleanup and ret
   238  		f.Close()
   239  		if !exists {
   240  			os.Remove(fn)
   241  		}
   242  		return nil, err
   243  	}
   244  
   245  	if cksumType != cos.ChecksumNone {
   246  		cksum = cksumHash.Clone()
   247  	}
   248  	return &fileReader{f, filepath, name, cksum}, nil
   249  }
   250  
   251  // NewExistingFile opens an existing file, reads it to compute checksum, and returns a new reader.
   252  // NOTE: Caller responsible for closing.
   253  func NewExistingFile(fn, cksumType string) (Reader, error) {
   254  	return NewRandFile(fn, "", -1, cksumType)
   255  }
   256  
   257  func (r *fileReader) Open() (cos.ReadOpenCloser, error) {
   258  	cksumType := cos.ChecksumNone
   259  	if r.cksum != nil {
   260  		cksumType = r.cksum.Type()
   261  	}
   262  	return NewRandFile(r.filePath, r.name, -1, cksumType)
   263  }
   264  
   265  // XXHash implements the Reader interface.
   266  func (r *fileReader) Cksum() *cos.Cksum {
   267  	return r.cksum
   268  }
   269  
   270  //////////////
   271  // sgReader //
   272  //////////////
   273  
   274  func NewSG(sgl *memsys.SGL, size int64, cksumType string) (Reader, error) {
   275  	var cksum *cos.Cksum
   276  	if size > 0 {
   277  		cksumHash, err := copyRandWithHash(sgl, size, cksumType, cos.NowRand())
   278  		if err != nil {
   279  			return nil, err
   280  		}
   281  		if cksumType != cos.ChecksumNone {
   282  			cksum = cksumHash.Clone()
   283  		}
   284  	}
   285  
   286  	r := memsys.NewReader(sgl)
   287  	return &sgReader{*r, cksum}, nil
   288  }
   289  
   290  func (r *sgReader) Cksum() *cos.Cksum {
   291  	return r.cksum
   292  }
   293  
   294  /////////////////
   295  // bytesReader //
   296  /////////////////
   297  
   298  func NewBytes(buf []byte) Reader                    { return &bytesReader{bytes.NewBuffer(buf), buf} }
   299  func (*bytesReader) Close() error                   { return nil }
   300  func (*bytesReader) Cksum() *cos.Cksum              { return nil }
   301  func (*bytesReader) Seek(int64, int) (int64, error) { return 0, nil }
   302  
   303  func (r *bytesReader) Open() (cos.ReadOpenCloser, error) {
   304  	return &bytesReader{bytes.NewBuffer(r.buf), r.buf}, nil
   305  }
   306  
   307  ///////////////
   308  // tarReader //
   309  ///////////////
   310  
   311  func newTarReader(size int64, cksumType string) (r Reader, err error) {
   312  	var (
   313  		singleFileSize = min(size, int64(cos.KiB))
   314  		buff           = bytes.NewBuffer(nil)
   315  	)
   316  	err = tarch.CreateArchCustomFilesToW(buff, tar.FormatUnknown, archive.ExtTar, max(int(size/singleFileSize), 1),
   317  		int(singleFileSize), shard.ContentKeyInt, ".cls", true)
   318  	if err != nil {
   319  		return nil, err
   320  	}
   321  	cksum, err := cos.ChecksumBytes(buff.Bytes(), cksumType)
   322  	if err != nil {
   323  		return nil, err
   324  	}
   325  	return &tarReader{
   326  		b:      buff.Bytes(),
   327  		Reader: *bytes.NewReader(buff.Bytes()),
   328  		cksum:  cksum,
   329  	}, err
   330  }
   331  
   332  func (*tarReader) Close() error        { return nil }
   333  func (r *tarReader) Cksum() *cos.Cksum { return r.cksum }
   334  
   335  func (r *tarReader) Open() (cos.ReadOpenCloser, error) {
   336  	return &tarReader{
   337  		Reader: *bytes.NewReader(r.b),
   338  		cksum:  r.cksum,
   339  		b:      r.b,
   340  	}, nil
   341  }
   342  
   343  //
   344  // for convenience
   345  //
   346  
   347  func New(p Params, cksumType string) (Reader, error) {
   348  	switch p.Type {
   349  	case TypeSG:
   350  		debug.Assert(p.SGL != nil)
   351  		return NewSG(p.SGL, p.Size, cksumType)
   352  	case TypeRand:
   353  		return NewRand(p.Size, cksumType)
   354  	case TypeFile:
   355  		return NewRandFile(p.Path, p.Name, p.Size, cksumType)
   356  	case TypeTar:
   357  		return newTarReader(p.Size, cksumType)
   358  	default:
   359  		return nil, errors.New("unknown memory type for creating inmem reader")
   360  	}
   361  }
   362  
   363  // copyRandWithHash reads data from random source and writes it to a writer while
   364  // optionally computing xxhash
   365  // See related: memsys_test.copyRand
   366  func copyRandWithHash(w io.Writer, size int64, cksumType string, rnd *rand.Rand) (*cos.CksumHash, error) {
   367  	var (
   368  		cksum   *cos.CksumHash
   369  		rem     = size
   370  		buf, s  = memsys.PageMM().Alloc()
   371  		blkSize = int64(len(buf))
   372  	)
   373  	defer s.Free(buf)
   374  
   375  	if cksumType != cos.ChecksumNone {
   376  		cksum = cos.NewCksumHash(cksumType)
   377  	}
   378  	for i := int64(0); i <= size/blkSize; i++ {
   379  		n := int(min(blkSize, rem))
   380  		rnd.Read(buf[:n])
   381  		m, err := w.Write(buf[:n])
   382  		if err != nil {
   383  			return nil, err
   384  		}
   385  		if cksumType != cos.ChecksumNone {
   386  			cksum.H.Write(buf[:m])
   387  		}
   388  		debug.Assert(m == n)
   389  		rem -= int64(m)
   390  	}
   391  	if cksumType != cos.ChecksumNone {
   392  		cksum.Finalize()
   393  	}
   394  	return cksum, nil
   395  }