github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/go/nbs/benchmarks/data_source.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package main
     6  
     7  import (
     8  	"bufio"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  
    14  	"github.com/attic-labs/kingpin"
    15  	"github.com/attic-labs/noms/go/chunks"
    16  	"github.com/attic-labs/noms/go/d"
    17  	"github.com/attic-labs/noms/go/hash"
    18  	"github.com/attic-labs/noms/go/nbs/benchmarks/gen"
    19  	"github.com/dustin/go-humanize"
    20  )
    21  
    22  var readFile = kingpin.Flag("input-file", "A file full of test data. Creates and saves associated .chunks file at runtime if it doesn't yet exist. If none is specified, data and .chunks files will be generated and saved.").String()
    23  
    24  const averageChunkSize = 4 * 1024
    25  
    26  type hashSlice []hash.Hash
    27  
    28  func (s hashSlice) Len() int {
    29  	return len(s)
    30  }
    31  
    32  func (s hashSlice) Less(i, j int) bool {
    33  	return s[i].Less(s[j])
    34  }
    35  
    36  func (s hashSlice) Swap(i, j int) {
    37  	s[i], s[j] = s[j], s[i]
    38  }
    39  
    40  type dataSource struct {
    41  	data, cm            *os.File
    42  	totalData, dataRead uint64
    43  	hashes              hashSlice
    44  }
    45  
    46  func getInput(dataSize uint64) (src *dataSource, err error) {
    47  	filename := *readFile
    48  	if filename == "" {
    49  		filename = humanize.IBytes(dataSize) + ".bin"
    50  	}
    51  	data, err := gen.OpenOrGenerateDataFile(filename, dataSize)
    52  	if err != nil {
    53  		return nil, err
    54  	}
    55  	chunkFile := data.Name() + ".chunks"
    56  	cm := gen.OpenOrBuildChunkMap(chunkFile, data)
    57  	fmt.Println("Reading from", filename, "with chunks", chunkFile)
    58  	src = &dataSource{data: data, cm: cm, totalData: dataSize}
    59  	tuples := make(chan offsetTuple, 1024)
    60  	go func() {
    61  		src.readTuples(tuples)
    62  		close(tuples)
    63  	}()
    64  	for ot := range tuples {
    65  		src.hashes = append(src.hashes, ot.h)
    66  	}
    67  	return src, err
    68  }
    69  
    70  type offsetTuple struct {
    71  	h hash.Hash
    72  	l uint64
    73  }
    74  
    75  func (src *dataSource) PrimeFilesystemCache() {
    76  	bufData := bufio.NewReaderSize(src.data, 10*humanize.MiByte)
    77  	tuples := make(chan offsetTuple, 16)
    78  	go func() {
    79  		src.readTuples(tuples)
    80  		close(tuples)
    81  	}()
    82  
    83  	for ot := range tuples {
    84  		buff := make([]byte, ot.l)
    85  		n, err := io.ReadFull(bufData, buff)
    86  		d.Chk.NoError(err)
    87  		d.Chk.True(uint64(n) == ot.l)
    88  	}
    89  }
    90  
    91  func (src *dataSource) ReadChunks(chunkChan chan<- *chunks.Chunk) {
    92  	bufData := bufio.NewReaderSize(src.data, humanize.MiByte)
    93  	tuples := make(chan offsetTuple, 1024)
    94  	go func() {
    95  		src.readTuples(tuples)
    96  		close(tuples)
    97  	}()
    98  
    99  	for ot := range tuples {
   100  		buff := make([]byte, ot.l)
   101  		n, err := io.ReadFull(bufData, buff)
   102  		d.Chk.NoError(err)
   103  		d.Chk.True(uint64(n) == ot.l)
   104  		c := chunks.NewChunkWithHash(ot.h, buff)
   105  		chunkChan <- &c
   106  	}
   107  }
   108  
   109  func (src *dataSource) GetHashes() hashSlice {
   110  	out := make(hashSlice, len(src.hashes))
   111  	copy(out, src.hashes)
   112  	return out
   113  }
   114  
   115  func (src *dataSource) readTuples(tuples chan<- offsetTuple) {
   116  	src.reset()
   117  
   118  	otBuf := [gen.OffsetTupleLen]byte{}
   119  	cm := bufio.NewReaderSize(src.cm, humanize.MiByte)
   120  	ot := offsetTuple{}
   121  
   122  	for src.dataRead < src.totalData {
   123  		n, err := io.ReadFull(cm, otBuf[:])
   124  		if err != nil {
   125  			d.Chk.True(err == io.EOF)
   126  			return
   127  		}
   128  		d.Chk.True(n == gen.OffsetTupleLen)
   129  		ot.h = hash.New(otBuf[:20])
   130  		ot.l = uint64(binary.BigEndian.Uint32(otBuf[20:]))
   131  		src.dataRead += ot.l
   132  		tuples <- ot
   133  	}
   134  }
   135  
   136  func (src *dataSource) reset() {
   137  	_, err := src.data.Seek(0, os.SEEK_SET)
   138  	d.Chk.NoError(err)
   139  	_, err = src.cm.Seek(0, os.SEEK_SET)
   140  	d.Chk.NoError(err)
   141  	src.dataRead = 0
   142  }
   143  
   144  func (src *dataSource) Close() {
   145  	d.Chk.NoError(src.data.Close())
   146  	d.Chk.NoError(src.cm.Close())
   147  }