github.com/m3db/m3@v1.5.0/src/dbnode/persist/fs/index_write.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package fs
    22  
    23  import (
    24  	"bufio"
    25  	"errors"
    26  	"fmt"
    27  	"io/fs"
    28  	"io/ioutil"
    29  	"os"
    30  	"time"
    31  
    32  	"github.com/m3db/m3/src/dbnode/digest"
    33  	"github.com/m3db/m3/src/dbnode/generated/proto/index"
    34  	"github.com/m3db/m3/src/dbnode/persist"
    35  	idxpersist "github.com/m3db/m3/src/m3ninx/persist"
    36  	xerrors "github.com/m3db/m3/src/x/errors"
    37  	xos "github.com/m3db/m3/src/x/os"
    38  	xtime "github.com/m3db/m3/src/x/time"
    39  
    40  	protobuftypes "github.com/gogo/protobuf/types"
    41  )
    42  
    43  const (
    44  	indexFileSetMajorVersion = 1
    45  
    46  	// indexWriteBufferSize is set to 250kb to avoid very frequent
    47  	// syscall overhead using the default buffer size (lot of large
    48  	// files written when writing the index).
    49  	indexWriteBufferSize = 2 << 17 // ~250kb
    50  )
    51  
    52  var (
    53  	errIndexFileSetWriterReturnsNoFiles = errors.New(
    54  		"index file set writer returned zero file types")
    55  	errIndexFileSetWriterOpenWithNoShards = errors.New(
    56  		"index file set writer opened with no shards specified")
    57  )
    58  
    59  type indexWriter struct {
    60  	opts             Options
    61  	filePathPrefix   string
    62  	newFileMode      os.FileMode
    63  	newDirectoryMode os.FileMode
    64  	fdWithDigest     digest.FdWithDigestWriter
    65  
    66  	err             error
    67  	blockSize       time.Duration
    68  	start           xtime.UnixNano
    69  	fileSetType     persist.FileSetType
    70  	snapshotTime    xtime.UnixNano
    71  	volumeIndex     int
    72  	indexVolumeType idxpersist.IndexVolumeType
    73  	shards          map[uint32]struct{}
    74  	segments        []writtenIndexSegment
    75  
    76  	namespaceDir       string
    77  	checkpointFilePath string
    78  	infoFilePath       string
    79  	digestFilePath     string
    80  }
    81  
    82  type writtenIndexSegment struct {
    83  	segmentType  idxpersist.IndexSegmentType
    84  	majorVersion int
    85  	minorVersion int
    86  	metadata     []byte
    87  	files        []writtenIndexSegmentFile
    88  }
    89  
    90  type writtenIndexSegmentFile struct {
    91  	segmentFileType idxpersist.IndexSegmentFileType
    92  	digest          uint32
    93  }
    94  
    95  // NewIndexWriter returns a new index writer with options.
    96  func NewIndexWriter(opts Options) (IndexFileSetWriter, error) {
    97  	if err := opts.Validate(); err != nil {
    98  		return nil, err
    99  	}
   100  	return &indexWriter{
   101  		opts:             opts,
   102  		filePathPrefix:   opts.FilePathPrefix(),
   103  		newFileMode:      opts.NewFileMode(),
   104  		newDirectoryMode: opts.NewDirectoryMode(),
   105  		fdWithDigest:     digest.NewFdWithDigestWriter(indexWriteBufferSize),
   106  	}, nil
   107  }
   108  
   109  func (w *indexWriter) Open(opts IndexWriterOpenOptions) error {
   110  	if len(opts.Shards) == 0 {
   111  		return errIndexFileSetWriterOpenWithNoShards
   112  	}
   113  
   114  	var (
   115  		namespace  = opts.Identifier.Namespace
   116  		blockStart = opts.Identifier.BlockStart
   117  	)
   118  	w.err = nil
   119  	w.blockSize = opts.BlockSize
   120  	w.start = blockStart
   121  	w.fileSetType = opts.FileSetType
   122  	w.volumeIndex = opts.Identifier.VolumeIndex
   123  	w.shards = opts.Shards
   124  	w.snapshotTime = opts.Snapshot.SnapshotTime
   125  	w.indexVolumeType = opts.IndexVolumeType
   126  	if w.indexVolumeType == "" {
   127  		w.indexVolumeType = idxpersist.DefaultIndexVolumeType
   128  	}
   129  	w.segments = nil
   130  
   131  	switch opts.FileSetType {
   132  	case persist.FileSetSnapshotType:
   133  		w.namespaceDir = NamespaceIndexSnapshotDirPath(w.filePathPrefix, namespace)
   134  	case persist.FileSetFlushType:
   135  		w.namespaceDir = NamespaceIndexDataDirPath(w.filePathPrefix, namespace)
   136  	default:
   137  		return fmt.Errorf("cannot open index writer for fileset type: %s", opts.FileSetType)
   138  	}
   139  	if err := os.MkdirAll(w.namespaceDir, w.newDirectoryMode); err != nil {
   140  		return err
   141  	}
   142  	w.infoFilePath = FilesetPathFromTimeAndIndex(w.namespaceDir, blockStart, w.volumeIndex, InfoFileSuffix)
   143  	w.digestFilePath = FilesetPathFromTimeAndIndex(w.namespaceDir, blockStart, w.volumeIndex, DigestFileSuffix)
   144  	w.checkpointFilePath = FilesetPathFromTimeAndIndex(w.namespaceDir, blockStart, w.volumeIndex, CheckpointFileSuffix)
   145  
   146  	exists, err := CompleteCheckpointFileExists(w.checkpointFilePath)
   147  	if err != nil {
   148  		return err
   149  	}
   150  	if exists {
   151  		return xerrors.Wrapf(fs.ErrExist,
   152  			"checkpoint already exists for volume: %s",
   153  			w.checkpointFilePath)
   154  	}
   155  
   156  	// NB: Write out an incomplete index info file when we start writing a volume,
   157  	// this is later used in the cleanup of corrupted/incomplete index filesets.
   158  	infoFileData, err := w.infoFileData()
   159  	if err != nil {
   160  		return err
   161  	}
   162  
   163  	return w.writeInfoFile(infoFileData)
   164  }
   165  
   166  func (w *indexWriter) WriteSegmentFileSet(
   167  	segmentFileSet idxpersist.IndexSegmentFileSetWriter,
   168  ) error {
   169  	if w.err != nil {
   170  		return w.err
   171  	}
   172  
   173  	segType := segmentFileSet.SegmentType()
   174  	if err := segType.Validate(); err != nil {
   175  		return w.markSegmentWriteError(segType, "", err)
   176  	}
   177  
   178  	seg := writtenIndexSegment{
   179  		segmentType:  segType,
   180  		majorVersion: segmentFileSet.MajorVersion(),
   181  		minorVersion: segmentFileSet.MinorVersion(),
   182  		metadata:     segmentFileSet.SegmentMetadata(),
   183  	}
   184  
   185  	files := segmentFileSet.Files()
   186  	if len(files) == 0 {
   187  		return w.markSegmentWriteError(segType, "",
   188  			errIndexFileSetWriterReturnsNoFiles)
   189  	}
   190  
   191  	idx := len(w.segments)
   192  	for _, segFileType := range files {
   193  		if err := segFileType.Validate(); err != nil {
   194  			return w.markSegmentWriteError(segType, segFileType, err)
   195  		}
   196  
   197  		var filePath string
   198  		switch w.fileSetType {
   199  		case persist.FileSetSnapshotType:
   200  			filePath = snapshotIndexSegmentFilePathFromTimeAndIndex(w.namespaceDir, w.start, w.volumeIndex,
   201  				idx, segFileType)
   202  		case persist.FileSetFlushType:
   203  			filePath = filesetIndexSegmentFilePathFromTime(w.namespaceDir, w.start, w.volumeIndex,
   204  				idx, segFileType)
   205  		default:
   206  			err := fmt.Errorf("unknown fileset type: %s", w.fileSetType)
   207  			return w.markSegmentWriteError(segType, segFileType, err)
   208  		}
   209  
   210  		fd, err := OpenWritable(filePath, w.newFileMode)
   211  		if err != nil {
   212  			return w.markSegmentWriteError(segType, segFileType, err)
   213  		}
   214  
   215  		// Use buffered IO writer to write the file in case the reader
   216  		// returns small chunks of data
   217  		w.fdWithDigest.Reset(fd)
   218  		digest := w.fdWithDigest.Digest()
   219  		writer := bufio.NewWriter(w.fdWithDigest)
   220  		writeErr := segmentFileSet.WriteFile(segFileType, writer)
   221  		err = xerrors.FirstError(writeErr, writer.Flush(), w.fdWithDigest.Close())
   222  		if err != nil {
   223  			return w.markSegmentWriteError(segType, segFileType, err)
   224  		}
   225  
   226  		seg.files = append(seg.files, writtenIndexSegmentFile{
   227  			segmentFileType: segFileType,
   228  			digest:          digest.Sum32(),
   229  		})
   230  	}
   231  
   232  	w.segments = append(w.segments, seg)
   233  	return nil
   234  }
   235  
   236  func (w *indexWriter) markSegmentWriteError(
   237  	segType idxpersist.IndexSegmentType,
   238  	segFileType idxpersist.IndexSegmentFileType,
   239  	err error,
   240  ) error {
   241  	w.err = fmt.Errorf("failed to write segment_type=%s, segment_file_type=%s: %v",
   242  		segType, segFileType, err)
   243  	return w.err
   244  }
   245  
   246  func (w *indexWriter) infoFileData() ([]byte, error) {
   247  	shards := make([]uint32, 0, len(w.shards))
   248  	for shard := range w.shards {
   249  		shards = append(shards, shard)
   250  	}
   251  	info := &index.IndexVolumeInfo{
   252  		MajorVersion: indexFileSetMajorVersion,
   253  		BlockStart:   int64(w.start),
   254  		BlockSize:    int64(w.blockSize),
   255  		FileType:     int64(w.fileSetType),
   256  		Shards:       shards,
   257  		SnapshotTime: int64(w.snapshotTime),
   258  		IndexVolumeType: &protobuftypes.StringValue{
   259  			Value: string(w.indexVolumeType),
   260  		},
   261  	}
   262  	for _, segment := range w.segments {
   263  		segmentInfo := &index.SegmentInfo{
   264  			SegmentType:  string(segment.segmentType),
   265  			MajorVersion: int64(segment.majorVersion),
   266  			MinorVersion: int64(segment.minorVersion),
   267  			Metadata:     segment.metadata,
   268  		}
   269  		for _, file := range segment.files {
   270  			fileInfo := &index.SegmentFileInfo{
   271  				SegmentFileType: string(file.segmentFileType),
   272  			}
   273  			segmentInfo.Files = append(segmentInfo.Files, fileInfo)
   274  		}
   275  		info.Segments = append(info.Segments, segmentInfo)
   276  	}
   277  	return info.Marshal()
   278  }
   279  
   280  func (w *indexWriter) digestsFileData(infoFileData []byte) ([]byte, error) {
   281  	digests := &index.IndexDigests{
   282  		InfoDigest: digest.Checksum(infoFileData),
   283  	}
   284  	for _, segment := range w.segments {
   285  		segmentDigest := &index.SegmentDigest{
   286  			SegmentType: string(segment.segmentType),
   287  		}
   288  		for _, file := range segment.files {
   289  			fileDigest := &index.SegmentFileDigest{
   290  				SegmentFileType: string(file.segmentFileType),
   291  				Digest:          file.digest,
   292  			}
   293  			segmentDigest.Files = append(segmentDigest.Files, fileDigest)
   294  		}
   295  		digests.SegmentDigests = append(digests.SegmentDigests, segmentDigest)
   296  	}
   297  	return digests.Marshal()
   298  }
   299  
   300  func (w *indexWriter) Close() error {
   301  	if w.err != nil {
   302  		// If a write error occurred don't even bother trying to write out file set
   303  		return w.err
   304  	}
   305  
   306  	// Write info file
   307  	infoFileData, err := w.infoFileData()
   308  	if err != nil {
   309  		return err
   310  	}
   311  
   312  	if err := w.writeInfoFile(infoFileData); err != nil {
   313  		return err
   314  	}
   315  
   316  	// Write digests file
   317  	digestsFileData, err := w.digestsFileData(infoFileData)
   318  	if err != nil {
   319  		return err
   320  	}
   321  	err = ioutil.WriteFile(w.digestFilePath, digestsFileData, w.newFileMode)
   322  	if err != nil {
   323  		return err
   324  	}
   325  
   326  	// Write checkpoint file
   327  	digestBuffer := digest.NewBuffer()
   328  	digestBuffer.WriteDigest(digest.Checksum(digestsFileData))
   329  	return ioutil.WriteFile(w.checkpointFilePath, digestBuffer, w.newFileMode)
   330  }
   331  
   332  func (w *indexWriter) writeInfoFile(infoFileData []byte) error {
   333  	// NB: corrupted index fileset cleanup logic depends on info files being written ahead of
   334  	// all the other files. To avoid cases where writes could be observed in a different order,
   335  	// info files are being fsync'ed immediately after being written.
   336  	return xos.WriteFileSync(w.infoFilePath, infoFileData, w.newFileMode)
   337  }