github.com/m3db/m3@v1.5.0/src/cmd/tools/verify_data_files/main/main.go (about)

     1  // Copyright (c) 2019 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package main
    22  
    23  import (
    24  	"fmt"
    25  	"io"
    26  	golog "log"
    27  	"os"
    28  	"path"
    29  	"sort"
    30  	"strconv"
    31  	"unicode/utf8"
    32  
    33  	"github.com/m3db/m3/src/cmd/tools"
    34  	"github.com/m3db/m3/src/dbnode/digest"
    35  	"github.com/m3db/m3/src/dbnode/persist"
    36  	"github.com/m3db/m3/src/dbnode/persist/fs"
    37  	"github.com/m3db/m3/src/dbnode/storage/index/convert"
    38  	"github.com/m3db/m3/src/x/checked"
    39  	"github.com/m3db/m3/src/x/ident"
    40  	"github.com/m3db/m3/src/x/pool"
    41  
    42  	"github.com/pborman/getopt"
    43  	"go.uber.org/zap"
    44  	"go.uber.org/zap/zapcore"
    45  )
    46  
    47  func main() {
    48  	var (
    49  		optPathPrefix          = getopt.StringLong("path-prefix", 'p', "/var/lib/m3db", "Path prefix [e.g. /var/lib/m3db]")
    50  		optFailFast            = getopt.BoolLong("fail-fast", 'f', "Fail fast will bail on first failure")
    51  		optFixDir              = getopt.StringLong("fix-path-prefix", 'o', "/tmp/m3db", "Fix output path file prefix for fixed files [e.g. /tmp/m3db]")
    52  		optFixInvalidIDs       = getopt.BoolLong("fix-invalid-ids", 'i', "Fix invalid IDs will remove entries with IDs that have non-UTF8 chars")
    53  		optFixInvalidTags      = getopt.BoolLong("fix-invalid-tags", 't', "Fix invalid tags will remove entries with tags that have name/values non-UTF8 chars")
    54  		optFixInvalidChecksums = getopt.BoolLong("fix-invalid-checksums", 'c', "Fix invalid checksums will remove entries with bad checksums")
    55  		optDebugLog            = getopt.BoolLong("debug", 'd', "Enable debug log level")
    56  	)
    57  	getopt.Parse()
    58  
    59  	logLevel := zapcore.InfoLevel
    60  	if *optDebugLog {
    61  		logLevel = zapcore.DebugLevel
    62  	}
    63  
    64  	logConfig := zap.NewDevelopmentConfig()
    65  	logConfig.Level = zap.NewAtomicLevelAt(logLevel)
    66  	log, err := logConfig.Build()
    67  	if err != nil {
    68  		golog.Fatalf("unable to create logger: %+v", err)
    69  	}
    70  
    71  	if *optPathPrefix == "" {
    72  		getopt.Usage()
    73  		os.Exit(1)
    74  	}
    75  
    76  	log.Info("creating bytes pool")
    77  	bytesPool := tools.NewCheckedBytesPool()
    78  	bytesPool.Init()
    79  
    80  	run(runOptions{
    81  		filePathPrefix:      *optPathPrefix,
    82  		failFast:            *optFailFast,
    83  		fixDir:              *optFixDir,
    84  		fixInvalidIDs:       *optFixInvalidIDs,
    85  		fixInvalidTags:      *optFixInvalidTags,
    86  		fixInvalidChecksums: *optFixInvalidChecksums,
    87  		bytesPool:           bytesPool,
    88  		log:                 log,
    89  	})
    90  }
    91  
    92  type runOptions struct {
    93  	filePathPrefix      string
    94  	failFast            bool
    95  	fixDir              string
    96  	fixInvalidIDs       bool
    97  	fixInvalidTags      bool
    98  	fixInvalidChecksums bool
    99  	bytesPool           pool.CheckedBytesPool
   100  	log                 *zap.Logger
   101  }
   102  
   103  func run(opts runOptions) {
   104  	filePathPrefix := opts.filePathPrefix
   105  	bytesPool := opts.bytesPool
   106  	log := opts.log
   107  
   108  	dataDirPath := fs.DataDirPath(filePathPrefix)
   109  
   110  	namespaces, err := dirFiles(dataDirPath)
   111  	if err != nil {
   112  		log.Fatal("could not read namespaces", zap.Error(err))
   113  	}
   114  
   115  	// Get all fileset files.
   116  	log.Info("discovering file sets",
   117  		zap.Strings("namespaces", namespaces))
   118  	var fileSetFiles []fs.FileSetFile
   119  	for _, namespace := range namespaces {
   120  		namespacePath := path.Join(dataDirPath, namespace)
   121  		shards, err := dirFiles(namespacePath)
   122  		if err != nil {
   123  			log.Fatal("could not read shards for namespace",
   124  				zap.String("namespacePath", namespacePath),
   125  				zap.Error(err))
   126  		}
   127  
   128  		log.Debug("discovered shards",
   129  			zap.String("namespace", namespace),
   130  			zap.String("namespacePath", namespacePath),
   131  			zap.Strings("shards", shards))
   132  		for _, shard := range shards {
   133  			shardPath := path.Join(namespacePath, shard)
   134  			shardID, err := strconv.Atoi(shard)
   135  			if err != nil {
   136  				log.Fatal("could not parse shard dir as int",
   137  					zap.String("shardPath", shardPath), zap.Error(err))
   138  			}
   139  
   140  			shardFileSets, err := fs.DataFiles(filePathPrefix,
   141  				ident.StringID(namespace), uint32(shardID))
   142  			if err != nil {
   143  				log.Fatal("could not list shard dir file setes",
   144  					zap.String("shardPath", shardPath), zap.Error(err))
   145  			}
   146  
   147  			log.Debug("discovered shard file sets",
   148  				zap.String("namespace", namespace),
   149  				zap.String("namespacePath", namespacePath),
   150  				zap.Int("shardID", shardID),
   151  				zap.Any("fileSets", shardFileSets))
   152  			fileSetFiles = append(fileSetFiles, shardFileSets...)
   153  		}
   154  	}
   155  
   156  	// Sort by time in reverse (usually want to fix latest files first and
   157  	// can stop once done with fail-fast).
   158  	log.Info("sorting file sets", zap.Int("numFileSets", len(fileSetFiles)))
   159  	sort.Slice(fileSetFiles, func(i, j int) bool {
   160  		return fileSetFiles[i].ID.BlockStart.After(fileSetFiles[j].ID.BlockStart)
   161  	})
   162  
   163  	log.Info("verifying file sets", zap.Int("numFileSets", len(fileSetFiles)))
   164  	for _, fileSet := range fileSetFiles {
   165  		if !fileSet.HasCompleteCheckpointFile() {
   166  			continue // Don't validate file sets without checkpoint file.
   167  		}
   168  
   169  		log.Info("verifying file set file", zap.Any("fileSet", fileSet))
   170  		if err := verifyFileSet(verifyFileSetOptions{
   171  			filePathPrefix:      filePathPrefix,
   172  			bytesPool:           bytesPool,
   173  			fileSet:             fileSet,
   174  			fixDir:              opts.fixDir,
   175  			fixInvalidIDs:       opts.fixInvalidIDs,
   176  			fixInvalidTags:      opts.fixInvalidTags,
   177  			fixInvalidChecksums: opts.fixInvalidChecksums,
   178  		}, log); err != nil {
   179  			log.Error("file set file failed verification",
   180  				zap.Error(err),
   181  				zap.Any("fileSet", fileSet))
   182  
   183  			if opts.failFast {
   184  				log.Fatal("aborting due to fail fast set")
   185  			}
   186  		}
   187  	}
   188  }
   189  
   190  func dirFiles(dirPath string) ([]string, error) {
   191  	dir, err := os.Open(dirPath)
   192  	if err != nil {
   193  		return nil, fmt.Errorf("could not open dir: %v", err)
   194  	}
   195  
   196  	defer dir.Close()
   197  
   198  	stat, err := dir.Stat()
   199  	if err != nil {
   200  		return nil, fmt.Errorf("could not stat dir: %v", err)
   201  	}
   202  	if !stat.IsDir() {
   203  		return nil, fmt.Errorf("path is not a directory: %s", dirPath)
   204  	}
   205  
   206  	entries, err := dir.Readdirnames(-1)
   207  	if err != nil {
   208  		return nil, fmt.Errorf("could not read dir names: %v", err)
   209  	}
   210  
   211  	results := entries[:0]
   212  	for _, p := range entries {
   213  		if p == "." || p == ".." || p == "./.." || p == "./" || p == "../" || p == "./../" {
   214  			continue
   215  		}
   216  		results = append(results, p)
   217  	}
   218  	return results, nil
   219  }
   220  
   221  type verifyFileSetOptions struct {
   222  	filePathPrefix string
   223  	bytesPool      pool.CheckedBytesPool
   224  	fileSet        fs.FileSetFile
   225  
   226  	fixDir              string
   227  	fixInvalidIDs       bool
   228  	fixInvalidTags      bool
   229  	fixInvalidChecksums bool
   230  }
   231  
   232  func verifyFileSet(
   233  	opts verifyFileSetOptions,
   234  	log *zap.Logger,
   235  ) error {
   236  	fsOpts := fs.NewOptions().SetFilePathPrefix(opts.filePathPrefix)
   237  	reader, err := fs.NewReader(opts.bytesPool, fsOpts)
   238  	if err != nil {
   239  		return err
   240  	}
   241  
   242  	fileSet := opts.fileSet
   243  
   244  	openOpts := fs.DataReaderOpenOptions{
   245  		Identifier:  fileSet.ID,
   246  		FileSetType: persist.FileSetFlushType,
   247  	}
   248  
   249  	err = reader.Open(openOpts)
   250  	if err != nil {
   251  		return err
   252  	}
   253  
   254  	defer reader.Close()
   255  
   256  	for {
   257  		id, tags, data, checksum, err := reader.Read()
   258  		if err == io.EOF {
   259  			break
   260  		}
   261  		if err != nil {
   262  			return err
   263  		}
   264  
   265  		check, err := readEntry(id, tags, data, checksum)
   266  		data.Finalize() // Always finalize data.
   267  		if err == nil {
   268  			continue
   269  		}
   270  
   271  		shouldFixInvalidID := check.invalidID && opts.fixInvalidIDs
   272  		shouldFixInvalidTags := check.invalidTags && opts.fixInvalidTags
   273  		shouldFixInvalidChecksum := check.invalidChecksum && opts.fixInvalidChecksums
   274  		if !shouldFixInvalidID && !shouldFixInvalidTags && !shouldFixInvalidChecksum {
   275  			return err
   276  		}
   277  
   278  		log.Info("starting to fix file set", zap.Any("fileSet", fileSet))
   279  		fixErr := fixFileSet(opts, log)
   280  		if fixErr != nil {
   281  			log.Error("could not fix file set",
   282  				zap.Any("fileSet", fileSet), zap.Error(fixErr))
   283  			return err
   284  		}
   285  
   286  		log.Info("fixed file set", zap.Any("fileSet", fileSet))
   287  		return err
   288  	}
   289  
   290  	return nil
   291  }
   292  
   293  type readEntryResult struct {
   294  	invalidID       bool
   295  	invalidTags     bool
   296  	invalidChecksum bool
   297  }
   298  
   299  func readEntry(
   300  	id ident.ID,
   301  	tags ident.TagIterator,
   302  	data checked.Bytes,
   303  	checksum uint32,
   304  ) (readEntryResult, error) {
   305  	idValue := id.Bytes()
   306  	if len(idValue) == 0 {
   307  		return readEntryResult{invalidID: true},
   308  			fmt.Errorf("invalid id: err=%s, as_string=%s, as_hex=%x",
   309  				"empty", idValue, idValue)
   310  	}
   311  	if !utf8.Valid(idValue) {
   312  		return readEntryResult{invalidID: true},
   313  			fmt.Errorf("invalid id: err=%s, as_string=%s, as_hex=%x",
   314  				"non-utf8", idValue, idValue)
   315  	}
   316  
   317  	for tags.Next() {
   318  		tag := tags.Current()
   319  		if err := convert.ValidateSeriesTag(tag); err != nil {
   320  			return readEntryResult{invalidTags: true},
   321  				fmt.Errorf("invalid tag: err=%v, "+
   322  					"name_as_string=%s, name_as_hex=%s"+
   323  					"value_as_string=%s, value_as_hex=%s",
   324  					err,
   325  					tag.Name.Bytes(), tag.Name.Bytes(),
   326  					tag.Value.Bytes(), tag.Value.Bytes())
   327  		}
   328  	}
   329  
   330  	data.IncRef()
   331  	calculatedChecksum := digest.Checksum(data.Bytes())
   332  	data.DecRef()
   333  
   334  	if calculatedChecksum != checksum {
   335  		return readEntryResult{invalidChecksum: true},
   336  			fmt.Errorf("data checksum invalid: actual=%v, expected=%v",
   337  				calculatedChecksum, checksum)
   338  	}
   339  	return readEntryResult{}, nil
   340  }
   341  
   342  func fixFileSet(
   343  	opts verifyFileSetOptions,
   344  	log *zap.Logger,
   345  ) error {
   346  	fsOpts := fs.NewOptions().SetFilePathPrefix(opts.filePathPrefix)
   347  	reader, err := fs.NewReader(opts.bytesPool, fsOpts)
   348  	if err != nil {
   349  		return err
   350  	}
   351  
   352  	fileSet := opts.fileSet
   353  
   354  	openOpts := fs.DataReaderOpenOptions{
   355  		Identifier:  fileSet.ID,
   356  		FileSetType: persist.FileSetFlushType,
   357  	}
   358  
   359  	err = reader.Open(openOpts)
   360  	if err != nil {
   361  		return err
   362  	}
   363  
   364  	defer reader.Close()
   365  
   366  	// NOTE: we output to a new directory so that we don't clobber files.
   367  	writeFsOpts := fsOpts.SetFilePathPrefix(opts.fixDir)
   368  	writer, err := fs.NewWriter(writeFsOpts)
   369  	if err != nil {
   370  		return err
   371  	}
   372  
   373  	err = writer.Open(fs.DataWriterOpenOptions{
   374  		FileSetType:        persist.FileSetFlushType,
   375  		FileSetContentType: fileSet.ID.FileSetContentType,
   376  		Identifier:         fileSet.ID,
   377  		BlockSize:          reader.Status().BlockSize,
   378  	})
   379  	if err != nil {
   380  		return err
   381  	}
   382  
   383  	success := false
   384  	defer func() {
   385  		if !success {
   386  			writer.Close()
   387  		}
   388  	}()
   389  
   390  	var (
   391  		removedIDs  int
   392  		removedTags int
   393  		copies      []checked.Bytes
   394  	)
   395  	for {
   396  		id, tags, data, checksum, err := reader.Read()
   397  		if err == io.EOF {
   398  			break
   399  		}
   400  		if err != nil {
   401  			return err
   402  		}
   403  
   404  		tagsCopy := tags.Duplicate()
   405  
   406  		check, err := readEntry(id, tags, data, checksum)
   407  		if err != nil {
   408  			shouldFixInvalidID := check.invalidID && opts.fixInvalidIDs
   409  			shouldFixInvalidTags := check.invalidTags && opts.fixInvalidTags
   410  			shouldFixInvalidChecksum := check.invalidChecksum && opts.fixInvalidChecksums
   411  			log.Info("read entry for fix",
   412  				zap.Bool("shouldFixInvalidID", shouldFixInvalidID),
   413  				zap.Bool("shouldFixInvalidTags", shouldFixInvalidTags),
   414  				zap.Bool("shouldFixInvalidChecksum", shouldFixInvalidChecksum))
   415  
   416  			if shouldFixInvalidID || shouldFixInvalidTags || shouldFixInvalidChecksum {
   417  				// Skip this entry being written to the target volume.
   418  				removedIDs++
   419  				continue
   420  			}
   421  
   422  			return fmt.Errorf("encountered an error not enabled to fix: %v", err)
   423  		}
   424  
   425  		metadata := persist.NewMetadataFromIDAndTagIterator(id, tagsCopy,
   426  			persist.MetadataOptions{
   427  				FinalizeID:          true,
   428  				FinalizeTagIterator: true,
   429  			})
   430  
   431  		data.IncRef()
   432  		err = writer.Write(metadata, data, checksum)
   433  		data.DecRef()
   434  		if err != nil {
   435  			return fmt.Errorf("could not write fixed file set entry: %v", err)
   436  		}
   437  
   438  		// Finalize data to release back to pool.
   439  		data.Finalize()
   440  
   441  		// Release our copies back to pool.
   442  		for _, copy := range copies {
   443  			copy.DecRef()
   444  			copy.Finalize()
   445  		}
   446  		copies = copies[:0]
   447  	}
   448  
   449  	log.Info("finished fixing file set",
   450  		zap.Any("fileSet", fileSet),
   451  		zap.Int("removedIDs", removedIDs),
   452  		zap.Int("removedTags", removedTags))
   453  
   454  	success = true
   455  	return writer.Close()
   456  }