github.com/m3db/m3@v1.5.0/src/cmd/tools/read_index_segments/main/main.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package main
    22  
    23  import (
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	golog "log"
    28  	"math"
    29  	"os"
    30  	"runtime"
    31  	"sync"
    32  	"time"
    33  
    34  	"github.com/m3db/m3/src/dbnode/persist"
    35  	"github.com/m3db/m3/src/dbnode/persist/fs"
    36  	"github.com/m3db/m3/src/query/util/json"
    37  	"github.com/m3db/m3/src/x/ident"
    38  	xsync "github.com/m3db/m3/src/x/sync"
    39  	"github.com/m3db/m3/src/x/unsafe"
    40  
    41  	"github.com/pborman/getopt"
    42  	"go.uber.org/zap"
    43  )
    44  
    45  var (
    46  	halfCPUs     = int(math.Max(float64(runtime.GOMAXPROCS(0)/2), 1))
    47  	endlineBytes = []byte("\n")
    48  )
    49  
    50  func main() {
    51  	var (
    52  		optPathPrefix          = getopt.StringLong("path-prefix", 'p', "/var/lib/m3db", "Path prefix [e.g. /var/lib/m3db]")
    53  		optOutputFile          = getopt.StringLong("output-file", 'o', "", "Output JSON file of line delimited JSON objects for each segment")
    54  		optValidate            = getopt.BoolLong("validate", 'v', "Validate the segments, do not print out metadata")
    55  		optValidateConcurrency = getopt.IntLong("validate-concurrency", 'c', halfCPUs, "Validation concurrency")
    56  	)
    57  	getopt.Parse()
    58  
    59  	logConfig := zap.NewDevelopmentConfig()
    60  	log, err := logConfig.Build()
    61  	if err != nil {
    62  		golog.Fatalf("unable to create logger: %+v", err)
    63  	}
    64  
    65  	if *optOutputFile != "" && *optValidate {
    66  		log.Error("cannot write output and validate, do not set output file if validating")
    67  		getopt.Usage()
    68  		os.Exit(1)
    69  	}
    70  
    71  	if *optPathPrefix == "" || (*optOutputFile == "" && !*optValidate) {
    72  		getopt.Usage()
    73  		os.Exit(1)
    74  	}
    75  
    76  	run(runOptions{
    77  		filePathPrefix:      *optPathPrefix,
    78  		outputFilePath:      *optOutputFile,
    79  		validate:            *optValidate,
    80  		validateConcurrency: *optValidateConcurrency,
    81  		log:                 log,
    82  	})
    83  }
    84  
    85  type runOptions struct {
    86  	filePathPrefix      string
    87  	outputFilePath      string
    88  	validate            bool
    89  	validateConcurrency int
    90  	log                 *zap.Logger
    91  }
    92  
    93  func run(opts runOptions) {
    94  	log := opts.log
    95  
    96  	fsOpts := fs.NewOptions().
    97  		SetFilePathPrefix(opts.filePathPrefix).
    98  		// Always validate checksums before reading and/or validating contents
    99  		// regardless of whether this is a validation run or just reading
   100  		// the raw files.
   101  		SetIndexReaderAutovalidateIndexSegments(true)
   102  
   103  	indexDirPath := fs.IndexDataDirPath(opts.filePathPrefix)
   104  
   105  	namespaces, err := dirFiles(indexDirPath)
   106  	if err != nil {
   107  		log.Fatal("could not read namespaces", zap.Error(err))
   108  	}
   109  
   110  	// Get all fileset files.
   111  	log.Info("discovered namespaces", zap.Strings("namespaces", namespaces))
   112  
   113  	var (
   114  		out                io.Writer
   115  		validateWorkerPool xsync.WorkerPool
   116  	)
   117  	if opts.validate {
   118  		// Only validating, output to dev null.
   119  		out = ioutil.Discard
   120  		validateWorkerPool = xsync.NewWorkerPool(opts.validateConcurrency)
   121  		validateWorkerPool.Init()
   122  		log.Info("validating segment files",
   123  			zap.Int("concurrency", opts.validateConcurrency))
   124  	} else {
   125  		// Output to file.
   126  		out, err = os.Create(opts.outputFilePath)
   127  		if err != nil {
   128  			log.Fatal("unable to create output file",
   129  				zap.String("file", opts.outputFilePath),
   130  				zap.Error(err))
   131  		}
   132  		log.Info("writing output JSON line delimited",
   133  			zap.String("path", opts.outputFilePath))
   134  	}
   135  
   136  	for _, namespace := range namespaces {
   137  		log.Info("reading segments", zap.String("namespace", namespace))
   138  		ns := ident.StringID(namespace)
   139  
   140  		readNamespaceSegments(out, opts.validate, validateWorkerPool,
   141  			ns, fsOpts, log)
   142  
   143  		// Separate by endline.
   144  		if _, err := out.Write(endlineBytes); err != nil {
   145  			log.Fatal("could not write endline", zap.Error(err))
   146  		}
   147  	}
   148  }
   149  
   150  func readNamespaceSegments(
   151  	out io.Writer,
   152  	validate bool,
   153  	validateWorkerPool xsync.WorkerPool,
   154  	nsID ident.ID,
   155  	fsOpts fs.Options,
   156  	log *zap.Logger,
   157  ) {
   158  	var (
   159  		infoFiles = fs.ReadIndexInfoFiles(fs.ReadIndexInfoFilesOptions{
   160  			FilePathPrefix:   fsOpts.FilePathPrefix(),
   161  			Namespace:        nsID,
   162  			ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
   163  		})
   164  		wg sync.WaitGroup
   165  	)
   166  
   167  	for _, infoFile := range infoFiles {
   168  		if err := infoFile.Err.Error(); err != nil {
   169  			log.Error("unable to read index info file",
   170  				zap.Stringer("namespace", nsID),
   171  				zap.Error(err),
   172  				zap.String("filepath", infoFile.Err.Filepath()),
   173  			)
   174  			continue
   175  		}
   176  
   177  		if !validate {
   178  			readBlockSegments(out, nsID, infoFile, fsOpts, log)
   179  			continue
   180  		}
   181  
   182  		// Validating, so use validation concurrency.
   183  		wg.Add(1)
   184  		validateWorkerPool.Go(func() {
   185  			defer wg.Done()
   186  			readBlockSegments(out, nsID, infoFile, fsOpts, log)
   187  		})
   188  	}
   189  
   190  	// Wait for any concurrent validation.
   191  	wg.Wait()
   192  }
   193  
   194  func readBlockSegments(
   195  	out io.Writer,
   196  	nsID ident.ID,
   197  	infoFile fs.ReadIndexInfoFileResult,
   198  	fsOpts fs.Options,
   199  	log *zap.Logger,
   200  ) {
   201  	// Make sure if we fatal or error out the exact block is known.
   202  	log = log.With(
   203  		zap.String("namespace", nsID.String()),
   204  		zap.String("blockStart", infoFile.ID.BlockStart.String()),
   205  		zap.Int64("blockStartUnixNano", int64(infoFile.ID.BlockStart)),
   206  		zap.Int("volumeIndex", infoFile.ID.VolumeIndex),
   207  		zap.Strings("files", infoFile.AbsoluteFilePaths))
   208  
   209  	log.Info("reading block segments")
   210  
   211  	readResult, err := fs.ReadIndexSegments(fs.ReadIndexSegmentsOptions{
   212  		ReaderOptions: fs.IndexReaderOpenOptions{
   213  			Identifier:  infoFile.ID,
   214  			FileSetType: persist.FileSetFlushType,
   215  		},
   216  		FilesystemOptions: fsOpts,
   217  	})
   218  	if err != nil {
   219  		log.Error("unable to read segments from index fileset", zap.Error(err))
   220  		return
   221  	}
   222  
   223  	if readResult.Validated {
   224  		log.Info("validated segments")
   225  	} else {
   226  		log.Error("expected to validate segments but did not validate")
   227  	}
   228  
   229  	for i, seg := range readResult.Segments {
   230  		jw := json.NewWriter(out)
   231  		jw.BeginObject()
   232  
   233  		jw.BeginObjectField("namespace")
   234  		jw.WriteString(nsID.String())
   235  
   236  		jw.BeginObjectField("blockStart")
   237  		jw.WriteString(time.Unix(0, infoFile.Info.BlockStart).Format(time.RFC3339))
   238  
   239  		jw.BeginObjectField("volumeIndex")
   240  		jw.WriteInt(infoFile.ID.VolumeIndex)
   241  
   242  		jw.BeginObjectField("segmentIndex")
   243  		jw.WriteInt(i)
   244  
   245  		reader, err := seg.Reader()
   246  		if err != nil {
   247  			log.Fatal("unable to create segment reader", zap.Error(err))
   248  		}
   249  
   250  		iter, err := reader.AllDocs()
   251  		if err != nil {
   252  			log.Fatal("unable to iterate segment docs", zap.Error(err))
   253  		}
   254  
   255  		jw.BeginObjectField("documents")
   256  		jw.BeginArray()
   257  		for postingsID := 0; iter.Next(); postingsID++ {
   258  			d := iter.Current()
   259  			jw.BeginObject()
   260  
   261  			jw.BeginObjectField("postingsID")
   262  			jw.WriteInt(postingsID)
   263  
   264  			jw.BeginObjectField("id")
   265  			unsafe.WithString(d.ID, func(str string) {
   266  				jw.WriteString(str)
   267  			})
   268  
   269  			jw.BeginObjectField("fields")
   270  
   271  			jw.BeginArray()
   272  			for _, field := range d.Fields {
   273  				jw.BeginObject()
   274  
   275  				jw.BeginObjectField("name")
   276  				unsafe.WithString(field.Name, func(str string) {
   277  					jw.WriteString(str)
   278  				})
   279  
   280  				jw.BeginObjectField("value")
   281  				unsafe.WithString(field.Name, func(str string) {
   282  					jw.WriteString(str)
   283  				})
   284  
   285  				jw.EndObject()
   286  			}
   287  			jw.EndArray()
   288  
   289  			jw.EndObject()
   290  		}
   291  		jw.EndArray()
   292  
   293  		if err := iter.Err(); err != nil {
   294  			log.Fatal("doc iterator error", zap.Error(err))
   295  		}
   296  		if err := iter.Close(); err != nil {
   297  			log.Fatal("doc iterator close error", zap.Error(err))
   298  		}
   299  
   300  		fieldsIter, err := seg.FieldsIterable().Fields()
   301  		if err != nil {
   302  			log.Fatal("could not create fields iterator", zap.Error(err))
   303  		}
   304  
   305  		jw.BeginObjectField("fields")
   306  		jw.BeginArray()
   307  		for fieldsIter.Next() {
   308  			field := fieldsIter.Current()
   309  
   310  			jw.BeginObject()
   311  			jw.BeginObjectField("field")
   312  			unsafe.WithString(field, func(str string) {
   313  				jw.WriteString(str)
   314  			})
   315  
   316  			termsIter, err := seg.TermsIterable().Terms(field)
   317  			if err != nil {
   318  				log.Fatal("could not create terms iterator", zap.Error(err))
   319  			}
   320  
   321  			jw.BeginObjectField("terms")
   322  			jw.BeginArray()
   323  			for termsIter.Next() {
   324  				term, postingsList := termsIter.Current()
   325  
   326  				jw.BeginObject()
   327  				jw.BeginObjectField("term")
   328  				unsafe.WithString(term, func(str string) {
   329  					jw.WriteString(str)
   330  				})
   331  
   332  				postingsIter := postingsList.Iterator()
   333  
   334  				jw.BeginObjectField("postings")
   335  				jw.BeginArray()
   336  				for postingsIter.Next() {
   337  					postingsID := postingsIter.Current()
   338  					jw.WriteInt(int(postingsID))
   339  				}
   340  				jw.EndArray()
   341  				jw.EndObject()
   342  
   343  				if err := postingsIter.Err(); err != nil {
   344  					log.Fatal("postings iterator error", zap.Error(err))
   345  				}
   346  
   347  				if err := postingsIter.Close(); err != nil {
   348  					log.Fatal("postings iterator close error", zap.Error(err))
   349  				}
   350  			}
   351  			jw.EndArray()
   352  			jw.EndObject()
   353  
   354  			if err := termsIter.Err(); err != nil {
   355  				log.Fatal("field iterator error", zap.Error(err))
   356  			}
   357  
   358  			if err := termsIter.Close(); err != nil {
   359  				log.Fatal("field iterator close error", zap.Error(err))
   360  			}
   361  		}
   362  		jw.EndArray()
   363  
   364  		if err := fieldsIter.Err(); err != nil {
   365  			log.Fatal("field iterator error", zap.Error(err))
   366  		}
   367  
   368  		if err := fieldsIter.Close(); err != nil {
   369  			log.Fatal("field iterator close error", zap.Error(err))
   370  		}
   371  
   372  		jw.EndObject()
   373  
   374  		if err := jw.Flush(); err != nil {
   375  			log.Fatal("could not flush JSON writer", zap.Error(err))
   376  		}
   377  		if err := jw.Close(); err != nil {
   378  			log.Fatal("could not close JSON writer", zap.Error(err))
   379  		}
   380  	}
   381  }
   382  
   383  func dirFiles(dirPath string) ([]string, error) {
   384  	dir, err := os.Open(dirPath)
   385  	if err != nil {
   386  		return nil, fmt.Errorf("could not open dir: %v", err)
   387  	}
   388  
   389  	defer dir.Close()
   390  
   391  	stat, err := dir.Stat()
   392  	if err != nil {
   393  		return nil, fmt.Errorf("could not stat dir: %v", err)
   394  	}
   395  	if !stat.IsDir() {
   396  		return nil, fmt.Errorf("path is not a directory: %s", dirPath)
   397  	}
   398  
   399  	entries, err := dir.Readdirnames(-1)
   400  	if err != nil {
   401  		return nil, fmt.Errorf("could not read dir names: %v", err)
   402  	}
   403  
   404  	results := entries[:0]
   405  	for _, p := range entries {
   406  		if p == "." || p == ".." || p == "./.." || p == "./" || p == "../" || p == "./../" {
   407  			continue
   408  		}
   409  		results = append(results, p)
   410  	}
   411  	return results, nil
   412  }