github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/nomdex/nomdex_update.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package main
     6  
     7  import (
     8  	"fmt"
     9  	"regexp"
    10  	"strconv"
    11  	"sync"
    12  	"sync/atomic"
    13  
    14  	"github.com/attic-labs/kingpin"
    15  	humanize "github.com/dustin/go-humanize"
    16  
    17  	"github.com/attic-labs/noms/go/config"
    18  	"github.com/attic-labs/noms/go/d"
    19  	"github.com/attic-labs/noms/go/datas"
    20  	"github.com/attic-labs/noms/go/hash"
    21  	"github.com/attic-labs/noms/go/types"
    22  	"github.com/attic-labs/noms/go/util/profile"
    23  	"github.com/attic-labs/noms/go/util/status"
    24  )
    25  
    26  var (
    27  	inPathArg    = ""
    28  	outDsArg     = ""
    29  	relPathArg   = ""
    30  	txRegexArg   = ""
    31  	txReplaceArg = ""
    32  	txConvertArg = ""
    33  )
    34  
    35  var longUpHelp = `'nomdex up' builds indexes that are useful for rapidly accessing objects.
    36  
    37  This sample tool can index objects based on any string or number attribute of that
    38  object. The 'up' command works by scanning all the objects reachable from the --in-path
    39  command line argument. It tests the object to determine if there is a string or number
    40  value reachable by applying the --by path argument to the object. If so, the object is
    41  added to the index under that value.
    42  
    43  For example, if there are objects in the database that contain a personId and a
    44  gender field, 'nomdex up' can scan all the objects in a given dataset and build
    45  an index on the specified field with the following commands:
    46     nomdex up --in-path <dsSpec>.value --by .gender --out-ds gender-index
    47     nomdex up --in-path <dsSpec>.value --by .address.city --out-ds personId-index
    48  
    49  The previous commands can be understood as follows. The first command updates or
    50  builds an index by scanning all the objects that are reachable from |in-path| that
    51  have a string or number value reachable using |by| and stores the root of the
    52  resulting index in a dataset specified by |out-ds|.
    53  
    54  Notice that the --in-path argument has a value of '<dsSpec>.value'. The '.value'
    55  is not strictly necessary but it's normally useful when indexing. Since datasets
    56  generally point to Commit objects in Noms, they usually have parents which are
    57  previous versions of the data. If you add .value to the end of the dataset, only
    58  the most recent version of the data will be indexed. Without the '.value' all
    59  objects in all previous commits will also be indexed which is most often not what
    60  is expected.
    61  
    62  There are three additional commands that can be useful for transforming the value
    63  being indexed:
    64      * tx-replace: used to modify behavior of tx-regex, see below
    65      * tx-regex: the behavior for this argument depends on whether a tx-replace argument
    66          is present. If so, the go routine "regexp.ReplaceAllString() is called:
    67              txRe := regex.MustCompile(|tx-regex|)
    68              txRe.ReplaceAllString(|index value|, |tx-replace|
    69          If tx-replace is not present then the following call is made on each value:
    70              txRe := regex.MustCompile(|tx-regex|)
    71              regex.FindStringSubmatch(|index value|)
    72      *tx-convert: attempts to convert the index value to the type specified.
    73          Currently the only value accepted for this arg is 'number'
    74  
    75  The resulting indexes can be used by the 'nomdex find command' for help on that
    76  see: nomdex find -h
    77  `
    78  
    79  func registerUpdate() {
    80  	cmd := kingpin.Command("up", "Build/update an index.")
    81  	cmd.Flag("in-path", "a value to search for items to index within").Required().StringVar(&inPathArg)
    82  	cmd.Flag("out-ds", "name of dataset to save the results to").Required().StringVar(&outDsArg)
    83  	cmd.Flag("by", "a path relative to all the items in <in-path> to index by").Required().StringVar(&relPathArg)
    84  	cmd.Flag("tx-regex", "perform a string transformation on value before putting it in index").StringVar(&txRegexArg)
    85  	cmd.Flag("tx-replace", "replace values matched by tx-regex").StringVar(&txReplaceArg)
    86  	cmd.Flag("tx-convert", "convert the result of a tx regex/replace to this type (only does 'number' currently)").StringVar(&txConvertArg)
    87  }
    88  
    89  type StreamingSetEntry struct {
    90  	valChan chan<- types.Value
    91  	setChan <-chan types.Set
    92  }
    93  
    94  type IndexMap map[types.Value]StreamingSetEntry
    95  
    96  type Index struct {
    97  	m          IndexMap
    98  	indexedCnt int64
    99  	seenCnt    int64
   100  	mutex      sync.Mutex
   101  }
   102  
   103  func runUpdate() int {
   104  	defer profile.MaybeStartProfile().Stop()
   105  
   106  	cfg := config.NewResolver()
   107  	db, rootObject, err := cfg.GetPath(inPathArg)
   108  	d.Chk.NoError(err)
   109  
   110  	if rootObject == nil {
   111  		fmt.Printf("Object not found: %s\n", inPathArg)
   112  		return 1
   113  	}
   114  
   115  	outDs := db.GetDataset(outDsArg)
   116  	relPath, err := types.ParsePath(relPathArg)
   117  	if printError(err, "Error parsing -by value\n\t") {
   118  		return 1
   119  	}
   120  
   121  	gb := types.NewGraphBuilder(db, types.MapKind)
   122  	addElementsToGraphBuilder(gb, db, rootObject, relPath)
   123  	indexMap := gb.Build().(types.Map)
   124  
   125  	outDs, err = db.Commit(outDs, indexMap, datas.CommitOptions{})
   126  	d.Chk.NoError(err)
   127  	fmt.Printf("Committed index with %d entries to dataset: %s\n", indexMap.Len(), outDsArg)
   128  
   129  	return 0
   130  }
   131  
   132  func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootObject types.Value, relPath types.Path) {
   133  	typeCacheMutex := sync.Mutex{}
   134  	typeCache := map[hash.Hash]bool{}
   135  
   136  	var txRe *regexp.Regexp
   137  	if txRegexArg != "" {
   138  		var err error
   139  		txRe, err = regexp.Compile(txRegexArg)
   140  		d.CheckError(err)
   141  	}
   142  
   143  	index := Index{m: IndexMap{}}
   144  	types.WalkValues(rootObject, db, func(v types.Value) bool {
   145  		typ := types.TypeOf(v)
   146  		typeCacheMutex.Lock()
   147  		hasPath, ok := typeCache[typ.Hash()]
   148  		typeCacheMutex.Unlock()
   149  		if !ok || hasPath {
   150  			pathResolved := false
   151  			tv := relPath.Resolve(v, db)
   152  			if tv != nil {
   153  				index.addToGraphBuilder(gb, tv, v, txRe)
   154  				pathResolved = true
   155  			}
   156  			if !ok {
   157  				typeCacheMutex.Lock()
   158  				typeCache[typ.Hash()] = pathResolved
   159  				typeCacheMutex.Unlock()
   160  			}
   161  		}
   162  		return false
   163  	})
   164  
   165  	status.Done()
   166  }
   167  
   168  func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value, txRe *regexp.Regexp) {
   169  	atomic.AddInt64(&idx.seenCnt, 1)
   170  	if txRe != nil {
   171  		k1 := types.EncodedValue(k)
   172  		k2 := ""
   173  		if txReplaceArg != "" {
   174  			k2 = txRe.ReplaceAllString(string(k1), txReplaceArg)
   175  		} else {
   176  			matches := txRe.FindStringSubmatch(string(k1))
   177  			if len(matches) > 0 {
   178  				k2 = matches[len(matches)-1]
   179  			}
   180  		}
   181  		if txConvertArg == "number" {
   182  			if k2 == "" {
   183  				return
   184  			}
   185  			n, err := strconv.ParseFloat(k2, 64)
   186  			if err != nil {
   187  				fmt.Println("error converting to number: ", err)
   188  				return
   189  			}
   190  			k = types.Number(n)
   191  		} else {
   192  			k = types.String(k2)
   193  		}
   194  	}
   195  	atomic.AddInt64(&idx.indexedCnt, 1)
   196  	gb.SetInsert(types.ValueSlice{k}, v)
   197  	status.Printf("Found %s objects, Indexed %s objects", humanize.Comma(idx.seenCnt), humanize.Comma(idx.indexedCnt))
   198  }