github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/nomdex/nomdex_update.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package main 6 7 import ( 8 "fmt" 9 "regexp" 10 "strconv" 11 "sync" 12 "sync/atomic" 13 14 "github.com/attic-labs/kingpin" 15 humanize "github.com/dustin/go-humanize" 16 17 "github.com/attic-labs/noms/go/config" 18 "github.com/attic-labs/noms/go/d" 19 "github.com/attic-labs/noms/go/datas" 20 "github.com/attic-labs/noms/go/hash" 21 "github.com/attic-labs/noms/go/types" 22 "github.com/attic-labs/noms/go/util/profile" 23 "github.com/attic-labs/noms/go/util/status" 24 ) 25 26 var ( 27 inPathArg = "" 28 outDsArg = "" 29 relPathArg = "" 30 txRegexArg = "" 31 txReplaceArg = "" 32 txConvertArg = "" 33 ) 34 35 var longUpHelp = `'nomdex up' builds indexes that are useful for rapidly accessing objects. 36 37 This sample tool can index objects based on any string or number attribute of that 38 object. The 'up' command works by scanning all the objects reachable from the --in-path 39 command line argument. It tests the object to determine if there is a string or number 40 value reachable by applying the --by path argument to the object. If so, the object is 41 added to the index under that value. 42 43 For example, if there are objects in the database that contain a personId and a 44 gender field, 'nomdex up' can scan all the objects in a given dataset and build 45 an index on the specified field with the following commands: 46 nomdex up --in-path <dsSpec>.value --by .gender --out-ds gender-index 47 nomdex up --in-path <dsSpec>.value --by .address.city --out-ds personId-index 48 49 The previous commands can be understood as follows. The first command updates or 50 builds an index by scanning all the objects that are reachable from |in-path| that 51 have a string or number value reachable using |by| and stores the root of the 52 resulting index in a dataset specified by |out-ds|. 53 54 Notice that the --in-path argument has a value of '<dsSpec>.value'. The '.value' 55 is not strictly necessary but it's normally useful when indexing. Since datasets 56 generally point to Commit objects in Noms, they usually have parents which are 57 previous versions of the data. If you add .value to the end of the dataset, only 58 the most recent version of the data will be indexed. Without the '.value' all 59 objects in all previous commits will also be indexed which is most often not what 60 is expected. 61 62 There are three additional commands that can be useful for transforming the value 63 being indexed: 64 * tx-replace: used to modify behavior of tx-regex, see below 65 * tx-regex: the behavior for this argument depends on whether a tx-replace argument 66 is present. If so, the go routine "regexp.ReplaceAllString() is called: 67 txRe := regex.MustCompile(|tx-regex|) 68 txRe.ReplaceAllString(|index value|, |tx-replace| 69 If tx-replace is not present then the following call is made on each value: 70 txRe := regex.MustCompile(|tx-regex|) 71 regex.FindStringSubmatch(|index value|) 72 *tx-convert: attempts to convert the index value to the type specified. 73 Currently the only value accepted for this arg is 'number' 74 75 The resulting indexes can be used by the 'nomdex find command' for help on that 76 see: nomdex find -h 77 ` 78 79 func registerUpdate() { 80 cmd := kingpin.Command("up", "Build/update an index.") 81 cmd.Flag("in-path", "a value to search for items to index within").Required().StringVar(&inPathArg) 82 cmd.Flag("out-ds", "name of dataset to save the results to").Required().StringVar(&outDsArg) 83 cmd.Flag("by", "a path relative to all the items in <in-path> to index by").Required().StringVar(&relPathArg) 84 cmd.Flag("tx-regex", "perform a string transformation on value before putting it in index").StringVar(&txRegexArg) 85 cmd.Flag("tx-replace", "replace values matched by tx-regex").StringVar(&txReplaceArg) 86 cmd.Flag("tx-convert", "convert the result of a tx regex/replace to this type (only does 'number' currently)").StringVar(&txConvertArg) 87 } 88 89 type StreamingSetEntry struct { 90 valChan chan<- types.Value 91 setChan <-chan types.Set 92 } 93 94 type IndexMap map[types.Value]StreamingSetEntry 95 96 type Index struct { 97 m IndexMap 98 indexedCnt int64 99 seenCnt int64 100 mutex sync.Mutex 101 } 102 103 func runUpdate() int { 104 defer profile.MaybeStartProfile().Stop() 105 106 cfg := config.NewResolver() 107 db, rootObject, err := cfg.GetPath(inPathArg) 108 d.Chk.NoError(err) 109 110 if rootObject == nil { 111 fmt.Printf("Object not found: %s\n", inPathArg) 112 return 1 113 } 114 115 outDs := db.GetDataset(outDsArg) 116 relPath, err := types.ParsePath(relPathArg) 117 if printError(err, "Error parsing -by value\n\t") { 118 return 1 119 } 120 121 gb := types.NewGraphBuilder(db, types.MapKind) 122 addElementsToGraphBuilder(gb, db, rootObject, relPath) 123 indexMap := gb.Build().(types.Map) 124 125 outDs, err = db.Commit(outDs, indexMap, datas.CommitOptions{}) 126 d.Chk.NoError(err) 127 fmt.Printf("Committed index with %d entries to dataset: %s\n", indexMap.Len(), outDsArg) 128 129 return 0 130 } 131 132 func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootObject types.Value, relPath types.Path) { 133 typeCacheMutex := sync.Mutex{} 134 typeCache := map[hash.Hash]bool{} 135 136 var txRe *regexp.Regexp 137 if txRegexArg != "" { 138 var err error 139 txRe, err = regexp.Compile(txRegexArg) 140 d.CheckError(err) 141 } 142 143 index := Index{m: IndexMap{}} 144 types.WalkValues(rootObject, db, func(v types.Value) bool { 145 typ := types.TypeOf(v) 146 typeCacheMutex.Lock() 147 hasPath, ok := typeCache[typ.Hash()] 148 typeCacheMutex.Unlock() 149 if !ok || hasPath { 150 pathResolved := false 151 tv := relPath.Resolve(v, db) 152 if tv != nil { 153 index.addToGraphBuilder(gb, tv, v, txRe) 154 pathResolved = true 155 } 156 if !ok { 157 typeCacheMutex.Lock() 158 typeCache[typ.Hash()] = pathResolved 159 typeCacheMutex.Unlock() 160 } 161 } 162 return false 163 }) 164 165 status.Done() 166 } 167 168 func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value, txRe *regexp.Regexp) { 169 atomic.AddInt64(&idx.seenCnt, 1) 170 if txRe != nil { 171 k1 := types.EncodedValue(k) 172 k2 := "" 173 if txReplaceArg != "" { 174 k2 = txRe.ReplaceAllString(string(k1), txReplaceArg) 175 } else { 176 matches := txRe.FindStringSubmatch(string(k1)) 177 if len(matches) > 0 { 178 k2 = matches[len(matches)-1] 179 } 180 } 181 if txConvertArg == "number" { 182 if k2 == "" { 183 return 184 } 185 n, err := strconv.ParseFloat(k2, 64) 186 if err != nil { 187 fmt.Println("error converting to number: ", err) 188 return 189 } 190 k = types.Number(n) 191 } else { 192 k = types.String(k2) 193 } 194 } 195 atomic.AddInt64(&idx.indexedCnt, 1) 196 gb.SetInsert(types.ValueSlice{k}, v) 197 status.Printf("Found %s objects, Indexed %s objects", humanize.Comma(idx.seenCnt), humanize.Comma(idx.indexedCnt)) 198 }