github.com/cayleygraph/cayley@v0.7.7/cmd/cayley/command/dedup.go (about) 1 package command 2 3 import ( 4 "context" 5 "crypto/sha1" 6 "errors" 7 "fmt" 8 "hash" 9 "sort" 10 "time" 11 12 "github.com/spf13/cobra" 13 "github.com/spf13/viper" 14 15 "github.com/cayleygraph/cayley/clog" 16 "github.com/cayleygraph/cayley/graph" 17 "github.com/cayleygraph/cayley/graph/path" 18 "github.com/cayleygraph/quad" 19 "github.com/cayleygraph/quad/voc/rdf" 20 ) 21 22 func iriFlag(s string, err error) (quad.IRI, error) { 23 if err != nil { 24 return "", err 25 } 26 return quad.IRI(s), nil 27 } 28 29 func NewDedupCommand() *cobra.Command { 30 cmd := &cobra.Command{ 31 Use: "dedup", 32 Short: "Deduplicate bnode values", 33 RunE: func(cmd *cobra.Command, args []string) error { 34 ctx := context.Background() 35 printBackendInfo() 36 h, err := openDatabase() 37 if err != nil { 38 return err 39 } 40 defer h.Close() 41 42 pred, _ := iriFlag(cmd.Flags().GetString("pred")) 43 typ, _ := iriFlag(cmd.Flags().GetString("type")) 44 if typ == "" { 45 return errors.New("no type is specified") 46 } 47 return dedupProperties(ctx, h, pred, typ) 48 }, 49 } 50 cmd.Flags().String("pred", rdf.Type, "type predicate to use to find nodes") 51 cmd.Flags().String("type", "", "type value to use to find nodes") 52 return cmd 53 } 54 55 func valueLess(a, b graph.Ref) bool { 56 // TODO(dennwc): more effective way 57 s1, s2 := fmt.Sprint(a), fmt.Sprint(b) 58 return s1 < s2 59 } 60 61 type sortVals []graph.Ref 62 63 func (a sortVals) Len() int { return len(a) } 64 func (a sortVals) Less(i, j int) bool { return valueLess(a[i], a[j]) } 65 func (a sortVals) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 66 67 type sortProp []property 68 69 func (a sortProp) Len() int { return len(a) } 70 func (a sortProp) Less(i, j int) bool { return valueLess(a[i].Pred, a[j].Pred) } 71 func (a sortProp) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 72 73 func hashProperties(h hash.Hash, m map[interface{}]property) string { 74 props := make([]property, 0, len(m)) 75 for _, p := range m { 76 if len(p.Values) > 1 { 77 sort.Sort(sortVals(p.Values)) 78 } 79 props = append(props, p) 80 } 81 sort.Sort(sortProp(props)) 82 h.Reset() 83 for _, p := range props { 84 fmt.Fprint(h, p.Pred) 85 h.Write([]byte{0}) 86 for _, v := range p.Values { 87 fmt.Fprint(h, v) 88 h.Write([]byte{1}) 89 } 90 } 91 res := make([]byte, 0, h.Size()) 92 res = h.Sum(res) 93 return string(res) 94 } 95 96 type property struct { 97 Pred graph.Ref 98 Values []graph.Ref 99 } 100 101 func dedupProperties(ctx context.Context, h *graph.Handle, pred, typ quad.IRI) error { 102 batch := viper.GetInt(KeyLoadBatch) 103 if batch == 0 { 104 batch = quad.DefaultBatch 105 } 106 107 qs := h.QuadStore 108 p := path.StartPath(qs).Has(pred, typ) 109 ictx, cancel := context.WithCancel(ctx) 110 defer cancel() 111 var gerr error 112 113 seen := make(map[string]graph.Ref) 114 cnt, dedup := 0, 0 115 start := time.Now() 116 last := start 117 hh := sha1.New() 118 119 tx := graph.NewTransaction() 120 txn := 0 121 flush := func() { 122 if txn == 0 { 123 return 124 } 125 err := h.ApplyTransaction(tx) 126 if err == nil { 127 tx = graph.NewTransaction() 128 dedup += txn 129 txn = 0 130 } else { 131 gerr = err 132 cancel() 133 } 134 if now := time.Now(); now.Sub(last) > time.Second*5 { 135 last = now 136 clog.Infof("deduplicated %d/%d nodes (%.1f nodes/sec)", 137 dedup, cnt, float64(cnt)/now.Sub(start).Seconds(), 138 ) 139 } 140 } 141 err := p.Iterate(ictx).Each(func(s graph.Ref) { 142 cnt++ 143 it := qs.QuadIterator(quad.Subject, s) 144 defer it.Close() 145 m := make(map[interface{}]property) 146 for it.Next(ictx) { 147 q := it.Result() 148 p := qs.QuadDirection(q, quad.Predicate) 149 o := qs.QuadDirection(q, quad.Object) 150 k := graph.ToKey(p) 151 prop := m[k] 152 prop.Pred = p 153 prop.Values = append(prop.Values, o) 154 m[k] = prop 155 } 156 if gerr = it.Err(); gerr != nil { 157 cancel() 158 } 159 ph := hashProperties(hh, m) 160 id, ok := seen[ph] 161 if !ok { 162 seen[ph] = s 163 return 164 } 165 if gerr = dedupValueTx(ictx, h, tx, s, id); gerr != nil { 166 cancel() 167 } 168 txn++ 169 if txn >= batch { // TODO(dennwc): flag 170 flush() 171 } 172 }) 173 flush() 174 clog.Infof("deduplicated %d/%d nodes in %v", dedup, cnt, time.Since(start)) 175 if gerr != nil { 176 err = gerr 177 } 178 return err 179 } 180 181 func dedupValueTx(ctx context.Context, h *graph.Handle, tx *graph.Transaction, a, b graph.Ref) error { 182 v := h.NameOf(b) 183 it := h.QuadIterator(quad.Object, a) 184 defer it.Close() 185 for it.Next(ctx) { 186 // TODO(dennwc): we should be able to add "raw" quads without getting values for directions 187 q := h.Quad(it.Result()) 188 tx.RemoveQuad(q) 189 q.Object = v 190 tx.AddQuad(q) 191 } 192 if err := it.Err(); err != nil { 193 return err 194 } 195 it.Close() 196 197 it = h.QuadIterator(quad.Subject, a) 198 defer it.Close() 199 for it.Next(ctx) { 200 q := h.Quad(it.Result()) 201 tx.RemoveQuad(q) 202 } 203 if err := it.Err(); err != nil { 204 return err 205 } 206 return nil 207 }