github.com/cayleygraph/cayley@v0.7.7/cmd/cayley/command/dedup.go (about)

     1  package command
     2  
     3  import (
     4  	"context"
     5  	"crypto/sha1"
     6  	"errors"
     7  	"fmt"
     8  	"hash"
     9  	"sort"
    10  	"time"
    11  
    12  	"github.com/spf13/cobra"
    13  	"github.com/spf13/viper"
    14  
    15  	"github.com/cayleygraph/cayley/clog"
    16  	"github.com/cayleygraph/cayley/graph"
    17  	"github.com/cayleygraph/cayley/graph/path"
    18  	"github.com/cayleygraph/quad"
    19  	"github.com/cayleygraph/quad/voc/rdf"
    20  )
    21  
    22  func iriFlag(s string, err error) (quad.IRI, error) {
    23  	if err != nil {
    24  		return "", err
    25  	}
    26  	return quad.IRI(s), nil
    27  }
    28  
    29  func NewDedupCommand() *cobra.Command {
    30  	cmd := &cobra.Command{
    31  		Use:   "dedup",
    32  		Short: "Deduplicate bnode values",
    33  		RunE: func(cmd *cobra.Command, args []string) error {
    34  			ctx := context.Background()
    35  			printBackendInfo()
    36  			h, err := openDatabase()
    37  			if err != nil {
    38  				return err
    39  			}
    40  			defer h.Close()
    41  
    42  			pred, _ := iriFlag(cmd.Flags().GetString("pred"))
    43  			typ, _ := iriFlag(cmd.Flags().GetString("type"))
    44  			if typ == "" {
    45  				return errors.New("no type is specified")
    46  			}
    47  			return dedupProperties(ctx, h, pred, typ)
    48  		},
    49  	}
    50  	cmd.Flags().String("pred", rdf.Type, "type predicate to use to find nodes")
    51  	cmd.Flags().String("type", "", "type value to use to find nodes")
    52  	return cmd
    53  }
    54  
    55  func valueLess(a, b graph.Ref) bool {
    56  	// TODO(dennwc): more effective way
    57  	s1, s2 := fmt.Sprint(a), fmt.Sprint(b)
    58  	return s1 < s2
    59  }
    60  
    61  type sortVals []graph.Ref
    62  
    63  func (a sortVals) Len() int           { return len(a) }
    64  func (a sortVals) Less(i, j int) bool { return valueLess(a[i], a[j]) }
    65  func (a sortVals) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
    66  
    67  type sortProp []property
    68  
    69  func (a sortProp) Len() int           { return len(a) }
    70  func (a sortProp) Less(i, j int) bool { return valueLess(a[i].Pred, a[j].Pred) }
    71  func (a sortProp) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
    72  
    73  func hashProperties(h hash.Hash, m map[interface{}]property) string {
    74  	props := make([]property, 0, len(m))
    75  	for _, p := range m {
    76  		if len(p.Values) > 1 {
    77  			sort.Sort(sortVals(p.Values))
    78  		}
    79  		props = append(props, p)
    80  	}
    81  	sort.Sort(sortProp(props))
    82  	h.Reset()
    83  	for _, p := range props {
    84  		fmt.Fprint(h, p.Pred)
    85  		h.Write([]byte{0})
    86  		for _, v := range p.Values {
    87  			fmt.Fprint(h, v)
    88  			h.Write([]byte{1})
    89  		}
    90  	}
    91  	res := make([]byte, 0, h.Size())
    92  	res = h.Sum(res)
    93  	return string(res)
    94  }
    95  
    96  type property struct {
    97  	Pred   graph.Ref
    98  	Values []graph.Ref
    99  }
   100  
   101  func dedupProperties(ctx context.Context, h *graph.Handle, pred, typ quad.IRI) error {
   102  	batch := viper.GetInt(KeyLoadBatch)
   103  	if batch == 0 {
   104  		batch = quad.DefaultBatch
   105  	}
   106  
   107  	qs := h.QuadStore
   108  	p := path.StartPath(qs).Has(pred, typ)
   109  	ictx, cancel := context.WithCancel(ctx)
   110  	defer cancel()
   111  	var gerr error
   112  
   113  	seen := make(map[string]graph.Ref)
   114  	cnt, dedup := 0, 0
   115  	start := time.Now()
   116  	last := start
   117  	hh := sha1.New()
   118  
   119  	tx := graph.NewTransaction()
   120  	txn := 0
   121  	flush := func() {
   122  		if txn == 0 {
   123  			return
   124  		}
   125  		err := h.ApplyTransaction(tx)
   126  		if err == nil {
   127  			tx = graph.NewTransaction()
   128  			dedup += txn
   129  			txn = 0
   130  		} else {
   131  			gerr = err
   132  			cancel()
   133  		}
   134  		if now := time.Now(); now.Sub(last) > time.Second*5 {
   135  			last = now
   136  			clog.Infof("deduplicated %d/%d nodes (%.1f nodes/sec)",
   137  				dedup, cnt, float64(cnt)/now.Sub(start).Seconds(),
   138  			)
   139  		}
   140  	}
   141  	err := p.Iterate(ictx).Each(func(s graph.Ref) {
   142  		cnt++
   143  		it := qs.QuadIterator(quad.Subject, s)
   144  		defer it.Close()
   145  		m := make(map[interface{}]property)
   146  		for it.Next(ictx) {
   147  			q := it.Result()
   148  			p := qs.QuadDirection(q, quad.Predicate)
   149  			o := qs.QuadDirection(q, quad.Object)
   150  			k := graph.ToKey(p)
   151  			prop := m[k]
   152  			prop.Pred = p
   153  			prop.Values = append(prop.Values, o)
   154  			m[k] = prop
   155  		}
   156  		if gerr = it.Err(); gerr != nil {
   157  			cancel()
   158  		}
   159  		ph := hashProperties(hh, m)
   160  		id, ok := seen[ph]
   161  		if !ok {
   162  			seen[ph] = s
   163  			return
   164  		}
   165  		if gerr = dedupValueTx(ictx, h, tx, s, id); gerr != nil {
   166  			cancel()
   167  		}
   168  		txn++
   169  		if txn >= batch { // TODO(dennwc): flag
   170  			flush()
   171  		}
   172  	})
   173  	flush()
   174  	clog.Infof("deduplicated %d/%d nodes in %v", dedup, cnt, time.Since(start))
   175  	if gerr != nil {
   176  		err = gerr
   177  	}
   178  	return err
   179  }
   180  
   181  func dedupValueTx(ctx context.Context, h *graph.Handle, tx *graph.Transaction, a, b graph.Ref) error {
   182  	v := h.NameOf(b)
   183  	it := h.QuadIterator(quad.Object, a)
   184  	defer it.Close()
   185  	for it.Next(ctx) {
   186  		// TODO(dennwc): we should be able to add "raw" quads without getting values for directions
   187  		q := h.Quad(it.Result())
   188  		tx.RemoveQuad(q)
   189  		q.Object = v
   190  		tx.AddQuad(q)
   191  	}
   192  	if err := it.Err(); err != nil {
   193  		return err
   194  	}
   195  	it.Close()
   196  
   197  	it = h.QuadIterator(quad.Subject, a)
   198  	defer it.Close()
   199  	for it.Next(ctx) {
   200  		q := h.Quad(it.Result())
   201  		tx.RemoveQuad(q)
   202  	}
   203  	if err := it.Err(); err != nil {
   204  		return err
   205  	}
   206  	return nil
   207  }