github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/iso_canonical.go

github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/iso_canonical.go (about)

     1  // Copyright ©2020 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package rdf
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"hash"
    12  	"sort"
    13  )
    14  
    15  // See "Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms
    16  // for Leaning and Labelling Blank Nodes" by Aiden Hogan for description of
    17  // the algorithm, https://doi.org/10.1145/3068333 and available free from
    18  // the author's web page http://aidanhogan.com/docs/rdf-canonicalisation.pdf.
    19  //
    20  // Aspects of implementation from discussion in v1.0 of the readme of the PoC
    21  // at https://doi.org/10.5281/zenodo.3154322
    22  
    23  // Isomorphic returns whether the RDF graph datasets a and b are isomorphic,
    24  // where there is a bijective mapping between blank nodes in a and b using
    25  // the given hash function. If decomp is true, the graphs are decomposed
    26  // before canonicalization.
    27  func Isomorphic(a, b []*Statement, decomp bool, h hash.Hash) bool {
    28  	if len(a) != len(b) {
    29  		return false
    30  	}
    31  
    32  	zero := make([]byte, h.Size())
    33  	ah, _ := IsoCanonicalHashes(a, decomp, true, h, zero)
    34  	bh, _ := IsoCanonicalHashes(b, decomp, true, h, zero)
    35  	if len(ah) != len(bh) {
    36  		return false
    37  	}
    38  
    39  	work := make([][]byte, 2*len(ah))
    40  	lexicalHashes(work[:len(ah)], ah)
    41  	lexicalHashes(work[len(ah):], bh)
    42  	for i := range work[:len(ah)] {
    43  		if !bytes.Equal(work[i], work[i+len(ah)]) {
    44  			return false
    45  		}
    46  	}
    47  	return true
    48  }
    49  
    50  func lexicalHashes(dst [][]byte, hashes map[string][]byte) {
    51  	i := 0
    52  	for _, s := range hashes {
    53  		dst[i] = s
    54  		i++
    55  	}
    56  	sort.Sort(lexical(dst))
    57  }
    58  
    59  // IsoCanonicalHashes returns a mapping between the nodes of the RDF graph
    60  // dataset described by the given statements using the provided hash
    61  // function. If decomp is true, the graphs are decomposed before hashing.
    62  // If dist is true the input graph is decomposed into identical splits, the
    63  // entire graph will be hashed to distinguish nodes. If decomp is false,
    64  // dist has no effect.
    65  // Blank node hashes are initially set to the value of zero. Hash values
    66  // are provided for literal and IRI nodes as well as for blank node. The
    67  // hash input for literal nodes includes the quotes and the input for IRI
    68  // nodes first removes the angle quotes around the IRI, although these are
    69  // included in the map keys.
    70  //
    71  // Note that hashes returned by IsoCanonicalHashes with decomp=true are not
    72  // comparable with hashes returned by IsoCanonicalHashes with decomp=false.
    73  //
    74  // See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of
    75  // the hashing algorithm.
    76  func IsoCanonicalHashes(statements []*Statement, decomp, dist bool, h hash.Hash, zero []byte) (hashes map[string][]byte, terms map[string]map[string]bool) {
    77  	if len(statements) == 0 {
    78  		return nil, nil
    79  	}
    80  
    81  	if debug {
    82  		debug.log(0, "Statements:")
    83  		for _, s := range statements {
    84  			debug.log(0, s)
    85  		}
    86  		debug.log(0)
    87  	}
    88  
    89  	hash, parts, ok := hashBNodesPerSplit(statements, decomp, h, zero)
    90  
    91  	if debug {
    92  		debug.log(0, "Blanks:")
    93  		if len(hash.blanks) != 0 {
    94  			for _, b := range hash.blanks {
    95  				debug.log(0, b)
    96  			}
    97  		} else {
    98  			debug.log(0, "none")
    99  		}
   100  		debug.log(0)
   101  
   102  		debug.log(0, "Parts:")
   103  		debug.logParts(0, parts)
   104  
   105  		debug.logf(0, "Hashes from hashBNodesPerSplit (splitting=%t):\n", decomp)
   106  		debug.logHashes(0, hash.hashOf, h.Size())
   107  	}
   108  
   109  	if ok {
   110  		return hash.hashOf, hash.termsFor
   111  	}
   112  
   113  	// TODO: remove the triviality exception in distinguish and return
   114  	// the original hashes if this result is nil. Make the triviality
   115  	// exception optional.
   116  	hashes = distinguish(statements, dist, h, zero, hash, parts, nil, 0)
   117  
   118  	if hashes == nil {
   119  		// distinguish was given trivial parts and
   120  		// we did not ask it to try to merge them.
   121  		return hash.hashOf, hash.termsFor
   122  	}
   123  
   124  	if debug {
   125  		debug.log(0, "Final resolved Hashes:")
   126  		debug.logHashes(0, hashes, h.Size())
   127  	}
   128  
   129  	terms = make(map[string]map[string]bool, len(hashes))
   130  	for k, h := range hashes {
   131  		terms[string(h)] = map[string]bool{k: true}
   132  	}
   133  
   134  	return hashes, terms
   135  }
   136  
   137  // C14n performs a relabeling of the statements in src based on the terms
   138  // obtained from IsoCanonicalHashes, placing the results in dst and returning
   139  // them. The relabeling scheme is the same as for the Universal RDF Dataset
   140  // Normalization Algorithm, blank terms are ordered lexically by their hash
   141  // value and then given a blank label with the prefix "_:c14n" and an
   142  // identifier counter corresponding to the label's sort rank.
   143  //
   144  // If dst is nil, it is allocated, otherwise the length of dst must match the
   145  // length of src.
   146  func C14n(dst, src []*Statement, terms map[string]map[string]bool) ([]*Statement, error) {
   147  	if dst == nil {
   148  		dst = make([]*Statement, len(src))
   149  	}
   150  
   151  	if len(dst) != len(src) {
   152  		return dst, errors.New("rdf: slice length mismatch")
   153  	}
   154  
   155  	need := make(map[string]bool)
   156  	for _, s := range src {
   157  		for _, t := range []string{
   158  			s.Subject.Value,
   159  			s.Object.Value,
   160  			s.Label.Value,
   161  		} {
   162  			if !isBlank(t) {
   163  				continue
   164  			}
   165  			need[t] = true
   166  		}
   167  	}
   168  
   169  	blanks := make([]string, len(need))
   170  	i := 0
   171  	for h, m := range terms {
   172  		var ok bool
   173  		for t := range m {
   174  			if isBlank(t) {
   175  				ok = true
   176  				break
   177  			}
   178  		}
   179  		if !ok {
   180  			continue
   181  		}
   182  		if i == len(blanks) {
   183  			return dst, errors.New("rdf: too many blanks in terms")
   184  		}
   185  		blanks[i] = h
   186  		i++
   187  	}
   188  	sort.Strings(blanks)
   189  
   190  	c14n := make(map[string]string)
   191  	for i, b := range blanks {
   192  		if len(terms[b]) == 0 {
   193  			return nil, fmt.Errorf("rdf: no term for blank with hash %x", b)
   194  		}
   195  		for t := range terms[b] {
   196  			if !isBlank(t) {
   197  				continue
   198  			}
   199  			if _, exists := c14n[t]; exists {
   200  				continue
   201  			}
   202  			delete(need, t)
   203  			c14n[t] = fmt.Sprintf("_:c14n%d", i)
   204  		}
   205  	}
   206  
   207  	if len(need) != 0 {
   208  		return dst, fmt.Errorf("rdf: missing term hashes for %d terms", len(need))
   209  	}
   210  
   211  	for i, s := range src {
   212  		if dst[i] == nil {
   213  			dst[i] = &Statement{}
   214  		}
   215  		n := dst[i]
   216  		n.Subject = Term{Value: translate(s.Subject.Value, c14n)}
   217  		n.Predicate = s.Predicate
   218  		n.Object = Term{Value: translate(s.Object.Value, c14n)}
   219  		n.Label = Term{Value: translate(s.Label.Value, c14n)}
   220  	}
   221  	sort.Sort(c14nStatements(dst))
   222  
   223  	return dst, nil
   224  }
   225  
   226  func translate(term string, mapping map[string]string) string {
   227  	if term, ok := mapping[term]; ok {
   228  		return term
   229  	}
   230  	return term
   231  }
   232  
   233  type c14nStatements []*Statement
   234  
   235  func (s c14nStatements) Len() int { return len(s) }
   236  func (s c14nStatements) Less(i, j int) bool {
   237  	si := s[i]
   238  	sj := s[j]
   239  	switch {
   240  	case si.Subject.Value < sj.Subject.Value:
   241  		return true
   242  	case si.Subject.Value > sj.Subject.Value:
   243  		return false
   244  	}
   245  	switch { // Always IRI.
   246  	case si.Predicate.Value < sj.Predicate.Value:
   247  		return true
   248  	case si.Predicate.Value > sj.Predicate.Value:
   249  		return false
   250  	}
   251  	switch {
   252  	case si.Object.Value < sj.Object.Value:
   253  		return true
   254  	case si.Object.Value > sj.Object.Value:
   255  		return false
   256  	}
   257  	return si.Label.Value < sj.Label.Value
   258  }
   259  func (s c14nStatements) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
   260  
   261  // hashBNodes returns the hashed blank nodes of the graph described by statements
   262  // using the provided hash function. Hashes are initialised with zero.
   263  //
   264  // This is algorithm 1 in doi:10.1145/3068333.
   265  func hashBNodes(statements []*Statement, h hash.Hash, zero []byte, hash0 map[string][]byte) (hash *table, disjoint bool) {
   266  	curr := newTable()
   267  	for _, s := range statements {
   268  		for i, t := range []string{
   269  			s.Subject.Value,
   270  			s.Predicate.Value,
   271  			s.Object.Value,
   272  			s.Label.Value,
   273  		} {
   274  			switch {
   275  			case i == 3 && t == "":
   276  				continue
   277  			case isBlank(t):
   278  				if hash0 == nil {
   279  					curr.set(t, zero)
   280  				} else {
   281  					curr.set(t, hash0[t])
   282  				}
   283  			case isIRI(t):
   284  				h.Reset()
   285  				h.Write([]byte(t[1 : len(t)-1])) //nolint:errcheck
   286  				curr.set(t, h.Sum(nil))
   287  			default:
   288  				h.Reset()
   289  				h.Write([]byte(t)) //nolint:errcheck
   290  				curr.set(t, h.Sum(nil))
   291  			}
   292  		}
   293  	}
   294  
   295  	bag := newHashBag(h, curr)
   296  	last := curr.clone()
   297  	for {
   298  		curr, last = last, curr
   299  		for _, s := range statements {
   300  			if isBlank(s.Subject.Value) {
   301  				var lab []byte
   302  				if s.Label.Value != "" {
   303  					lab = last.hashOf[s.Label.Value]
   304  				}
   305  				c := hashTuple(h, last.hashOf[s.Object.Value], last.hashOf[s.Predicate.Value], lab, []byte{'+'})
   306  				bag.add(s.Subject.Value, c)
   307  			}
   308  
   309  			if isBlank(s.Object.Value) {
   310  				var lab []byte
   311  				if s.Label.Value != "" {
   312  					lab = last.hashOf[s.Label.Value]
   313  				}
   314  				c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], lab, []byte{'-'})
   315  				bag.add(s.Object.Value, c)
   316  			}
   317  
   318  			// This and the lab value above implement the label hashing
   319  			// required for RDF dataset hashing as described in
   320  			// https://doi.org/10.5281/zenodo.3154322 v1.0
   321  			// Readme.md#adaptation-of-the-algorithms-to-handle-datasets.
   322  			if isBlank(s.Label.Value) {
   323  				c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], last.hashOf[s.Object.Value], []byte{'.'})
   324  				bag.add(s.Label.Value, c)
   325  			}
   326  		}
   327  
   328  		for t := range bag.hashesFor {
   329  			curr.set(t, bag.sum(t))
   330  		}
   331  
   332  		disjoint = curr.allUnique()
   333  		if disjoint || !curr.changedFrom(last) {
   334  			return curr, disjoint
   335  		}
   336  	}
   337  }
   338  
   339  // table is a collision aware hash collection for RDF terms.
   340  type table struct {
   341  	// hashOf holds the hash for each term.
   342  	hashOf map[string][]byte
   343  	// termsFor holds the set of nodes in
   344  	// the second key for terms that share
   345  	// the hash in the first key.
   346  	termsFor map[string]map[string]bool
   347  
   348  	// isBlank and blanks are the set of blank
   349  	// nodes.
   350  	// isBlank is nil for cloned tables.
   351  	isBlank map[string]bool
   352  	// blanks is nil for tables created
   353  	// with newTable.
   354  	blanks []string
   355  }
   356  
   357  // newTable returns a new hash table.
   358  func newTable() *table {
   359  	return &table{
   360  		hashOf:   make(map[string][]byte),
   361  		termsFor: make(map[string]map[string]bool),
   362  		isBlank:  make(map[string]bool),
   363  	}
   364  }
   365  
   366  // wasCloned returns whether t is a parent or child of a cloning operation.
   367  func (t *table) wasCloned() bool { return t.isBlank == nil }
   368  
   369  // isNew returns whether t is a new table.
   370  func (t *table) isNew() bool { return t.blanks == nil }
   371  
   372  // clone returns a clone of the receiver.
   373  func (t *table) clone() *table {
   374  	new := &table{
   375  		hashOf:   make(map[string][]byte),
   376  		termsFor: make(map[string]map[string]bool),
   377  	}
   378  	for term, hash := range t.hashOf {
   379  		new.hashOf[term] = hash
   380  	}
   381  	for hash, coll := range t.termsFor {
   382  		if len(coll) == 0 {
   383  			continue
   384  		}
   385  		terms := make(map[string]bool)
   386  		for term := range coll {
   387  			terms[term] = true
   388  		}
   389  		new.termsFor[hash] = terms
   390  	}
   391  	if t.isNew() {
   392  		t.blanks = make([]string, len(t.isBlank))
   393  		i := 0
   394  		for n := range t.isBlank {
   395  			t.blanks[i] = n
   396  			i++
   397  		}
   398  		t.isBlank = nil
   399  	}
   400  	new.blanks = t.blanks
   401  	return new
   402  }
   403  
   404  // TODO(kortschak): Make hash table in table.hashOf reuse the []byte on update.
   405  // This is not trivial since we need to check for changes, so we can't just get
   406  // the current hash buffer and write into it. So if this is done we probably
   407  // a pair of buffers, a current and a waiting.
   408  
   409  // set sets the hash of the term, removing any previously set hash.
   410  func (t *table) set(term string, hash []byte) {
   411  	prev := t.hashOf[term]
   412  	if bytes.Equal(prev, hash) {
   413  		return
   414  	}
   415  	t.hashOf[term] = hash
   416  
   417  	// Delete any existing hashes for this term.
   418  	switch terms := t.termsFor[string(prev)]; {
   419  	case len(terms) == 1:
   420  		delete(t.termsFor, string(prev))
   421  	case len(terms) > 1:
   422  		delete(terms, term)
   423  	}
   424  
   425  	terms, ok := t.termsFor[string(hash)]
   426  	if ok {
   427  		terms[term] = true
   428  	} else {
   429  		t.termsFor[string(hash)] = map[string]bool{term: true}
   430  	}
   431  
   432  	if !t.wasCloned() && isBlank(term) {
   433  		// We are in the original table, so note
   434  		// any blank node label that we see.
   435  		t.isBlank[term] = true
   436  	}
   437  }
   438  
   439  // allUnique returns whether every term has an unique hash. allUnique
   440  // can only be called on a table that was returned by clone.
   441  func (t *table) allUnique() bool {
   442  	if t.isNew() {
   443  		panic("checked hash bag from uncloned table")
   444  	}
   445  	for _, term := range t.blanks {
   446  		if len(t.termsFor[string(t.hashOf[term])]) > 1 {
   447  			return false
   448  		}
   449  	}
   450  	return true
   451  }
   452  
   453  // changedFrom returns whether the receiver has been updated from last.
   454  // changedFrom can only be called on a table that was returned by clone.
   455  func (t *table) changedFrom(last *table) bool {
   456  	if t.isNew() {
   457  		panic("checked hash bag from uncloned table")
   458  	}
   459  	for i, x := range t.blanks {
   460  		for _, y := range t.blanks[i+1:] {
   461  			if bytes.Equal(t.hashOf[x], t.hashOf[y]) != bytes.Equal(last.hashOf[x], last.hashOf[y]) {
   462  				return true
   463  			}
   464  		}
   465  	}
   466  	return false
   467  }
   468  
   469  // hashBag implements a commutative and associative hash.
   470  // See notes in https://doi.org/10.5281/zenodo.3154322 v1.0
   471  // Readme.md#what-is-the-precise-specification-of-hashbag.
   472  type hashBag struct {
   473  	hash      hash.Hash
   474  	hashesFor map[string][][]byte
   475  }
   476  
   477  // newHashBag returns a new hashBag using the provided hash function for
   478  // the given hash table. newHashBag can only take a table parameter that
   479  // was returned by newTable.
   480  func newHashBag(h hash.Hash, t *table) hashBag {
   481  	if t.wasCloned() {
   482  		panic("made hash bag from cloned table")
   483  	}
   484  	b := hashBag{hash: h, hashesFor: make(map[string][][]byte, len(t.isBlank))}
   485  	for n := range t.isBlank {
   486  		b.hashesFor[n] = [][]byte{t.hashOf[n]}
   487  	}
   488  	return b
   489  }
   490  
   491  // add adds the hash to the hash bag for the term.
   492  func (b hashBag) add(term string, hash []byte) {
   493  	b.hashesFor[term] = append(b.hashesFor[term], hash)
   494  }
   495  
   496  // sum calculates the hash sum for the given term, updates the hash bag
   497  // state and returns the hash.
   498  func (b hashBag) sum(term string) []byte {
   499  	p := b.hashesFor[term]
   500  	sort.Sort(lexical(p))
   501  	h := hashTuple(b.hash, p...)
   502  	b.hashesFor[term] = b.hashesFor[term][:1]
   503  	b.hashesFor[term][0] = h
   504  	return h
   505  }
   506  
   507  // lexical implements lexical sorting of [][]byte.
   508  type lexical [][]byte
   509  
   510  func (b lexical) Len() int           { return len(b) }
   511  func (b lexical) Less(i, j int) bool { return string(b[i]) < string(b[j]) }
   512  func (b lexical) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   513  
   514  // hashTuple returns the h hash of the concatenation of t.
   515  func hashTuple(h hash.Hash, t ...[]byte) []byte {
   516  	h.Reset()
   517  	for _, b := range t {
   518  		h.Write(b) //nolint:errcheck
   519  	}
   520  	return h.Sum(nil)
   521  }
   522  
   523  // hashBNodesPerSplit returns the independently hashed blank nodes of the
   524  // graph described by statements using the provided hash function. Hashes
   525  // are initialised with zero.
   526  //
   527  // This is algorithm 2 in doi:10.1145/3068333.
   528  func hashBNodesPerSplit(statements []*Statement, decomp bool, h hash.Hash, zero []byte) (hash *table, parts byLengthHash, disjoint bool) {
   529  	if !decomp {
   530  		hash, ok := hashBNodes(statements, h, zero, nil)
   531  		parts = appendOrdered(byLengthHash{}, hash.termsFor)
   532  		sort.Sort(parts)
   533  		return hash, parts, ok
   534  	}
   535  
   536  	splits := split(statements)
   537  
   538  	// Avoid recombination work if there is only one split.
   539  	if len(splits) == 1 {
   540  		hash, ok := hashBNodes(statements, h, zero, nil)
   541  		parts = appendOrdered(byLengthHash{}, hash.termsFor)
   542  		sort.Sort(parts)
   543  		return hash, parts, ok
   544  	}
   545  
   546  	hash = &table{hashOf: make(map[string][]byte)}
   547  	disjoint = true
   548  	for _, g := range splits {
   549  		part, ok := hashBNodes(g, h, zero, nil)
   550  		// Each split is guaranteed to be disjoint in its
   551  		// set of blank nodes, so we can just append to our
   552  		// collection of blanks.
   553  		hash.blanks = append(hash.blanks, part.blanks...)
   554  		if !ok {
   555  			// Allow a short-circuit of the allUnique check.
   556  			disjoint = false
   557  		}
   558  		for k, v := range part.hashOf {
   559  			hash.hashOf[k] = v
   560  		}
   561  		parts = appendOrdered(parts, part.termsFor)
   562  	}
   563  	sort.Sort(parts)
   564  	return hash, parts, disjoint && allUnique(hash.hashOf)
   565  }
   566  
   567  // appendOrdered adds parts (labels stored in the second key) for each
   568  // hash (stored in the first key) to parts.
   569  func appendOrdered(parts byLengthHash, partSets map[string]map[string]bool) byLengthHash {
   570  	for h, s := range partSets {
   571  		var p []string
   572  		for e := range s {
   573  			if isBlank(e) {
   574  				p = append(p, e)
   575  			}
   576  		}
   577  		if p != nil {
   578  			parts.nodes = append(parts.nodes, p)
   579  			parts.hashes = append(parts.hashes, h)
   580  		}
   581  	}
   582  	return parts
   583  }
   584  
   585  // byLengthHash implements ascending length sort of a set of blank RDF
   586  // term partitions with ties broken by lexical ordering of the partitions'
   587  // hashes.
   588  type byLengthHash struct {
   589  	// nodes holds the blank nodes of a part.
   590  	nodes [][]string
   591  	// hashes holds the hashes corresponding
   592  	// to the nodes in the nodes field, using
   593  	// the same index.
   594  	hashes []string
   595  }
   596  
   597  func (s byLengthHash) Len() int { return len(s.nodes) }
   598  func (s byLengthHash) Less(i, j int) bool {
   599  	switch {
   600  	case len(s.nodes[i]) < len(s.nodes[j]):
   601  		return true
   602  	case len(s.nodes[i]) > len(s.nodes[j]):
   603  		return false
   604  	}
   605  	return s.hashes[i] < s.hashes[j]
   606  }
   607  func (s byLengthHash) Swap(i, j int) {
   608  	s.nodes[i], s.nodes[j] = s.nodes[j], s.nodes[i]
   609  	s.hashes[i], s.hashes[j] = s.hashes[j], s.hashes[i]
   610  }
   611  
   612  // allUnique returns whether the []byte hash values in hashes are all unique.
   613  func allUnique(hashes map[string][]byte) bool {
   614  	set := make(map[string]bool)
   615  	for _, h := range hashes {
   616  		if set[string(h)] {
   617  			return false
   618  		}
   619  		set[string(h)] = true
   620  	}
   621  	return true
   622  }
   623  
   624  // split returns the statements forming connected components in the graph
   625  // described by statements.
   626  //
   627  // This is split in algorithm 2 in doi:10.1145/3068333.
   628  func split(statements []*Statement) [][]*Statement {
   629  	ds := make(djSet)
   630  	for _, s := range statements {
   631  		ds.add(s.Subject.Value)
   632  		ds.add(s.Object.Value)
   633  		if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
   634  			ds.union(ds.find(s.Subject.Value), ds.find(s.Object.Value))
   635  		}
   636  	}
   637  
   638  	var (
   639  		splits [][]*Statement
   640  		ground []*Statement
   641  	)
   642  	idxOf := make(map[*dsNode]int)
   643  	for _, s := range statements {
   644  		var t string
   645  		switch {
   646  		case isBlank(s.Subject.Value):
   647  			t = s.Subject.Value
   648  		case isBlank(s.Object.Value):
   649  			t = s.Object.Value
   650  		default:
   651  			ground = append(ground, s)
   652  			continue
   653  		}
   654  		r := ds.find(t)
   655  		if r == nil {
   656  			panic(fmt.Sprintf("term not found: %q", t))
   657  		}
   658  		i, ok := idxOf[r]
   659  		if !ok {
   660  			i = len(splits)
   661  			idxOf[r] = i
   662  			splits = append(splits, []*Statement{s})
   663  		} else {
   664  			splits[i] = append(splits[i], s)
   665  		}
   666  	}
   667  	if ground != nil {
   668  		splits = append(splits, ground)
   669  	}
   670  
   671  	if debug {
   672  		debug.log(0, "Splits:")
   673  		for i, s := range splits {
   674  			for j, t := range s {
   675  				if j == 0 {
   676  					debug.logf(0, "%d.\t%s\n", i+1, t)
   677  				} else {
   678  					debug.logf(0, "\t%s\n", t)
   679  				}
   680  			}
   681  			debug.log(0)
   682  		}
   683  	}
   684  
   685  	return splits
   686  }
   687  
   688  // distinguish returns G⊥: smallest hash-labelled graph found thus far.
   689  // The graph is returned as a node to hash lookup.
   690  //
   691  // This is part of algorithm 3 in doi:10.1145/3068333.
   692  //
   693  // The correspondence between the parameters for the function in the paper
   694  // with the implementation here is as follows:
   695  //  - G = statements
   696  //  - hash = hash
   697  //  - P = parts (already sorted by hashBNodesPerSplit)
   698  //  - G⊥ = lowest
   699  //  - B = hash.blanks
   700  // The additional parameter dist specifies that distinguish should treat
   701  // coequal trivial parts as a coarse of intermediate part and distinguish
   702  // the nodes in that merged part.
   703  func distinguish(statements []*Statement, dist bool, h hash.Hash, zero []byte, hash *table, parts byLengthHash, lowest map[string][]byte, depth int) map[string][]byte {
   704  	if debug {
   705  		debug.log(depth, "Running Distinguish")
   706  	}
   707  
   708  	var small []string
   709  	var k int
   710  	for k, small = range parts.nodes {
   711  		if len(small) > 1 {
   712  			break
   713  		}
   714  	}
   715  	if len(small) < 2 {
   716  		if lowest != nil || !dist {
   717  			if debug {
   718  				debug.log(depth, "Return lowest (no non-trivial parts):")
   719  				debug.logHashes(depth, lowest, h.Size())
   720  			}
   721  
   722  			return lowest
   723  		}
   724  
   725  		// We have been given a set of fine parts,
   726  		// but to reach here they must have been
   727  		// non-uniquely labeled, so treat them
   728  		// as a single coarse part.
   729  		k, small = 0, parts.nodes[0]
   730  	}
   731  
   732  	if debug {
   733  		debug.logf(depth, "Part: %v %x\n\n", small, parts.hashes[k])
   734  		debug.log(depth, "Orig hash:")
   735  		debug.logHashes(depth, hash.hashOf, h.Size())
   736  	}
   737  
   738  	smallHash := hash.hashOf[small[0]]
   739  	for _, p := range parts.nodes[k:] {
   740  		if !bytes.Equal(smallHash, hash.hashOf[p[0]]) {
   741  
   742  			if debug {
   743  				debug.logf(depth, "End of co-equal hashes: %x != %x\n\n", smallHash, hash.hashOf[p[0]])
   744  			}
   745  
   746  			break
   747  		}
   748  		for i, b := range p {
   749  
   750  			if debug {
   751  				debug.logf(depth, "Iter: %d — B = %q\n\n", i, b)
   752  
   753  				if depth == 0 {
   754  					debug.log(depth, "Current lowest:\n")
   755  					debug.logHashes(depth, lowest, h.Size())
   756  				}
   757  			}
   758  
   759  			hashP := hash.clone()
   760  			hashP.set(b, hashTuple(h, hashP.hashOf[b], []byte{'@'}))
   761  			hashPP, ok := hashBNodes(statements, h, zero, hashP.hashOf)
   762  			if ok {
   763  
   764  				if debug {
   765  					debug.log(depth, "hashPP is trivial")
   766  					debug.log(depth, "comparing hashPP\n")
   767  					debug.logHashes(depth, hashPP.hashOf, h.Size())
   768  					debug.log(depth, "with previous\n")
   769  					debug.logHashes(depth, lowest, h.Size())
   770  				}
   771  
   772  				if lowest == nil || graphLess(statements, hashPP.hashOf, lowest) {
   773  					lowest = hashPP.hashOf
   774  					debug.log(depth, "choose hashPP\n")
   775  				}
   776  			} else {
   777  				partsP := appendOrdered(byLengthHash{}, hashPP.termsFor)
   778  				sort.Sort(partsP)
   779  
   780  				if debug {
   781  					debug.log(depth, "Parts':")
   782  					debug.logParts(depth, partsP)
   783  					debug.log(depth, "Recursive distinguish")
   784  					debug.log(depth, "Called with current lowest:\n")
   785  					debug.logHashes(depth, lowest, h.Size())
   786  				}
   787  
   788  				lowest = distinguish(statements, dist, h, zero, hashPP, partsP, lowest, depth+1)
   789  			}
   790  		}
   791  	}
   792  
   793  	if debug {
   794  		debug.log(depth, "Return lowest:")
   795  		debug.logHashes(depth, lowest, h.Size())
   796  	}
   797  
   798  	return lowest
   799  }
   800  
   801  // terms ordered syntactically, triples ordered lexicographically, and graphs
   802  // ordered such that G < H if and only if G ⊂ H or there exists a triple
   803  // t ∈ G \ H such that no triple t' ∈ H \ G exists where t' < t.
   804  // p9 https://doi.org/10.1145/3068333
   805  func graphLess(statements []*Statement, a, b map[string][]byte) bool {
   806  	g := newLexicalStatements(statements, a)
   807  	sort.Sort(g)
   808  	h := newLexicalStatements(statements, b)
   809  	sort.Sort(h)
   810  
   811  	gSubH := sub(g, h, len(g.statements))
   812  	if len(gSubH) == 0 {
   813  		return true
   814  	}
   815  
   816  	hSubG := sub(h, g, 1)
   817  	if len(hSubG) == 0 {
   818  		return true
   819  	}
   820  	lowestH := relabeledStatement{hSubG[0], h.hashes}
   821  
   822  	for _, s := range gSubH {
   823  		rs := relabeledStatement{s, g.hashes}
   824  		if rs.less(lowestH) {
   825  			return true
   826  		}
   827  	}
   828  	return false
   829  }
   830  
   831  // lexicalStatements is a sort implementation for Statements with blank
   832  // node labels replaced with their hash.
   833  type lexicalStatements struct {
   834  	statements []*Statement
   835  	hashes     map[string][]byte
   836  }
   837  
   838  func newLexicalStatements(statements []*Statement, hash map[string][]byte) lexicalStatements {
   839  	s := lexicalStatements{
   840  		statements: make([]*Statement, len(statements)),
   841  		hashes:     hash,
   842  	}
   843  	copy(s.statements, statements)
   844  	return s
   845  }
   846  
   847  // sub returns the difference between a and b up to max elements long.
   848  func sub(a, b lexicalStatements, max int) []*Statement {
   849  	var d []*Statement
   850  	var i, j int
   851  	for i < len(a.statements) && j < len(b.statements) && len(d) < max {
   852  		ra := relabeledStatement{a.statements[i], a.hashes}
   853  		rb := relabeledStatement{b.statements[j], b.hashes}
   854  		switch {
   855  		case ra.less(rb):
   856  			d = append(d, a.statements[i])
   857  			i++
   858  		case rb.less(ra):
   859  			j++
   860  		default:
   861  			i++
   862  		}
   863  	}
   864  	if len(d) < max {
   865  		d = append(d, a.statements[i:min(len(a.statements), i+max-len(d))]...)
   866  	}
   867  	return d
   868  }
   869  
   870  func min(a, b int) int {
   871  	if a < b {
   872  		return a
   873  	}
   874  	return b
   875  }
   876  
   877  func (s lexicalStatements) Len() int { return len(s.statements) }
   878  func (s lexicalStatements) Less(i, j int) bool {
   879  	return relabeledStatement{s.statements[i], s.hashes}.less(relabeledStatement{s.statements[j], s.hashes})
   880  }
   881  func (s lexicalStatements) Swap(i, j int) {
   882  	s.statements[i], s.statements[j] = s.statements[j], s.statements[i]
   883  }
   884  
   885  // relabeledStatement is a statement that is orderable by its blank node
   886  // hash relabeling.
   887  type relabeledStatement struct {
   888  	statement *Statement
   889  	labels    map[string][]byte
   890  }
   891  
   892  func (a relabeledStatement) less(b relabeledStatement) bool {
   893  	switch {
   894  	case relabeledTerm{a.statement.Subject, a.labels}.less(relabeledTerm{b.statement.Subject, b.labels}):
   895  		return true
   896  	case relabeledTerm{b.statement.Subject, b.labels}.less(relabeledTerm{a.statement.Subject, a.labels}):
   897  		return false
   898  	}
   899  	switch { // Always IRI.
   900  	case a.statement.Predicate.Value < b.statement.Predicate.Value:
   901  		return true
   902  	case a.statement.Predicate.Value > b.statement.Predicate.Value:
   903  		return false
   904  	}
   905  	switch {
   906  	case relabeledTerm{a.statement.Object, a.labels}.less(relabeledTerm{b.statement.Object, b.labels}):
   907  		return true
   908  	case relabeledTerm{b.statement.Object, b.labels}.less(relabeledTerm{a.statement.Object, a.labels}):
   909  		return false
   910  	}
   911  	return relabeledTerm{a.statement.Label, a.labels}.less(relabeledTerm{b.statement.Label, b.labels})
   912  }
   913  
   914  func (s relabeledStatement) String() string {
   915  	subj := relabeledTerm{term: s.statement.Subject, labels: s.labels}
   916  	obj := relabeledTerm{term: s.statement.Object, labels: s.labels}
   917  	if s.statement.Label.Value == "" {
   918  		return fmt.Sprintf("%s %s %s .", subj, s.statement.Predicate.Value, obj)
   919  	}
   920  	lab := relabeledTerm{term: s.statement.Label, labels: s.labels}
   921  	return fmt.Sprintf("%s %s %s %s .", subj, s.statement.Predicate.Value, obj, lab)
   922  }
   923  
   924  // relabeledTerm is a term that is orderable by its blank node hash relabeling.
   925  type relabeledTerm struct {
   926  	term   Term
   927  	labels map[string][]byte
   928  }
   929  
   930  func (a relabeledTerm) less(b relabeledTerm) bool {
   931  	aIsBlank := isBlank(a.term.Value)
   932  	bIsBlank := isBlank(b.term.Value)
   933  	switch {
   934  	case aIsBlank && bIsBlank:
   935  		return bytes.Compare(a.labels[a.term.Value], b.labels[b.term.Value]) < 0
   936  	case aIsBlank:
   937  		return blankPrefix < unquoteIRI(b.term.Value)
   938  	case bIsBlank:
   939  		return unquoteIRI(a.term.Value) < blankPrefix
   940  	default:
   941  		return unquoteIRI(a.term.Value) < unquoteIRI(b.term.Value)
   942  	}
   943  }
   944  
   945  func unquoteIRI(s string) string {
   946  	if len(s) > 1 && s[0] == '<' && s[len(s)-1] == '>' {
   947  		s = s[1 : len(s)-1]
   948  	}
   949  	return s
   950  }
   951  
   952  func (t relabeledTerm) String() string {
   953  	if !isBlank(t.term.Value) {
   954  		return t.term.Value
   955  	}
   956  	h, ok := t.labels[t.term.Value]
   957  	if !ok {
   958  		return t.term.Value + "_missing_hash"
   959  	}
   960  	return fmt.Sprintf("_:%0x", h)
   961  }