github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/equi_canonical.go (about)

     1  // Copyright ©2021 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package rdf
     6  
     7  import (
     8  	"errors"
     9  	"sort"
    10  )
    11  
    12  // Throughout, the comments refer to doi:10.1145/3068333 which should be
    13  // understood as a synonym for http://aidanhogan.com/docs/rdf-canonicalisation.pdf
    14  // although there are differences between the two, see http://aidanhogan.com/#errataH17.
    15  // Where there are differences, the document at http://aidanhogan.com/ is the
    16  // canonical truth. The DOI reference is referred to for persistence.
    17  
    18  // Lean returns an RDF core of g that entails g. If g contains any non-zero
    19  // labels, Lean will return a non-nil error and a core of g assuming no graph
    20  // labels exist.
    21  //
    22  // See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of
    23  // the algorithm.
    24  func Lean(g []*Statement) ([]*Statement, error) {
    25  	// BUG(kortschak): Graph leaning does not take into account graph label terms
    26  	// since the formal semantics for a multiple graph data model have not been
    27  	// defined. See https://www.w3.org/TR/rdf11-datasets/#declaring.
    28  
    29  	var (
    30  		hasBlanks bool
    31  		err       error
    32  	)
    33  	for _, s := range g {
    34  		if isBlank(s.Subject.Value) || isBlank(s.Object.Value) {
    35  			hasBlanks = true
    36  			if err != nil {
    37  				break
    38  			}
    39  		}
    40  		if s.Label.Value != "" && err == nil {
    41  			err = errors.New("rdf: data-set contains graph names")
    42  			if hasBlanks {
    43  				break
    44  			}
    45  		}
    46  	}
    47  	if hasBlanks {
    48  		g = lean(&dfs{}, g)
    49  	}
    50  	return g, err
    51  }
    52  
    53  // removeRedundantBnodes removes blank nodes whose edges are a subset of
    54  // another term in the RDF graph.
    55  //
    56  // This is algorithm 4 in doi:10.1145/3068333.
    57  func removeRedundantBnodes(g []*Statement) []*Statement {
    58  	g = append(g[:0:0], g...)
    59  	for {
    60  		edges := make(map[string]map[triple]bool)
    61  		for _, s := range g {
    62  			for i, t := range []string{
    63  				s.Subject.Value,
    64  				s.Object.Value,
    65  			} {
    66  				e, ok := edges[t]
    67  				if !ok {
    68  					e = make(map[triple]bool)
    69  					edges[t] = e
    70  				}
    71  				switch i {
    72  				case 0:
    73  					e[triple{s.Predicate.Value, s.Object.Value, "+"}] = true
    74  				case 1:
    75  					e[triple{s.Predicate.Value, s.Subject.Value, "-"}] = true
    76  				}
    77  			}
    78  		}
    79  
    80  		seen := make(map[string]bool)
    81  		bNodes := make(map[string]bool)
    82  		terms := make(map[string]bool)
    83  		for _, s := range g {
    84  			for _, t := range []string{
    85  				s.Subject.Value,
    86  				s.Predicate.Value,
    87  				s.Object.Value,
    88  			} {
    89  				terms[t] = true
    90  				if isBlank(t) {
    91  					bNodes[t] = true
    92  				} else {
    93  					seen[t] = true
    94  				}
    95  			}
    96  		}
    97  
    98  		redundant := make(map[string]bool)
    99  		for x := range bNodes {
   100  			for xp := range terms {
   101  				if isProperSubset(edges[x], edges[xp]) || (seen[xp] && isEqualEdges(edges[x], edges[xp])) {
   102  					redundant[x] = true
   103  					break
   104  				}
   105  			}
   106  			seen[x] = true
   107  		}
   108  
   109  		n := len(g)
   110  		for i := 0; i < len(g); {
   111  			if !redundant[g[i].Subject.Value] && !redundant[g[i].Object.Value] {
   112  				i++
   113  				continue
   114  			}
   115  			g[i], g = g[len(g)-1], g[:len(g)-1]
   116  		}
   117  		if n == len(g) {
   118  			return g
   119  		}
   120  	}
   121  }
   122  
   123  type triple [3]string
   124  
   125  func isProperSubset(a, b map[triple]bool) bool {
   126  	for k := range a {
   127  		if !b[k] {
   128  			return false
   129  		}
   130  	}
   131  	return len(a) < len(b)
   132  }
   133  
   134  func isEqualEdges(a, b map[triple]bool) bool {
   135  	if len(a) != len(b) {
   136  		return false
   137  	}
   138  	for k := range a {
   139  		if !b[k] {
   140  			return false
   141  		}
   142  	}
   143  	return true
   144  }
   145  
   146  // findCandidates finds candidates for blank nodes and blank nodes that are fixed.
   147  //
   148  // This is algorithm 5 in doi:10.1145/3068333.
   149  func findCandidates(g []*Statement) ([]*Statement, map[string]bool, map[string]map[string]bool, bool) {
   150  	g = removeRedundantBnodes(g)
   151  
   152  	edges := make(map[triple]bool)
   153  	f := make(map[string]bool)
   154  	for _, s := range g {
   155  		sub := s.Subject.Value
   156  		prd := s.Predicate.Value
   157  		obj := s.Object.Value
   158  
   159  		edges[triple{sub, prd, obj}] = true
   160  		edges[triple{sub, prd, "*"}] = true
   161  		edges[triple{"*", prd, obj}] = true
   162  		switch {
   163  		case isBlank(sub) && isBlank(obj):
   164  			f[sub] = false
   165  			f[obj] = false
   166  		case isBlank(sub):
   167  			if _, ok := f[sub]; !ok {
   168  				f[sub] = true
   169  			}
   170  		case isBlank(obj):
   171  			if _, ok := f[obj]; !ok {
   172  				f[obj] = true
   173  			}
   174  		}
   175  	}
   176  	for k, v := range f {
   177  		if !v {
   178  			delete(f, k)
   179  		}
   180  	}
   181  	if len(f) == 0 {
   182  		f = nil
   183  	}
   184  
   185  	cands := make(map[string]map[string]bool)
   186  	bnodes := make(map[string]bool)
   187  	for _, s := range g {
   188  		for _, b := range []string{
   189  			s.Subject.Value,
   190  			s.Object.Value,
   191  		} {
   192  			if !isBlank(b) {
   193  				continue
   194  			}
   195  			bnodes[b] = true
   196  			if f[b] {
   197  				cands[b] = map[string]bool{b: true}
   198  			} else {
   199  				terms := make(map[string]bool)
   200  				for _, s := range g {
   201  					for _, t := range []string{
   202  						s.Subject.Value,
   203  						s.Predicate.Value,
   204  						s.Object.Value,
   205  					} {
   206  						terms[t] = true
   207  					}
   208  				}
   209  				cands[b] = terms
   210  			}
   211  		}
   212  	}
   213  	if isEqualTerms(f, bnodes) {
   214  		return g, f, cands, true
   215  	}
   216  
   217  	for {
   218  		bb := make(map[string]bool)
   219  		for b := range bnodes {
   220  			if !f[b] {
   221  				bb[b] = true
   222  			}
   223  		}
   224  		for b := range bb {
   225  			for x := range cands[b] {
   226  				if x == b {
   227  					continue
   228  				}
   229  				for _, s := range g {
   230  					if s.Subject.Value != b {
   231  						continue
   232  					}
   233  					prd := s.Predicate.Value
   234  					obj := s.Object.Value
   235  					if (inILF(obj, f) && !edges[triple{x, prd, obj}]) || (bb[obj] && !edges[triple{x, prd, "*"}]) {
   236  						delete(cands[b], x)
   237  						break
   238  					}
   239  				}
   240  				if !cands[b][x] {
   241  					continue
   242  				}
   243  				for _, s := range g {
   244  					if s.Object.Value != b {
   245  						continue
   246  					}
   247  					sub := s.Subject.Value
   248  					prd := s.Predicate.Value
   249  					if (inIF(sub, f) && !edges[triple{sub, prd, x}]) || (bb[sub] && !edges[triple{"*", prd, x}]) {
   250  						delete(cands[b], x)
   251  						break
   252  					}
   253  				}
   254  			}
   255  		}
   256  
   257  		fp := f
   258  		f = make(map[string]bool)
   259  		for b := range fp {
   260  			f[b] = true
   261  		}
   262  		for b := range bb { // Mark newly fixed blank nodes.
   263  			if len(cands[b]) == 1 && cands[b][b] {
   264  				f[b] = true
   265  			}
   266  		}
   267  		allFixed := isEqualTerms(f, bnodes)
   268  		if isEqualTerms(fp, f) || allFixed {
   269  			if len(f) == 0 {
   270  				f = nil
   271  			}
   272  			return g, f, cands, allFixed
   273  		}
   274  	}
   275  }
   276  
   277  // inILF returns whether t is in IL or F.
   278  func inILF(t string, f map[string]bool) bool {
   279  	return isIRI(t) || isLiteral(t) || f[t]
   280  }
   281  
   282  // inIF returns whether t is in I or F.
   283  func inIF(t string, f map[string]bool) bool {
   284  	return isIRI(t) || f[t]
   285  }
   286  
   287  // dfs is a depth-first search strategy.
   288  type dfs struct{}
   289  
   290  // lean returns a core of the RDF graph g using the given strategy.
   291  //
   292  // This is lines 1-9 of algorithm 6 in doi:10.1145/3068333.
   293  func lean(strategy *dfs, g []*Statement) []*Statement {
   294  	foundBnode := false
   295  search:
   296  	for _, s := range g {
   297  		for _, t := range []string{
   298  			s.Subject.Value,
   299  			s.Object.Value,
   300  		} {
   301  			if isBlank(t) {
   302  				foundBnode = true
   303  				break search
   304  			}
   305  		}
   306  	}
   307  	if !foundBnode {
   308  		return g
   309  	}
   310  	g, fixed, cands, allFixed := findCandidates(g)
   311  	if allFixed {
   312  		return g
   313  	}
   314  	for _, s := range g {
   315  		if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
   316  			mu := make(map[string]string, len(fixed))
   317  			for b := range fixed {
   318  				mu[b] = b
   319  			}
   320  			mu = findCoreEndomorphism(strategy, g, cands, mu)
   321  			return applyMu(g, mu)
   322  		}
   323  	}
   324  	return g
   325  }
   326  
   327  // findCoreEndomorphism returns a core solution using the given strategy.
   328  //
   329  // This is lines 10-14 of algorithm 6 in doi:10.1145/3068333.
   330  func findCoreEndomorphism(strategy *dfs, g []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
   331  	var q []*Statement
   332  	preds := make(map[string]int)
   333  	seen := make(map[triple]bool)
   334  	for _, s := range g {
   335  		preds[s.Predicate.Value]++
   336  		if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
   337  			if seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] {
   338  				continue
   339  			}
   340  			seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] = true
   341  			q = append(q, s)
   342  		}
   343  	}
   344  	sort.Slice(q, func(i, j int) bool {
   345  		return selectivity(q[i], cands, preds) < selectivity(q[j], cands, preds)
   346  	})
   347  	return strategy.evaluate(g, q, cands, mu)
   348  }
   349  
   350  // selectivity returns the selectivity heuristic score for s. Lower scores
   351  // are more selective.
   352  func selectivity(s *Statement, cands map[string]map[string]bool, preds map[string]int) int {
   353  	return min(len(cands[s.Subject.Value])*len(cands[s.Object.Value]), preds[s.Predicate.Value])
   354  }
   355  
   356  // evaluate returns an endomorphism using a DFS strategy.
   357  //
   358  // This is lines 25-32 of algorithm 6 in doi:10.1145/3068333.
   359  func (st *dfs) evaluate(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
   360  	mu = st.search(g, q, cands, mu)
   361  	for len(mu) != len(codom(mu)) {
   362  		mupp := fixedFrom(cands)
   363  		mup := findCoreEndomorphism(st, applyMu(g, mu), cands, mupp)
   364  		if isAutomorphism(mup) {
   365  			return mu
   366  		}
   367  		for b, x := range mu {
   368  			if _, ok := mup[b]; !ok {
   369  				mup[b] = x
   370  			}
   371  		}
   372  		mu = mup
   373  	}
   374  	return mu
   375  }
   376  
   377  func fixedFrom(cands map[string]map[string]bool) map[string]string {
   378  	fixed := make(map[string]string)
   379  	for b, m := range cands {
   380  		if len(m) == 1 && m[b] {
   381  			fixed[b] = b
   382  		}
   383  	}
   384  	return fixed
   385  }
   386  
   387  // applyMu applies mu to g returning the result.
   388  func applyMu(g []*Statement, mu map[string]string) []*Statement {
   389  	back := make([]Statement, 0, len(g))
   390  	dst := make([]*Statement, 0, len(g))
   391  	seen := make(map[Statement]bool)
   392  	for _, s := range g {
   393  		n := Statement{
   394  			Subject:   Term{Value: translate(s.Subject.Value, mu)},
   395  			Predicate: Term{Value: s.Predicate.Value},
   396  			Object:    Term{Value: translate(s.Object.Value, mu)},
   397  			Label:     Term{Value: s.Label.Value},
   398  		}
   399  		if seen[n] {
   400  			continue
   401  		}
   402  		seen[n] = true
   403  		back = append(back, n)
   404  		dst = append(dst, &back[len(back)-1])
   405  	}
   406  	return dst
   407  }
   408  
   409  // search returns a minimum endomorphism using a DFS strategy.
   410  //
   411  // This is lines 33-46 of algorithm 6 in doi:10.1145/3068333.
   412  func (st *dfs) search(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
   413  	qMin := q[0]
   414  	m := st.join(qMin, g, cands, mu)
   415  	if len(m) == 0 {
   416  		// Early exit if no mapping found.
   417  		return nil
   418  	}
   419  	sortByCodom(m)
   420  	mMin := m[0]
   421  	qp := q[1:]
   422  	if len(qp) != 0 {
   423  		for len(m) != 0 {
   424  			mMin = m[0]
   425  			mup := st.search(g, qp, cands, mMin)
   426  			if !isAutomorphism(mup) {
   427  				return mup
   428  			}
   429  			m = m[1:]
   430  		}
   431  	}
   432  	return mMin
   433  }
   434  
   435  // isAutomorphism returns whether mu is an automorphism, this is equivalent to
   436  // dom(mu) == codom(mu).
   437  func isAutomorphism(mu map[string]string) bool {
   438  	return isEqualTerms(dom(mu), codom(mu))
   439  }
   440  
   441  // dom returns the domain of mu.
   442  func dom(mu map[string]string) map[string]bool {
   443  	d := make(map[string]bool, len(mu))
   444  	for v := range mu {
   445  		d[v] = true
   446  	}
   447  	return d
   448  }
   449  
   450  // codom returns the codomain of mu.
   451  func codom(mu map[string]string) map[string]bool {
   452  	cd := make(map[string]bool, len(mu))
   453  	for _, v := range mu {
   454  		cd[v] = true
   455  	}
   456  	return cd
   457  }
   458  
   459  // isEqualTerms returns whether a and b are identical.
   460  func isEqualTerms(a, b map[string]bool) bool {
   461  	if len(a) != len(b) {
   462  		return false
   463  	}
   464  	for k := range a {
   465  		if !b[k] {
   466  			return false
   467  		}
   468  	}
   469  	return true
   470  }
   471  
   472  // sortByCodom performs a sort of maps ordered by fewest blank nodes in
   473  // codomain, then fewest self mappings.
   474  func sortByCodom(maps []map[string]string) {
   475  	m := orderedByCodom{
   476  		maps:  maps,
   477  		attrs: make([]attrs, len(maps)),
   478  	}
   479  	for i, mu := range maps {
   480  		m.attrs[i].blanks = make(map[string]bool)
   481  		for x, y := range mu {
   482  			if isBlank(y) {
   483  				m.attrs[i].blanks[y] = true
   484  			}
   485  			if x == y {
   486  				m.attrs[i].selfs++
   487  			}
   488  		}
   489  	}
   490  	sort.Sort(m)
   491  }
   492  
   493  type orderedByCodom struct {
   494  	maps  []map[string]string
   495  	attrs []attrs
   496  }
   497  
   498  type attrs struct {
   499  	blanks map[string]bool
   500  	selfs  int
   501  }
   502  
   503  func (m orderedByCodom) Len() int { return len(m.maps) }
   504  func (m orderedByCodom) Less(i, j int) bool {
   505  	attrI := m.attrs[i]
   506  	attrJ := m.attrs[j]
   507  	switch {
   508  	case len(attrI.blanks) < len(attrJ.blanks):
   509  		return true
   510  	case len(attrI.blanks) > len(attrJ.blanks):
   511  		return false
   512  	default:
   513  		return attrI.selfs < attrJ.selfs
   514  	}
   515  }
   516  func (m orderedByCodom) Swap(i, j int) {
   517  	m.maps[i], m.maps[j] = m.maps[j], m.maps[i]
   518  	m.attrs[i], m.attrs[j] = m.attrs[j], m.attrs[i]
   519  }
   520  
   521  // join evaluates the given pattern, q, joining with solutions in m.
   522  // This takes only a single mapping and so only works for the DFS strategy.
   523  //
   524  // This is lines 47-51 of algorithm 6 in doi:10.1145/3068333.
   525  func (st *dfs) join(q *Statement, g []*Statement, cands map[string]map[string]bool, m map[string]string) []map[string]string {
   526  	var mp []map[string]string
   527  	isLoop := q.Subject.Value == q.Object.Value
   528  	for _, s := range g {
   529  		// Line 45: M_q ← {µ | µ(q) ∈ G}
   530  		//  | µ(q) ∈ G
   531  		//
   532  		//    µ(q) ∈ G ↔ (µ(q_s),q_p,µ(q_o)) ∈ G
   533  		if q.Predicate.Value != s.Predicate.Value {
   534  			continue
   535  		}
   536  		//    q_s = q_o ↔ µ(q_s) =_µ(q_o)
   537  		if isLoop && s.Subject.Value != s.Object.Value {
   538  			continue
   539  		}
   540  
   541  		// Line 46: M_q' ← {µ ∈ M_q | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]}
   542  		//  | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]
   543  		if !cands[q.Subject.Value][s.Subject.Value] || !cands[q.Object.Value][s.Object.Value] {
   544  			continue
   545  		}
   546  
   547  		// Line 47: M' ← M_q' ⋈ M
   548  		// M₁ ⋈ M₂ = {μ₁ ∪ μ₂ | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings}
   549  		//  | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings
   550  		if mq, ok := m[q.Subject.Value]; ok && mq != s.Subject.Value {
   551  			continue
   552  		}
   553  		if !isLoop {
   554  			if mq, ok := m[q.Object.Value]; ok && mq != s.Object.Value {
   555  				continue
   556  			}
   557  		}
   558  		// Line 47: μ₁ ∪ μ₂
   559  		var mu map[string]string
   560  		if isLoop {
   561  			mu = map[string]string{
   562  				q.Subject.Value: s.Subject.Value,
   563  			}
   564  		} else {
   565  			mu = map[string]string{
   566  				q.Subject.Value: s.Subject.Value,
   567  				q.Object.Value:  s.Object.Value,
   568  			}
   569  		}
   570  		for b, mb := range m {
   571  			mu[b] = mb
   572  		}
   573  		mp = append(mp, mu)
   574  	}
   575  	return mp
   576  }