gonum.org/v1/gonum@v0.14.0/graph/formats/rdf/urna.go

gonum.org/v1/gonum@v0.14.0/graph/formats/rdf/urna.go (about)

     1  // Copyright ©2020 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package rdf
     6  
     7  import (
     8  	"bytes"
     9  	"crypto/sha1"
    10  	"crypto/sha256"
    11  	"errors"
    12  	"fmt"
    13  	"hash"
    14  	"sort"
    15  
    16  	"gonum.org/v1/gonum/stat/combin"
    17  )
    18  
    19  // Deduplicate removes duplicate statements in s, working in place, and returns
    20  // the deduplicated slice with statements sorted in lexical order. Term UID
    21  // fields are not considered and their values may be lost during deduplication.
    22  func Deduplicate(s []*Statement) []*Statement {
    23  	if len(s) < 2 {
    24  		return s
    25  	}
    26  	sort.Sort(c14nStatements(s))
    27  	curr := 0
    28  	for i, e := range s {
    29  		if isSameStatement(e, s[curr]) {
    30  			continue
    31  		}
    32  		curr++
    33  		if curr < i {
    34  			s[curr], s[i] = s[i], nil
    35  		}
    36  	}
    37  	return s[:curr+1]
    38  }
    39  
    40  func isSameStatement(a, b *Statement) bool {
    41  	if a == b {
    42  		return true
    43  	}
    44  	return a.Subject.Value == b.Subject.Value &&
    45  		a.Predicate.Value == b.Predicate.Value &&
    46  		a.Object.Value == b.Object.Value &&
    47  		a.Label.Value == b.Label.Value
    48  }
    49  
    50  // Note on implementation details: The comment numbering in the code relates the
    51  // implementation to the steps of the algorithm described in the specification.
    52  
    53  // URGNA2012 applies the Universal RDF Graph Normalization Algorithm 2012
    54  // to the statements in src, placing the result in dst and returning it.
    55  // If dst is nil a slice of statements will be allocated. If dst is not
    56  // nil and not the same length as src, URGNA2012 will return an error.
    57  //
    58  // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details.
    59  func URGNA2012(dst, src []*Statement) ([]*Statement, error) {
    60  	if dst == nil {
    61  		dst = make([]*Statement, len(src))
    62  	} else if len(dst) != len(src) {
    63  		return dst, errors.New("rdf: slice length mismatch")
    64  	}
    65  	// 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm
    66  	u := &urna{
    67  		canon:         newIssuer("_:c14n"),
    68  		hashes:        make(map[string]string),
    69  		statementsFor: make(map[string][]*Statement),
    70  		hash:          sha1.New(),
    71  		label:         "_:g",
    72  	}
    73  	u.hashToRelated = u.hashToRelatedURGNA2012
    74  	return u.relabel(dst, src)
    75  }
    76  
    77  // URDNA2015 applies the Universal RDF Dataset Normalization Algorithm 2015
    78  // to the statements in src, placing the result in dst and returning it.
    79  // If dst is nil a slice of statements will be allocated. If dst is not
    80  // nil and not the same length as src, URDNA2015 will return an error.
    81  //
    82  // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details.
    83  func URDNA2015(dst, src []*Statement) ([]*Statement, error) {
    84  	if dst == nil {
    85  		dst = make([]*Statement, len(src))
    86  	} else if len(dst) != len(src) {
    87  		return dst, errors.New("rdf: slice length mismatch")
    88  	}
    89  	// 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm
    90  	u := &urna{
    91  		canon:         newIssuer("_:c14n"),
    92  		hashes:        make(map[string]string),
    93  		statementsFor: make(map[string][]*Statement),
    94  		hash:          sha256.New(),
    95  	}
    96  	u.hashToRelated = u.hashToRelatedURDNA2015
    97  	return u.relabel(dst, src)
    98  }
    99  
   100  // urna is the canonicalization state for the URGNA2012 and URDNA2015
   101  // algorithms. The urna type implements both algorithms through the state
   102  // of the label and hashToRelated fields.
   103  //
   104  // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#canonicalization-state
   105  // for details.
   106  type urna struct {
   107  	// canon is the canonical issuer.
   108  	canon *issuer
   109  
   110  	// hashes holds already calculated hashes
   111  	// for hashing first degree quads.
   112  	hashes map[string]string
   113  
   114  	// statementsFor is the blank node to quads map.
   115  	statementsFor map[string][]*Statement
   116  
   117  	// hash is the hash function used by the
   118  	// canonicalization function.
   119  	hash hash.Hash
   120  	// hashToRelated holds URGNA2012 and URDNA2015-
   121  	// specific hashing routines.
   122  	hashToRelated relatedHashCreator
   123  	// label holds "_:g" when running URGNA2012.
   124  	// Otherwise it is empty.
   125  	label string
   126  }
   127  
   128  // relabel is the algorithm described in section 4.4.2 of the spec at
   129  // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm.
   130  func (u *urna) relabel(dst, src []*Statement) ([]*Statement, error) {
   131  	// termsFor is the hash to blank nodes map.
   132  	// It is not held in the urna struct, but is
   133  	// part of the canonicalization state.
   134  	//
   135  	// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#dfn-hash-to-blank-nodes-map
   136  	var termsFor map[string][]string // 1.
   137  
   138  	for _, s := range src { // 2.
   139  	terms:
   140  		for _, t := range []string{
   141  			s.Subject.Value,
   142  			s.Object.Value,
   143  			s.Label.Value,
   144  		} {
   145  			if !isBlank(t) {
   146  				continue
   147  			}
   148  			for _, e := range u.statementsFor[t] {
   149  				if e == s {
   150  					continue terms
   151  				}
   152  			}
   153  			u.statementsFor[t] = append(u.statementsFor[t], s)
   154  		}
   155  	}
   156  
   157  	// todo is the list of non-normalized blank node identifiers.
   158  	todo := make(map[string]bool) // 3.
   159  	for b := range u.statementsFor {
   160  		todo[b] = true
   161  	}
   162  
   163  	simple := true // 4.
   164  	for simple {   // 5.
   165  		simple = false // 5.1
   166  
   167  		termsFor = make(map[string][]string) // 5.2
   168  
   169  		for b := range todo { // 5.3
   170  			hash := u.hashFirstDegreeQuads(b)          // 5.3.1
   171  			termsFor[hash] = append(termsFor[hash], b) // 5.3.2
   172  		}
   173  
   174  		for _, h := range lexicallySortedTermHashes(termsFor) { // 5.4
   175  			terms := termsFor[h]
   176  			if len(terms) > 1 { // 5.4.1
   177  				continue
   178  			}
   179  			u.canon.issueFor(terms[0]) // 5.4.2
   180  			delete(todo, terms[0])     // 5.4.3
   181  			delete(termsFor, h)        // 5.4.4
   182  			simple = true              // 5.4.5
   183  		}
   184  	}
   185  
   186  	for _, hash := range lexicallySortedTermHashes(termsFor) { // 6.
   187  		paths := make(map[string][]*issuer) // 6.1
   188  		for _, b := range termsFor[hash] {  // 6.2
   189  			if u.canon.has(b) { // 6.2.1
   190  				continue
   191  			}
   192  			names := newIssuer("_:b") // 6.2.2
   193  			names.issueFor(b)         // 6.2.3
   194  
   195  			// 6.2.4
   196  			hash, issuer := u.hashNDegreeQuads(b, names)
   197  			paths[string(hash)] = append(paths[string(hash)], issuer)
   198  		}
   199  
   200  		for _, hash := range lexicallySortedPathHashes(paths) { // 6.3
   201  			for _, i := range paths[hash] {
   202  				for _, existing := range i.ordered { // 6.3.1
   203  					u.canon.issueFor(existing)
   204  				}
   205  			}
   206  		}
   207  	}
   208  
   209  	// 7.
   210  	for i, s := range src {
   211  		if dst[i] == nil {
   212  			dst[i] = &Statement{}
   213  		}
   214  		n := dst[i]
   215  		n.Subject = Term{Value: translateURNA(s.Subject.Value, u.canon.issued), UID: s.Subject.UID}
   216  		n.Predicate = s.Predicate
   217  		n.Object = Term{Value: translateURNA(s.Object.Value, u.canon.issued), UID: s.Object.UID}
   218  		n.Label = Term{Value: translateURNA(s.Label.Value, u.canon.issued), UID: s.Label.UID}
   219  	}
   220  	sort.Sort(c14nStatements(dst))
   221  
   222  	return dst, nil
   223  }
   224  
   225  // lexicallySortedPathHashes returns the lexically sorted hashes of paths.
   226  func lexicallySortedPathHashes(paths map[string][]*issuer) []string {
   227  	lexicalHashPaths := make([]string, len(paths))
   228  	i := 0
   229  	for h := range paths {
   230  		lexicalHashPaths[i] = h
   231  		i++
   232  	}
   233  	sort.Strings(lexicalHashPaths)
   234  	return lexicalHashPaths
   235  }
   236  
   237  func translateURNA(term string, mapping map[string]string) string {
   238  	term = translate(term, mapping)
   239  	if term == "" {
   240  		return ""
   241  	}
   242  	text, qual, kind, err := extract([]rune(term))
   243  	var t Term
   244  	switch kind {
   245  	case Blank:
   246  		return term
   247  	case IRI:
   248  		t, err = NewIRITerm(text)
   249  	case Literal:
   250  		t, err = NewLiteralTerm(text, qual)
   251  	}
   252  	if err != nil {
   253  		panic(fmt.Errorf("rdf: invalid term %q: %w", term, err))
   254  	}
   255  	return t.Value
   256  }
   257  
   258  // hashFirstDegreeQuads is the algorithm described in section 4.6 of the spec
   259  // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1.
   260  func (u *urna) hashFirstDegreeQuads(b string) string {
   261  	if h, ok := u.hashes[b]; ok {
   262  		return h
   263  	}
   264  
   265  	var statements []*Statement // 1.
   266  
   267  	for _, s := range u.statementsFor[b] { // 2. and 3.
   268  		var n Statement
   269  		n.Subject.Value = replaceBlank(s.Subject.Value, b, "")
   270  		n.Predicate.Value = s.Predicate.Value
   271  		n.Object.Value = replaceBlank(s.Object.Value, b, "")
   272  		n.Label.Value = replaceBlank(s.Label.Value, b, u.label)
   273  		statements = append(statements, &n)
   274  	}
   275  
   276  	sort.Sort(c14nStatements(statements)) // 4.
   277  
   278  	// 5.
   279  	u.hash.Reset()
   280  	for _, s := range statements {
   281  		fmt.Fprintln(u.hash, s)
   282  	}
   283  	u.hashes[b] = string(hex(u.hash.Sum(nil)))
   284  
   285  	return u.hashes[b]
   286  }
   287  
   288  // replaceBlank implements 3.1 of the algorithm described at
   289  // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1.
   290  func replaceBlank(b, matching, label string) string {
   291  	if !isBlank(b) { // 3.1
   292  		return b
   293  	}
   294  	if label != "" { // URGNA2012 modification.
   295  		// When running in URGNA2012 mode, label is "_:g" for Label fields.
   296  		//
   297  		// If any blank node was used in the graph name position in the quad,
   298  		// then the value was serialized using the special blank node identifier,
   299  		// "_:g", instead of "_:z".
   300  		// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012
   301  		return label
   302  	}
   303  	// 3.1.1.1
   304  	if b == matching {
   305  		return "_:a"
   306  	}
   307  	return "_:z"
   308  }
   309  
   310  // hashNDegreeQuads is the algorithm described in section 4.8 of the spec
   311  // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads.
   312  func (u *urna) hashNDegreeQuads(b string, names *issuer) ([]byte, *issuer) {
   313  	// termsFor is the hash to related blank nodes map.
   314  	termsFor := u.hashToRelated(b, names) // 1., 2. and 3.
   315  	var final []byte                      // 4.
   316  
   317  	for _, hash := range lexicallySortedTermHashes(termsFor) { // 5.
   318  		terms := termsFor[hash]
   319  		final = append(final, hash...) // 5.1
   320  		var chosenPath []byte          // 5.2
   321  		var chosenIssuer *issuer       // 5.3
   322  		p := newPermutations(terms)    // 5.4
   323  	permutations:
   324  		for p.next() {
   325  			namesCopy := names.clone()          // 5.4.1
   326  			var path []byte                     // 5.4.2
   327  			var work []string                   // 5.4.3
   328  			for _, b := range p.permutation() { // 5.4.4
   329  				if u.canon.has(b) { // 5.4.4.1
   330  					path = append(path, u.canon.issueFor(b)...)
   331  				} else { // 5.4.4.1
   332  					if !namesCopy.has(b) {
   333  						work = append(work, b)
   334  					}
   335  
   336  					path = append(path, namesCopy.issueFor(b)...) // 5.4.4.2.2
   337  				}
   338  
   339  				// 5.4.4.3
   340  				if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 {
   341  					continue permutations
   342  				}
   343  			}
   344  
   345  			for _, b := range work { // 5.4.5
   346  				hash, issuer := u.hashNDegreeQuads(b, namesCopy) // 5.4.5.1
   347  				path = append(path, namesCopy.issueFor(b)...)    // 5.4.5.2
   348  
   349  				// 5.4.5.3
   350  				path = append(path, '<')
   351  				path = append(path, hash...)
   352  				path = append(path, '>')
   353  
   354  				namesCopy = issuer // 5.4.5.4
   355  
   356  				// 5.4.5.5
   357  				if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 {
   358  					continue permutations
   359  				}
   360  			}
   361  
   362  			if len(chosenPath) == 0 || bytes.Compare(path, chosenPath) < 0 { // 5.4.6
   363  				chosenPath = path
   364  				chosenIssuer = namesCopy
   365  			}
   366  
   367  		}
   368  		// 5.5
   369  		final = append(final, chosenPath...)
   370  		u.hash.Reset()
   371  		u.hash.Write(final)
   372  
   373  		names = chosenIssuer // 5.6
   374  	}
   375  
   376  	return hex(u.hash.Sum(nil)), names
   377  }
   378  
   379  // lexicallySortedTermHashes returns the lexically sorted hashes of termsFor.
   380  func lexicallySortedTermHashes(termsFor map[string][]string) []string {
   381  	lexicalHashes := make([]string, len(termsFor))
   382  	i := 0
   383  	for h := range termsFor {
   384  		lexicalHashes[i] = h
   385  		i++
   386  	}
   387  	sort.Strings(lexicalHashes)
   388  	return lexicalHashes
   389  }
   390  
   391  type relatedHashCreator func(b string, names *issuer) map[string][]string
   392  
   393  // hashToRelatedURDNA2015 is the section 1. 2. and 3. of 4.8.2 of the spec
   394  // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads.
   395  func (u *urna) hashToRelatedURDNA2015(b string, names *issuer) map[string][]string {
   396  	// termsFor is the hash to related blank nodes map.
   397  	termsFor := make(map[string][]string) // 1.
   398  
   399  	for _, s := range u.statementsFor[b] { // 2. and 3.
   400  		for i, term := range []string{ // 3.1
   401  			s.Subject.Value,
   402  			s.Object.Value,
   403  			s.Label.Value,
   404  		} {
   405  			if !isBlank(term) || term == b {
   406  				continue
   407  			}
   408  
   409  			// 3.1.1
   410  			const position = "sog"
   411  			hash := u.hashRelatedBlank(term, s, names, position[i])
   412  
   413  			// 3.1.2
   414  			termsFor[string(hash)] = append(termsFor[string(hash)], term)
   415  		}
   416  	}
   417  
   418  	return termsFor
   419  }
   420  
   421  // hashToRelatedURGNA2012 is the section 1., 2. and 3. of 4.8.2 of the spec
   422  // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads
   423  // with changes made for URGNA2012 shown in the appendix for 4.8 at
   424  // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012.
   425  // The numbering of steps here corresponds to the spec's numbering in the
   426  // appendix.
   427  func (u *urna) hashToRelatedURGNA2012(b string, names *issuer) map[string][]string {
   428  	// termsFor is the hash to related blank nodes map.
   429  	termsFor := make(map[string][]string)
   430  
   431  	for _, s := range u.statementsFor[b] { // 1.
   432  		var (
   433  			term string
   434  			pos  byte
   435  		)
   436  		switch {
   437  		case isBlank(s.Subject.Value) && s.Subject.Value != b: // 1.1
   438  			term = s.Subject.Value
   439  			pos = 'p'
   440  		case isBlank(s.Object.Value) && s.Object.Value != b: // 1.2
   441  			term = s.Object.Value
   442  			pos = 'r'
   443  		default:
   444  			continue // 1.3
   445  		}
   446  
   447  		// 1.4
   448  		hash := u.hashRelatedBlank(term, s, names, pos)
   449  		termsFor[string(hash)] = append(termsFor[string(hash)], term)
   450  	}
   451  
   452  	return termsFor
   453  }
   454  
   455  // hashNDegreeQuads is the algorithm described in section 4.7 of the spec
   456  // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-related-blank-node.
   457  func (u *urna) hashRelatedBlank(term string, s *Statement, names *issuer, pos byte) []byte {
   458  	// 1.
   459  	var b string
   460  	switch {
   461  	case u.canon.has(term):
   462  		b = u.canon.issueFor(term)
   463  	case names.has(term):
   464  		b = names.issueFor(term)
   465  	default:
   466  		b = u.hashFirstDegreeQuads(term)
   467  	}
   468  
   469  	// 2.
   470  	u.hash.Reset()
   471  	u.hash.Write([]byte{pos})
   472  
   473  	if pos != 'g' { // 3.
   474  		if u.label == "" {
   475  			// URDNA2015: Term.Value retained the angle quotes
   476  			// so we don't need to add them.
   477  			u.hash.Write([]byte(s.Predicate.Value))
   478  		} else {
   479  			// URGNA2012 does not delimit predicate by < and >.
   480  			// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012
   481  			// with reference to 4.7.
   482  			u.hash.Write([]byte(unquoteIRI(s.Predicate.Value)))
   483  		}
   484  	}
   485  
   486  	// 4. and 5.
   487  	u.hash.Write([]byte(b))
   488  	return hex(u.hash.Sum(nil))
   489  }
   490  
   491  // issuer is an identifier issuer.
   492  type issuer struct {
   493  	prefix  string
   494  	issued  map[string]string
   495  	ordered []string
   496  }
   497  
   498  // newIssuer returns a new identifier issuer with the given prefix.
   499  func newIssuer(prefix string) *issuer {
   500  	return &issuer{prefix: prefix, issued: make(map[string]string)}
   501  }
   502  
   503  // issueFor implements the issue identifier algorithm.
   504  //
   505  // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#issue-identifier-algorithm
   506  func (i *issuer) issueFor(b string) string {
   507  	c, ok := i.issued[b]
   508  	if ok {
   509  		return c
   510  	}
   511  	c = fmt.Sprintf("%s%d", i.prefix, len(i.issued))
   512  	i.issued[b] = c
   513  	i.ordered = append(i.ordered, b)
   514  	return c
   515  }
   516  
   517  func (i *issuer) has(id string) bool {
   518  	_, ok := i.issued[id]
   519  	return ok
   520  }
   521  
   522  func (i *issuer) clone() *issuer {
   523  	new := issuer{
   524  		prefix:  i.prefix,
   525  		issued:  make(map[string]string, len(i.issued)),
   526  		ordered: make([]string, len(i.ordered)),
   527  	}
   528  	copy(new.ordered, i.ordered)
   529  	for k, v := range i.issued {
   530  		new.issued[k] = v
   531  	}
   532  	return &new
   533  }
   534  
   535  func hex(data []byte) []byte {
   536  	const digit = "0123456789abcdef"
   537  	buf := make([]byte, 0, len(data)*2)
   538  	for _, b := range data {
   539  		buf = append(buf, digit[b>>4], digit[b&0xf])
   540  	}
   541  	return buf
   542  }
   543  
   544  // permutations is a string permutation generator.
   545  type permutations struct {
   546  	src  []string
   547  	dst  []string
   548  	idx  []int
   549  	perm *combin.PermutationGenerator
   550  }
   551  
   552  // newPermutation returns a new permutations.
   553  func newPermutations(src []string) *permutations {
   554  	return &permutations{
   555  		src:  src,
   556  		dst:  make([]string, len(src)),
   557  		perm: combin.NewPermutationGenerator(len(src), len(src)),
   558  		idx:  make([]int, len(src)),
   559  	}
   560  }
   561  
   562  // next returns whether there is another permutation available.
   563  func (p *permutations) next() bool {
   564  	return p.perm.Next()
   565  }
   566  
   567  // permutation returns the permutation. The caller may not retain the
   568  // returned slice between iterations.
   569  func (p *permutations) permutation() []string {
   570  	p.perm.Permutation(p.idx)
   571  	for i, j := range p.idx {
   572  		p.dst[j] = p.src[i]
   573  	}
   574  	return p.dst
   575  }