github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/urna.go (about) 1 // Copyright ©2020 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package rdf 6 7 import ( 8 "bytes" 9 "crypto/sha1" 10 "crypto/sha256" 11 "errors" 12 "fmt" 13 "hash" 14 "sort" 15 16 "github.com/gopherd/gonum/stat/combin" 17 ) 18 19 // Deduplicate removes duplicate statements in s, working in place, and returns 20 // the deduplicated slice with statements sorted in lexical order. Term UID 21 // fields are not considered and their values may be lost during deduplication. 22 func Deduplicate(s []*Statement) []*Statement { 23 if len(s) < 2 { 24 return s 25 } 26 sort.Sort(c14nStatements(s)) 27 curr := 0 28 for i, e := range s { 29 if isSameStatement(e, s[curr]) { 30 continue 31 } 32 curr++ 33 if curr < i { 34 s[curr], s[i] = s[i], nil 35 } 36 } 37 return s[:curr+1] 38 } 39 40 func isSameStatement(a, b *Statement) bool { 41 if a == b { 42 return true 43 } 44 return a.Subject.Value == b.Subject.Value && 45 a.Predicate.Value == b.Predicate.Value && 46 a.Object.Value == b.Object.Value && 47 a.Label.Value == b.Label.Value 48 } 49 50 // Note on implementation details: The comment numbering in the code relates the 51 // implementation to the steps of the algorithm described in the specification. 52 53 // URGNA2012 applies the Universal RDF Graph Normalization Algorithm 2012 54 // to the statements in src, placing the result in dst and returning it. 55 // If dst is nil a slice of statements will be allocated. If dst is not 56 // nil and not the same length as src, URGNA2012 will return an error. 57 // 58 // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details. 59 func URGNA2012(dst, src []*Statement) ([]*Statement, error) { 60 if dst == nil { 61 dst = make([]*Statement, len(src)) 62 } else if len(dst) != len(src) { 63 return dst, errors.New("rdf: slice length mismatch") 64 } 65 // 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm 66 u := &urna{ 67 canon: newIssuer("_:c14n"), 68 hashes: make(map[string]string), 69 statementsFor: make(map[string][]*Statement), 70 hash: sha1.New(), 71 label: "_:g", 72 } 73 u.hashToRelated = u.hashToRelatedURGNA2012 74 return u.relabel(dst, src) 75 } 76 77 // URDNA2015 applies the Universal RDF Dataset Normalization Algorithm 2015 78 // to the statements in src, placing the result in dst and returning it. 79 // If dst is nil a slice of statements will be allocated. If dst is not 80 // nil and not the same length as src, URDNA2015 will return an error. 81 // 82 // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details. 83 func URDNA2015(dst, src []*Statement) ([]*Statement, error) { 84 if dst == nil { 85 dst = make([]*Statement, len(src)) 86 } else if len(dst) != len(src) { 87 return dst, errors.New("rdf: slice length mismatch") 88 } 89 // 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm 90 u := &urna{ 91 canon: newIssuer("_:c14n"), 92 hashes: make(map[string]string), 93 statementsFor: make(map[string][]*Statement), 94 hash: sha256.New(), 95 } 96 u.hashToRelated = u.hashToRelatedURDNA2015 97 return u.relabel(dst, src) 98 } 99 100 // urna is the canonicalization state for the URGNA2012 and URDNA2015 101 // algorithms. The urna type implements both algorithms through the state 102 // of the label and hashToRelated fields. 103 // 104 // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#canonicalization-state 105 // for details. 106 type urna struct { 107 // canon is the canonical issuer. 108 canon *issuer 109 110 // hashes holds already calculated hashes 111 // for hashing first degree quads. 112 hashes map[string]string 113 114 // statementsFor is the blank node to quads map. 115 statementsFor map[string][]*Statement 116 117 // hash is the hash function used by the 118 // canonicalization function. 119 hash hash.Hash 120 // hashToRelated holds URGNA2012 and URDNA2015- 121 // specific hashing routines. 122 hashToRelated relatedHashCreator 123 // label holds "_:g" when running URGNA2012. 124 // Otherwise it is empty. 125 label string 126 } 127 128 // relabel is the algorithm described in section 4.4.2 of the spec at 129 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm. 130 func (u *urna) relabel(dst, src []*Statement) ([]*Statement, error) { 131 // termsFor is the hash to blank nodes map. 132 // It is not held in the urna struct, but is 133 // part of the canonicalization state. 134 // 135 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#dfn-hash-to-blank-nodes-map 136 var termsFor map[string][]string // 1. 137 138 for _, s := range src { // 2. 139 terms: 140 for _, t := range []string{ 141 s.Subject.Value, 142 s.Object.Value, 143 s.Label.Value, 144 } { 145 if !isBlank(t) { 146 continue 147 } 148 for _, e := range u.statementsFor[t] { 149 if e == s { 150 continue terms 151 } 152 } 153 u.statementsFor[t] = append(u.statementsFor[t], s) 154 } 155 } 156 157 // todo is the list of non-normalized blank node identifiers. 158 todo := make(map[string]bool) // 3. 159 for b := range u.statementsFor { 160 todo[b] = true 161 } 162 163 simple := true // 4. 164 for simple { // 5. 165 simple = false // 5.1 166 167 termsFor = make(map[string][]string) // 5.2 168 169 for b := range todo { // 5.3 170 hash := u.hashFirstDegreeQuads(b) // 5.3.1 171 termsFor[hash] = append(termsFor[hash], b) // 5.3.2 172 } 173 174 for _, h := range lexicallySortedTermHashes(termsFor) { // 5.4 175 terms := termsFor[h] 176 if len(terms) > 1 { // 5.4.1 177 continue 178 } 179 u.canon.issueFor(terms[0]) // 5.4.2 180 delete(todo, terms[0]) // 5.4.3 181 delete(termsFor, h) // 5.4.4 182 simple = true // 5.4.5 183 } 184 } 185 186 for _, hash := range lexicallySortedTermHashes(termsFor) { // 6. 187 paths := make(map[string][]*issuer) // 6.1 188 for _, b := range termsFor[hash] { // 6.2 189 if u.canon.has(b) { // 6.2.1 190 continue 191 } 192 names := newIssuer("_:b") // 6.2.2 193 names.issueFor(b) // 6.2.3 194 195 // 6.2.4 196 hash, issuer := u.hashNDegreeQuads(b, names) 197 paths[string(hash)] = append(paths[string(hash)], issuer) 198 } 199 200 for _, hash := range lexicallySortedPathHashes(paths) { // 6.3 201 for _, i := range paths[hash] { 202 for _, existing := range i.ordered { // 6.3.1 203 u.canon.issueFor(existing) 204 } 205 } 206 } 207 } 208 209 // 7. 210 for i, s := range src { 211 if dst[i] == nil { 212 dst[i] = &Statement{} 213 } 214 n := dst[i] 215 n.Subject = Term{Value: translateURNA(s.Subject.Value, u.canon.issued), UID: s.Subject.UID} 216 n.Predicate = s.Predicate 217 n.Object = Term{Value: translateURNA(s.Object.Value, u.canon.issued), UID: s.Object.UID} 218 n.Label = Term{Value: translateURNA(s.Label.Value, u.canon.issued), UID: s.Label.UID} 219 } 220 sort.Sort(c14nStatements(dst)) 221 222 return dst, nil 223 } 224 225 // lexicallySortedPathHashes returns the lexically sorted hashes of paths. 226 func lexicallySortedPathHashes(paths map[string][]*issuer) []string { 227 lexicalHashPaths := make([]string, len(paths)) 228 i := 0 229 for h := range paths { 230 lexicalHashPaths[i] = h 231 i++ 232 } 233 sort.Strings(lexicalHashPaths) 234 return lexicalHashPaths 235 } 236 237 func translateURNA(term string, mapping map[string]string) string { 238 term = translate(term, mapping) 239 if term == "" { 240 return "" 241 } 242 text, qual, kind, err := extract([]rune(term)) 243 var t Term 244 switch kind { 245 case Blank: 246 return term 247 case IRI: 248 t, err = NewIRITerm(text) 249 case Literal: 250 t, err = NewLiteralTerm(text, qual) 251 } 252 if err != nil { 253 panic(fmt.Errorf("rdf: invalid term %q: %w", term, err)) 254 } 255 return t.Value 256 } 257 258 // hashFirstDegreeQuads is the algorithm described in section 4.6 of the spec 259 // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1. 260 func (u *urna) hashFirstDegreeQuads(b string) string { 261 if h, ok := u.hashes[b]; ok { 262 return h 263 } 264 265 var statements []*Statement // 1. 266 267 for _, s := range u.statementsFor[b] { // 2. and 3. 268 var n Statement 269 n.Subject.Value = replaceBlank(s.Subject.Value, b, "") 270 n.Predicate.Value = s.Predicate.Value 271 n.Object.Value = replaceBlank(s.Object.Value, b, "") 272 n.Label.Value = replaceBlank(s.Label.Value, b, u.label) 273 statements = append(statements, &n) 274 } 275 276 sort.Sort(c14nStatements(statements)) // 4. 277 278 // 5. 279 u.hash.Reset() 280 for _, s := range statements { 281 fmt.Fprintln(u.hash, s) 282 } 283 u.hashes[b] = string(hex(u.hash.Sum(nil))) 284 285 return u.hashes[b] 286 } 287 288 // replaceBlank implements 3.1 of the algorithm described at 289 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1. 290 func replaceBlank(b, matching, label string) string { 291 if !isBlank(b) { // 3.1 292 return b 293 } 294 if label != "" { // URGNA2012 modification. 295 // When running in URGNA2012 mode, label is "_:g" for Label fields. 296 // 297 // If any blank node was used in the graph name position in the quad, 298 // then the value was serialized using the special blank node identifier, 299 // "_:g", instead of "_:z". 300 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012 301 return label 302 } 303 // 3.1.1.1 304 if b == matching { 305 return "_:a" 306 } 307 return "_:z" 308 } 309 310 // hashNDegreeQuads is the algorithm described in section 4.8 of the spec 311 // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads. 312 func (u *urna) hashNDegreeQuads(b string, names *issuer) ([]byte, *issuer) { 313 // termsFor is the hash to related blank nodes map. 314 termsFor := u.hashToRelated(b, names) // 1., 2. and 3. 315 var final []byte // 4. 316 317 for _, hash := range lexicallySortedTermHashes(termsFor) { // 5. 318 terms := termsFor[hash] 319 final = append(final, hash...) // 5.1 320 var chosenPath []byte // 5.2 321 var chosenIssuer *issuer // 5.3 322 p := newPermutations(terms) // 5.4 323 permutations: 324 for p.next() { 325 namesCopy := names.clone() // 5.4.1 326 var path []byte // 5.4.2 327 var work []string // 5.4.3 328 for _, b := range p.permutation() { // 5.4.4 329 if u.canon.has(b) { // 5.4.4.1 330 path = append(path, u.canon.issueFor(b)...) 331 } else { // 5.4.4.1 332 if !namesCopy.has(b) { 333 work = append(work, b) 334 } 335 336 path = append(path, namesCopy.issueFor(b)...) // 5.4.4.2.2 337 } 338 339 // 5.4.4.3 340 if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 { 341 continue permutations 342 } 343 } 344 345 for _, b := range work { // 5.4.5 346 hash, issuer := u.hashNDegreeQuads(b, namesCopy) // 5.4.5.1 347 path = append(path, namesCopy.issueFor(b)...) // 5.4.5.2 348 349 // 5.4.5.3 350 path = append(path, '<') 351 path = append(path, hash...) 352 path = append(path, '>') 353 354 namesCopy = issuer // 5.4.5.4 355 356 // 5.4.5.5 357 if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 { 358 continue permutations 359 } 360 } 361 362 if len(chosenPath) == 0 || bytes.Compare(path, chosenPath) < 0 { // 5.4.6 363 chosenPath = path 364 chosenIssuer = namesCopy 365 } 366 367 } 368 // 5.5 369 final = append(final, chosenPath...) 370 u.hash.Reset() 371 u.hash.Write(final) //nolint:errcheck 372 373 names = chosenIssuer // 5.6 374 } 375 376 return hex(u.hash.Sum(nil)), names 377 } 378 379 // lexicallySortedTermHashes returns the lexically sorted hashes of termsFor. 380 func lexicallySortedTermHashes(termsFor map[string][]string) []string { 381 lexicalHashes := make([]string, len(termsFor)) 382 i := 0 383 for h := range termsFor { 384 lexicalHashes[i] = h 385 i++ 386 } 387 sort.Strings(lexicalHashes) 388 return lexicalHashes 389 } 390 391 type relatedHashCreator func(b string, names *issuer) map[string][]string 392 393 // hashToRelatedURDNA2015 is the section 1. 2. and 3. of 4.8.2 of the spec 394 // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads. 395 func (u *urna) hashToRelatedURDNA2015(b string, names *issuer) map[string][]string { 396 // termsFor is the hash to related blank nodes map. 397 termsFor := make(map[string][]string) // 1. 398 399 for _, s := range u.statementsFor[b] { // 2. and 3. 400 for i, term := range []string{ // 3.1 401 s.Subject.Value, 402 s.Object.Value, 403 s.Label.Value, 404 } { 405 if !isBlank(term) || term == b { 406 continue 407 } 408 409 // 3.1.1 410 const position = "sog" 411 hash := u.hashRelatedBlank(term, s, names, position[i]) 412 413 // 3.1.2 414 termsFor[string(hash)] = append(termsFor[string(hash)], term) 415 } 416 } 417 418 return termsFor 419 } 420 421 // hashToRelatedURGNA2012 is the section 1., 2. and 3. of 4.8.2 of the spec 422 // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads 423 // with changes made for URGNA2012 shown in the appendix for 4.8 at 424 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012. 425 // The numbering of steps here corresponds to the spec's numbering in the 426 // appendix. 427 func (u *urna) hashToRelatedURGNA2012(b string, names *issuer) map[string][]string { 428 // termsFor is the hash to related blank nodes map. 429 termsFor := make(map[string][]string) 430 431 for _, s := range u.statementsFor[b] { // 1. 432 var ( 433 term string 434 pos byte 435 ) 436 switch { 437 case isBlank(s.Subject.Value) && s.Subject.Value != b: // 1.1 438 term = s.Subject.Value 439 pos = 'p' 440 case isBlank(s.Object.Value) && s.Object.Value != b: // 1.2 441 term = s.Object.Value 442 pos = 'r' 443 default: 444 continue // 1.3 445 } 446 447 // 1.4 448 hash := u.hashRelatedBlank(term, s, names, pos) 449 termsFor[string(hash)] = append(termsFor[string(hash)], term) 450 } 451 452 return termsFor 453 } 454 455 // hashNDegreeQuads is the algorithm described in section 4.7 of the spec 456 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-related-blank-node. 457 func (u *urna) hashRelatedBlank(term string, s *Statement, names *issuer, pos byte) []byte { 458 // 1. 459 var b string 460 switch { 461 case u.canon.has(term): 462 b = u.canon.issueFor(term) 463 case names.has(term): 464 b = names.issueFor(term) 465 default: 466 b = u.hashFirstDegreeQuads(term) 467 } 468 469 // 2. 470 u.hash.Reset() 471 u.hash.Write([]byte{pos}) //nolint:errcheck 472 473 if pos != 'g' { // 3. 474 if u.label == "" { 475 // URDNA2015: Term.Value retained the angle quotes 476 // so we don't need to add them. 477 u.hash.Write([]byte(s.Predicate.Value)) //nolint:errcheck 478 } else { 479 // URGNA2012 does not delimit predicate by < and >. 480 // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012 481 // with reference to 4.7. 482 u.hash.Write([]byte(unquoteIRI(s.Predicate.Value))) //nolint:errcheck 483 } 484 } 485 486 // 4. and 5. 487 u.hash.Write([]byte(b)) //nolint:errcheck 488 return hex(u.hash.Sum(nil)) 489 } 490 491 // issuer is an identifier issuer. 492 type issuer struct { 493 prefix string 494 issued map[string]string 495 ordered []string 496 } 497 498 // newIssuer returns a new identifier issuer with the given prefix. 499 func newIssuer(prefix string) *issuer { 500 return &issuer{prefix: prefix, issued: make(map[string]string)} 501 } 502 503 // issueFor implements the issue identifier algorithm. 504 // 505 // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#issue-identifier-algorithm 506 func (i *issuer) issueFor(b string) string { 507 c, ok := i.issued[b] 508 if ok { 509 return c 510 } 511 c = fmt.Sprintf("%s%d", i.prefix, len(i.issued)) 512 i.issued[b] = c 513 i.ordered = append(i.ordered, b) 514 return c 515 } 516 517 func (i *issuer) has(id string) bool { 518 _, ok := i.issued[id] 519 return ok 520 } 521 522 func (i *issuer) clone() *issuer { 523 new := issuer{ 524 prefix: i.prefix, 525 issued: make(map[string]string, len(i.issued)), 526 ordered: make([]string, len(i.ordered)), 527 } 528 copy(new.ordered, i.ordered) 529 for k, v := range i.issued { 530 new.issued[k] = v 531 } 532 return &new 533 } 534 535 func hex(data []byte) []byte { 536 const digit = "0123456789abcdef" 537 buf := make([]byte, 0, len(data)*2) 538 for _, b := range data { 539 buf = append(buf, digit[b>>4], digit[b&0xf]) 540 } 541 return buf 542 } 543 544 // permutations is a string permutation generator. 545 type permutations struct { 546 src []string 547 dst []string 548 idx []int 549 perm *combin.PermutationGenerator 550 } 551 552 // newPermutation returns a new permutations. 553 func newPermutations(src []string) *permutations { 554 return &permutations{ 555 src: src, 556 dst: make([]string, len(src)), 557 perm: combin.NewPermutationGenerator(len(src), len(src)), 558 idx: make([]int, len(src)), 559 } 560 } 561 562 // next returns whether there is another permutation available. 563 func (p *permutations) next() bool { 564 return p.perm.Next() 565 } 566 567 // permutation returns the permutation. The caller may not retain the 568 // returned slice between iterations. 569 func (p *permutations) permutation() []string { 570 p.perm.Permutation(p.idx) 571 for i, j := range p.idx { 572 p.dst[j] = p.src[i] 573 } 574 return p.dst 575 }