gonum.org/v1/gonum@v0.14.0/graph/formats/rdf/iso_canonical.go (about) 1 // Copyright ©2020 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package rdf 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "hash" 12 "sort" 13 ) 14 15 // See "Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms 16 // for Leaning and Labelling Blank Nodes" by Aiden Hogan for description of 17 // the algorithm, https://doi.org/10.1145/3068333 and available free from 18 // the author's web page http://aidanhogan.com/docs/rdf-canonicalisation.pdf. 19 // 20 // Aspects of implementation from discussion in v1.0 of the readme of the PoC 21 // at https://doi.org/10.5281/zenodo.3154322 22 23 // Isomorphic returns whether the RDF graph datasets a and b are isomorphic, 24 // where there is a bijective mapping between blank nodes in a and b using 25 // the given hash function. If decomp is true, the graphs are decomposed 26 // before canonicalization. 27 func Isomorphic(a, b []*Statement, decomp bool, h hash.Hash) bool { 28 if len(a) != len(b) { 29 return false 30 } 31 32 zero := make([]byte, h.Size()) 33 ah, _ := IsoCanonicalHashes(a, decomp, true, h, zero) 34 bh, _ := IsoCanonicalHashes(b, decomp, true, h, zero) 35 if len(ah) != len(bh) { 36 return false 37 } 38 39 work := make([][]byte, 2*len(ah)) 40 lexicalHashes(work[:len(ah)], ah) 41 lexicalHashes(work[len(ah):], bh) 42 for i := range work[:len(ah)] { 43 if !bytes.Equal(work[i], work[i+len(ah)]) { 44 return false 45 } 46 } 47 return true 48 } 49 50 func lexicalHashes(dst [][]byte, hashes map[string][]byte) { 51 i := 0 52 for _, s := range hashes { 53 dst[i] = s 54 i++ 55 } 56 sort.Sort(lexical(dst)) 57 } 58 59 // IsoCanonicalHashes returns a mapping between the nodes of the RDF graph 60 // dataset described by the given statements using the provided hash 61 // function. If decomp is true, the graphs are decomposed before hashing. 62 // If dist is true the input graph is decomposed into identical splits, the 63 // entire graph will be hashed to distinguish nodes. If decomp is false, 64 // dist has no effect. 65 // Blank node hashes are initially set to the value of zero. Hash values 66 // are provided for literal and IRI nodes as well as for blank node. The 67 // hash input for literal nodes includes the quotes and the input for IRI 68 // nodes first removes the angle quotes around the IRI, although these are 69 // included in the map keys. 70 // 71 // Note that hashes returned by IsoCanonicalHashes with decomp=true are not 72 // comparable with hashes returned by IsoCanonicalHashes with decomp=false. 73 // 74 // See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of 75 // the hashing algorithm. 76 func IsoCanonicalHashes(statements []*Statement, decomp, dist bool, h hash.Hash, zero []byte) (hashes map[string][]byte, terms map[string]map[string]bool) { 77 if len(statements) == 0 { 78 return nil, nil 79 } 80 81 if debug { 82 debug.log(0, "Statements:") 83 for _, s := range statements { 84 debug.log(0, s) 85 } 86 debug.log(0) 87 } 88 89 hash, parts, ok := hashBNodesPerSplit(statements, decomp, h, zero) 90 91 if debug { 92 debug.log(0, "Blanks:") 93 if len(hash.blanks) != 0 { 94 for _, b := range hash.blanks { 95 debug.log(0, b) 96 } 97 } else { 98 debug.log(0, "none") 99 } 100 debug.log(0) 101 102 debug.log(0, "Parts:") 103 debug.logParts(0, parts) 104 105 debug.logf(0, "Hashes from hashBNodesPerSplit (splitting=%t):\n", decomp) 106 debug.logHashes(0, hash.hashOf, h.Size()) 107 } 108 109 if ok { 110 return hash.hashOf, hash.termsFor 111 } 112 113 // TODO: remove the triviality exception in distinguish and return 114 // the original hashes if this result is nil. Make the triviality 115 // exception optional. 116 hashes = distinguish(statements, dist, h, zero, hash, parts, nil, 0) 117 118 if hashes == nil { 119 // distinguish was given trivial parts and 120 // we did not ask it to try to merge them. 121 return hash.hashOf, hash.termsFor 122 } 123 124 if debug { 125 debug.log(0, "Final resolved Hashes:") 126 debug.logHashes(0, hashes, h.Size()) 127 } 128 129 terms = make(map[string]map[string]bool, len(hashes)) 130 for k, h := range hashes { 131 terms[string(h)] = map[string]bool{k: true} 132 } 133 134 return hashes, terms 135 } 136 137 // C14n performs a relabeling of the statements in src based on the terms 138 // obtained from IsoCanonicalHashes, placing the results in dst and returning 139 // them. The relabeling scheme is the same as for the Universal RDF Dataset 140 // Normalization Algorithm, blank terms are ordered lexically by their hash 141 // value and then given a blank label with the prefix "_:c14n" and an 142 // identifier counter corresponding to the label's sort rank. 143 // 144 // If dst is nil, it is allocated, otherwise the length of dst must match the 145 // length of src. 146 func C14n(dst, src []*Statement, terms map[string]map[string]bool) ([]*Statement, error) { 147 if dst == nil { 148 dst = make([]*Statement, len(src)) 149 } 150 151 if len(dst) != len(src) { 152 return dst, errors.New("rdf: slice length mismatch") 153 } 154 155 need := make(map[string]bool) 156 for _, s := range src { 157 for _, t := range []string{ 158 s.Subject.Value, 159 s.Object.Value, 160 s.Label.Value, 161 } { 162 if !isBlank(t) { 163 continue 164 } 165 need[t] = true 166 } 167 } 168 169 blanks := make([]string, len(need)) 170 i := 0 171 for h, m := range terms { 172 var ok bool 173 for t := range m { 174 if isBlank(t) { 175 ok = true 176 break 177 } 178 } 179 if !ok { 180 continue 181 } 182 if i == len(blanks) { 183 return dst, errors.New("rdf: too many blanks in terms") 184 } 185 blanks[i] = h 186 i++ 187 } 188 sort.Strings(blanks) 189 190 c14n := make(map[string]string) 191 for i, b := range blanks { 192 if len(terms[b]) == 0 { 193 return nil, fmt.Errorf("rdf: no term for blank with hash %x", b) 194 } 195 for t := range terms[b] { 196 if !isBlank(t) { 197 continue 198 } 199 if _, exists := c14n[t]; exists { 200 continue 201 } 202 delete(need, t) 203 c14n[t] = fmt.Sprintf("_:c14n%d", i) 204 } 205 } 206 207 if len(need) != 0 { 208 return dst, fmt.Errorf("rdf: missing term hashes for %d terms", len(need)) 209 } 210 211 for i, s := range src { 212 if dst[i] == nil { 213 dst[i] = &Statement{} 214 } 215 n := dst[i] 216 n.Subject = Term{Value: translate(s.Subject.Value, c14n)} 217 n.Predicate = s.Predicate 218 n.Object = Term{Value: translate(s.Object.Value, c14n)} 219 n.Label = Term{Value: translate(s.Label.Value, c14n)} 220 } 221 sort.Sort(c14nStatements(dst)) 222 223 return dst, nil 224 } 225 226 func translate(term string, mapping map[string]string) string { 227 if term, ok := mapping[term]; ok { 228 return term 229 } 230 return term 231 } 232 233 type c14nStatements []*Statement 234 235 func (s c14nStatements) Len() int { return len(s) } 236 func (s c14nStatements) Less(i, j int) bool { 237 si := s[i] 238 sj := s[j] 239 switch { 240 case si.Subject.Value < sj.Subject.Value: 241 return true 242 case si.Subject.Value > sj.Subject.Value: 243 return false 244 } 245 switch { // Always IRI. 246 case si.Predicate.Value < sj.Predicate.Value: 247 return true 248 case si.Predicate.Value > sj.Predicate.Value: 249 return false 250 } 251 switch { 252 case si.Object.Value < sj.Object.Value: 253 return true 254 case si.Object.Value > sj.Object.Value: 255 return false 256 } 257 return si.Label.Value < sj.Label.Value 258 } 259 func (s c14nStatements) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 260 261 // hashBNodes returns the hashed blank nodes of the graph described by statements 262 // using the provided hash function. Hashes are initialised with zero. 263 // 264 // This is algorithm 1 in doi:10.1145/3068333. 265 func hashBNodes(statements []*Statement, h hash.Hash, zero []byte, hash0 map[string][]byte) (hash *table, disjoint bool) { 266 curr := newTable() 267 for _, s := range statements { 268 for i, t := range []string{ 269 s.Subject.Value, 270 s.Predicate.Value, 271 s.Object.Value, 272 s.Label.Value, 273 } { 274 switch { 275 case i == 3 && t == "": 276 continue 277 case isBlank(t): 278 if hash0 == nil { 279 curr.set(t, zero) 280 } else { 281 curr.set(t, hash0[t]) 282 } 283 case isIRI(t): 284 h.Reset() 285 h.Write([]byte(t[1 : len(t)-1])) 286 curr.set(t, h.Sum(nil)) 287 default: 288 h.Reset() 289 h.Write([]byte(t)) 290 curr.set(t, h.Sum(nil)) 291 } 292 } 293 } 294 295 bag := newHashBag(h, curr) 296 last := curr.clone() 297 for { 298 curr, last = last, curr 299 for _, s := range statements { 300 if isBlank(s.Subject.Value) { 301 var lab []byte 302 if s.Label.Value != "" { 303 lab = last.hashOf[s.Label.Value] 304 } 305 c := hashTuple(h, last.hashOf[s.Object.Value], last.hashOf[s.Predicate.Value], lab, []byte{'+'}) 306 bag.add(s.Subject.Value, c) 307 } 308 309 if isBlank(s.Object.Value) { 310 var lab []byte 311 if s.Label.Value != "" { 312 lab = last.hashOf[s.Label.Value] 313 } 314 c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], lab, []byte{'-'}) 315 bag.add(s.Object.Value, c) 316 } 317 318 // This and the lab value above implement the label hashing 319 // required for RDF dataset hashing as described in 320 // https://doi.org/10.5281/zenodo.3154322 v1.0 321 // Readme.md#adaptation-of-the-algorithms-to-handle-datasets. 322 if isBlank(s.Label.Value) { 323 c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], last.hashOf[s.Object.Value], []byte{'.'}) 324 bag.add(s.Label.Value, c) 325 } 326 } 327 328 for t := range bag.hashesFor { 329 curr.set(t, bag.sum(t)) 330 } 331 332 disjoint = curr.allUnique() 333 if disjoint || !curr.changedFrom(last) { 334 return curr, disjoint 335 } 336 } 337 } 338 339 // table is a collision aware hash collection for RDF terms. 340 type table struct { 341 // hashOf holds the hash for each term. 342 hashOf map[string][]byte 343 // termsFor holds the set of nodes in 344 // the second key for terms that share 345 // the hash in the first key. 346 termsFor map[string]map[string]bool 347 348 // isBlank and blanks are the set of blank 349 // nodes. 350 // isBlank is nil for cloned tables. 351 isBlank map[string]bool 352 // blanks is nil for tables created 353 // with newTable. 354 blanks []string 355 } 356 357 // newTable returns a new hash table. 358 func newTable() *table { 359 return &table{ 360 hashOf: make(map[string][]byte), 361 termsFor: make(map[string]map[string]bool), 362 isBlank: make(map[string]bool), 363 } 364 } 365 366 // wasCloned returns whether t is a parent or child of a cloning operation. 367 func (t *table) wasCloned() bool { return t.isBlank == nil } 368 369 // isNew returns whether t is a new table. 370 func (t *table) isNew() bool { return t.blanks == nil } 371 372 // clone returns a clone of the receiver. 373 func (t *table) clone() *table { 374 new := &table{ 375 hashOf: make(map[string][]byte), 376 termsFor: make(map[string]map[string]bool), 377 } 378 for term, hash := range t.hashOf { 379 new.hashOf[term] = hash 380 } 381 for hash, coll := range t.termsFor { 382 if len(coll) == 0 { 383 continue 384 } 385 terms := make(map[string]bool) 386 for term := range coll { 387 terms[term] = true 388 } 389 new.termsFor[hash] = terms 390 } 391 if t.isNew() { 392 t.blanks = make([]string, len(t.isBlank)) 393 i := 0 394 for n := range t.isBlank { 395 t.blanks[i] = n 396 i++ 397 } 398 t.isBlank = nil 399 } 400 new.blanks = t.blanks 401 return new 402 } 403 404 // TODO(kortschak): Make hash table in table.hashOf reuse the []byte on update. 405 // This is not trivial since we need to check for changes, so we can't just get 406 // the current hash buffer and write into it. So if this is done we probably 407 // a pair of buffers, a current and a waiting. 408 409 // set sets the hash of the term, removing any previously set hash. 410 func (t *table) set(term string, hash []byte) { 411 prev := t.hashOf[term] 412 if bytes.Equal(prev, hash) { 413 return 414 } 415 t.hashOf[term] = hash 416 417 // Delete any existing hashes for this term. 418 switch terms := t.termsFor[string(prev)]; { 419 case len(terms) == 1: 420 delete(t.termsFor, string(prev)) 421 case len(terms) > 1: 422 delete(terms, term) 423 } 424 425 terms, ok := t.termsFor[string(hash)] 426 if ok { 427 terms[term] = true 428 } else { 429 t.termsFor[string(hash)] = map[string]bool{term: true} 430 } 431 432 if !t.wasCloned() && isBlank(term) { 433 // We are in the original table, so note 434 // any blank node label that we see. 435 t.isBlank[term] = true 436 } 437 } 438 439 // allUnique returns whether every term has an unique hash. allUnique 440 // can only be called on a table that was returned by clone. 441 func (t *table) allUnique() bool { 442 if t.isNew() { 443 panic("checked hash bag from uncloned table") 444 } 445 for _, term := range t.blanks { 446 if len(t.termsFor[string(t.hashOf[term])]) > 1 { 447 return false 448 } 449 } 450 return true 451 } 452 453 // changedFrom returns whether the receiver has been updated from last. 454 // changedFrom can only be called on a table that was returned by clone. 455 func (t *table) changedFrom(last *table) bool { 456 if t.isNew() { 457 panic("checked hash bag from uncloned table") 458 } 459 for i, x := range t.blanks { 460 for _, y := range t.blanks[i+1:] { 461 if bytes.Equal(t.hashOf[x], t.hashOf[y]) != bytes.Equal(last.hashOf[x], last.hashOf[y]) { 462 return true 463 } 464 } 465 } 466 return false 467 } 468 469 // hashBag implements a commutative and associative hash. 470 // See notes in https://doi.org/10.5281/zenodo.3154322 v1.0 471 // Readme.md#what-is-the-precise-specification-of-hashbag. 472 type hashBag struct { 473 hash hash.Hash 474 hashesFor map[string][][]byte 475 } 476 477 // newHashBag returns a new hashBag using the provided hash function for 478 // the given hash table. newHashBag can only take a table parameter that 479 // was returned by newTable. 480 func newHashBag(h hash.Hash, t *table) hashBag { 481 if t.wasCloned() { 482 panic("made hash bag from cloned table") 483 } 484 b := hashBag{hash: h, hashesFor: make(map[string][][]byte, len(t.isBlank))} 485 for n := range t.isBlank { 486 b.hashesFor[n] = [][]byte{t.hashOf[n]} 487 } 488 return b 489 } 490 491 // add adds the hash to the hash bag for the term. 492 func (b hashBag) add(term string, hash []byte) { 493 b.hashesFor[term] = append(b.hashesFor[term], hash) 494 } 495 496 // sum calculates the hash sum for the given term, updates the hash bag 497 // state and returns the hash. 498 func (b hashBag) sum(term string) []byte { 499 p := b.hashesFor[term] 500 sort.Sort(lexical(p)) 501 h := hashTuple(b.hash, p...) 502 b.hashesFor[term] = b.hashesFor[term][:1] 503 b.hashesFor[term][0] = h 504 return h 505 } 506 507 // lexical implements lexical sorting of [][]byte. 508 type lexical [][]byte 509 510 func (b lexical) Len() int { return len(b) } 511 func (b lexical) Less(i, j int) bool { return string(b[i]) < string(b[j]) } 512 func (b lexical) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 513 514 // hashTuple returns the h hash of the concatenation of t. 515 func hashTuple(h hash.Hash, t ...[]byte) []byte { 516 h.Reset() 517 for _, b := range t { 518 h.Write(b) 519 } 520 return h.Sum(nil) 521 } 522 523 // hashBNodesPerSplit returns the independently hashed blank nodes of the 524 // graph described by statements using the provided hash function. Hashes 525 // are initialised with zero. 526 // 527 // This is algorithm 2 in doi:10.1145/3068333. 528 func hashBNodesPerSplit(statements []*Statement, decomp bool, h hash.Hash, zero []byte) (hash *table, parts byLengthHash, disjoint bool) { 529 if !decomp { 530 hash, ok := hashBNodes(statements, h, zero, nil) 531 parts = appendOrdered(byLengthHash{}, hash.termsFor) 532 sort.Sort(parts) 533 return hash, parts, ok 534 } 535 536 splits := split(statements) 537 538 // Avoid recombination work if there is only one split. 539 if len(splits) == 1 { 540 hash, ok := hashBNodes(statements, h, zero, nil) 541 parts = appendOrdered(byLengthHash{}, hash.termsFor) 542 sort.Sort(parts) 543 return hash, parts, ok 544 } 545 546 hash = &table{hashOf: make(map[string][]byte)} 547 disjoint = true 548 for _, g := range splits { 549 part, ok := hashBNodes(g, h, zero, nil) 550 // Each split is guaranteed to be disjoint in its 551 // set of blank nodes, so we can just append to our 552 // collection of blanks. 553 hash.blanks = append(hash.blanks, part.blanks...) 554 if !ok { 555 // Allow a short-circuit of the allUnique check. 556 disjoint = false 557 } 558 for k, v := range part.hashOf { 559 hash.hashOf[k] = v 560 } 561 parts = appendOrdered(parts, part.termsFor) 562 } 563 sort.Sort(parts) 564 return hash, parts, disjoint && allUnique(hash.hashOf) 565 } 566 567 // appendOrdered adds parts (labels stored in the second key) for each 568 // hash (stored in the first key) to parts. 569 func appendOrdered(parts byLengthHash, partSets map[string]map[string]bool) byLengthHash { 570 for h, s := range partSets { 571 var p []string 572 for e := range s { 573 if isBlank(e) { 574 p = append(p, e) 575 } 576 } 577 if p != nil { 578 parts.nodes = append(parts.nodes, p) 579 parts.hashes = append(parts.hashes, h) 580 } 581 } 582 return parts 583 } 584 585 // byLengthHash implements ascending length sort of a set of blank RDF 586 // term partitions with ties broken by lexical ordering of the partitions' 587 // hashes. 588 type byLengthHash struct { 589 // nodes holds the blank nodes of a part. 590 nodes [][]string 591 // hashes holds the hashes corresponding 592 // to the nodes in the nodes field, using 593 // the same index. 594 hashes []string 595 } 596 597 func (s byLengthHash) Len() int { return len(s.nodes) } 598 func (s byLengthHash) Less(i, j int) bool { 599 switch { 600 case len(s.nodes[i]) < len(s.nodes[j]): 601 return true 602 case len(s.nodes[i]) > len(s.nodes[j]): 603 return false 604 } 605 return s.hashes[i] < s.hashes[j] 606 } 607 func (s byLengthHash) Swap(i, j int) { 608 s.nodes[i], s.nodes[j] = s.nodes[j], s.nodes[i] 609 s.hashes[i], s.hashes[j] = s.hashes[j], s.hashes[i] 610 } 611 612 // allUnique returns whether the []byte hash values in hashes are all unique. 613 func allUnique(hashes map[string][]byte) bool { 614 set := make(map[string]bool) 615 for _, h := range hashes { 616 if set[string(h)] { 617 return false 618 } 619 set[string(h)] = true 620 } 621 return true 622 } 623 624 // split returns the statements forming connected components in the graph 625 // described by statements. 626 // 627 // This is split in algorithm 2 in doi:10.1145/3068333. 628 func split(statements []*Statement) [][]*Statement { 629 ds := make(djSet) 630 for _, s := range statements { 631 ds.add(s.Subject.Value) 632 ds.add(s.Object.Value) 633 if isBlank(s.Subject.Value) && isBlank(s.Object.Value) { 634 ds.union(ds.find(s.Subject.Value), ds.find(s.Object.Value)) 635 } 636 } 637 638 var ( 639 splits [][]*Statement 640 ground []*Statement 641 ) 642 idxOf := make(map[*dsNode]int) 643 for _, s := range statements { 644 var t string 645 switch { 646 case isBlank(s.Subject.Value): 647 t = s.Subject.Value 648 case isBlank(s.Object.Value): 649 t = s.Object.Value 650 default: 651 ground = append(ground, s) 652 continue 653 } 654 r := ds.find(t) 655 if r == nil { 656 panic(fmt.Sprintf("term not found: %q", t)) 657 } 658 i, ok := idxOf[r] 659 if !ok { 660 i = len(splits) 661 idxOf[r] = i 662 splits = append(splits, []*Statement{s}) 663 } else { 664 splits[i] = append(splits[i], s) 665 } 666 } 667 if ground != nil { 668 splits = append(splits, ground) 669 } 670 671 if debug { 672 debug.log(0, "Splits:") 673 for i, s := range splits { 674 for j, t := range s { 675 if j == 0 { 676 debug.logf(0, "%d.\t%s\n", i+1, t) 677 } else { 678 debug.logf(0, "\t%s\n", t) 679 } 680 } 681 debug.log(0) 682 } 683 } 684 685 return splits 686 } 687 688 // distinguish returns G⊥: smallest hash-labelled graph found thus far. 689 // The graph is returned as a node to hash lookup. 690 // 691 // This is part of algorithm 3 in doi:10.1145/3068333. 692 // 693 // The correspondence between the parameters for the function in the paper 694 // with the implementation here is as follows: 695 // - G = statements 696 // - hash = hash 697 // - P = parts (already sorted by hashBNodesPerSplit) 698 // - G⊥ = lowest 699 // - B = hash.blanks 700 // 701 // The additional parameter dist specifies that distinguish should treat 702 // coequal trivial parts as a coarse of intermediate part and distinguish 703 // the nodes in that merged part. 704 func distinguish(statements []*Statement, dist bool, h hash.Hash, zero []byte, hash *table, parts byLengthHash, lowest map[string][]byte, depth int) map[string][]byte { 705 if debug { 706 debug.log(depth, "Running Distinguish") 707 } 708 709 var small []string 710 var k int 711 for k, small = range parts.nodes { 712 if len(small) > 1 { 713 break 714 } 715 } 716 if len(small) < 2 { 717 if lowest != nil || !dist { 718 if debug { 719 debug.log(depth, "Return lowest (no non-trivial parts):") 720 debug.logHashes(depth, lowest, h.Size()) 721 } 722 723 return lowest 724 } 725 726 // We have been given a set of fine parts, 727 // but to reach here they must have been 728 // non-uniquely labeled, so treat them 729 // as a single coarse part. 730 k, small = 0, parts.nodes[0] 731 } 732 733 if debug { 734 debug.logf(depth, "Part: %v %x\n\n", small, parts.hashes[k]) 735 debug.log(depth, "Orig hash:") 736 debug.logHashes(depth, hash.hashOf, h.Size()) 737 } 738 739 smallHash := hash.hashOf[small[0]] 740 for _, p := range parts.nodes[k:] { 741 if !bytes.Equal(smallHash, hash.hashOf[p[0]]) { 742 743 if debug { 744 debug.logf(depth, "End of co-equal hashes: %x != %x\n\n", smallHash, hash.hashOf[p[0]]) 745 } 746 747 break 748 } 749 for i, b := range p { 750 751 if debug { 752 debug.logf(depth, "Iter: %d — B = %q\n\n", i, b) 753 754 if depth == 0 { 755 debug.log(depth, "Current lowest:\n") 756 debug.logHashes(depth, lowest, h.Size()) 757 } 758 } 759 760 hashP := hash.clone() 761 hashP.set(b, hashTuple(h, hashP.hashOf[b], []byte{'@'})) 762 hashPP, ok := hashBNodes(statements, h, zero, hashP.hashOf) 763 if ok { 764 765 if debug { 766 debug.log(depth, "hashPP is trivial") 767 debug.log(depth, "comparing hashPP\n") 768 debug.logHashes(depth, hashPP.hashOf, h.Size()) 769 debug.log(depth, "with previous\n") 770 debug.logHashes(depth, lowest, h.Size()) 771 } 772 773 if lowest == nil || graphLess(statements, hashPP.hashOf, lowest) { 774 lowest = hashPP.hashOf 775 debug.log(depth, "choose hashPP\n") 776 } 777 } else { 778 partsP := appendOrdered(byLengthHash{}, hashPP.termsFor) 779 sort.Sort(partsP) 780 781 if debug { 782 debug.log(depth, "Parts':") 783 debug.logParts(depth, partsP) 784 debug.log(depth, "Recursive distinguish") 785 debug.log(depth, "Called with current lowest:\n") 786 debug.logHashes(depth, lowest, h.Size()) 787 } 788 789 lowest = distinguish(statements, dist, h, zero, hashPP, partsP, lowest, depth+1) 790 } 791 } 792 } 793 794 if debug { 795 debug.log(depth, "Return lowest:") 796 debug.logHashes(depth, lowest, h.Size()) 797 } 798 799 return lowest 800 } 801 802 // terms ordered syntactically, triples ordered lexicographically, and graphs 803 // ordered such that G < H if and only if G ⊂ H or there exists a triple 804 // t ∈ G \ H such that no triple t' ∈ H \ G exists where t' < t. 805 // p9 https://doi.org/10.1145/3068333 806 func graphLess(statements []*Statement, a, b map[string][]byte) bool { 807 g := newLexicalStatements(statements, a) 808 sort.Sort(g) 809 h := newLexicalStatements(statements, b) 810 sort.Sort(h) 811 812 gSubH := sub(g, h, len(g.statements)) 813 if len(gSubH) == 0 { 814 return true 815 } 816 817 hSubG := sub(h, g, 1) 818 if len(hSubG) == 0 { 819 return true 820 } 821 lowestH := relabeledStatement{hSubG[0], h.hashes} 822 823 for _, s := range gSubH { 824 rs := relabeledStatement{s, g.hashes} 825 if rs.less(lowestH) { 826 return true 827 } 828 } 829 return false 830 } 831 832 // lexicalStatements is a sort implementation for Statements with blank 833 // node labels replaced with their hash. 834 type lexicalStatements struct { 835 statements []*Statement 836 hashes map[string][]byte 837 } 838 839 func newLexicalStatements(statements []*Statement, hash map[string][]byte) lexicalStatements { 840 s := lexicalStatements{ 841 statements: make([]*Statement, len(statements)), 842 hashes: hash, 843 } 844 copy(s.statements, statements) 845 return s 846 } 847 848 // sub returns the difference between a and b up to max elements long. 849 func sub(a, b lexicalStatements, max int) []*Statement { 850 var d []*Statement 851 var i, j int 852 for i < len(a.statements) && j < len(b.statements) && len(d) < max { 853 ra := relabeledStatement{a.statements[i], a.hashes} 854 rb := relabeledStatement{b.statements[j], b.hashes} 855 switch { 856 case ra.less(rb): 857 d = append(d, a.statements[i]) 858 i++ 859 case rb.less(ra): 860 j++ 861 default: 862 i++ 863 } 864 } 865 if len(d) < max { 866 d = append(d, a.statements[i:min(len(a.statements), i+max-len(d))]...) 867 } 868 return d 869 } 870 871 func min(a, b int) int { 872 if a < b { 873 return a 874 } 875 return b 876 } 877 878 func (s lexicalStatements) Len() int { return len(s.statements) } 879 func (s lexicalStatements) Less(i, j int) bool { 880 return relabeledStatement{s.statements[i], s.hashes}.less(relabeledStatement{s.statements[j], s.hashes}) 881 } 882 func (s lexicalStatements) Swap(i, j int) { 883 s.statements[i], s.statements[j] = s.statements[j], s.statements[i] 884 } 885 886 // relabeledStatement is a statement that is orderable by its blank node 887 // hash relabeling. 888 type relabeledStatement struct { 889 statement *Statement 890 labels map[string][]byte 891 } 892 893 func (a relabeledStatement) less(b relabeledStatement) bool { 894 switch { 895 case relabeledTerm{a.statement.Subject, a.labels}.less(relabeledTerm{b.statement.Subject, b.labels}): 896 return true 897 case relabeledTerm{b.statement.Subject, b.labels}.less(relabeledTerm{a.statement.Subject, a.labels}): 898 return false 899 } 900 switch { // Always IRI. 901 case a.statement.Predicate.Value < b.statement.Predicate.Value: 902 return true 903 case a.statement.Predicate.Value > b.statement.Predicate.Value: 904 return false 905 } 906 switch { 907 case relabeledTerm{a.statement.Object, a.labels}.less(relabeledTerm{b.statement.Object, b.labels}): 908 return true 909 case relabeledTerm{b.statement.Object, b.labels}.less(relabeledTerm{a.statement.Object, a.labels}): 910 return false 911 } 912 return relabeledTerm{a.statement.Label, a.labels}.less(relabeledTerm{b.statement.Label, b.labels}) 913 } 914 915 func (s relabeledStatement) String() string { 916 subj := relabeledTerm{term: s.statement.Subject, labels: s.labels} 917 obj := relabeledTerm{term: s.statement.Object, labels: s.labels} 918 if s.statement.Label.Value == "" { 919 return fmt.Sprintf("%s %s %s .", subj, s.statement.Predicate.Value, obj) 920 } 921 lab := relabeledTerm{term: s.statement.Label, labels: s.labels} 922 return fmt.Sprintf("%s %s %s %s .", subj, s.statement.Predicate.Value, obj, lab) 923 } 924 925 // relabeledTerm is a term that is orderable by its blank node hash relabeling. 926 type relabeledTerm struct { 927 term Term 928 labels map[string][]byte 929 } 930 931 func (a relabeledTerm) less(b relabeledTerm) bool { 932 aIsBlank := isBlank(a.term.Value) 933 bIsBlank := isBlank(b.term.Value) 934 switch { 935 case aIsBlank && bIsBlank: 936 return bytes.Compare(a.labels[a.term.Value], b.labels[b.term.Value]) < 0 937 case aIsBlank: 938 return blankPrefix < unquoteIRI(b.term.Value) 939 case bIsBlank: 940 return unquoteIRI(a.term.Value) < blankPrefix 941 default: 942 return unquoteIRI(a.term.Value) < unquoteIRI(b.term.Value) 943 } 944 } 945 946 func unquoteIRI(s string) string { 947 if len(s) > 1 && s[0] == '<' && s[len(s)-1] == '>' { 948 s = s[1 : len(s)-1] 949 } 950 return s 951 } 952 953 func (t relabeledTerm) String() string { 954 if !isBlank(t.term.Value) { 955 return t.term.Value 956 } 957 h, ok := t.labels[t.term.Value] 958 if !ok { 959 return t.term.Value + "_missing_hash" 960 } 961 return fmt.Sprintf("_:%0x", h) 962 }