github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/equi_canonical.go (about) 1 // Copyright ©2021 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package rdf 6 7 import ( 8 "errors" 9 "sort" 10 ) 11 12 // Throughout, the comments refer to doi:10.1145/3068333 which should be 13 // understood as a synonym for http://aidanhogan.com/docs/rdf-canonicalisation.pdf 14 // although there are differences between the two, see http://aidanhogan.com/#errataH17. 15 // Where there are differences, the document at http://aidanhogan.com/ is the 16 // canonical truth. The DOI reference is referred to for persistence. 17 18 // Lean returns an RDF core of g that entails g. If g contains any non-zero 19 // labels, Lean will return a non-nil error and a core of g assuming no graph 20 // labels exist. 21 // 22 // See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of 23 // the algorithm. 24 func Lean(g []*Statement) ([]*Statement, error) { 25 // BUG(kortschak): Graph leaning does not take into account graph label terms 26 // since the formal semantics for a multiple graph data model have not been 27 // defined. See https://www.w3.org/TR/rdf11-datasets/#declaring. 28 29 var ( 30 hasBlanks bool 31 err error 32 ) 33 for _, s := range g { 34 if isBlank(s.Subject.Value) || isBlank(s.Object.Value) { 35 hasBlanks = true 36 if err != nil { 37 break 38 } 39 } 40 if s.Label.Value != "" && err == nil { 41 err = errors.New("rdf: data-set contains graph names") 42 if hasBlanks { 43 break 44 } 45 } 46 } 47 if hasBlanks { 48 g = lean(&dfs{}, g) 49 } 50 return g, err 51 } 52 53 // removeRedundantBnodes removes blank nodes whose edges are a subset of 54 // another term in the RDF graph. 55 // 56 // This is algorithm 4 in doi:10.1145/3068333. 57 func removeRedundantBnodes(g []*Statement) []*Statement { 58 g = append(g[:0:0], g...) 59 for { 60 edges := make(map[string]map[triple]bool) 61 for _, s := range g { 62 for i, t := range []string{ 63 s.Subject.Value, 64 s.Object.Value, 65 } { 66 e, ok := edges[t] 67 if !ok { 68 e = make(map[triple]bool) 69 edges[t] = e 70 } 71 switch i { 72 case 0: 73 e[triple{s.Predicate.Value, s.Object.Value, "+"}] = true 74 case 1: 75 e[triple{s.Predicate.Value, s.Subject.Value, "-"}] = true 76 } 77 } 78 } 79 80 seen := make(map[string]bool) 81 bNodes := make(map[string]bool) 82 terms := make(map[string]bool) 83 for _, s := range g { 84 for _, t := range []string{ 85 s.Subject.Value, 86 s.Predicate.Value, 87 s.Object.Value, 88 } { 89 terms[t] = true 90 if isBlank(t) { 91 bNodes[t] = true 92 } else { 93 seen[t] = true 94 } 95 } 96 } 97 98 redundant := make(map[string]bool) 99 for x := range bNodes { 100 for xp := range terms { 101 if isProperSubset(edges[x], edges[xp]) || (seen[xp] && isEqualEdges(edges[x], edges[xp])) { 102 redundant[x] = true 103 break 104 } 105 } 106 seen[x] = true 107 } 108 109 n := len(g) 110 for i := 0; i < len(g); { 111 if !redundant[g[i].Subject.Value] && !redundant[g[i].Object.Value] { 112 i++ 113 continue 114 } 115 g[i], g = g[len(g)-1], g[:len(g)-1] 116 } 117 if n == len(g) { 118 return g 119 } 120 } 121 } 122 123 type triple [3]string 124 125 func isProperSubset(a, b map[triple]bool) bool { 126 for k := range a { 127 if !b[k] { 128 return false 129 } 130 } 131 return len(a) < len(b) 132 } 133 134 func isEqualEdges(a, b map[triple]bool) bool { 135 if len(a) != len(b) { 136 return false 137 } 138 for k := range a { 139 if !b[k] { 140 return false 141 } 142 } 143 return true 144 } 145 146 // findCandidates finds candidates for blank nodes and blank nodes that are fixed. 147 // 148 // This is algorithm 5 in doi:10.1145/3068333. 149 func findCandidates(g []*Statement) ([]*Statement, map[string]bool, map[string]map[string]bool, bool) { 150 g = removeRedundantBnodes(g) 151 152 edges := make(map[triple]bool) 153 f := make(map[string]bool) 154 for _, s := range g { 155 sub := s.Subject.Value 156 prd := s.Predicate.Value 157 obj := s.Object.Value 158 159 edges[triple{sub, prd, obj}] = true 160 edges[triple{sub, prd, "*"}] = true 161 edges[triple{"*", prd, obj}] = true 162 switch { 163 case isBlank(sub) && isBlank(obj): 164 f[sub] = false 165 f[obj] = false 166 case isBlank(sub): 167 if _, ok := f[sub]; !ok { 168 f[sub] = true 169 } 170 case isBlank(obj): 171 if _, ok := f[obj]; !ok { 172 f[obj] = true 173 } 174 } 175 } 176 for k, v := range f { 177 if !v { 178 delete(f, k) 179 } 180 } 181 if len(f) == 0 { 182 f = nil 183 } 184 185 cands := make(map[string]map[string]bool) 186 bnodes := make(map[string]bool) 187 for _, s := range g { 188 for _, b := range []string{ 189 s.Subject.Value, 190 s.Object.Value, 191 } { 192 if !isBlank(b) { 193 continue 194 } 195 bnodes[b] = true 196 if f[b] { 197 cands[b] = map[string]bool{b: true} 198 } else { 199 terms := make(map[string]bool) 200 for _, s := range g { 201 for _, t := range []string{ 202 s.Subject.Value, 203 s.Predicate.Value, 204 s.Object.Value, 205 } { 206 terms[t] = true 207 } 208 } 209 cands[b] = terms 210 } 211 } 212 } 213 if isEqualTerms(f, bnodes) { 214 return g, f, cands, true 215 } 216 217 for { 218 bb := make(map[string]bool) 219 for b := range bnodes { 220 if !f[b] { 221 bb[b] = true 222 } 223 } 224 for b := range bb { 225 for x := range cands[b] { 226 if x == b { 227 continue 228 } 229 for _, s := range g { 230 if s.Subject.Value != b { 231 continue 232 } 233 prd := s.Predicate.Value 234 obj := s.Object.Value 235 if (inILF(obj, f) && !edges[triple{x, prd, obj}]) || (bb[obj] && !edges[triple{x, prd, "*"}]) { 236 delete(cands[b], x) 237 break 238 } 239 } 240 if !cands[b][x] { 241 continue 242 } 243 for _, s := range g { 244 if s.Object.Value != b { 245 continue 246 } 247 sub := s.Subject.Value 248 prd := s.Predicate.Value 249 if (inIF(sub, f) && !edges[triple{sub, prd, x}]) || (bb[sub] && !edges[triple{"*", prd, x}]) { 250 delete(cands[b], x) 251 break 252 } 253 } 254 } 255 } 256 257 fp := f 258 f = make(map[string]bool) 259 for b := range fp { 260 f[b] = true 261 } 262 for b := range bb { // Mark newly fixed blank nodes. 263 if len(cands[b]) == 1 && cands[b][b] { 264 f[b] = true 265 } 266 } 267 allFixed := isEqualTerms(f, bnodes) 268 if isEqualTerms(fp, f) || allFixed { 269 if len(f) == 0 { 270 f = nil 271 } 272 return g, f, cands, allFixed 273 } 274 } 275 } 276 277 // inILF returns whether t is in IL or F. 278 func inILF(t string, f map[string]bool) bool { 279 return isIRI(t) || isLiteral(t) || f[t] 280 } 281 282 // inIF returns whether t is in I or F. 283 func inIF(t string, f map[string]bool) bool { 284 return isIRI(t) || f[t] 285 } 286 287 // dfs is a depth-first search strategy. 288 type dfs struct{} 289 290 // lean returns a core of the RDF graph g using the given strategy. 291 // 292 // This is lines 1-9 of algorithm 6 in doi:10.1145/3068333. 293 func lean(strategy *dfs, g []*Statement) []*Statement { 294 foundBnode := false 295 search: 296 for _, s := range g { 297 for _, t := range []string{ 298 s.Subject.Value, 299 s.Object.Value, 300 } { 301 if isBlank(t) { 302 foundBnode = true 303 break search 304 } 305 } 306 } 307 if !foundBnode { 308 return g 309 } 310 g, fixed, cands, allFixed := findCandidates(g) 311 if allFixed { 312 return g 313 } 314 for _, s := range g { 315 if isBlank(s.Subject.Value) && isBlank(s.Object.Value) { 316 mu := make(map[string]string, len(fixed)) 317 for b := range fixed { 318 mu[b] = b 319 } 320 mu = findCoreEndomorphism(strategy, g, cands, mu) 321 return applyMu(g, mu) 322 } 323 } 324 return g 325 } 326 327 // findCoreEndomorphism returns a core solution using the given strategy. 328 // 329 // This is lines 10-14 of algorithm 6 in doi:10.1145/3068333. 330 func findCoreEndomorphism(strategy *dfs, g []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string { 331 var q []*Statement 332 preds := make(map[string]int) 333 seen := make(map[triple]bool) 334 for _, s := range g { 335 preds[s.Predicate.Value]++ 336 if isBlank(s.Subject.Value) && isBlank(s.Object.Value) { 337 if seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] { 338 continue 339 } 340 seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] = true 341 q = append(q, s) 342 } 343 } 344 sort.Slice(q, func(i, j int) bool { 345 return selectivity(q[i], cands, preds) < selectivity(q[j], cands, preds) 346 }) 347 return strategy.evaluate(g, q, cands, mu) 348 } 349 350 // selectivity returns the selectivity heuristic score for s. Lower scores 351 // are more selective. 352 func selectivity(s *Statement, cands map[string]map[string]bool, preds map[string]int) int { 353 return min(len(cands[s.Subject.Value])*len(cands[s.Object.Value]), preds[s.Predicate.Value]) 354 } 355 356 // evaluate returns an endomorphism using a DFS strategy. 357 // 358 // This is lines 25-32 of algorithm 6 in doi:10.1145/3068333. 359 func (st *dfs) evaluate(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string { 360 mu = st.search(g, q, cands, mu) 361 for len(mu) != len(codom(mu)) { 362 mupp := fixedFrom(cands) 363 mup := findCoreEndomorphism(st, applyMu(g, mu), cands, mupp) 364 if isAutomorphism(mup) { 365 return mu 366 } 367 for b, x := range mu { 368 if _, ok := mup[b]; !ok { 369 mup[b] = x 370 } 371 } 372 mu = mup 373 } 374 return mu 375 } 376 377 func fixedFrom(cands map[string]map[string]bool) map[string]string { 378 fixed := make(map[string]string) 379 for b, m := range cands { 380 if len(m) == 1 && m[b] { 381 fixed[b] = b 382 } 383 } 384 return fixed 385 } 386 387 // applyMu applies mu to g returning the result. 388 func applyMu(g []*Statement, mu map[string]string) []*Statement { 389 back := make([]Statement, 0, len(g)) 390 dst := make([]*Statement, 0, len(g)) 391 seen := make(map[Statement]bool) 392 for _, s := range g { 393 n := Statement{ 394 Subject: Term{Value: translate(s.Subject.Value, mu)}, 395 Predicate: Term{Value: s.Predicate.Value}, 396 Object: Term{Value: translate(s.Object.Value, mu)}, 397 Label: Term{Value: s.Label.Value}, 398 } 399 if seen[n] { 400 continue 401 } 402 seen[n] = true 403 back = append(back, n) 404 dst = append(dst, &back[len(back)-1]) 405 } 406 return dst 407 } 408 409 // search returns a minimum endomorphism using a DFS strategy. 410 // 411 // This is lines 33-46 of algorithm 6 in doi:10.1145/3068333. 412 func (st *dfs) search(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string { 413 qMin := q[0] 414 m := st.join(qMin, g, cands, mu) 415 if len(m) == 0 { 416 // Early exit if no mapping found. 417 return nil 418 } 419 sortByCodom(m) 420 mMin := m[0] 421 qp := q[1:] 422 if len(qp) != 0 { 423 for len(m) != 0 { 424 mMin = m[0] 425 mup := st.search(g, qp, cands, mMin) 426 if !isAutomorphism(mup) { 427 return mup 428 } 429 m = m[1:] 430 } 431 } 432 return mMin 433 } 434 435 // isAutomorphism returns whether mu is an automorphism, this is equivalent to 436 // dom(mu) == codom(mu). 437 func isAutomorphism(mu map[string]string) bool { 438 return isEqualTerms(dom(mu), codom(mu)) 439 } 440 441 // dom returns the domain of mu. 442 func dom(mu map[string]string) map[string]bool { 443 d := make(map[string]bool, len(mu)) 444 for v := range mu { 445 d[v] = true 446 } 447 return d 448 } 449 450 // codom returns the codomain of mu. 451 func codom(mu map[string]string) map[string]bool { 452 cd := make(map[string]bool, len(mu)) 453 for _, v := range mu { 454 cd[v] = true 455 } 456 return cd 457 } 458 459 // isEqualTerms returns whether a and b are identical. 460 func isEqualTerms(a, b map[string]bool) bool { 461 if len(a) != len(b) { 462 return false 463 } 464 for k := range a { 465 if !b[k] { 466 return false 467 } 468 } 469 return true 470 } 471 472 // sortByCodom performs a sort of maps ordered by fewest blank nodes in 473 // codomain, then fewest self mappings. 474 func sortByCodom(maps []map[string]string) { 475 m := orderedByCodom{ 476 maps: maps, 477 attrs: make([]attrs, len(maps)), 478 } 479 for i, mu := range maps { 480 m.attrs[i].blanks = make(map[string]bool) 481 for x, y := range mu { 482 if isBlank(y) { 483 m.attrs[i].blanks[y] = true 484 } 485 if x == y { 486 m.attrs[i].selfs++ 487 } 488 } 489 } 490 sort.Sort(m) 491 } 492 493 type orderedByCodom struct { 494 maps []map[string]string 495 attrs []attrs 496 } 497 498 type attrs struct { 499 blanks map[string]bool 500 selfs int 501 } 502 503 func (m orderedByCodom) Len() int { return len(m.maps) } 504 func (m orderedByCodom) Less(i, j int) bool { 505 attrI := m.attrs[i] 506 attrJ := m.attrs[j] 507 switch { 508 case len(attrI.blanks) < len(attrJ.blanks): 509 return true 510 case len(attrI.blanks) > len(attrJ.blanks): 511 return false 512 default: 513 return attrI.selfs < attrJ.selfs 514 } 515 } 516 func (m orderedByCodom) Swap(i, j int) { 517 m.maps[i], m.maps[j] = m.maps[j], m.maps[i] 518 m.attrs[i], m.attrs[j] = m.attrs[j], m.attrs[i] 519 } 520 521 // join evaluates the given pattern, q, joining with solutions in m. 522 // This takes only a single mapping and so only works for the DFS strategy. 523 // 524 // This is lines 47-51 of algorithm 6 in doi:10.1145/3068333. 525 func (st *dfs) join(q *Statement, g []*Statement, cands map[string]map[string]bool, m map[string]string) []map[string]string { 526 var mp []map[string]string 527 isLoop := q.Subject.Value == q.Object.Value 528 for _, s := range g { 529 // Line 45: M_q ← {µ | µ(q) ∈ G} 530 // | µ(q) ∈ G 531 // 532 // µ(q) ∈ G ↔ (µ(q_s),q_p,µ(q_o)) ∈ G 533 if q.Predicate.Value != s.Predicate.Value { 534 continue 535 } 536 // q_s = q_o ↔ µ(q_s) =_µ(q_o) 537 if isLoop && s.Subject.Value != s.Object.Value { 538 continue 539 } 540 541 // Line 46: M_q' ← {µ ∈ M_q | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]} 542 // | for all b ∈ bnodes({q}), µ(b) ∈ cands[b] 543 if !cands[q.Subject.Value][s.Subject.Value] || !cands[q.Object.Value][s.Object.Value] { 544 continue 545 } 546 547 // Line 47: M' ← M_q' ⋈ M 548 // M₁ ⋈ M₂ = {μ₁ ∪ μ₂ | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings} 549 // | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings 550 if mq, ok := m[q.Subject.Value]; ok && mq != s.Subject.Value { 551 continue 552 } 553 if !isLoop { 554 if mq, ok := m[q.Object.Value]; ok && mq != s.Object.Value { 555 continue 556 } 557 } 558 // Line 47: μ₁ ∪ μ₂ 559 var mu map[string]string 560 if isLoop { 561 mu = map[string]string{ 562 q.Subject.Value: s.Subject.Value, 563 } 564 } else { 565 mu = map[string]string{ 566 q.Subject.Value: s.Subject.Value, 567 q.Object.Value: s.Object.Value, 568 } 569 } 570 for b, mb := range m { 571 mu[b] = mb 572 } 573 mp = append(mp, mu) 574 } 575 return mp 576 }