github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/iso_canonical_test.go (about) 1 // Copyright ©2020 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package rdf 6 7 import ( 8 "crypto/md5" 9 "flag" 10 "fmt" 11 "hash" 12 "io" 13 "os" 14 "path/filepath" 15 "reflect" 16 "sort" 17 "testing" 18 "text/tabwriter" 19 "time" 20 21 "math/rand" 22 ) 23 24 var ( 25 origSeed = flag.Int64("seed", 1, "specify random seed to use for each test (negative for Unix time)") 26 tests = flag.String("test", "*-in.n[qt]", "specify test case in testdata") 27 ) 28 29 func TestIsoCanonicalHashes(t *testing.T) { 30 seed := uint64(*origSeed) 31 if *origSeed < 0 { 32 seed = uint64(time.Now().UnixNano()) 33 } 34 defer func() { 35 if t.Failed() && *origSeed < 0 { 36 t.Logf("time based seed: %d", seed) 37 } 38 }() 39 40 // Number of times to run IsoCanonicalHashes to check consistency. 41 const retries = 5 42 43 // Share a global hash function to ensure that we 44 // are resetting the function internally on each use. 45 hash := md5.New() 46 47 glob, err := filepath.Glob(filepath.Join("testdata", *tests)) 48 if err != nil { 49 t.Fatalf("Failed to open test suite: %v", err) 50 } 51 for _, path := range glob { 52 name := filepath.Base(path) 53 t.Run(name, func(t *testing.T) { 54 src := rand.NewSource(seed) 55 56 f, err := os.Open(path) 57 if err != nil { 58 t.Fatalf("Failed to open test suite in %q: %v", path, err) 59 } 60 var statements []*Statement 61 dec := NewDecoder(f) 62 for { 63 s, err := dec.Unmarshal() 64 if err != nil { 65 if err == io.EOF { 66 break 67 } 68 t.Fatalf("Unexpected error reading from %q: %v", path, err) 69 } 70 statements = append(statements, s) 71 } 72 f.Close() 73 74 for _, decomp := range []bool{false, true} { 75 t.Run(fmt.Sprintf("decomp=%t", decomp), func(t *testing.T) { 76 var last map[string][]byte 77 for i := 0; i < retries; i++ { 78 curr, terms := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16)) 79 if !hashesDisjoint(terms) { 80 t.Errorf("IsoCanonicalHashes did not uniquely identify nodes %q with decomp=%t", 81 name, decomp) 82 } 83 if last != nil { 84 last := relabelStatements(statements, termsFor(last, hash)) 85 sort.Sort(simpleLexicalStatements(last)) 86 87 curr := relabelStatements(statements, termsFor(curr, hash)) 88 sort.Sort(simpleLexicalStatements(curr)) 89 90 if !reflect.DeepEqual(last, curr) { 91 t.Errorf("IsoCanonicalHashes was not stable between runs on %q with decomp=%t", 92 name, decomp) 93 94 t.Log("Current run:") 95 for _, s := range curr { 96 t.Logf("\t%s", s) 97 } 98 99 t.Log("Previous run:") 100 for _, s := range last { 101 t.Logf("\t%s", s) 102 } 103 104 break 105 } 106 } 107 last = curr 108 } 109 hashes := last 110 ok := allUnique(hashes) 111 if !ok { 112 t.Errorf("Failed to get unique hashes for %q disjoint with decomp=%t", name, decomp) 113 t.Logf("skipping %q decomp=%t", path, decomp) 114 return 115 } 116 117 // Test that a graph is not isomorphic with one generated 118 // by deleting the last statement. 119 t.Run("isomorphic G != G-s", func(t *testing.T) { 120 if len(statements) == 0 { 121 return 122 } 123 if Isomorphic(statements, statements[:len(statements)-1], decomp, hash) { 124 t.Error("Isomorphic(G, G-s)=true") 125 } 126 }) 127 128 // Test that a graph is not isomorphic with one generated 129 // by hashing the first grounded statement. 130 t.Run("isomorphic G != Gμ(g)", func(t *testing.T) { 131 mangled, mangTerms := mangleFirstIL(statements, hash) 132 if mangTerms == nil { 133 // All terms were blanks. 134 return 135 } 136 if Isomorphic(statements, mangled, decomp, hash) { 137 t.Error("Isomorphic(G, Gμ(g))=true") 138 } 139 }) 140 141 // Test that a graph is not isomorphic with one generated 142 // by merging the first two lexically sorted blank nodes 143 // into one. 144 t.Run("isomorphic G != G(b1∪b2)", func(t *testing.T) { 145 mangled, mangTerms := mergeFirst2B(statements) 146 if mangTerms == nil { 147 // All terms were blanks. 148 return 149 } 150 if Isomorphic(statements, mangled, decomp, hash) { 151 t.Error("Isomorphic(G, G(b1∪b2))=true") 152 } 153 }) 154 155 // Relabel a copy of the statements and then sort. 156 orig := relabelStatements(statements, termsFor(hashes, hash)) 157 sort.Sort(simpleLexicalStatements(orig)) 158 159 for _, perm := range []struct { 160 name string 161 data func() ([]*Statement, map[string]string) 162 }{ 163 { 164 name: "reverse statements", 165 data: func() ([]*Statement, map[string]string) { return reverseStatements(statements) }, 166 }, 167 { 168 name: "permute statements", 169 data: func() ([]*Statement, map[string]string) { return permuteStatements(statements, src) }, 170 }, 171 { 172 name: "permute blank labels", 173 data: func() ([]*Statement, map[string]string) { return permuteBlanks(statements, src) }, 174 }, 175 { 176 name: "hash blank labels", 177 data: func() ([]*Statement, map[string]string) { return hashBlanks(statements, md5.New()) }, 178 }, 179 { 180 name: "reverse statements and hash blank labels", 181 data: func() ([]*Statement, map[string]string) { 182 // Reordering must come first since it does not return 183 // a non-nil terms map, but hashBlanks does. 184 s, _ := reverseStatements(statements) 185 return hashBlanks(s, md5.New()) 186 }, 187 }, 188 { 189 name: "permute statements and hash blank labels", 190 data: func() ([]*Statement, map[string]string) { 191 // Reordering must come first since it does not return 192 // a non-nil terms map, but hashBlanks does. 193 s, _ := permuteStatements(statements, src) 194 return hashBlanks(s, md5.New()) 195 }, 196 }, 197 } { 198 t.Run(perm.name, func(t *testing.T) { 199 if debug { 200 fmt.Fprintf(os.Stderr, "\n%q %q decomp=%t:\n", path, perm.name, decomp) 201 } 202 203 altStatements, terms := perm.data() 204 altHashes, altTerms := IsoCanonicalHashes(altStatements, decomp, true, hash, make([]byte, 16)) 205 ok := allUnique(altHashes) && hashesDisjoint(altTerms) 206 if !ok { 207 t.Errorf("Failed to get unique hashes for %q alternative disjoint %q with decomp=%t", 208 path, perm.name, decomp) 209 } 210 211 if debug { 212 fmt.Fprintln(os.Stderr, "Name mappings from original dataset:") 213 keys := make([]string, len(hashes)) 214 var i int 215 for k := range hashes { 216 keys[i] = k 217 i++ 218 } 219 sort.Strings(keys) 220 w := tabwriter.NewWriter(os.Stderr, 0, 4, 8, ' ', 0) 221 for _, k := range keys { 222 fmt.Fprintf(w, "\t%s\t%s\n", k, translate(k, terms)) 223 } 224 w.Flush() 225 fmt.Fprintln(os.Stderr) 226 } 227 228 // Relabel a copy of the alternative statements and then sort. 229 alt := relabelStatements(altStatements, termsFor(altHashes, hash)) 230 sort.Sort(simpleLexicalStatements(alt)) 231 232 for i := range statements { 233 if *orig[i] != *alt[i] { // Otherwise we have pointer inequality. 234 t.Errorf("Unexpected statement in %q %q decomp=%t:\ngot: %#v\nwant:%#v", 235 path, perm.name, decomp, orig[i], alt[i]) 236 237 break 238 } 239 } 240 241 if !Isomorphic(statements, altStatements, decomp, hash) { 242 t.Errorf("Isomorphic(G, perm(G))=false in %q %q decomp=%t", 243 path, perm.name, decomp) 244 } 245 }) 246 } 247 }) 248 } 249 }) 250 } 251 } 252 253 func permuteStatements(s []*Statement, src rand.Source) ([]*Statement, map[string]string) { 254 rnd := rand.New(src) 255 m := make([]*Statement, len(s)) 256 for x, y := range rnd.Perm(len(s)) { 257 m[x] = s[y] 258 } 259 return m, nil 260 } 261 262 func reverseStatements(s []*Statement) ([]*Statement, map[string]string) { 263 m := make([]*Statement, len(s)) 264 for i, j := 0, len(s)-1; i < len(s); i, j = i+1, j-1 { 265 m[j] = s[i] 266 } 267 return m, nil 268 } 269 270 func permuteBlanks(s []*Statement, src rand.Source) ([]*Statement, map[string]string) { 271 rnd := rand.New(src) 272 terms := make(map[string]string) 273 for _, e := range s { 274 for _, t := range []string{ 275 e.Subject.Value, 276 e.Predicate.Value, 277 e.Object.Value, 278 e.Label.Value, 279 } { 280 if t == "" { 281 continue 282 } 283 terms[t] = t 284 } 285 } 286 287 var blanks []string 288 for t := range terms { 289 if isBlank(t) { 290 blanks = append(blanks, t) 291 } 292 } 293 sort.Strings(blanks) 294 for x, y := range rnd.Perm(len(blanks)) { 295 terms[blanks[x]] = blanks[y] 296 } 297 298 m := relabelStatements(s, terms) 299 return m, terms 300 } 301 302 func hashBlanks(s []*Statement, h hash.Hash) ([]*Statement, map[string]string) { 303 terms := make(map[string]string) 304 for _, e := range s { 305 for _, t := range []string{ 306 e.Subject.Value, 307 e.Predicate.Value, 308 e.Object.Value, 309 e.Label.Value, 310 } { 311 if !isBlank(t) { 312 continue 313 } 314 h.Reset() 315 h.Write([]byte(t)) //nolint:errcheck 316 terms[t] = fmt.Sprintf("_:%0*x", 2*h.Size(), h.Sum(nil)) 317 } 318 } 319 320 m := relabelStatements(s, terms) 321 return m, terms 322 } 323 324 func mangleFirstIL(s []*Statement, h hash.Hash) ([]*Statement, map[string]string) { 325 terms := make(map[string]string) 326 for _, e := range s { 327 for _, t := range []string{ 328 e.Subject.Value, 329 e.Predicate.Value, 330 e.Object.Value, 331 e.Label.Value, 332 } { 333 if isBlank(t) { 334 continue 335 } 336 h.Reset() 337 h.Write([]byte(t)) //nolint:errcheck 338 terms[t] = fmt.Sprintf(`"%0*x"`, 2*h.Size(), h.Sum(nil)) 339 return relabelStatements(s, terms), terms 340 } 341 } 342 343 m := relabelStatements(s, nil) 344 return m, nil 345 } 346 347 func mergeFirst2B(s []*Statement) ([]*Statement, map[string]string) { 348 terms := make(map[string]string) 349 for _, e := range s { 350 for _, t := range []string{ 351 e.Subject.Value, 352 e.Predicate.Value, 353 e.Object.Value, 354 e.Label.Value, 355 } { 356 if !isBlank(t) { 357 continue 358 } 359 terms[t] = t 360 } 361 } 362 if len(terms) < 2 { 363 return relabelStatements(s, nil), nil 364 } 365 366 blanks := make([]string, len(terms)) 367 i := 0 368 for _, b := range terms { 369 blanks[i] = b 370 i++ 371 } 372 sort.Strings(blanks) 373 terms[blanks[1]] = terms[blanks[0]] 374 375 m := relabelStatements(s, terms) 376 return m, nil 377 } 378 379 func hashesDisjoint(terms map[string]map[string]bool) bool { 380 for _, t := range terms { 381 if len(t) != 1 { 382 return false 383 } 384 } 385 return true 386 } 387 388 func TestLexicalStatements(t *testing.T) { 389 if *tests == "" { 390 *tests = "*" 391 } 392 393 hash := md5.New() 394 395 glob, err := filepath.Glob(filepath.Join("testdata", *tests)) 396 if err != nil { 397 t.Fatalf("Failed to open test suite: %v", err) 398 } 399 for _, path := range glob { 400 f, err := os.Open(path) 401 if err != nil { 402 t.Fatalf("Failed to open test suite in %q: %v", path, err) 403 } 404 var statements []*Statement 405 dec := NewDecoder(f) 406 for { 407 s, err := dec.Unmarshal() 408 if err != nil { 409 if err == io.EOF { 410 break 411 } 412 t.Fatalf("Unexpected error reading from %q: %v", path, err) 413 } 414 statements = append(statements, s) 415 } 416 f.Close() 417 418 for _, decomp := range []bool{false, true} { 419 hashes, _ := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16)) 420 421 terms := termsFor(hashes, hash) 422 423 // Sort a copy of the statements based on hashes and then relabel. 424 indirect := make([]*Statement, len(statements)) 425 copy(indirect, statements) 426 sort.Sort(lexicalStatements{indirect, hashes}) 427 indirect = relabelStatements(indirect, terms) 428 429 // Relabel a copy of the statements and then sort. 430 direct := relabelStatements(statements, terms) 431 sort.Sort(simpleLexicalStatements(direct)) 432 433 for i := range statements { 434 if *indirect[i] != *direct[i] { // Otherwise we have pointer inequality. 435 t.Errorf("Unexpected ordering of indirect sort in %q:\ngot: %#v\nwant:%#v", 436 path, indirect[i], direct[i]) 437 } 438 } 439 } 440 } 441 } 442 443 func termsFor(hashes map[string][]byte, hash hash.Hash) map[string]string { 444 terms := make(map[string]string) 445 for t, h := range hashes { 446 if isBlank(t) { 447 terms[t] = fmt.Sprintf("_:%0*x", 2*hash.Size(), h) 448 } 449 } 450 return terms 451 } 452 453 // simpleLexicalStatements implements lexical statement sorting on the 454 // literal values without interpolation. 455 type simpleLexicalStatements []*Statement 456 457 func (s simpleLexicalStatements) Len() int { return len(s) } 458 func (s simpleLexicalStatements) Less(i, j int) bool { 459 si := s[i] 460 sj := s[j] 461 switch { 462 case unquoteIRI(si.Subject.Value) < unquoteIRI(sj.Subject.Value): 463 return true 464 case unquoteIRI(si.Subject.Value) > unquoteIRI(sj.Subject.Value): 465 return false 466 } 467 switch { // Always IRI. 468 case si.Predicate.Value < sj.Predicate.Value: 469 return true 470 case si.Predicate.Value > sj.Predicate.Value: 471 return false 472 } 473 switch { 474 case unquoteIRI(si.Object.Value) < unquoteIRI(sj.Object.Value): 475 return true 476 case unquoteIRI(si.Object.Value) > unquoteIRI(sj.Object.Value): 477 return false 478 } 479 return unquoteIRI(si.Label.Value) < unquoteIRI(sj.Label.Value) 480 } 481 func (s simpleLexicalStatements) Swap(i, j int) { 482 s[i], s[j] = s[j], s[i] 483 } 484 485 func relabelStatements(s []*Statement, terms map[string]string) []*Statement { 486 m := make([]*Statement, len(s)) 487 for i, e := range s { 488 n := *e 489 n.Subject = Term{Value: translate(n.Subject.Value, terms)} 490 n.Predicate = Term{Value: translate(n.Predicate.Value, terms)} 491 n.Object = Term{Value: translate(n.Object.Value, terms)} 492 n.Label = Term{Value: translate(n.Label.Value, terms)} 493 m[i] = &n 494 } 495 return m 496 } 497 498 func BenchmarkIsoCanonicalHashes(b *testing.B) { 499 hash := md5.New() 500 501 benchmarks := []string{ 502 "test019-in.nq", 503 "test044-in.nq", 504 } 505 506 for _, name := range benchmarks { 507 path := filepath.Join("testdata", name) 508 b.Run(name, func(b *testing.B) { 509 f, err := os.Open(path) 510 if err != nil { 511 b.Fatalf("Failed to open test suite in %q: %v", path, err) 512 } 513 var statements []*Statement 514 dec := NewDecoder(f) 515 for { 516 s, err := dec.Unmarshal() 517 if err != nil { 518 if err == io.EOF { 519 break 520 } 521 b.Fatalf("Unexpected error reading from %q: %v", path, err) 522 } 523 statements = append(statements, s) 524 } 525 f.Close() 526 527 nodes := make(map[string]bool) 528 for _, s := range statements { 529 for _, t := range []string{ 530 s.Subject.Value, 531 s.Predicate.Value, 532 s.Object.Value, 533 s.Label.Value, 534 } { 535 if t != "" { 536 nodes[t] = true 537 } 538 } 539 } 540 n := len(nodes) 541 542 for _, decomp := range []bool{false, true} { 543 b.Run(fmt.Sprintf("decomp=%t", decomp), func(b *testing.B) { 544 for i := 0; i < b.N; i++ { 545 hashes, _ := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16)) 546 if len(hashes) != n { 547 b.Fatalf("unexpected number of hashes: %d != %d", len(hashes), len(statements)) 548 } 549 } 550 }) 551 } 552 }) 553 } 554 }