gonum.org/v1/gonum@v0.14.0/graph/formats/rdf/iso_canonical_test.go (about)

     1  // Copyright ©2020 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package rdf
     6  
     7  import (
     8  	"crypto/md5"
     9  	"flag"
    10  	"fmt"
    11  	"hash"
    12  	"io"
    13  	"os"
    14  	"path/filepath"
    15  	"reflect"
    16  	"sort"
    17  	"testing"
    18  	"text/tabwriter"
    19  	"time"
    20  
    21  	"golang.org/x/exp/rand"
    22  )
    23  
    24  var (
    25  	origSeed = flag.Int64("seed", 1, "specify random seed to use for each test (negative for Unix time)")
    26  	tests    = flag.String("test", "*-in.n[qt]", "specify test case in testdata")
    27  )
    28  
    29  func TestIsoCanonicalHashes(t *testing.T) {
    30  	seed := uint64(*origSeed)
    31  	if *origSeed < 0 {
    32  		seed = uint64(time.Now().UnixNano())
    33  	}
    34  	defer func() {
    35  		if t.Failed() && *origSeed < 0 {
    36  			t.Logf("time based seed: %d", seed)
    37  		}
    38  	}()
    39  
    40  	// Number of times to run IsoCanonicalHashes to check consistency.
    41  	const retries = 5
    42  
    43  	// Share a global hash function to ensure that we
    44  	// are resetting the function internally on each use.
    45  	hash := md5.New()
    46  
    47  	glob, err := filepath.Glob(filepath.Join("testdata", *tests))
    48  	if err != nil {
    49  		t.Fatalf("Failed to open test suite: %v", err)
    50  	}
    51  	for _, path := range glob {
    52  		name := filepath.Base(path)
    53  		t.Run(name, func(t *testing.T) {
    54  			src := rand.NewSource(seed)
    55  
    56  			f, err := os.Open(path)
    57  			if err != nil {
    58  				t.Fatalf("Failed to open test suite in %q: %v", path, err)
    59  			}
    60  			var statements []*Statement
    61  			dec := NewDecoder(f)
    62  			for {
    63  				s, err := dec.Unmarshal()
    64  				if err != nil {
    65  					if err == io.EOF {
    66  						break
    67  					}
    68  					t.Fatalf("Unexpected error reading from %q: %v", path, err)
    69  				}
    70  				statements = append(statements, s)
    71  			}
    72  			f.Close()
    73  
    74  			for _, decomp := range []bool{false, true} {
    75  				t.Run(fmt.Sprintf("decomp=%t", decomp), func(t *testing.T) {
    76  					var last map[string][]byte
    77  					for i := 0; i < retries; i++ {
    78  						curr, terms := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16))
    79  						if !hashesDisjoint(terms) {
    80  							t.Errorf("IsoCanonicalHashes did not uniquely identify nodes %q with decomp=%t",
    81  								name, decomp)
    82  						}
    83  						if last != nil {
    84  							last := relabelStatements(statements, termsFor(last, hash))
    85  							sort.Sort(simpleLexicalStatements(last))
    86  
    87  							curr := relabelStatements(statements, termsFor(curr, hash))
    88  							sort.Sort(simpleLexicalStatements(curr))
    89  
    90  							if !reflect.DeepEqual(last, curr) {
    91  								t.Errorf("IsoCanonicalHashes was not stable between runs on %q with decomp=%t",
    92  									name, decomp)
    93  
    94  								t.Log("Current run:")
    95  								for _, s := range curr {
    96  									t.Logf("\t%s", s)
    97  								}
    98  
    99  								t.Log("Previous run:")
   100  								for _, s := range last {
   101  									t.Logf("\t%s", s)
   102  								}
   103  
   104  								break
   105  							}
   106  						}
   107  						last = curr
   108  					}
   109  					hashes := last
   110  					ok := allUnique(hashes)
   111  					if !ok {
   112  						t.Errorf("Failed to get unique hashes for %q disjoint with decomp=%t", name, decomp)
   113  						t.Logf("skipping %q decomp=%t", path, decomp)
   114  						return
   115  					}
   116  
   117  					// Test that a graph is not isomorphic with one generated
   118  					// by deleting the last statement.
   119  					t.Run("isomorphic G != G-s", func(t *testing.T) {
   120  						if len(statements) == 0 {
   121  							return
   122  						}
   123  						if Isomorphic(statements, statements[:len(statements)-1], decomp, hash) {
   124  							t.Error("Isomorphic(G, G-s)=true")
   125  						}
   126  					})
   127  
   128  					// Test that a graph is not isomorphic with one generated
   129  					// by hashing the first grounded statement.
   130  					t.Run("isomorphic G != Gμ(g)", func(t *testing.T) {
   131  						mangled, mangTerms := mangleFirstIL(statements, hash)
   132  						if mangTerms == nil {
   133  							// All terms were blanks.
   134  							return
   135  						}
   136  						if Isomorphic(statements, mangled, decomp, hash) {
   137  							t.Error("Isomorphic(G, Gμ(g))=true")
   138  						}
   139  					})
   140  
   141  					// Test that a graph is not isomorphic with one generated
   142  					// by merging the first two lexically sorted blank nodes
   143  					// into one.
   144  					t.Run("isomorphic G != G(b1∪b2)", func(t *testing.T) {
   145  						mangled, mangTerms := mergeFirst2B(statements)
   146  						if mangTerms == nil {
   147  							// All terms were blanks.
   148  							return
   149  						}
   150  						if Isomorphic(statements, mangled, decomp, hash) {
   151  							t.Error("Isomorphic(G, G(b1∪b2))=true")
   152  						}
   153  					})
   154  
   155  					// Relabel a copy of the statements and then sort.
   156  					orig := relabelStatements(statements, termsFor(hashes, hash))
   157  					sort.Sort(simpleLexicalStatements(orig))
   158  
   159  					for _, perm := range []struct {
   160  						name string
   161  						data func() ([]*Statement, map[string]string)
   162  					}{
   163  						{
   164  							name: "reverse statements",
   165  							data: func() ([]*Statement, map[string]string) { return reverseStatements(statements) },
   166  						},
   167  						{
   168  							name: "permute statements",
   169  							data: func() ([]*Statement, map[string]string) { return permuteStatements(statements, src) },
   170  						},
   171  						{
   172  							name: "permute blank labels",
   173  							data: func() ([]*Statement, map[string]string) { return permuteBlanks(statements, src) },
   174  						},
   175  						{
   176  							name: "hash blank labels",
   177  							data: func() ([]*Statement, map[string]string) { return hashBlanks(statements, md5.New()) },
   178  						},
   179  						{
   180  							name: "reverse statements and hash blank labels",
   181  							data: func() ([]*Statement, map[string]string) {
   182  								// Reordering must come first since it does not return
   183  								// a non-nil terms map, but hashBlanks does.
   184  								s, _ := reverseStatements(statements)
   185  								return hashBlanks(s, md5.New())
   186  							},
   187  						},
   188  						{
   189  							name: "permute statements and hash blank labels",
   190  							data: func() ([]*Statement, map[string]string) {
   191  								// Reordering must come first since it does not return
   192  								// a non-nil terms map, but hashBlanks does.
   193  								s, _ := permuteStatements(statements, src)
   194  								return hashBlanks(s, md5.New())
   195  							},
   196  						},
   197  					} {
   198  						t.Run(perm.name, func(t *testing.T) {
   199  							if debug {
   200  								fmt.Fprintf(os.Stderr, "\n%q %q decomp=%t:\n", path, perm.name, decomp)
   201  							}
   202  
   203  							altStatements, terms := perm.data()
   204  							altHashes, altTerms := IsoCanonicalHashes(altStatements, decomp, true, hash, make([]byte, 16))
   205  							ok := allUnique(altHashes) && hashesDisjoint(altTerms)
   206  							if !ok {
   207  								t.Errorf("Failed to get unique hashes for %q alternative disjoint %q with decomp=%t",
   208  									path, perm.name, decomp)
   209  							}
   210  
   211  							if debug {
   212  								fmt.Fprintln(os.Stderr, "Name mappings from original dataset:")
   213  								keys := make([]string, len(hashes))
   214  								var i int
   215  								for k := range hashes {
   216  									keys[i] = k
   217  									i++
   218  								}
   219  								sort.Strings(keys)
   220  								w := tabwriter.NewWriter(os.Stderr, 0, 4, 8, ' ', 0)
   221  								for _, k := range keys {
   222  									fmt.Fprintf(w, "\t%s\t%s\n", k, translate(k, terms))
   223  								}
   224  								w.Flush()
   225  								fmt.Fprintln(os.Stderr)
   226  							}
   227  
   228  							// Relabel a copy of the alternative statements and then sort.
   229  							alt := relabelStatements(altStatements, termsFor(altHashes, hash))
   230  							sort.Sort(simpleLexicalStatements(alt))
   231  
   232  							for i := range statements {
   233  								if *orig[i] != *alt[i] { // Otherwise we have pointer inequality.
   234  									t.Errorf("Unexpected statement in %q %q decomp=%t:\ngot: %#v\nwant:%#v",
   235  										path, perm.name, decomp, orig[i], alt[i])
   236  
   237  									break
   238  								}
   239  							}
   240  
   241  							if !Isomorphic(statements, altStatements, decomp, hash) {
   242  								t.Errorf("Isomorphic(G, perm(G))=false in %q %q decomp=%t",
   243  									path, perm.name, decomp)
   244  							}
   245  						})
   246  					}
   247  				})
   248  			}
   249  		})
   250  	}
   251  }
   252  
   253  func permuteStatements(s []*Statement, src rand.Source) ([]*Statement, map[string]string) {
   254  	rnd := rand.New(src)
   255  	m := make([]*Statement, len(s))
   256  	for x, y := range rnd.Perm(len(s)) {
   257  		m[x] = s[y]
   258  	}
   259  	return m, nil
   260  }
   261  
   262  func reverseStatements(s []*Statement) ([]*Statement, map[string]string) {
   263  	m := make([]*Statement, len(s))
   264  	for i, j := 0, len(s)-1; i < len(s); i, j = i+1, j-1 {
   265  		m[j] = s[i]
   266  	}
   267  	return m, nil
   268  }
   269  
   270  func permuteBlanks(s []*Statement, src rand.Source) ([]*Statement, map[string]string) {
   271  	rnd := rand.New(src)
   272  	terms := make(map[string]string)
   273  	for _, e := range s {
   274  		for _, t := range []string{
   275  			e.Subject.Value,
   276  			e.Predicate.Value,
   277  			e.Object.Value,
   278  			e.Label.Value,
   279  		} {
   280  			if t == "" {
   281  				continue
   282  			}
   283  			terms[t] = t
   284  		}
   285  	}
   286  
   287  	var blanks []string
   288  	for t := range terms {
   289  		if isBlank(t) {
   290  			blanks = append(blanks, t)
   291  		}
   292  	}
   293  	sort.Strings(blanks)
   294  	for x, y := range rnd.Perm(len(blanks)) {
   295  		terms[blanks[x]] = blanks[y]
   296  	}
   297  
   298  	m := relabelStatements(s, terms)
   299  	return m, terms
   300  }
   301  
   302  func hashBlanks(s []*Statement, h hash.Hash) ([]*Statement, map[string]string) {
   303  	terms := make(map[string]string)
   304  	for _, e := range s {
   305  		for _, t := range []string{
   306  			e.Subject.Value,
   307  			e.Predicate.Value,
   308  			e.Object.Value,
   309  			e.Label.Value,
   310  		} {
   311  			if !isBlank(t) {
   312  				continue
   313  			}
   314  			h.Reset()
   315  			h.Write([]byte(t))
   316  			terms[t] = fmt.Sprintf("_:%0*x", 2*h.Size(), h.Sum(nil))
   317  		}
   318  	}
   319  
   320  	m := relabelStatements(s, terms)
   321  	return m, terms
   322  }
   323  
   324  func mangleFirstIL(s []*Statement, h hash.Hash) ([]*Statement, map[string]string) {
   325  	terms := make(map[string]string)
   326  	for _, e := range s {
   327  		for _, t := range []string{
   328  			e.Subject.Value,
   329  			e.Predicate.Value,
   330  			e.Object.Value,
   331  			e.Label.Value,
   332  		} {
   333  			if isBlank(t) {
   334  				continue
   335  			}
   336  			h.Reset()
   337  			h.Write([]byte(t))
   338  			terms[t] = fmt.Sprintf(`"%0*x"`, 2*h.Size(), h.Sum(nil))
   339  			return relabelStatements(s, terms), terms
   340  		}
   341  	}
   342  
   343  	m := relabelStatements(s, nil)
   344  	return m, nil
   345  }
   346  
   347  func mergeFirst2B(s []*Statement) ([]*Statement, map[string]string) {
   348  	terms := make(map[string]string)
   349  	for _, e := range s {
   350  		for _, t := range []string{
   351  			e.Subject.Value,
   352  			e.Predicate.Value,
   353  			e.Object.Value,
   354  			e.Label.Value,
   355  		} {
   356  			if !isBlank(t) {
   357  				continue
   358  			}
   359  			terms[t] = t
   360  		}
   361  	}
   362  	if len(terms) < 2 {
   363  		return relabelStatements(s, nil), nil
   364  	}
   365  
   366  	blanks := make([]string, len(terms))
   367  	i := 0
   368  	for _, b := range terms {
   369  		blanks[i] = b
   370  		i++
   371  	}
   372  	sort.Strings(blanks)
   373  	terms[blanks[1]] = terms[blanks[0]]
   374  
   375  	m := relabelStatements(s, terms)
   376  	return m, nil
   377  }
   378  
   379  func hashesDisjoint(terms map[string]map[string]bool) bool {
   380  	for _, t := range terms {
   381  		if len(t) != 1 {
   382  			return false
   383  		}
   384  	}
   385  	return true
   386  }
   387  
   388  func TestLexicalStatements(t *testing.T) {
   389  	if *tests == "" {
   390  		*tests = "*"
   391  	}
   392  
   393  	hash := md5.New()
   394  
   395  	glob, err := filepath.Glob(filepath.Join("testdata", *tests))
   396  	if err != nil {
   397  		t.Fatalf("Failed to open test suite: %v", err)
   398  	}
   399  	for _, path := range glob {
   400  		f, err := os.Open(path)
   401  		if err != nil {
   402  			t.Fatalf("Failed to open test suite in %q: %v", path, err)
   403  		}
   404  		var statements []*Statement
   405  		dec := NewDecoder(f)
   406  		for {
   407  			s, err := dec.Unmarshal()
   408  			if err != nil {
   409  				if err == io.EOF {
   410  					break
   411  				}
   412  				t.Fatalf("Unexpected error reading from %q: %v", path, err)
   413  			}
   414  			statements = append(statements, s)
   415  		}
   416  		f.Close()
   417  
   418  		for _, decomp := range []bool{false, true} {
   419  			hashes, _ := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16))
   420  
   421  			terms := termsFor(hashes, hash)
   422  
   423  			// Sort a copy of the statements based on hashes and then relabel.
   424  			indirect := make([]*Statement, len(statements))
   425  			copy(indirect, statements)
   426  			sort.Sort(lexicalStatements{indirect, hashes})
   427  			indirect = relabelStatements(indirect, terms)
   428  
   429  			// Relabel a copy of the statements and then sort.
   430  			direct := relabelStatements(statements, terms)
   431  			sort.Sort(simpleLexicalStatements(direct))
   432  
   433  			for i := range statements {
   434  				if *indirect[i] != *direct[i] { // Otherwise we have pointer inequality.
   435  					t.Errorf("Unexpected ordering of indirect sort in %q:\ngot: %#v\nwant:%#v",
   436  						path, indirect[i], direct[i])
   437  				}
   438  			}
   439  		}
   440  	}
   441  }
   442  
   443  func termsFor(hashes map[string][]byte, hash hash.Hash) map[string]string {
   444  	terms := make(map[string]string)
   445  	for t, h := range hashes {
   446  		if isBlank(t) {
   447  			terms[t] = fmt.Sprintf("_:%0*x", 2*hash.Size(), h)
   448  		}
   449  	}
   450  	return terms
   451  }
   452  
   453  // simpleLexicalStatements implements lexical statement sorting on the
   454  // literal values without interpolation.
   455  type simpleLexicalStatements []*Statement
   456  
   457  func (s simpleLexicalStatements) Len() int { return len(s) }
   458  func (s simpleLexicalStatements) Less(i, j int) bool {
   459  	si := s[i]
   460  	sj := s[j]
   461  	switch {
   462  	case unquoteIRI(si.Subject.Value) < unquoteIRI(sj.Subject.Value):
   463  		return true
   464  	case unquoteIRI(si.Subject.Value) > unquoteIRI(sj.Subject.Value):
   465  		return false
   466  	}
   467  	switch { // Always IRI.
   468  	case si.Predicate.Value < sj.Predicate.Value:
   469  		return true
   470  	case si.Predicate.Value > sj.Predicate.Value:
   471  		return false
   472  	}
   473  	switch {
   474  	case unquoteIRI(si.Object.Value) < unquoteIRI(sj.Object.Value):
   475  		return true
   476  	case unquoteIRI(si.Object.Value) > unquoteIRI(sj.Object.Value):
   477  		return false
   478  	}
   479  	return unquoteIRI(si.Label.Value) < unquoteIRI(sj.Label.Value)
   480  }
   481  func (s simpleLexicalStatements) Swap(i, j int) {
   482  	s[i], s[j] = s[j], s[i]
   483  }
   484  
   485  func relabelStatements(s []*Statement, terms map[string]string) []*Statement {
   486  	m := make([]*Statement, len(s))
   487  	for i, e := range s {
   488  		n := *e
   489  		n.Subject = Term{Value: translate(n.Subject.Value, terms)}
   490  		n.Predicate = Term{Value: translate(n.Predicate.Value, terms)}
   491  		n.Object = Term{Value: translate(n.Object.Value, terms)}
   492  		n.Label = Term{Value: translate(n.Label.Value, terms)}
   493  		m[i] = &n
   494  	}
   495  	return m
   496  }
   497  
   498  func BenchmarkIsoCanonicalHashes(b *testing.B) {
   499  	hash := md5.New()
   500  
   501  	benchmarks := []string{
   502  		"test019-in.nq",
   503  		"test044-in.nq",
   504  	}
   505  
   506  	for _, name := range benchmarks {
   507  		path := filepath.Join("testdata", name)
   508  		b.Run(name, func(b *testing.B) {
   509  			f, err := os.Open(path)
   510  			if err != nil {
   511  				b.Fatalf("Failed to open test suite in %q: %v", path, err)
   512  			}
   513  			var statements []*Statement
   514  			dec := NewDecoder(f)
   515  			for {
   516  				s, err := dec.Unmarshal()
   517  				if err != nil {
   518  					if err == io.EOF {
   519  						break
   520  					}
   521  					b.Fatalf("Unexpected error reading from %q: %v", path, err)
   522  				}
   523  				statements = append(statements, s)
   524  			}
   525  			f.Close()
   526  
   527  			nodes := make(map[string]bool)
   528  			for _, s := range statements {
   529  				for _, t := range []string{
   530  					s.Subject.Value,
   531  					s.Predicate.Value,
   532  					s.Object.Value,
   533  					s.Label.Value,
   534  				} {
   535  					if t != "" {
   536  						nodes[t] = true
   537  					}
   538  				}
   539  			}
   540  			n := len(nodes)
   541  
   542  			for _, decomp := range []bool{false, true} {
   543  				b.Run(fmt.Sprintf("decomp=%t", decomp), func(b *testing.B) {
   544  					for i := 0; i < b.N; i++ {
   545  						hashes, _ := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16))
   546  						if len(hashes) != n {
   547  							b.Fatalf("unexpected number of hashes: %d != %d", len(hashes), len(statements))
   548  						}
   549  					}
   550  				})
   551  			}
   552  		})
   553  	}
   554  }