github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/rdf.go (about)

     1  // Copyright ©2020 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate ragel -Z -G2 parse.rl
     6  //go:generate ragel -Z -G2 extract.rl
     7  //go:generate ragel -Z -G2 check.rl
     8  //go:generate stringer -type=Kind
     9  
    10  package rdf
    11  
    12  import (
    13  	"bufio"
    14  	"bytes"
    15  	"errors"
    16  	"fmt"
    17  	"io"
    18  	"net/url"
    19  	"strconv"
    20  	"strings"
    21  	"unicode"
    22  	"unicode/utf8"
    23  
    24  	"github.com/gopherd/gonum/graph"
    25  )
    26  
    27  var (
    28  	_ graph.Node = Term{}
    29  	_ graph.Edge = (*Statement)(nil)
    30  	_ graph.Line = (*Statement)(nil)
    31  )
    32  
    33  var (
    34  	ErrInvalid        = errors.New("invalid N-Quad")
    35  	ErrIncomplete     = errors.New("incomplete N-Quad")
    36  	ErrInvalidTerm    = errors.New("invalid term")
    37  	ErrIncompleteTerm = errors.New("incomplete term")
    38  )
    39  
    40  // Kind represents the kind of an RDF term.
    41  type Kind int
    42  
    43  const (
    44  	// Invalid is an invalid RDF term.
    45  	Invalid Kind = iota
    46  
    47  	// IRI is the kind of an IRI term.
    48  	// https://www.w3.org/TR/n-quads/#sec-iri
    49  	IRI
    50  
    51  	// Literal is the kind of an RDF literal.
    52  	// https://www.w3.org/TR/n-quads/#sec-literals
    53  	Literal
    54  
    55  	// Blank is the kind of an RDF blank node term.
    56  	// https://www.w3.org/TR/n-quads/#BNodes
    57  	Blank
    58  )
    59  
    60  // Term is an RDF term. It implements the graph.Node interface.
    61  type Term struct {
    62  	// Value is the text value of term.
    63  	Value string
    64  
    65  	// UID is the unique ID for the term
    66  	// in a collection of RDF terms.
    67  	UID int64
    68  }
    69  
    70  // NewBlankTerm returns a Term based on the provided RDF blank node
    71  // label. The label should not include the "_:" prefix. The returned
    72  // Term will not have the UID set.
    73  func NewBlankTerm(label string) (Term, error) {
    74  	err := checkLabelText([]rune(label))
    75  	if err != nil {
    76  		return Term{}, err
    77  	}
    78  	return Term{Value: blankPrefix + label}, nil
    79  }
    80  
    81  const blankPrefix = "_:"
    82  
    83  func isBlank(s string) bool {
    84  	return strings.HasPrefix(s, blankPrefix)
    85  }
    86  
    87  // NewIRITerm returns a Term based on the provided IRI which must
    88  // be valid and include a scheme. The returned Term will not have
    89  // the UID set.
    90  func NewIRITerm(iri string) (Term, error) {
    91  	err := checkIRIText(iri)
    92  	if err != nil {
    93  		return Term{}, err
    94  	}
    95  	return Term{Value: escape("<", iri, ">")}, nil
    96  }
    97  
    98  func isIRI(s string) bool {
    99  	return strings.HasPrefix(s, "<") && strings.HasSuffix(s, ">")
   100  }
   101  
   102  // NewLiteralTerm returns a Term based on the literal text and an
   103  // optional qualifier which may either be a "@"-prefixed language
   104  // tag or a valid IRI. The text will be escaped if necessary and quoted,
   105  // and if an IRI is given it will be escaped if necessary. The returned
   106  // Term will not have the UID set.
   107  func NewLiteralTerm(text, qual string) (Term, error) {
   108  	text = escape(`"`, text, `"`)
   109  	if qual == "" {
   110  		return Term{Value: text}, nil
   111  	}
   112  	if strings.HasPrefix(qual, "@") {
   113  		err := checkLangText([]byte(qual))
   114  		if err != nil {
   115  			return Term{}, err
   116  		}
   117  		return Term{Value: text + qual}, nil
   118  	}
   119  	err := checkIRIText(qual)
   120  	if err != nil {
   121  		return Term{}, err
   122  	}
   123  	return Term{Value: text + escape("^^<", qual, ">")}, nil
   124  }
   125  
   126  func checkIRIText(iri string) error {
   127  	switch u, err := url.Parse(iri); {
   128  	case err != nil:
   129  		return err
   130  	case u.Scheme == "":
   131  		return fmt.Errorf("rdf: %w: relative IRI ref %q", ErrInvalidTerm, iri)
   132  	default:
   133  		return nil
   134  	}
   135  }
   136  
   137  func isLiteral(s string) bool {
   138  	return strings.HasPrefix(s, `"`) && strings.HasSuffix(s, `"`)
   139  }
   140  
   141  // Parts returns the parts of the term and the kind of the term.
   142  // IRI node text is returned as a valid IRI with the quoting angle
   143  // brackets removed and escape sequences interpreted, and blank
   144  // nodes are stripped of the "_:" prefix.
   145  // When the term is a literal, qual will either be empty, an unescaped
   146  // IRI, or an RDF language tag prefixed with an @ symbol. The literal
   147  // text is returned unquoted and unescaped.
   148  func (t Term) Parts() (text, qual string, kind Kind, err error) {
   149  	return extract([]rune(t.Value))
   150  }
   151  
   152  // ID returns the value of the Term's UID field.
   153  func (t Term) ID() int64 { return t.UID }
   154  
   155  // Statement is an RDF statement. It implements the graph.Edge and graph.Line
   156  // interfaces.
   157  type Statement struct {
   158  	Subject   Term
   159  	Predicate Term
   160  	Object    Term
   161  	Label     Term
   162  }
   163  
   164  // String returns the RDF 1.1 N-Quad formatted statement.
   165  func (s *Statement) String() string {
   166  	if s.Label.Value == "" {
   167  		return fmt.Sprintf("%s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value)
   168  	}
   169  	return fmt.Sprintf("%s %s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value)
   170  }
   171  
   172  // From returns the subject of the statement.
   173  func (s *Statement) From() graph.Node { return s.Subject }
   174  
   175  // To returns the object of the statement.
   176  func (s *Statement) To() graph.Node { return s.Object }
   177  
   178  // ID returns the UID of the Predicate field.
   179  func (s *Statement) ID() int64 { return s.Predicate.UID }
   180  
   181  // ReversedEdge returns the receiver unaltered. If there is a semantically
   182  // valid edge reversal operation for the data, the user should implement
   183  // this by wrapping Statement in a type performing that operation.
   184  // See the ReversedLine example for details.
   185  func (s *Statement) ReversedEdge() graph.Edge { return s }
   186  
   187  // ReversedLine returns the receiver unaltered. If there is a semantically
   188  // valid line reversal operation for the data, the user should implement
   189  // this by wrapping Statement in a type performing that operation.
   190  func (s *Statement) ReversedLine() graph.Line { return s }
   191  
   192  // ParseNQuad parses the statement and returns the corresponding Statement.
   193  // All Term UID fields are zero on return.
   194  func ParseNQuad(statement string) (*Statement, error) {
   195  	s, err := parse([]rune(statement))
   196  	if err != nil {
   197  		return nil, err
   198  	}
   199  	return &s, err
   200  }
   201  
   202  // Decoder is an RDF stream decoder. Statements returned by calls to the
   203  // Unmarshal method have their Terms' UID fields set so that unique terms
   204  // will have unique IDs and so can be used directly in a graph.Multi, or
   205  // in a graph.Graph if all predicate terms are identical. IDs created by
   206  // the decoder all exist within a single namespace and so Terms can be
   207  // uniquely identified by their UID. Term UIDs are based from 1 to allow
   208  // RDF-aware client graphs to assign ID if no ID has been assigned.
   209  type Decoder struct {
   210  	scanner *bufio.Scanner
   211  
   212  	strings store
   213  	ids     map[string]int64
   214  }
   215  
   216  // NewDecoder returns a new Decoder that takes input from r.
   217  func NewDecoder(r io.Reader) *Decoder {
   218  	return &Decoder{
   219  		scanner: bufio.NewScanner(r),
   220  		strings: make(store),
   221  		ids:     make(map[string]int64),
   222  	}
   223  }
   224  
   225  // Reset resets the decoder to use the provided io.Reader, retaining
   226  // the existing Term ID mapping.
   227  func (dec *Decoder) Reset(r io.Reader) {
   228  	dec.scanner = bufio.NewScanner(r)
   229  	dec.strings = make(store)
   230  	if dec.ids == nil {
   231  		dec.ids = make(map[string]int64)
   232  	}
   233  }
   234  
   235  // Unmarshal returns the next statement from the input stream.
   236  func (dec *Decoder) Unmarshal() (*Statement, error) {
   237  	for dec.scanner.Scan() {
   238  		data := bytes.TrimSpace(dec.scanner.Bytes())
   239  		if len(data) == 0 || data[0] == '#' {
   240  			continue
   241  		}
   242  
   243  		s, err := ParseNQuad(string(data))
   244  		if err != nil {
   245  			return nil, fmt.Errorf("rdf: failed to parse %q: %w", data, err)
   246  		}
   247  		if s == nil {
   248  			continue
   249  		}
   250  
   251  		s.Subject.Value = dec.strings.intern(s.Subject.Value)
   252  		s.Predicate.Value = dec.strings.intern(s.Predicate.Value)
   253  		s.Object.Value = dec.strings.intern(s.Object.Value)
   254  		s.Subject.UID = dec.idFor(s.Subject.Value)
   255  		s.Object.UID = dec.idFor(s.Object.Value)
   256  		s.Predicate.UID = dec.idFor(s.Predicate.Value)
   257  		if s.Label.Value != "" {
   258  			s.Label.Value = dec.strings.intern(s.Label.Value)
   259  			s.Label.UID = dec.idFor(s.Label.Value)
   260  		}
   261  		return s, nil
   262  	}
   263  	dec.strings = nil
   264  	err := dec.scanner.Err()
   265  	if err != nil {
   266  		return nil, err
   267  	}
   268  	return nil, io.EOF
   269  }
   270  
   271  func (dec *Decoder) idFor(s string) int64 {
   272  	id, ok := dec.ids[s]
   273  	if ok {
   274  		return id
   275  	}
   276  	id = int64(len(dec.ids)) + 1
   277  	dec.ids[s] = id
   278  	return id
   279  }
   280  
   281  // Terms returns the mapping between terms and graph node IDs constructed
   282  // during decoding the RDF statement stream.
   283  func (dec *Decoder) Terms() map[string]int64 {
   284  	return dec.ids
   285  }
   286  
   287  // store is a string internment implementation.
   288  type store map[string]string
   289  
   290  // intern returns an interned version of the parameter.
   291  func (is store) intern(s string) string {
   292  	if s == "" {
   293  		return ""
   294  	}
   295  
   296  	if len(s) < 2 || len(s) > 512 {
   297  		// Not enough benefit on average with real data.
   298  		return s
   299  	}
   300  
   301  	t, ok := is[s]
   302  	if ok {
   303  		return t
   304  	}
   305  	is[s] = s
   306  	return s
   307  }
   308  
   309  func escape(lq, s, rq string) string {
   310  	var buf strings.Builder
   311  	if lq != "" {
   312  		buf.WriteString(lq)
   313  	}
   314  	for _, r := range s {
   315  		var c byte
   316  		switch r {
   317  		case '\n':
   318  			c = 'n'
   319  		case '\r':
   320  			c = 'r'
   321  		case '"', '\\':
   322  			c = byte(r)
   323  		default:
   324  			const hex = "0123456789abcdef"
   325  			switch {
   326  			case r <= unicode.MaxASCII || strconv.IsPrint(r):
   327  				buf.WriteRune(r)
   328  			case r > utf8.MaxRune:
   329  				r = 0xFFFD
   330  				fallthrough
   331  			case r < 0x10000:
   332  				buf.WriteString("\\u")
   333  				for s := 12; s >= 0; s -= 4 {
   334  					buf.WriteByte(hex[r>>uint(s)&0xf])
   335  				}
   336  			default:
   337  				buf.WriteString("\\U")
   338  				for s := 28; s >= 0; s -= 4 {
   339  					buf.WriteByte(hex[r>>uint(s)&0xf])
   340  				}
   341  			}
   342  			continue
   343  		}
   344  		buf.Write([]byte{'\\', c})
   345  	}
   346  	if rq != "" {
   347  		buf.WriteString(rq)
   348  	}
   349  	return buf.String()
   350  }
   351  
   352  func unEscape(r []rune) string {
   353  	var buf strings.Builder
   354  	for i := 0; i < len(r); {
   355  		switch r[i] {
   356  		case '\\':
   357  			i++
   358  			var c byte
   359  			switch r[i] {
   360  			case 't':
   361  				c = '\t'
   362  			case 'b':
   363  				c = '\b'
   364  			case 'n':
   365  				c = '\n'
   366  			case 'r':
   367  				c = '\r'
   368  			case 'f':
   369  				c = '\f'
   370  			case '"':
   371  				c = '"'
   372  			case '\\':
   373  				c = '\\'
   374  			case '\'':
   375  				c = '\''
   376  			case 'u':
   377  				rc, err := strconv.ParseInt(string(r[i+1:i+5]), 16, 32)
   378  				if err != nil {
   379  					panic(fmt.Errorf("internal parser error: %w", err))
   380  				}
   381  				buf.WriteRune(rune(rc))
   382  				i += 5
   383  				continue
   384  			case 'U':
   385  				rc, err := strconv.ParseInt(string(r[i+1:i+9]), 16, 32)
   386  				if err != nil {
   387  					panic(fmt.Errorf("internal parser error: %w", err))
   388  				}
   389  				buf.WriteRune(rune(rc))
   390  				i += 9
   391  				continue
   392  			}
   393  			buf.WriteByte(c)
   394  		default:
   395  			buf.WriteRune(r[i])
   396  		}
   397  		i++
   398  	}
   399  
   400  	return buf.String()
   401  }