github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/graph/formats/rdf/rdf.go (about)

     1  // Copyright ©2020 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate ragel -Z -G2 parse.rl
     6  //go:generate ragel -Z -G2 extract.rl
     7  //go:generate ragel -Z -G2 check.rl
     8  //go:generate stringer -type=Kind
     9  
    10  package rdf
    11  
    12  import (
    13  	"bufio"
    14  	"bytes"
    15  	"errors"
    16  	"fmt"
    17  	"io"
    18  	"net/url"
    19  	"strconv"
    20  	"strings"
    21  	"unicode"
    22  	"unicode/utf8"
    23  
    24  	"github.com/jingcheng-WU/gonum/graph"
    25  )
    26  
    27  var (
    28  	_ graph.Node = Term{}
    29  	_ graph.Edge = (*Statement)(nil)
    30  	_ graph.Line = (*Statement)(nil)
    31  )
    32  
    33  var (
    34  	ErrInvalid        = errors.New("invalid N-Quad")
    35  	ErrIncomplete     = errors.New("incomplete N-Quad")
    36  	ErrInvalidTerm    = errors.New("invalid term")
    37  	ErrIncompleteTerm = errors.New("incomplete term")
    38  )
    39  
    40  // Kind represents the kind of an RDF term.
    41  type Kind int
    42  
    43  const (
    44  	// Invalid is an invalid RDF term.
    45  	Invalid Kind = iota
    46  
    47  	// IRI is the kind of an IRI term.
    48  	// https://www.w3.org/TR/n-quads/#sec-iri
    49  	IRI
    50  
    51  	// Literal is the kind of an RDF literal.
    52  	// https://www.w3.org/TR/n-quads/#sec-literals
    53  	Literal
    54  
    55  	// Blank is the kind of an RDF blank node term.
    56  	// https://www.w3.org/TR/n-quads/#BNodes
    57  	Blank
    58  )
    59  
    60  // Term is an RDF term. It implements the graph.Node interface.
    61  type Term struct {
    62  	// Value is the text value of term.
    63  	Value string
    64  
    65  	// UID is the unique ID for the term
    66  	// in a collection of RDF terms.
    67  	UID int64
    68  }
    69  
    70  // NewBlankTerm returns a Term based on the provided RDF blank node
    71  // label. The label should not include the "_:" prefix. The returned
    72  // Term will not have the UID set.
    73  func NewBlankTerm(label string) (Term, error) {
    74  	err := checkLabelText([]rune(label))
    75  	if err != nil {
    76  		return Term{}, err
    77  	}
    78  	return Term{Value: "_:" + label}, nil
    79  }
    80  
    81  // NewIRITerm returns a Term based on the provided IRI which must
    82  // be valid and include a scheme. The returned Term will not have
    83  // the UID set.
    84  func NewIRITerm(iri string) (Term, error) {
    85  	err := checkIRIText(iri)
    86  	if err != nil {
    87  		return Term{}, err
    88  	}
    89  	return Term{Value: escape("<", iri, ">")}, nil
    90  }
    91  
    92  // NewLiteralTerm returns a Term based on the literal text and an
    93  // optional qualifier which may either be a "@"-prefixed language
    94  // tag or a valid IRI. The text will be escaped if necessary and quoted,
    95  // and if an IRI is given it will be escaped if necessary. The returned
    96  // Term will not have the UID set.
    97  func NewLiteralTerm(text, qual string) (Term, error) {
    98  	text = escape(`"`, text, `"`)
    99  	if qual == "" {
   100  		return Term{Value: text}, nil
   101  	}
   102  	if strings.HasPrefix(qual, "@") {
   103  		err := checkLangText([]byte(qual))
   104  		if err != nil {
   105  			return Term{}, err
   106  		}
   107  		return Term{Value: text + qual}, nil
   108  	}
   109  	err := checkIRIText(qual)
   110  	if err != nil {
   111  		return Term{}, err
   112  	}
   113  	return Term{Value: text + escape("^^<", qual, ">")}, nil
   114  }
   115  
   116  func checkIRIText(iri string) error {
   117  	switch u, err := url.Parse(iri); {
   118  	case err != nil:
   119  		return err
   120  	case u.Scheme == "":
   121  		return fmt.Errorf("rdf: %w: relative IRI ref %q", ErrInvalidTerm, iri)
   122  	default:
   123  		return nil
   124  	}
   125  }
   126  
   127  // Parts returns the pars of the term and the kind of the term.
   128  // IRI node text is returned as a valid IRI with the quoting angle
   129  // brackets removed and escape sequences interpreted, and blank
   130  // nodes are stripped of the "_:" prefix.
   131  // When the term is a literal, qual will either be empty, an unescaped
   132  // IRI, or an RDF language tag prefixed with an @ symbol. The literal
   133  // text is returned unquoted and unescaped.
   134  func (t Term) Parts() (text, qual string, kind Kind, err error) {
   135  	return extract([]rune(t.Value))
   136  }
   137  
   138  // ID returns the value of the Term's UID field.
   139  func (t Term) ID() int64 { return t.UID }
   140  
   141  // Statement is an RDF statement. It implements the graph.Edge and graph.Line
   142  // interfaces.
   143  type Statement struct {
   144  	Subject   Term
   145  	Predicate Term
   146  	Object    Term
   147  	Label     Term
   148  }
   149  
   150  // String returns the RDF 1.1 N-Quad formatted statement.
   151  func (s *Statement) String() string {
   152  	if s.Label.Value == "" {
   153  		return fmt.Sprintf("%s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value)
   154  	}
   155  	return fmt.Sprintf("%s %s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value)
   156  }
   157  
   158  // From returns the subject of the statement.
   159  func (s *Statement) From() graph.Node { return s.Subject }
   160  
   161  // To returns the object of the statement.
   162  func (s *Statement) To() graph.Node { return s.Object }
   163  
   164  // ID returns the UID of the Predicate field.
   165  func (s *Statement) ID() int64 { return s.Predicate.UID }
   166  
   167  // ReversedEdge returns the receiver unaltered. If there is a semantically
   168  // valid edge reversal operation for the data, the user should implement
   169  // this by wrapping Statement in a type performing that operation.
   170  // See the ReversedLine example for details.
   171  func (s *Statement) ReversedEdge() graph.Edge { return s }
   172  
   173  // ReversedLine returns the receiver unaltered. If there is a semantically
   174  // valid line reversal operation for the data, the user should implement
   175  // this by wrapping Statement in a type performing that operation.
   176  func (s *Statement) ReversedLine() graph.Line { return s }
   177  
   178  // ParseNQuad parses the statement and returns the corresponding Statement.
   179  // All Term UID fields are zero on return.
   180  func ParseNQuad(statement string) (*Statement, error) {
   181  	s, err := parse([]rune(statement))
   182  	if err != nil {
   183  		return nil, err
   184  	}
   185  	return &s, err
   186  }
   187  
   188  // Decoder is an RDF stream decoder. Statements returned by calls to the
   189  // Unmarshal method have their Terms' UID fields set so that unique terms
   190  // will have unique IDs and so can be used directly in a graph.Multi, or
   191  // in a graph.Graph if all predicate terms are identical. IDs created by
   192  // the decoder all exist within a single namespace and so Terms can be
   193  // uniquely identified by their UID. Term UIDs are based from 1 to allow
   194  // RDF-aware client graphs to assign ID if no ID has been assigned.
   195  type Decoder struct {
   196  	scanner *bufio.Scanner
   197  
   198  	strings store
   199  	ids     map[string]int64
   200  }
   201  
   202  // NewDecoder returns a new Decoder that takes input from r.
   203  func NewDecoder(r io.Reader) *Decoder {
   204  	return &Decoder{
   205  		scanner: bufio.NewScanner(r),
   206  		strings: make(store),
   207  		ids:     make(map[string]int64),
   208  	}
   209  }
   210  
   211  // Reset resets the decoder to use the provided io.Reader, retaining
   212  // the existing Term ID mapping.
   213  func (dec *Decoder) Reset(r io.Reader) {
   214  	dec.scanner = bufio.NewScanner(r)
   215  	dec.strings = make(store)
   216  	if dec.ids == nil {
   217  		dec.ids = make(map[string]int64)
   218  	}
   219  }
   220  
   221  // Unmarshal returns the next statement from the input stream.
   222  func (dec *Decoder) Unmarshal() (*Statement, error) {
   223  	for dec.scanner.Scan() {
   224  		data := bytes.TrimSpace(dec.scanner.Bytes())
   225  		if len(data) == 0 || data[0] == '#' {
   226  			continue
   227  		}
   228  
   229  		s, err := ParseNQuad(string(data))
   230  		if err != nil {
   231  			return nil, fmt.Errorf("rdf: failed to parse %q: %w", data, err)
   232  		}
   233  		if s == nil {
   234  			continue
   235  		}
   236  
   237  		s.Subject.Value = dec.strings.intern(s.Subject.Value)
   238  		s.Predicate.Value = dec.strings.intern(s.Predicate.Value)
   239  		s.Object.Value = dec.strings.intern(s.Object.Value)
   240  		s.Subject.UID = dec.idFor(s.Subject.Value)
   241  		s.Object.UID = dec.idFor(s.Object.Value)
   242  		s.Predicate.UID = dec.idFor(s.Predicate.Value)
   243  		if s.Label.Value != "" {
   244  			s.Label.Value = dec.strings.intern(s.Label.Value)
   245  			s.Label.UID = dec.idFor(s.Label.Value)
   246  		}
   247  		return s, nil
   248  	}
   249  	dec.strings = nil
   250  	err := dec.scanner.Err()
   251  	if err != nil {
   252  		return nil, err
   253  	}
   254  	return nil, io.EOF
   255  }
   256  
   257  func (dec *Decoder) idFor(s string) int64 {
   258  	id, ok := dec.ids[s]
   259  	if ok {
   260  		return id
   261  	}
   262  	id = int64(len(dec.ids)) + 1
   263  	dec.ids[s] = id
   264  	return id
   265  }
   266  
   267  // Terms returns the mapping between terms and graph node IDs constructed
   268  // during decoding the RDF statement stream.
   269  func (dec *Decoder) Terms() map[string]int64 {
   270  	return dec.ids
   271  }
   272  
   273  // store is a string internment implementation.
   274  type store map[string]string
   275  
   276  // intern returns an interned version of the parameter.
   277  func (is store) intern(s string) string {
   278  	if s == "" {
   279  		return ""
   280  	}
   281  
   282  	if len(s) < 2 || len(s) > 512 {
   283  		// Not enough benefit on average with real data.
   284  		return s
   285  	}
   286  
   287  	t, ok := is[s]
   288  	if ok {
   289  		return t
   290  	}
   291  	is[s] = s
   292  	return s
   293  }
   294  
   295  func escape(lq, s, rq string) string {
   296  	var buf strings.Builder
   297  	if lq != "" {
   298  		buf.WriteString(lq)
   299  	}
   300  	for _, r := range s {
   301  		var c byte
   302  		switch r {
   303  		case '\n':
   304  			c = 'n'
   305  		case '\r':
   306  			c = 'r'
   307  		case '"', '\\':
   308  			c = byte(r)
   309  		default:
   310  			const hex = "0123456789abcdef"
   311  			switch {
   312  			case r <= unicode.MaxASCII || strconv.IsPrint(r):
   313  				buf.WriteRune(r)
   314  			case r > utf8.MaxRune:
   315  				r = 0xFFFD
   316  				fallthrough
   317  			case r < 0x10000:
   318  				buf.WriteString("\\u")
   319  				for s := 12; s >= 0; s -= 4 {
   320  					buf.WriteByte(hex[r>>uint(s)&0xf])
   321  				}
   322  			default:
   323  				buf.WriteString("\\U")
   324  				for s := 28; s >= 0; s -= 4 {
   325  					buf.WriteByte(hex[r>>uint(s)&0xf])
   326  				}
   327  			}
   328  			continue
   329  		}
   330  		buf.Write([]byte{'\\', c})
   331  	}
   332  	if rq != "" {
   333  		buf.WriteString(rq)
   334  	}
   335  	return buf.String()
   336  }
   337  
   338  func unEscape(r []rune) string {
   339  	var buf strings.Builder
   340  	for i := 0; i < len(r); {
   341  		switch r[i] {
   342  		case '\\':
   343  			i++
   344  			var c byte
   345  			switch r[i] {
   346  			case 't':
   347  				c = '\t'
   348  			case 'b':
   349  				c = '\b'
   350  			case 'n':
   351  				c = '\n'
   352  			case 'r':
   353  				c = '\r'
   354  			case 'f':
   355  				c = '\f'
   356  			case '"':
   357  				c = '"'
   358  			case '\\':
   359  				c = '\\'
   360  			case '\'':
   361  				c = '\''
   362  			case 'u':
   363  				rc, err := strconv.ParseInt(string(r[i+1:i+5]), 16, 32)
   364  				if err != nil {
   365  					panic(fmt.Errorf("internal parser error: %w", err))
   366  				}
   367  				buf.WriteRune(rune(rc))
   368  				i += 5
   369  				continue
   370  			case 'U':
   371  				rc, err := strconv.ParseInt(string(r[i+1:i+9]), 16, 32)
   372  				if err != nil {
   373  					panic(fmt.Errorf("internal parser error: %w", err))
   374  				}
   375  				buf.WriteRune(rune(rc))
   376  				i += 9
   377  				continue
   378  			}
   379  			buf.WriteByte(c)
   380  		default:
   381  			buf.WriteRune(r[i])
   382  		}
   383  		i++
   384  	}
   385  
   386  	return buf.String()
   387  }