github.com/gopherd/gonum@v0.0.4/graph/formats/rdf/rdf.go (about) 1 // Copyright ©2020 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate ragel -Z -G2 parse.rl 6 //go:generate ragel -Z -G2 extract.rl 7 //go:generate ragel -Z -G2 check.rl 8 //go:generate stringer -type=Kind 9 10 package rdf 11 12 import ( 13 "bufio" 14 "bytes" 15 "errors" 16 "fmt" 17 "io" 18 "net/url" 19 "strconv" 20 "strings" 21 "unicode" 22 "unicode/utf8" 23 24 "github.com/gopherd/gonum/graph" 25 ) 26 27 var ( 28 _ graph.Node = Term{} 29 _ graph.Edge = (*Statement)(nil) 30 _ graph.Line = (*Statement)(nil) 31 ) 32 33 var ( 34 ErrInvalid = errors.New("invalid N-Quad") 35 ErrIncomplete = errors.New("incomplete N-Quad") 36 ErrInvalidTerm = errors.New("invalid term") 37 ErrIncompleteTerm = errors.New("incomplete term") 38 ) 39 40 // Kind represents the kind of an RDF term. 41 type Kind int 42 43 const ( 44 // Invalid is an invalid RDF term. 45 Invalid Kind = iota 46 47 // IRI is the kind of an IRI term. 48 // https://www.w3.org/TR/n-quads/#sec-iri 49 IRI 50 51 // Literal is the kind of an RDF literal. 52 // https://www.w3.org/TR/n-quads/#sec-literals 53 Literal 54 55 // Blank is the kind of an RDF blank node term. 56 // https://www.w3.org/TR/n-quads/#BNodes 57 Blank 58 ) 59 60 // Term is an RDF term. It implements the graph.Node interface. 61 type Term struct { 62 // Value is the text value of term. 63 Value string 64 65 // UID is the unique ID for the term 66 // in a collection of RDF terms. 67 UID int64 68 } 69 70 // NewBlankTerm returns a Term based on the provided RDF blank node 71 // label. The label should not include the "_:" prefix. The returned 72 // Term will not have the UID set. 73 func NewBlankTerm(label string) (Term, error) { 74 err := checkLabelText([]rune(label)) 75 if err != nil { 76 return Term{}, err 77 } 78 return Term{Value: blankPrefix + label}, nil 79 } 80 81 const blankPrefix = "_:" 82 83 func isBlank(s string) bool { 84 return strings.HasPrefix(s, blankPrefix) 85 } 86 87 // NewIRITerm returns a Term based on the provided IRI which must 88 // be valid and include a scheme. The returned Term will not have 89 // the UID set. 90 func NewIRITerm(iri string) (Term, error) { 91 err := checkIRIText(iri) 92 if err != nil { 93 return Term{}, err 94 } 95 return Term{Value: escape("<", iri, ">")}, nil 96 } 97 98 func isIRI(s string) bool { 99 return strings.HasPrefix(s, "<") && strings.HasSuffix(s, ">") 100 } 101 102 // NewLiteralTerm returns a Term based on the literal text and an 103 // optional qualifier which may either be a "@"-prefixed language 104 // tag or a valid IRI. The text will be escaped if necessary and quoted, 105 // and if an IRI is given it will be escaped if necessary. The returned 106 // Term will not have the UID set. 107 func NewLiteralTerm(text, qual string) (Term, error) { 108 text = escape(`"`, text, `"`) 109 if qual == "" { 110 return Term{Value: text}, nil 111 } 112 if strings.HasPrefix(qual, "@") { 113 err := checkLangText([]byte(qual)) 114 if err != nil { 115 return Term{}, err 116 } 117 return Term{Value: text + qual}, nil 118 } 119 err := checkIRIText(qual) 120 if err != nil { 121 return Term{}, err 122 } 123 return Term{Value: text + escape("^^<", qual, ">")}, nil 124 } 125 126 func checkIRIText(iri string) error { 127 switch u, err := url.Parse(iri); { 128 case err != nil: 129 return err 130 case u.Scheme == "": 131 return fmt.Errorf("rdf: %w: relative IRI ref %q", ErrInvalidTerm, iri) 132 default: 133 return nil 134 } 135 } 136 137 func isLiteral(s string) bool { 138 return strings.HasPrefix(s, `"`) && strings.HasSuffix(s, `"`) 139 } 140 141 // Parts returns the parts of the term and the kind of the term. 142 // IRI node text is returned as a valid IRI with the quoting angle 143 // brackets removed and escape sequences interpreted, and blank 144 // nodes are stripped of the "_:" prefix. 145 // When the term is a literal, qual will either be empty, an unescaped 146 // IRI, or an RDF language tag prefixed with an @ symbol. The literal 147 // text is returned unquoted and unescaped. 148 func (t Term) Parts() (text, qual string, kind Kind, err error) { 149 return extract([]rune(t.Value)) 150 } 151 152 // ID returns the value of the Term's UID field. 153 func (t Term) ID() int64 { return t.UID } 154 155 // Statement is an RDF statement. It implements the graph.Edge and graph.Line 156 // interfaces. 157 type Statement struct { 158 Subject Term 159 Predicate Term 160 Object Term 161 Label Term 162 } 163 164 // String returns the RDF 1.1 N-Quad formatted statement. 165 func (s *Statement) String() string { 166 if s.Label.Value == "" { 167 return fmt.Sprintf("%s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value) 168 } 169 return fmt.Sprintf("%s %s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value) 170 } 171 172 // From returns the subject of the statement. 173 func (s *Statement) From() graph.Node { return s.Subject } 174 175 // To returns the object of the statement. 176 func (s *Statement) To() graph.Node { return s.Object } 177 178 // ID returns the UID of the Predicate field. 179 func (s *Statement) ID() int64 { return s.Predicate.UID } 180 181 // ReversedEdge returns the receiver unaltered. If there is a semantically 182 // valid edge reversal operation for the data, the user should implement 183 // this by wrapping Statement in a type performing that operation. 184 // See the ReversedLine example for details. 185 func (s *Statement) ReversedEdge() graph.Edge { return s } 186 187 // ReversedLine returns the receiver unaltered. If there is a semantically 188 // valid line reversal operation for the data, the user should implement 189 // this by wrapping Statement in a type performing that operation. 190 func (s *Statement) ReversedLine() graph.Line { return s } 191 192 // ParseNQuad parses the statement and returns the corresponding Statement. 193 // All Term UID fields are zero on return. 194 func ParseNQuad(statement string) (*Statement, error) { 195 s, err := parse([]rune(statement)) 196 if err != nil { 197 return nil, err 198 } 199 return &s, err 200 } 201 202 // Decoder is an RDF stream decoder. Statements returned by calls to the 203 // Unmarshal method have their Terms' UID fields set so that unique terms 204 // will have unique IDs and so can be used directly in a graph.Multi, or 205 // in a graph.Graph if all predicate terms are identical. IDs created by 206 // the decoder all exist within a single namespace and so Terms can be 207 // uniquely identified by their UID. Term UIDs are based from 1 to allow 208 // RDF-aware client graphs to assign ID if no ID has been assigned. 209 type Decoder struct { 210 scanner *bufio.Scanner 211 212 strings store 213 ids map[string]int64 214 } 215 216 // NewDecoder returns a new Decoder that takes input from r. 217 func NewDecoder(r io.Reader) *Decoder { 218 return &Decoder{ 219 scanner: bufio.NewScanner(r), 220 strings: make(store), 221 ids: make(map[string]int64), 222 } 223 } 224 225 // Reset resets the decoder to use the provided io.Reader, retaining 226 // the existing Term ID mapping. 227 func (dec *Decoder) Reset(r io.Reader) { 228 dec.scanner = bufio.NewScanner(r) 229 dec.strings = make(store) 230 if dec.ids == nil { 231 dec.ids = make(map[string]int64) 232 } 233 } 234 235 // Unmarshal returns the next statement from the input stream. 236 func (dec *Decoder) Unmarshal() (*Statement, error) { 237 for dec.scanner.Scan() { 238 data := bytes.TrimSpace(dec.scanner.Bytes()) 239 if len(data) == 0 || data[0] == '#' { 240 continue 241 } 242 243 s, err := ParseNQuad(string(data)) 244 if err != nil { 245 return nil, fmt.Errorf("rdf: failed to parse %q: %w", data, err) 246 } 247 if s == nil { 248 continue 249 } 250 251 s.Subject.Value = dec.strings.intern(s.Subject.Value) 252 s.Predicate.Value = dec.strings.intern(s.Predicate.Value) 253 s.Object.Value = dec.strings.intern(s.Object.Value) 254 s.Subject.UID = dec.idFor(s.Subject.Value) 255 s.Object.UID = dec.idFor(s.Object.Value) 256 s.Predicate.UID = dec.idFor(s.Predicate.Value) 257 if s.Label.Value != "" { 258 s.Label.Value = dec.strings.intern(s.Label.Value) 259 s.Label.UID = dec.idFor(s.Label.Value) 260 } 261 return s, nil 262 } 263 dec.strings = nil 264 err := dec.scanner.Err() 265 if err != nil { 266 return nil, err 267 } 268 return nil, io.EOF 269 } 270 271 func (dec *Decoder) idFor(s string) int64 { 272 id, ok := dec.ids[s] 273 if ok { 274 return id 275 } 276 id = int64(len(dec.ids)) + 1 277 dec.ids[s] = id 278 return id 279 } 280 281 // Terms returns the mapping between terms and graph node IDs constructed 282 // during decoding the RDF statement stream. 283 func (dec *Decoder) Terms() map[string]int64 { 284 return dec.ids 285 } 286 287 // store is a string internment implementation. 288 type store map[string]string 289 290 // intern returns an interned version of the parameter. 291 func (is store) intern(s string) string { 292 if s == "" { 293 return "" 294 } 295 296 if len(s) < 2 || len(s) > 512 { 297 // Not enough benefit on average with real data. 298 return s 299 } 300 301 t, ok := is[s] 302 if ok { 303 return t 304 } 305 is[s] = s 306 return s 307 } 308 309 func escape(lq, s, rq string) string { 310 var buf strings.Builder 311 if lq != "" { 312 buf.WriteString(lq) 313 } 314 for _, r := range s { 315 var c byte 316 switch r { 317 case '\n': 318 c = 'n' 319 case '\r': 320 c = 'r' 321 case '"', '\\': 322 c = byte(r) 323 default: 324 const hex = "0123456789abcdef" 325 switch { 326 case r <= unicode.MaxASCII || strconv.IsPrint(r): 327 buf.WriteRune(r) 328 case r > utf8.MaxRune: 329 r = 0xFFFD 330 fallthrough 331 case r < 0x10000: 332 buf.WriteString("\\u") 333 for s := 12; s >= 0; s -= 4 { 334 buf.WriteByte(hex[r>>uint(s)&0xf]) 335 } 336 default: 337 buf.WriteString("\\U") 338 for s := 28; s >= 0; s -= 4 { 339 buf.WriteByte(hex[r>>uint(s)&0xf]) 340 } 341 } 342 continue 343 } 344 buf.Write([]byte{'\\', c}) 345 } 346 if rq != "" { 347 buf.WriteString(rq) 348 } 349 return buf.String() 350 } 351 352 func unEscape(r []rune) string { 353 var buf strings.Builder 354 for i := 0; i < len(r); { 355 switch r[i] { 356 case '\\': 357 i++ 358 var c byte 359 switch r[i] { 360 case 't': 361 c = '\t' 362 case 'b': 363 c = '\b' 364 case 'n': 365 c = '\n' 366 case 'r': 367 c = '\r' 368 case 'f': 369 c = '\f' 370 case '"': 371 c = '"' 372 case '\\': 373 c = '\\' 374 case '\'': 375 c = '\'' 376 case 'u': 377 rc, err := strconv.ParseInt(string(r[i+1:i+5]), 16, 32) 378 if err != nil { 379 panic(fmt.Errorf("internal parser error: %w", err)) 380 } 381 buf.WriteRune(rune(rc)) 382 i += 5 383 continue 384 case 'U': 385 rc, err := strconv.ParseInt(string(r[i+1:i+9]), 16, 32) 386 if err != nil { 387 panic(fmt.Errorf("internal parser error: %w", err)) 388 } 389 buf.WriteRune(rune(rc)) 390 i += 9 391 continue 392 } 393 buf.WriteByte(c) 394 default: 395 buf.WriteRune(r[i]) 396 } 397 i++ 398 } 399 400 return buf.String() 401 }