github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/graph/formats/rdf/rdf.go (about) 1 // Copyright ©2020 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate ragel -Z -G2 parse.rl 6 //go:generate ragel -Z -G2 extract.rl 7 //go:generate ragel -Z -G2 check.rl 8 //go:generate stringer -type=Kind 9 10 package rdf 11 12 import ( 13 "bufio" 14 "bytes" 15 "errors" 16 "fmt" 17 "io" 18 "net/url" 19 "strconv" 20 "strings" 21 "unicode" 22 "unicode/utf8" 23 24 "github.com/jingcheng-WU/gonum/graph" 25 ) 26 27 var ( 28 _ graph.Node = Term{} 29 _ graph.Edge = (*Statement)(nil) 30 _ graph.Line = (*Statement)(nil) 31 ) 32 33 var ( 34 ErrInvalid = errors.New("invalid N-Quad") 35 ErrIncomplete = errors.New("incomplete N-Quad") 36 ErrInvalidTerm = errors.New("invalid term") 37 ErrIncompleteTerm = errors.New("incomplete term") 38 ) 39 40 // Kind represents the kind of an RDF term. 41 type Kind int 42 43 const ( 44 // Invalid is an invalid RDF term. 45 Invalid Kind = iota 46 47 // IRI is the kind of an IRI term. 48 // https://www.w3.org/TR/n-quads/#sec-iri 49 IRI 50 51 // Literal is the kind of an RDF literal. 52 // https://www.w3.org/TR/n-quads/#sec-literals 53 Literal 54 55 // Blank is the kind of an RDF blank node term. 56 // https://www.w3.org/TR/n-quads/#BNodes 57 Blank 58 ) 59 60 // Term is an RDF term. It implements the graph.Node interface. 61 type Term struct { 62 // Value is the text value of term. 63 Value string 64 65 // UID is the unique ID for the term 66 // in a collection of RDF terms. 67 UID int64 68 } 69 70 // NewBlankTerm returns a Term based on the provided RDF blank node 71 // label. The label should not include the "_:" prefix. The returned 72 // Term will not have the UID set. 73 func NewBlankTerm(label string) (Term, error) { 74 err := checkLabelText([]rune(label)) 75 if err != nil { 76 return Term{}, err 77 } 78 return Term{Value: "_:" + label}, nil 79 } 80 81 // NewIRITerm returns a Term based on the provided IRI which must 82 // be valid and include a scheme. The returned Term will not have 83 // the UID set. 84 func NewIRITerm(iri string) (Term, error) { 85 err := checkIRIText(iri) 86 if err != nil { 87 return Term{}, err 88 } 89 return Term{Value: escape("<", iri, ">")}, nil 90 } 91 92 // NewLiteralTerm returns a Term based on the literal text and an 93 // optional qualifier which may either be a "@"-prefixed language 94 // tag or a valid IRI. The text will be escaped if necessary and quoted, 95 // and if an IRI is given it will be escaped if necessary. The returned 96 // Term will not have the UID set. 97 func NewLiteralTerm(text, qual string) (Term, error) { 98 text = escape(`"`, text, `"`) 99 if qual == "" { 100 return Term{Value: text}, nil 101 } 102 if strings.HasPrefix(qual, "@") { 103 err := checkLangText([]byte(qual)) 104 if err != nil { 105 return Term{}, err 106 } 107 return Term{Value: text + qual}, nil 108 } 109 err := checkIRIText(qual) 110 if err != nil { 111 return Term{}, err 112 } 113 return Term{Value: text + escape("^^<", qual, ">")}, nil 114 } 115 116 func checkIRIText(iri string) error { 117 switch u, err := url.Parse(iri); { 118 case err != nil: 119 return err 120 case u.Scheme == "": 121 return fmt.Errorf("rdf: %w: relative IRI ref %q", ErrInvalidTerm, iri) 122 default: 123 return nil 124 } 125 } 126 127 // Parts returns the pars of the term and the kind of the term. 128 // IRI node text is returned as a valid IRI with the quoting angle 129 // brackets removed and escape sequences interpreted, and blank 130 // nodes are stripped of the "_:" prefix. 131 // When the term is a literal, qual will either be empty, an unescaped 132 // IRI, or an RDF language tag prefixed with an @ symbol. The literal 133 // text is returned unquoted and unescaped. 134 func (t Term) Parts() (text, qual string, kind Kind, err error) { 135 return extract([]rune(t.Value)) 136 } 137 138 // ID returns the value of the Term's UID field. 139 func (t Term) ID() int64 { return t.UID } 140 141 // Statement is an RDF statement. It implements the graph.Edge and graph.Line 142 // interfaces. 143 type Statement struct { 144 Subject Term 145 Predicate Term 146 Object Term 147 Label Term 148 } 149 150 // String returns the RDF 1.1 N-Quad formatted statement. 151 func (s *Statement) String() string { 152 if s.Label.Value == "" { 153 return fmt.Sprintf("%s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value) 154 } 155 return fmt.Sprintf("%s %s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value) 156 } 157 158 // From returns the subject of the statement. 159 func (s *Statement) From() graph.Node { return s.Subject } 160 161 // To returns the object of the statement. 162 func (s *Statement) To() graph.Node { return s.Object } 163 164 // ID returns the UID of the Predicate field. 165 func (s *Statement) ID() int64 { return s.Predicate.UID } 166 167 // ReversedEdge returns the receiver unaltered. If there is a semantically 168 // valid edge reversal operation for the data, the user should implement 169 // this by wrapping Statement in a type performing that operation. 170 // See the ReversedLine example for details. 171 func (s *Statement) ReversedEdge() graph.Edge { return s } 172 173 // ReversedLine returns the receiver unaltered. If there is a semantically 174 // valid line reversal operation for the data, the user should implement 175 // this by wrapping Statement in a type performing that operation. 176 func (s *Statement) ReversedLine() graph.Line { return s } 177 178 // ParseNQuad parses the statement and returns the corresponding Statement. 179 // All Term UID fields are zero on return. 180 func ParseNQuad(statement string) (*Statement, error) { 181 s, err := parse([]rune(statement)) 182 if err != nil { 183 return nil, err 184 } 185 return &s, err 186 } 187 188 // Decoder is an RDF stream decoder. Statements returned by calls to the 189 // Unmarshal method have their Terms' UID fields set so that unique terms 190 // will have unique IDs and so can be used directly in a graph.Multi, or 191 // in a graph.Graph if all predicate terms are identical. IDs created by 192 // the decoder all exist within a single namespace and so Terms can be 193 // uniquely identified by their UID. Term UIDs are based from 1 to allow 194 // RDF-aware client graphs to assign ID if no ID has been assigned. 195 type Decoder struct { 196 scanner *bufio.Scanner 197 198 strings store 199 ids map[string]int64 200 } 201 202 // NewDecoder returns a new Decoder that takes input from r. 203 func NewDecoder(r io.Reader) *Decoder { 204 return &Decoder{ 205 scanner: bufio.NewScanner(r), 206 strings: make(store), 207 ids: make(map[string]int64), 208 } 209 } 210 211 // Reset resets the decoder to use the provided io.Reader, retaining 212 // the existing Term ID mapping. 213 func (dec *Decoder) Reset(r io.Reader) { 214 dec.scanner = bufio.NewScanner(r) 215 dec.strings = make(store) 216 if dec.ids == nil { 217 dec.ids = make(map[string]int64) 218 } 219 } 220 221 // Unmarshal returns the next statement from the input stream. 222 func (dec *Decoder) Unmarshal() (*Statement, error) { 223 for dec.scanner.Scan() { 224 data := bytes.TrimSpace(dec.scanner.Bytes()) 225 if len(data) == 0 || data[0] == '#' { 226 continue 227 } 228 229 s, err := ParseNQuad(string(data)) 230 if err != nil { 231 return nil, fmt.Errorf("rdf: failed to parse %q: %w", data, err) 232 } 233 if s == nil { 234 continue 235 } 236 237 s.Subject.Value = dec.strings.intern(s.Subject.Value) 238 s.Predicate.Value = dec.strings.intern(s.Predicate.Value) 239 s.Object.Value = dec.strings.intern(s.Object.Value) 240 s.Subject.UID = dec.idFor(s.Subject.Value) 241 s.Object.UID = dec.idFor(s.Object.Value) 242 s.Predicate.UID = dec.idFor(s.Predicate.Value) 243 if s.Label.Value != "" { 244 s.Label.Value = dec.strings.intern(s.Label.Value) 245 s.Label.UID = dec.idFor(s.Label.Value) 246 } 247 return s, nil 248 } 249 dec.strings = nil 250 err := dec.scanner.Err() 251 if err != nil { 252 return nil, err 253 } 254 return nil, io.EOF 255 } 256 257 func (dec *Decoder) idFor(s string) int64 { 258 id, ok := dec.ids[s] 259 if ok { 260 return id 261 } 262 id = int64(len(dec.ids)) + 1 263 dec.ids[s] = id 264 return id 265 } 266 267 // Terms returns the mapping between terms and graph node IDs constructed 268 // during decoding the RDF statement stream. 269 func (dec *Decoder) Terms() map[string]int64 { 270 return dec.ids 271 } 272 273 // store is a string internment implementation. 274 type store map[string]string 275 276 // intern returns an interned version of the parameter. 277 func (is store) intern(s string) string { 278 if s == "" { 279 return "" 280 } 281 282 if len(s) < 2 || len(s) > 512 { 283 // Not enough benefit on average with real data. 284 return s 285 } 286 287 t, ok := is[s] 288 if ok { 289 return t 290 } 291 is[s] = s 292 return s 293 } 294 295 func escape(lq, s, rq string) string { 296 var buf strings.Builder 297 if lq != "" { 298 buf.WriteString(lq) 299 } 300 for _, r := range s { 301 var c byte 302 switch r { 303 case '\n': 304 c = 'n' 305 case '\r': 306 c = 'r' 307 case '"', '\\': 308 c = byte(r) 309 default: 310 const hex = "0123456789abcdef" 311 switch { 312 case r <= unicode.MaxASCII || strconv.IsPrint(r): 313 buf.WriteRune(r) 314 case r > utf8.MaxRune: 315 r = 0xFFFD 316 fallthrough 317 case r < 0x10000: 318 buf.WriteString("\\u") 319 for s := 12; s >= 0; s -= 4 { 320 buf.WriteByte(hex[r>>uint(s)&0xf]) 321 } 322 default: 323 buf.WriteString("\\U") 324 for s := 28; s >= 0; s -= 4 { 325 buf.WriteByte(hex[r>>uint(s)&0xf]) 326 } 327 } 328 continue 329 } 330 buf.Write([]byte{'\\', c}) 331 } 332 if rq != "" { 333 buf.WriteString(rq) 334 } 335 return buf.String() 336 } 337 338 func unEscape(r []rune) string { 339 var buf strings.Builder 340 for i := 0; i < len(r); { 341 switch r[i] { 342 case '\\': 343 i++ 344 var c byte 345 switch r[i] { 346 case 't': 347 c = '\t' 348 case 'b': 349 c = '\b' 350 case 'n': 351 c = '\n' 352 case 'r': 353 c = '\r' 354 case 'f': 355 c = '\f' 356 case '"': 357 c = '"' 358 case '\\': 359 c = '\\' 360 case '\'': 361 c = '\'' 362 case 'u': 363 rc, err := strconv.ParseInt(string(r[i+1:i+5]), 16, 32) 364 if err != nil { 365 panic(fmt.Errorf("internal parser error: %w", err)) 366 } 367 buf.WriteRune(rune(rc)) 368 i += 5 369 continue 370 case 'U': 371 rc, err := strconv.ParseInt(string(r[i+1:i+9]), 16, 32) 372 if err != nil { 373 panic(fmt.Errorf("internal parser error: %w", err)) 374 } 375 buf.WriteRune(rune(rc)) 376 i += 9 377 continue 378 } 379 buf.WriteByte(c) 380 default: 381 buf.WriteRune(r[i]) 382 } 383 i++ 384 } 385 386 return buf.String() 387 }