github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/chunker/chunk.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package chunker
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"compress/gzip"
    23  	encjson "encoding/json"
    24  	"io"
    25  	"net/http"
    26  	"os"
    27  	"path/filepath"
    28  	"strings"
    29  	"unicode"
    30  
    31  	"github.com/dgraph-io/dgo/x"
    32  	"github.com/dgraph-io/dgraph/lex"
    33  
    34  	"github.com/pkg/errors"
    35  )
    36  
    37  // Chunker describes the interface to parse and process the input to the live and bulk loaders.
    38  type Chunker interface {
    39  	Chunk(r *bufio.Reader) (*bytes.Buffer, error)
    40  	Parse(chunkBuf *bytes.Buffer) error
    41  	NQuads() *NQuadBuffer
    42  }
    43  
    44  type rdfChunker struct {
    45  	lexer *lex.Lexer
    46  	nqs   *NQuadBuffer
    47  }
    48  
    49  func (rc *rdfChunker) NQuads() *NQuadBuffer {
    50  	return rc.nqs
    51  }
    52  
    53  type jsonChunker struct {
    54  	nqs    *NQuadBuffer
    55  	inList bool
    56  }
    57  
    58  func (jc *jsonChunker) NQuads() *NQuadBuffer {
    59  	return jc.nqs
    60  }
    61  
    62  // InputFormat represents the multiple formats supported by Chunker.
    63  type InputFormat byte
    64  
    65  const (
    66  	// UnknownFormat is a constant to denote a format not supported by the bulk/live loaders.
    67  	UnknownFormat InputFormat = iota
    68  	// RdfFormat is a constant to denote the input to the live/bulk loader is in the RDF format.
    69  	RdfFormat
    70  	// JsonFormat is a constant to denote the input to the live/bulk loader is in the JSON format.
    71  	JsonFormat
    72  )
    73  
    74  // NewChunker returns a new chunker for the specified format.
    75  func NewChunker(inputFormat InputFormat, batchSize int) Chunker {
    76  	switch inputFormat {
    77  	case RdfFormat:
    78  		return &rdfChunker{
    79  			nqs:   NewNQuadBuffer(batchSize),
    80  			lexer: &lex.Lexer{},
    81  		}
    82  	case JsonFormat:
    83  		return &jsonChunker{
    84  			nqs: NewNQuadBuffer(batchSize),
    85  		}
    86  	default:
    87  		panic("unknown input format")
    88  	}
    89  }
    90  
    91  // Chunk reads the input line by line until one of the following 3 conditions happens
    92  // 1) the EOF is reached
    93  // 2) 1e5 lines have been read
    94  // 3) some unexpected error happened
    95  func (*rdfChunker) Chunk(r *bufio.Reader) (*bytes.Buffer, error) {
    96  	batch := new(bytes.Buffer)
    97  	batch.Grow(1 << 20)
    98  	for lineCount := 0; lineCount < 1e5; lineCount++ {
    99  		slc, err := r.ReadSlice('\n')
   100  		if err == io.EOF {
   101  			batch.Write(slc)
   102  			return batch, err
   103  		}
   104  		if err == bufio.ErrBufferFull {
   105  			// This should only happen infrequently.
   106  			batch.Write(slc)
   107  			var str string
   108  			str, err = r.ReadString('\n')
   109  			if err == io.EOF {
   110  				batch.WriteString(str)
   111  				return batch, err
   112  			}
   113  			if err != nil {
   114  				return nil, err
   115  			}
   116  			batch.WriteString(str)
   117  			continue
   118  		}
   119  		if err != nil {
   120  			return nil, err
   121  		}
   122  		batch.Write(slc)
   123  	}
   124  	return batch, nil
   125  }
   126  
   127  // Parse is not thread-safe. Only call it serially, because it reuses lexer object.
   128  func (rc *rdfChunker) Parse(chunkBuf *bytes.Buffer) error {
   129  	if chunkBuf == nil || chunkBuf.Len() == 0 {
   130  		return nil
   131  	}
   132  
   133  	for chunkBuf.Len() > 0 {
   134  		str, err := chunkBuf.ReadString('\n')
   135  		if err != nil && err != io.EOF {
   136  			x.Check(err)
   137  		}
   138  
   139  		nq, err := ParseRDF(str, rc.lexer)
   140  		if err == ErrEmpty {
   141  			continue // blank line or comment
   142  		} else if err != nil {
   143  			return errors.Wrapf(err, "while parsing line %q", str)
   144  		}
   145  		rc.nqs.Push(&nq)
   146  	}
   147  	return nil
   148  }
   149  
   150  // Chunk tries to consume multiple top-level maps from the reader until a size threshold is
   151  // reached, or the end of file is reached.
   152  func (jc *jsonChunker) Chunk(r *bufio.Reader) (*bytes.Buffer, error) {
   153  	ch, err := jc.nextRune(r)
   154  	if err != nil {
   155  		return nil, err
   156  	}
   157  	// If the file starts with a list rune [, we set the inList flag, and keep consuming maps
   158  	// until we reach the threshold.
   159  	if ch == '[' {
   160  		jc.inList = true
   161  	} else if ch == '{' {
   162  		// put the rune back for it to be consumed in the consumeMap function
   163  		if err := r.UnreadRune(); err != nil {
   164  			return nil, err
   165  		}
   166  	} else {
   167  		return nil, errors.Errorf("file is not JSON")
   168  	}
   169  
   170  	out := new(bytes.Buffer)
   171  	out.WriteRune('[')
   172  	hasMapsBefore := false
   173  	for out.Len() < 1e5 {
   174  		if hasMapsBefore {
   175  			out.WriteRune(',')
   176  		}
   177  		if err := jc.consumeMap(r, out); err != nil {
   178  			return nil, err
   179  		}
   180  		hasMapsBefore = true
   181  
   182  		// handle the legal termination cases, by checking the next rune after the map
   183  		ch, err := jc.nextRune(r)
   184  		if err == io.EOF {
   185  			// handles the EOF case, return the buffer which represents the top level map
   186  			if jc.inList {
   187  				return nil, errors.Errorf("JSON file ends abruptly, expecting ]")
   188  			}
   189  
   190  			out.WriteRune(']')
   191  			return out, io.EOF
   192  		} else if err != nil {
   193  			return nil, err
   194  		}
   195  
   196  		if ch == ']' {
   197  			if !jc.inList {
   198  				return nil, errors.Errorf("JSON map is followed by an extraneous ]")
   199  			}
   200  
   201  			// validate that there are no more non-space chars after the ]
   202  			if slurpSpace(r) != io.EOF {
   203  				return nil, errors.New("Not all of JSON file consumed")
   204  			}
   205  
   206  			out.WriteRune(']')
   207  			return out, io.EOF
   208  		}
   209  
   210  		// In the non termination cases, ensure at least one map has been consumed, and
   211  		// the only allowed char after the map is ",".
   212  		if out.Len() == 1 { // 1 represents the [ inserted before the for loop
   213  			return nil, errors.Errorf("Illegal rune found \"%c\", expecting {", ch)
   214  		}
   215  		if ch != ',' {
   216  			return nil, errors.Errorf("JSON map is followed by illegal rune \"%c\"", ch)
   217  		}
   218  	}
   219  	out.WriteRune(']')
   220  	return out, nil
   221  }
   222  
   223  // consumeMap consumes the next map from the reader, and stores the result into the buffer out.
   224  // After ignoring spaces, if the reader does not begin with {, no rune will be consumed
   225  // from the reader.
   226  func (jc *jsonChunker) consumeMap(r *bufio.Reader, out *bytes.Buffer) error {
   227  	// Just find the matching closing brace. Let the JSON-to-nquad parser in the mapper worry
   228  	// about whether everything in between is valid JSON or not.
   229  	depth := 0
   230  	for {
   231  		ch, err := jc.nextRune(r)
   232  		if err != nil {
   233  			return errors.New("Malformed JSON")
   234  		}
   235  		if depth == 0 && ch != '{' {
   236  			// We encountered a beginning rune that's not {,
   237  			// unread the char and return without consuming anything.
   238  			if err := r.UnreadRune(); err != nil {
   239  				return err
   240  			}
   241  			return nil
   242  		}
   243  
   244  		x.Check2(out.WriteRune(ch))
   245  		switch ch {
   246  		case '{':
   247  			depth++
   248  		case '}':
   249  			depth--
   250  		case '"':
   251  			if err := slurpQuoted(r, out); err != nil {
   252  				return err
   253  			}
   254  		default:
   255  			// We just write the rune to out, and let the Go JSON parser do its job.
   256  		}
   257  		if depth <= 0 {
   258  			break
   259  		}
   260  	}
   261  	return nil
   262  }
   263  
   264  // nextRune ignores any number of spaces that may precede a rune
   265  func (*jsonChunker) nextRune(r *bufio.Reader) (rune, error) {
   266  	if err := slurpSpace(r); err != nil {
   267  		return ' ', err
   268  	}
   269  	ch, _, err := r.ReadRune()
   270  	if err != nil {
   271  		return ' ', err
   272  	}
   273  	return ch, nil
   274  }
   275  
   276  func (jc *jsonChunker) Parse(chunkBuf *bytes.Buffer) error {
   277  	if chunkBuf == nil || chunkBuf.Len() == 0 {
   278  		return nil
   279  	}
   280  
   281  	err := jc.nqs.ParseJSON(chunkBuf.Bytes(), SetNquads)
   282  	return err
   283  }
   284  
   285  func slurpSpace(r *bufio.Reader) error {
   286  	for {
   287  		ch, _, err := r.ReadRune()
   288  		if err != nil {
   289  			return err
   290  		}
   291  		if !unicode.IsSpace(ch) {
   292  			x.Check(r.UnreadRune())
   293  			return nil
   294  		}
   295  	}
   296  }
   297  
   298  func slurpQuoted(r *bufio.Reader, out *bytes.Buffer) error {
   299  	for {
   300  		ch, _, err := r.ReadRune()
   301  		if err != nil {
   302  			return err
   303  		}
   304  		x.Check2(out.WriteRune(ch))
   305  
   306  		if ch == '\\' {
   307  			// Pick one more rune.
   308  			esc, _, err := r.ReadRune()
   309  			if err != nil {
   310  				return err
   311  			}
   312  			x.Check2(out.WriteRune(esc))
   313  			continue
   314  		}
   315  		if ch == '"' {
   316  			return nil
   317  		}
   318  	}
   319  }
   320  
   321  // FileReader returns an open reader and file on the given file. Gzip-compressed input is detected
   322  // and decompressed automatically even without the gz extension. The caller is responsible for
   323  // calling the returned cleanup function when done with the reader.
   324  func FileReader(file string) (rd *bufio.Reader, cleanup func()) {
   325  	var f *os.File
   326  	var err error
   327  	if file == "-" {
   328  		f = os.Stdin
   329  	} else {
   330  		f, err = os.Open(file)
   331  	}
   332  
   333  	x.Check(err)
   334  
   335  	cleanup = func() { f.Close() }
   336  
   337  	if filepath.Ext(file) == ".gz" {
   338  		gzr, err := gzip.NewReader(f)
   339  		x.Check(err)
   340  		rd = bufio.NewReader(gzr)
   341  		cleanup = func() { f.Close(); gzr.Close() }
   342  	} else {
   343  		rd = bufio.NewReader(f)
   344  		buf, _ := rd.Peek(512)
   345  
   346  		typ := http.DetectContentType(buf)
   347  		if typ == "application/x-gzip" {
   348  			gzr, err := gzip.NewReader(rd)
   349  			x.Check(err)
   350  			rd = bufio.NewReader(gzr)
   351  			cleanup = func() { f.Close(); gzr.Close() }
   352  		}
   353  	}
   354  
   355  	return rd, cleanup
   356  }
   357  
   358  // IsJSONData returns true if the reader, which should be at the start of the stream, is reading
   359  // a JSON stream, false otherwise.
   360  func IsJSONData(r *bufio.Reader) (bool, error) {
   361  	buf, err := r.Peek(512)
   362  	if err != nil && err != io.EOF {
   363  		return false, err
   364  	}
   365  
   366  	de := encjson.NewDecoder(bytes.NewReader(buf))
   367  	_, err = de.Token()
   368  
   369  	return err == nil, nil
   370  }
   371  
   372  // DataFormat returns a file's data format (RDF, JSON, or unknown) based on the filename
   373  // or the user-provided format option. The file extension has precedence.
   374  func DataFormat(filename string, format string) InputFormat {
   375  	format = strings.ToLower(format)
   376  	filename = strings.TrimSuffix(strings.ToLower(filename), ".gz")
   377  	switch {
   378  	case strings.HasSuffix(filename, ".rdf") || format == "rdf":
   379  		return RdfFormat
   380  	case strings.HasSuffix(filename, ".json") || format == "json":
   381  		return JsonFormat
   382  	default:
   383  		return UnknownFormat
   384  	}
   385  }