github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/pgwire/hba/scanner.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package hba
    12  
    13  import (
    14  	"regexp"
    15  	"strings"
    16  
    17  	"github.com/cockroachdb/errors"
    18  )
    19  
    20  // This file contains a scanner for the pg_hba.conf token syntax.
    21  //
    22  // The algorithm used here is as follows: first the input is split
    23  // into lines. Then each line is scanned using a rule-based algorithm.
    24  //
    25  
    26  // rule represents one scanning rule.
    27  type rule struct {
    28  	// re is the regular expression to match at the current text position.
    29  	re string
    30  	// fn is the action function to call if the rule matches.
    31  	// - if foundToken is found true, the lexer stops scanning and returns the current token.
    32  	// - err stops the scan and returns an error.
    33  	fn func(l *lex) (foundToken bool, err error)
    34  }
    35  
    36  // lex represents the state of the scanner.
    37  // This is not meant to be used in parsing rules.
    38  type lex struct {
    39  	String
    40  
    41  	// comma is set to true if the last found token was succeeded by a
    42  	// comma.
    43  	comma bool
    44  
    45  	// lexed is set to the portion of the text matched by the current
    46  	// rule, and is provided as input to the rule's action function.
    47  	lexed string
    48  }
    49  
    50  // rules describes the scanning rules.
    51  //
    52  // As per pg's source, file src/backend/libpq/hba.c:
    53  //   Tokens are strings of non-blank
    54  //   characters bounded by blank characters, commas, beginning of line, and
    55  //   end of line. Blank means space or tab. Tokens can be delimited by
    56  //   double quotes (this allows the inclusion of blanks, but not newlines).
    57  //
    58  // The scanner implemented here is slightly more strict than the one
    59  // used by PostgreSQL. For example, PostgreSQL supports tokens written
    60  // as: abc"def"geh to represent the single string "abcdefgeh". The
    61  // same input here will yield 3 different tokens "abc", "def"(quoted),
    62  // "geh".
    63  //
    64  // PostgreSQL also accepts including special (control) characters
    65  // inside quoted and unquoted strings, including tabs (\t) and
    66  // carriage returns (\r) inside quoted strings. These are not accepted
    67  // here for the sake of simplicity in the pretty-printer. If a use
    68  // case comes up where they should be accepted, care should be taken
    69  // to implement a new pretty-printer that does not rewrite whitespace
    70  // in HBA strings.
    71  //
    72  // This difference is intended; it makes the implementation simpler
    73  // and the result less surprising.
    74  //
    75  // Meanwhile, the scanner does implements some other oddities of
    76  // PostgreSQL. For example:
    77  //    a, b       (space after comma) counts as a single comma-delimited field.
    78  //    a ,b       (space before comma) counts as two fields.
    79  //
    80  var rules = []struct {
    81  	r  rule
    82  	rg *regexp.Regexp
    83  }{
    84  	{r: rule{`[ \t\r,]*` /***********/, func(l *lex) (bool, error) { return false, nil }}},
    85  	{r: rule{`#.*$` /****************/, func(l *lex) (bool, error) { return false, nil }}},
    86  	{r: rule{`[^[:cntrl:] ",]+,?` /**/, func(l *lex) (bool, error) { l.checkComma(); l.Value = l.lexed; return true, nil }}},
    87  	{r: rule{`"[^[:cntrl:]"]*",?` /**/, func(l *lex) (bool, error) { l.checkComma(); l.stripQuotes(); l.Value = l.lexed; return true, nil }}},
    88  	{r: rule{`"[^"]*$` /*************/, func(l *lex) (bool, error) { return false, errors.New("unterminated quoted string") }}},
    89  	{r: rule{`"[^"]*"` /*************/, func(l *lex) (bool, error) { return false, errors.New("invalid characters in quoted string") }}},
    90  	{r: rule{`.` /*******************/, func(l *lex) (bool, error) { return false, errors.Newf("unsupported character: %q", l.lexed) }}},
    91  }
    92  
    93  func (l *lex) checkComma() {
    94  	l.comma = l.lexed[len(l.lexed)-1] == ','
    95  	if l.comma {
    96  		l.lexed = l.lexed[:len(l.lexed)-1]
    97  	}
    98  }
    99  
   100  func (l *lex) stripQuotes() {
   101  	l.Quoted = true
   102  	l.lexed = l.lexed[1 : len(l.lexed)-1]
   103  }
   104  
   105  func init() {
   106  	for i := range rules {
   107  		rules[i].rg = regexp.MustCompile("^" + rules[i].r.re)
   108  	}
   109  }
   110  
   111  // nextToken reads the next token from buf. A token is a simple or
   112  // quoted string. If there is no token (e.g. just whitespace), the
   113  // returned token is empty. trailingComma indicates whether the token
   114  // is immediately followed by a comma.
   115  //
   116  // Inspired from pg's src/backend/libpq/hba.c, next_token().
   117  func nextToken(buf string) (remaining string, tok String, trailingComma bool, err error) {
   118  	remaining = buf
   119  	var l lex
   120  outer:
   121  	for remaining != "" {
   122  		l = lex{}
   123  	inner:
   124  		for _, rule := range rules {
   125  			l.lexed = rule.rg.FindString(remaining)
   126  			remaining = remaining[len(l.lexed):]
   127  			if l.lexed != "" {
   128  				var foundToken bool
   129  				foundToken, err = rule.r.fn(&l)
   130  				if foundToken || err != nil {
   131  					break outer
   132  				}
   133  				break inner
   134  			}
   135  		}
   136  	}
   137  	return remaining, l.String, l.comma, err
   138  }
   139  
   140  // nextFieldExpand reads the next comma-separated list of string from buf.
   141  // commas count as separator only when they immediately follow a string.
   142  //
   143  // Inspired from pg's src/backend/libpq/hba.c, next_field_expand().
   144  func nextFieldExpand(buf string) (remaining string, field []String, err error) {
   145  	remaining = buf
   146  	for {
   147  		var trailingComma bool
   148  		var tok String
   149  		remaining, tok, trailingComma, err = nextToken(remaining)
   150  		if tok.Empty() || err != nil {
   151  			return
   152  		}
   153  		field = append(field, tok)
   154  		if !trailingComma {
   155  			break
   156  		}
   157  	}
   158  	return
   159  }
   160  
   161  // tokenize splits the input into tokens.
   162  //
   163  // Inspired from pg's src/backend/libpq/hba.c, tokenize_file().
   164  func tokenize(input string) (res scannedInput, err error) {
   165  	inputLines := strings.Split(input, "\n")
   166  
   167  	for lineIdx, lineS := range inputLines {
   168  		var currentLine hbaLine
   169  		currentLine.input = strings.TrimSpace(lineS)
   170  		for remaining := lineS; remaining != ""; {
   171  			var currentField []String
   172  			remaining, currentField, err = nextFieldExpand(remaining)
   173  			if err != nil {
   174  				return res, errors.Wrapf(err, "line %d", lineIdx+1)
   175  			}
   176  			if len(currentField) > 0 {
   177  				currentLine.tokens = append(currentLine.tokens, currentField)
   178  			}
   179  		}
   180  		if len(currentLine.tokens) > 0 {
   181  			res.lines = append(res.lines, currentLine)
   182  			res.linenos = append(res.linenos, lineIdx+1)
   183  		}
   184  	}
   185  	return res, err
   186  }