github.com/m3db/m3@v1.5.0/src/m3ninx/index/regexp_test.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package index
    22  
    23  import (
    24  	"fmt"
    25  	"regexp/syntax"
    26  	"strings"
    27  	"testing"
    28  	"unicode"
    29  
    30  	"github.com/m3db/m3/src/x/tallytest"
    31  
    32  	"github.com/stretchr/testify/assert"
    33  	"github.com/stretchr/testify/require"
    34  	"github.com/uber-go/tally"
    35  )
    36  
    37  func TestEnsureSyntaxPerlTreatsAnchorsAsTextTerminator(t *testing.T) {
    38  	// Test to ensure future compatibility with changes in `regexp/syntax`.
    39  	//
    40  	// We require that '^' and '$' only match input terminating characters (i.e.
    41  	// text boundaries, not line boundaries within the input). The line of code
    42  	// below ensures that syntax.Perl does the same.
    43  	require.NotZero(t, syntax.Perl&syntax.OneLine)
    44  
    45  	// ensure our `parseRegexp` internal function uses the right flags too.
    46  	re, err := parseRegexp(".*")
    47  	require.NoError(t, err)
    48  	require.NotZero(t, re.Flags&syntax.OneLine)
    49  }
    50  
    51  func TestEnsureRegexpUnachoredee(t *testing.T) {
    52  	ast, err := parseRegexp("(?:^abc$){0,4}")
    53  	require.NoError(t, err)
    54  	pprintAst(ast)
    55  	println(fmt.Sprintf("%v", dumpRegexp(ast)))
    56  }
    57  
    58  func TestEnsureRegexpUnachored(t *testing.T) {
    59  	testCases := []testCase{
    60  		testCase{
    61  			name:           "naked ^",
    62  			input:          "^",
    63  			expectedOutput: "emp{}",
    64  		},
    65  		testCase{
    66  			name:           "naked $",
    67  			input:          "$",
    68  			expectedOutput: "emp{}",
    69  		},
    70  		testCase{
    71  			name:           "empty string ^$",
    72  			input:          "^$",
    73  			expectedOutput: "cat{}",
    74  		},
    75  		testCase{
    76  			name:           "invalid naked concat ^$",
    77  			input:          "$^",
    78  			expectedOutput: "cat{eot{}bot{}}",
    79  		},
    80  		testCase{
    81  			name:           "simple case of ^",
    82  			input:          "^abc",
    83  			expectedOutput: "str{abc}",
    84  		},
    85  		testCase{
    86  			name:           "simple case of $",
    87  			input:          "abc$",
    88  			expectedOutput: "str{abc}",
    89  		},
    90  		testCase{
    91  			name:           "simple case of both ^ & $",
    92  			input:          "^abc$",
    93  			expectedOutput: "str{abc}",
    94  		},
    95  		testCase{
    96  			name:           "weird case of internal ^",
    97  			input:          "^a^bc$",
    98  			expectedOutput: "cat{lit{a}bot{}str{bc}}",
    99  		},
   100  		testCase{
   101  			name:           "weird case of internal $",
   102  			input:          "^a$bc$",
   103  			expectedOutput: "cat{lit{a}eot{}str{bc}}",
   104  		},
   105  		testCase{
   106  			name:           "alternate of sub expressions with only legal ^ and $",
   107  			input:          "(?:^abc$)|(?:^xyz$)",
   108  			expectedOutput: "alt{str{abc}str{xyz}}",
   109  		},
   110  		testCase{
   111  			name:           "concat of sub expressions with only legal ^ and $",
   112  			input:          "(^abc$)(?:^xyz$)",
   113  			expectedOutput: "cat{cap{cat{str{abc}eot{}}}bot{}str{xyz}}",
   114  		},
   115  		testCase{
   116  			name:           "alternate of sub expressions with illegal ^ and $",
   117  			input:          "(?:^a$bc$)|(?:^xyz$)",
   118  			expectedOutput: "alt{cat{lit{a}eot{}str{bc}}str{xyz}}",
   119  		},
   120  		testCase{
   121  			name:           "concat of sub expressions with illegal ^ and $",
   122  			input:          "(?:^a$bc$)(?:^xyz$)",
   123  			expectedOutput: "cat{lit{a}eot{}str{bc}eot{}bot{}str{xyz}}",
   124  		},
   125  		testCase{
   126  			name:           "question mark case both boundaries success",
   127  			input:          "(?:^abc$)?",
   128  			expectedOutput: "que{str{abc}}",
   129  		},
   130  		testCase{
   131  			name:           "question mark case only ^",
   132  			input:          "(?:^abc)?",
   133  			expectedOutput: "que{str{abc}}",
   134  		},
   135  		testCase{
   136  			name:           "question mark case only $",
   137  			input:          "(?:abc$)?",
   138  			expectedOutput: "que{str{abc}}",
   139  		},
   140  		testCase{
   141  			name:           "question concat case $",
   142  			input:          "abc$?",
   143  			expectedOutput: "str{abc}",
   144  		},
   145  		testCase{
   146  			name:           "star mark case both boundaries success",
   147  			input:          "(?:^abc$)*",
   148  			expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}eot{}}}}",
   149  		},
   150  		testCase{
   151  			name:           "star mark case only ^",
   152  			input:          "(?:^abc)*",
   153  			expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}}}}",
   154  		},
   155  		testCase{
   156  			name:           "star mark case only $",
   157  			input:          "(?:abc$)*",
   158  			expectedOutput: "cat{que{str{abc}}star{cat{str{abc}eot{}}}}",
   159  		},
   160  		testCase{
   161  			name:           "star concat case $",
   162  			input:          "abc$*",
   163  			expectedOutput: "cat{str{abc}star{eot{}}}",
   164  		},
   165  		testCase{
   166  			name:           "star concat case ^",
   167  			input:          "^*abc",
   168  			expectedOutput: "cat{star{bot{}}str{abc}}",
   169  		},
   170  		testCase{
   171  			name:           "plus mark case both boundaries success",
   172  			input:          "(?:^abc$)+",
   173  			expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}eot{}}}}",
   174  		},
   175  		testCase{
   176  			name:           "plus mark case with capturing group",
   177  			input:          "(^abc$)+",
   178  			expectedOutput: "cat{cap{str{abc}}star{cap{cat{bot{}str{abc}eot{}}}}}",
   179  		},
   180  		testCase{
   181  			name:           "plus mark case only ^",
   182  			input:          "(?:^abc)+",
   183  			expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}}}}",
   184  		},
   185  		testCase{
   186  			name:           "plus mark case only $",
   187  			input:          "(?:abc$)+",
   188  			expectedOutput: "cat{str{abc}star{cat{str{abc}eot{}}}}",
   189  		},
   190  		testCase{
   191  			name:           "plus concat case $",
   192  			input:          "abc$+",
   193  			expectedOutput: "cat{str{abc}star{eot{}}}",
   194  		},
   195  		testCase{
   196  			name:           "plus concat case ^",
   197  			input:          "^+abc",
   198  			expectedOutput: "cat{star{bot{}}str{abc}}",
   199  		},
   200  		testCase{
   201  			name:           "repeat case both boundaries success",
   202  			input:          "(?:^abc$){3,4}",
   203  			expectedOutput: "cat{str{abc}rep{2,3 cat{bot{}str{abc}eot{}}}}",
   204  		},
   205  		testCase{
   206  			name:           "repeat case unbounded max",
   207  			input:          "(?:^abc$){3,}",
   208  			expectedOutput: "cat{str{abc}rep{2,-1 cat{bot{}str{abc}eot{}}}}",
   209  		},
   210  		testCase{
   211  			name:           "repeat case unbounded max with 1 min",
   212  			input:          "(?:^abc$){1,2}",
   213  			expectedOutput: "cat{str{abc}rep{0,1 cat{bot{}str{abc}eot{}}}}",
   214  		},
   215  		testCase{
   216  			name:           "repeat case unbounded max with 0 min",
   217  			input:          "(?:^abc$){0,2}",
   218  			expectedOutput: "rep{0,2 cat{bot{}str{abc}eot{}}}",
   219  		},
   220  	}
   221  	for _, tc := range testCases {
   222  		t.Run(tc.name, func(t *testing.T) {
   223  			re, err := parseRegexp(tc.input)
   224  			require.NoError(t, err)
   225  			parsed, err := ensureRegexpUnanchored(re)
   226  			require.NoError(t, err)
   227  			assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed))
   228  		})
   229  	}
   230  }
   231  
   232  func TestEnsureRegexpAnchored(t *testing.T) {
   233  	testCases := []testCase{
   234  		testCase{
   235  			name:           "naked ^",
   236  			input:          "(?:)",
   237  			expectedOutput: "cat{bot{}eot{\\z}}",
   238  		},
   239  		testCase{
   240  			name:           "invalid naked concat ^$",
   241  			input:          "$^",
   242  			expectedOutput: "cat{bot{}eot{}bot{}eot{\\z}}",
   243  		},
   244  		testCase{
   245  			name:           "simple case of literal",
   246  			input:          "abc",
   247  			expectedOutput: "cat{bot{}str{abc}eot{\\z}}",
   248  		},
   249  		testCase{
   250  			name:           "weird case of internal ^",
   251  			input:          "a^bc",
   252  			expectedOutput: "cat{bot{}lit{a}bot{}str{bc}eot{\\z}}",
   253  		},
   254  		testCase{
   255  			name:           "weird case of internal $",
   256  			input:          "a$bc",
   257  			expectedOutput: "cat{bot{}lit{a}eot{}str{bc}eot{\\z}}",
   258  		},
   259  		testCase{
   260  			name:           "alternate of sub expressions with only legal ^ and $",
   261  			input:          "abc|xyz",
   262  			expectedOutput: "cat{bot{}alt{str{abc}str{xyz}}eot{\\z}}",
   263  		},
   264  		testCase{
   265  			name:           "concat of sub expressions with only legal ^ and $",
   266  			input:          "(?:abc)(?:xyz)",
   267  			expectedOutput: "cat{bot{}str{abcxyz}eot{\\z}}",
   268  		},
   269  		testCase{
   270  			name:           "question mark case both boundaries success",
   271  			input:          "(?:abc)?",
   272  			expectedOutput: "cat{bot{}que{str{abc}}eot{\\z}}",
   273  		},
   274  		testCase{
   275  			name:           "star mark case both boundaries success",
   276  			input:          "(?:abc)*",
   277  			expectedOutput: "cat{bot{}star{str{abc}}eot{\\z}}",
   278  		},
   279  		testCase{
   280  			name:           "plus mark case both boundaries success",
   281  			input:          "(?:abc)+",
   282  			expectedOutput: "cat{bot{}plus{str{abc}}eot{\\z}}",
   283  		},
   284  		testCase{
   285  			name:           "repeat case both boundaries success",
   286  			input:          "(?:abc){3,4}",
   287  			expectedOutput: "cat{bot{}str{abc}str{abc}str{abc}que{str{abc}}eot{\\z}}",
   288  		},
   289  	}
   290  	for _, tc := range testCases {
   291  		t.Run(tc.name, func(t *testing.T) {
   292  			re, err := parseRegexp(tc.input)
   293  			require.NoError(t, err)
   294  			parsed, err := ensureRegexpAnchored(re)
   295  			require.NoError(t, err)
   296  			assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed))
   297  		})
   298  	}
   299  }
   300  
   301  type testCase struct {
   302  	name           string
   303  	input          string
   304  	expectedOutput string
   305  }
   306  
   307  // nolint
   308  // only used for debugging
   309  func pprintAst(ast *syntax.Regexp) {
   310  	println(fmt.Sprintf("%+v", *ast))
   311  	for i, s := range ast.Sub {
   312  		println(fmt.Sprintf("%d>", i))
   313  		pprintAst(s)
   314  	}
   315  }
   316  
   317  // NB(prateek): adapted from https://golang.org/src/regexp/syntax/parse_test.go#L315
   318  var opNames = []string{
   319  	syntax.OpNoMatch:        "no",
   320  	syntax.OpEmptyMatch:     "emp",
   321  	syntax.OpLiteral:        "lit",
   322  	syntax.OpCharClass:      "cc",
   323  	syntax.OpAnyCharNotNL:   "dnl",
   324  	syntax.OpAnyChar:        "dot",
   325  	syntax.OpBeginLine:      "bol",
   326  	syntax.OpEndLine:        "eol",
   327  	syntax.OpBeginText:      "bot",
   328  	syntax.OpEndText:        "eot",
   329  	syntax.OpWordBoundary:   "wb",
   330  	syntax.OpNoWordBoundary: "nwb",
   331  	syntax.OpCapture:        "cap",
   332  	syntax.OpStar:           "star",
   333  	syntax.OpPlus:           "plus",
   334  	syntax.OpQuest:          "que",
   335  	syntax.OpRepeat:         "rep",
   336  	syntax.OpConcat:         "cat",
   337  	syntax.OpAlternate:      "alt",
   338  }
   339  
   340  // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
   341  // It is used during testing to distinguish between parses that might print
   342  // the same using re's String method.
   343  func dumpRegexp(re *syntax.Regexp) string {
   344  	var b strings.Builder
   345  	dumpRegexpHelper(&b, re)
   346  	return b.String()
   347  }
   348  
   349  func dumpRegexpHelper(b *strings.Builder, re *syntax.Regexp) {
   350  	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
   351  		fmt.Fprintf(b, "op%d", re.Op)
   352  	} else {
   353  		switch re.Op {
   354  		default:
   355  			b.WriteString(opNames[re.Op])
   356  		case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat:
   357  			if re.Flags&syntax.NonGreedy != 0 {
   358  				b.WriteByte('n')
   359  			}
   360  			b.WriteString(opNames[re.Op])
   361  		case syntax.OpLiteral:
   362  			if len(re.Rune) > 1 {
   363  				b.WriteString("str")
   364  			} else {
   365  				b.WriteString("lit")
   366  			}
   367  			if re.Flags&syntax.FoldCase != 0 {
   368  				for _, r := range re.Rune {
   369  					if unicode.SimpleFold(r) != r {
   370  						b.WriteString("fold")
   371  						break
   372  					}
   373  				}
   374  			}
   375  		}
   376  	}
   377  	b.WriteByte('{')
   378  	switch re.Op {
   379  	case syntax.OpEndText:
   380  		if re.Flags&syntax.WasDollar == 0 {
   381  			b.WriteString(`\z`)
   382  		}
   383  	case syntax.OpLiteral:
   384  		for _, r := range re.Rune {
   385  			b.WriteRune(r)
   386  		}
   387  	case syntax.OpConcat, syntax.OpAlternate:
   388  		for _, sub := range re.Sub {
   389  			dumpRegexpHelper(b, sub)
   390  		}
   391  	case syntax.OpStar, syntax.OpPlus, syntax.OpQuest:
   392  		dumpRegexpHelper(b, re.Sub[0])
   393  	case syntax.OpRepeat:
   394  		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
   395  		dumpRegexpHelper(b, re.Sub[0])
   396  	case syntax.OpCapture:
   397  		if re.Name != "" {
   398  			b.WriteString(re.Name)
   399  			b.WriteByte(':')
   400  		}
   401  		dumpRegexpHelper(b, re.Sub[0])
   402  	case syntax.OpCharClass:
   403  		sep := ""
   404  		for i := 0; i < len(re.Rune); i += 2 {
   405  			b.WriteString(sep)
   406  			sep = " "
   407  			lo, hi := re.Rune[i], re.Rune[i+1]
   408  			if lo == hi {
   409  				fmt.Fprintf(b, "%#x", lo)
   410  			} else {
   411  				fmt.Fprintf(b, "%#x-%#x", lo, hi)
   412  			}
   413  		}
   414  	}
   415  	b.WriteByte('}')
   416  }
   417  
   418  func TestRegexpCache(t *testing.T) {
   419  	scope := tally.NewTestScope("", nil)
   420  
   421  	SetRegexpCacheOptions(RegexpCacheOptions{Size: 1, Scope: scope})
   422  	defer SetRegexpCacheOptions(RegexpCacheOptions{Size: 0})
   423  
   424  	_, err := CompileRegex([]byte("foo.*bar"))
   425  	require.NoError(t, err)
   426  
   427  	tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.miss", nil)
   428  
   429  	_, err = CompileRegex([]byte("foo.*bar"))
   430  	require.NoError(t, err)
   431  
   432  	tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.hit", nil)
   433  }