github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/regexp_test.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package index
    22  
    23  import (
    24  	"fmt"
    25  	"regexp/syntax"
    26  	"strings"
    27  	"testing"
    28  	"unicode"
    29  
    30  	"github.com/m3db/m3/src/x/tallytest"
    31  
    32  	"github.com/stretchr/testify/assert"
    33  	"github.com/stretchr/testify/require"
    34  	"github.com/uber-go/tally"
    35  )
    36  
    37  func TestEnsureSyntaxPerlTreatsAnchorsAsTextTerminator(t *testing.T) {
    38  	// Test to ensure future compatibility with changes in `regexp/syntax`.
    39  	//
    40  	// We require that '^' and '$' only match input terminating characters (i.e.
    41  	// text boundaries, not line boundaries within the input). The line of code
    42  	// below ensures that syntax.Perl does the same.
    43  	require.NotZero(t, syntax.Perl&syntax.OneLine)
    44  
    45  	// ensure our `parseRegexp` internal function uses the right flags too.
    46  	re, err := parseRegexp(".*")
    47  	require.NoError(t, err)
    48  	require.NotZero(t, re.Flags&syntax.OneLine)
    49  }
    50  
    51  func TestEnsureRegexpUnachoredee(t *testing.T) {
    52  	ast, err := parseRegexp("(?:^abc$){0,4}")
    53  	require.NoError(t, err)
    54  	pprintAst(ast)
    55  	println(fmt.Sprintf("%v", dumpRegexp(ast)))
    56  }
    57  
    58  func TestEnsureRegexpUnachored(t *testing.T) {
    59  	testCases := []testCase{
    60  		{
    61  			name:           "naked ^",
    62  			input:          "^",
    63  			expectedOutput: "emp{}",
    64  		},
    65  		{
    66  			name:           "naked $",
    67  			input:          "$",
    68  			expectedOutput: "emp{}",
    69  		},
    70  		{
    71  			name:           "empty string ^$",
    72  			input:          "^$",
    73  			expectedOutput: "cat{}",
    74  		},
    75  		{
    76  			name:           "invalid naked concat ^$",
    77  			input:          "$^",
    78  			expectedOutput: "cat{eot{}bot{}}",
    79  		},
    80  		{
    81  			name:           "simple case of ^",
    82  			input:          "^abc",
    83  			expectedOutput: "str{abc}",
    84  		},
    85  		{
    86  			name:           "simple case of $",
    87  			input:          "abc$",
    88  			expectedOutput: "str{abc}",
    89  		},
    90  		{
    91  			name:           "simple case of both ^ & $",
    92  			input:          "^abc$",
    93  			expectedOutput: "str{abc}",
    94  		},
    95  		{
    96  			name:           "weird case of internal ^",
    97  			input:          "^a^bc$",
    98  			expectedOutput: "cat{lit{a}bot{}str{bc}}",
    99  		},
   100  		{
   101  			name:           "weird case of internal $",
   102  			input:          "^a$bc$",
   103  			expectedOutput: "cat{lit{a}eot{}str{bc}}",
   104  		},
   105  		{
   106  			name:           "alternate of sub expressions with only legal ^ and $",
   107  			input:          "(?:^abc$)|(?:^xyz$)",
   108  			expectedOutput: "alt{str{abc}str{xyz}}",
   109  		},
   110  		{
   111  			name:           "concat of sub expressions with only legal ^ and $",
   112  			input:          "(^abc$)(?:^xyz$)",
   113  			expectedOutput: "cat{cap{cat{str{abc}eot{}}}bot{}str{xyz}}",
   114  		},
   115  		{
   116  			name:           "alternate of sub expressions with illegal ^ and $",
   117  			input:          "(?:^a$bc$)|(?:^xyz$)",
   118  			expectedOutput: "alt{cat{lit{a}eot{}str{bc}}str{xyz}}",
   119  		},
   120  		{
   121  			name:           "concat of sub expressions with illegal ^ and $",
   122  			input:          "(?:^a$bc$)(?:^xyz$)",
   123  			expectedOutput: "cat{lit{a}eot{}str{bc}eot{}bot{}str{xyz}}",
   124  		},
   125  		{
   126  			name:           "question mark case both boundaries success",
   127  			input:          "(?:^abc$)?",
   128  			expectedOutput: "que{str{abc}}",
   129  		},
   130  		{
   131  			name:           "question mark case only ^",
   132  			input:          "(?:^abc)?",
   133  			expectedOutput: "que{str{abc}}",
   134  		},
   135  		{
   136  			name:           "question mark case only $",
   137  			input:          "(?:abc$)?",
   138  			expectedOutput: "que{str{abc}}",
   139  		},
   140  		{
   141  			name:           "question concat case $",
   142  			input:          "abc$?",
   143  			expectedOutput: "str{abc}",
   144  		},
   145  		{
   146  			name:           "star mark case both boundaries success",
   147  			input:          "(?:^abc$)*",
   148  			expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}eot{}}}}",
   149  		},
   150  		{
   151  			name:           "star mark case only ^",
   152  			input:          "(?:^abc)*",
   153  			expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}}}}",
   154  		},
   155  		{
   156  			name:           "star mark case only $",
   157  			input:          "(?:abc$)*",
   158  			expectedOutput: "cat{que{str{abc}}star{cat{str{abc}eot{}}}}",
   159  		},
   160  		{
   161  			name:           "star concat case $",
   162  			input:          "abc$*",
   163  			expectedOutput: "cat{str{abc}star{eot{}}}",
   164  		},
   165  		{
   166  			name:           "star concat case ^",
   167  			input:          "^*abc",
   168  			expectedOutput: "cat{star{bot{}}str{abc}}",
   169  		},
   170  		{
   171  			name:           "plus mark case both boundaries success",
   172  			input:          "(?:^abc$)+",
   173  			expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}eot{}}}}",
   174  		},
   175  		{
   176  			name:           "plus mark case with capturing group",
   177  			input:          "(^abc$)+",
   178  			expectedOutput: "cat{cap{str{abc}}star{cap{cat{bot{}str{abc}eot{}}}}}",
   179  		},
   180  		{
   181  			name:           "plus mark case only ^",
   182  			input:          "(?:^abc)+",
   183  			expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}}}}",
   184  		},
   185  		{
   186  			name:           "plus mark case only $",
   187  			input:          "(?:abc$)+",
   188  			expectedOutput: "cat{str{abc}star{cat{str{abc}eot{}}}}",
   189  		},
   190  		{
   191  			name:           "plus concat case $",
   192  			input:          "abc$+",
   193  			expectedOutput: "cat{str{abc}star{eot{}}}",
   194  		},
   195  		{
   196  			name:           "plus concat case ^",
   197  			input:          "^+abc",
   198  			expectedOutput: "cat{star{bot{}}str{abc}}",
   199  		},
   200  		{
   201  			name:           "repeat case both boundaries success",
   202  			input:          "(?:^abc$){3,4}",
   203  			expectedOutput: "cat{str{abc}rep{2,3 cat{bot{}str{abc}eot{}}}}",
   204  		},
   205  		{
   206  			name:           "repeat case unbounded max",
   207  			input:          "(?:^abc$){3,}",
   208  			expectedOutput: "cat{str{abc}rep{2,-1 cat{bot{}str{abc}eot{}}}}",
   209  		},
   210  		{
   211  			name:           "repeat case unbounded max with 1 min",
   212  			input:          "(?:^abc$){1,2}",
   213  			expectedOutput: "cat{str{abc}rep{0,1 cat{bot{}str{abc}eot{}}}}",
   214  		},
   215  		{
   216  			name:           "repeat case unbounded max with 0 min",
   217  			input:          "(?:^abc$){0,2}",
   218  			expectedOutput: "rep{0,2 cat{bot{}str{abc}eot{}}}",
   219  		},
   220  	}
   221  	for _, tc := range testCases {
   222  		t.Run(tc.name, func(t *testing.T) {
   223  			re, err := parseRegexp(tc.input)
   224  			require.NoError(t, err)
   225  			parsed, err := EnsureRegexpUnanchored(re)
   226  			require.NoError(t, err)
   227  			assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed))
   228  		})
   229  	}
   230  }
   231  
   232  func TestEnsureRegexpAnchored(t *testing.T) {
   233  	testCases := []testCase{
   234  		{
   235  			name:           "naked ^",
   236  			input:          "(?:)",
   237  			expectedOutput: "cat{bot{}eot{\\z}}",
   238  		},
   239  		{
   240  			name:           "invalid naked concat ^$",
   241  			input:          "$^",
   242  			expectedOutput: "cat{bot{}eot{}bot{}eot{\\z}}",
   243  		},
   244  		{
   245  			name:           "simple case of literal",
   246  			input:          "abc",
   247  			expectedOutput: "cat{bot{}str{abc}eot{\\z}}",
   248  		},
   249  		{
   250  			name:           "weird case of internal ^",
   251  			input:          "a^bc",
   252  			expectedOutput: "cat{bot{}lit{a}bot{}str{bc}eot{\\z}}",
   253  		},
   254  		{
   255  			name:           "weird case of internal $",
   256  			input:          "a$bc",
   257  			expectedOutput: "cat{bot{}lit{a}eot{}str{bc}eot{\\z}}",
   258  		},
   259  		{
   260  			name:           "alternate of sub expressions with only legal ^ and $",
   261  			input:          "abc|xyz",
   262  			expectedOutput: "cat{bot{}alt{str{abc}str{xyz}}eot{\\z}}",
   263  		},
   264  		{
   265  			name:           "concat of sub expressions with only legal ^ and $",
   266  			input:          "(?:abc)(?:xyz)",
   267  			expectedOutput: "cat{bot{}str{abcxyz}eot{\\z}}",
   268  		},
   269  		{
   270  			name:           "question mark case both boundaries success",
   271  			input:          "(?:abc)?",
   272  			expectedOutput: "cat{bot{}que{str{abc}}eot{\\z}}",
   273  		},
   274  		{
   275  			name:           "star mark case both boundaries success",
   276  			input:          "(?:abc)*",
   277  			expectedOutput: "cat{bot{}star{str{abc}}eot{\\z}}",
   278  		},
   279  		{
   280  			name:           "plus mark case both boundaries success",
   281  			input:          "(?:abc)+",
   282  			expectedOutput: "cat{bot{}plus{str{abc}}eot{\\z}}",
   283  		},
   284  		{
   285  			name:           "repeat case both boundaries success",
   286  			input:          "(?:abc){3,4}",
   287  			expectedOutput: "cat{bot{}str{abc}str{abc}str{abc}que{str{abc}}eot{\\z}}",
   288  		},
   289  	}
   290  	for _, tc := range testCases {
   291  		t.Run(tc.name, func(t *testing.T) {
   292  			re, err := parseRegexp(tc.input)
   293  			require.NoError(t, err)
   294  			parsed := EnsureRegexpAnchored(re)
   295  			assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed))
   296  		})
   297  	}
   298  }
   299  
   300  type testCase struct {
   301  	name           string
   302  	input          string
   303  	expectedOutput string
   304  }
   305  
   306  // nolint
   307  // only used for debugging
   308  func pprintAst(ast *syntax.Regexp) {
   309  	println(fmt.Sprintf("%+v", *ast))
   310  	for i, s := range ast.Sub {
   311  		println(fmt.Sprintf("%d>", i))
   312  		pprintAst(s)
   313  	}
   314  }
   315  
   316  // NB(prateek): adapted from https://golang.org/src/regexp/syntax/parse_test.go#L315
   317  var opNames = []string{
   318  	syntax.OpNoMatch:        "no",
   319  	syntax.OpEmptyMatch:     "emp",
   320  	syntax.OpLiteral:        "lit",
   321  	syntax.OpCharClass:      "cc",
   322  	syntax.OpAnyCharNotNL:   "dnl",
   323  	syntax.OpAnyChar:        "dot",
   324  	syntax.OpBeginLine:      "bol",
   325  	syntax.OpEndLine:        "eol",
   326  	syntax.OpBeginText:      "bot",
   327  	syntax.OpEndText:        "eot",
   328  	syntax.OpWordBoundary:   "wb",
   329  	syntax.OpNoWordBoundary: "nwb",
   330  	syntax.OpCapture:        "cap",
   331  	syntax.OpStar:           "star",
   332  	syntax.OpPlus:           "plus",
   333  	syntax.OpQuest:          "que",
   334  	syntax.OpRepeat:         "rep",
   335  	syntax.OpConcat:         "cat",
   336  	syntax.OpAlternate:      "alt",
   337  }
   338  
   339  // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
   340  // It is used during testing to distinguish between parses that might print
   341  // the same using re's String method.
   342  func dumpRegexp(re *syntax.Regexp) string {
   343  	var b strings.Builder
   344  	dumpRegexpHelper(&b, re)
   345  	return b.String()
   346  }
   347  
   348  func dumpRegexpHelper(b *strings.Builder, re *syntax.Regexp) {
   349  	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
   350  		fmt.Fprintf(b, "op%d", re.Op)
   351  	} else {
   352  		switch re.Op {
   353  		default:
   354  			b.WriteString(opNames[re.Op])
   355  		case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat:
   356  			if re.Flags&syntax.NonGreedy != 0 {
   357  				b.WriteByte('n')
   358  			}
   359  			b.WriteString(opNames[re.Op])
   360  		case syntax.OpLiteral:
   361  			if len(re.Rune) > 1 {
   362  				b.WriteString("str")
   363  			} else {
   364  				b.WriteString("lit")
   365  			}
   366  			if re.Flags&syntax.FoldCase != 0 {
   367  				for _, r := range re.Rune {
   368  					if unicode.SimpleFold(r) != r {
   369  						b.WriteString("fold")
   370  						break
   371  					}
   372  				}
   373  			}
   374  		}
   375  	}
   376  	b.WriteByte('{')
   377  	switch re.Op {
   378  	case syntax.OpEndText:
   379  		if re.Flags&syntax.WasDollar == 0 {
   380  			b.WriteString(`\z`)
   381  		}
   382  	case syntax.OpLiteral:
   383  		for _, r := range re.Rune {
   384  			b.WriteRune(r)
   385  		}
   386  	case syntax.OpConcat, syntax.OpAlternate:
   387  		for _, sub := range re.Sub {
   388  			dumpRegexpHelper(b, sub)
   389  		}
   390  	case syntax.OpStar, syntax.OpPlus, syntax.OpQuest:
   391  		dumpRegexpHelper(b, re.Sub[0])
   392  	case syntax.OpRepeat:
   393  		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
   394  		dumpRegexpHelper(b, re.Sub[0])
   395  	case syntax.OpCapture:
   396  		if re.Name != "" {
   397  			b.WriteString(re.Name)
   398  			b.WriteByte(':')
   399  		}
   400  		dumpRegexpHelper(b, re.Sub[0])
   401  	case syntax.OpCharClass:
   402  		sep := ""
   403  		for i := 0; i < len(re.Rune); i += 2 {
   404  			b.WriteString(sep)
   405  			sep = " "
   406  			lo, hi := re.Rune[i], re.Rune[i+1]
   407  			if lo == hi {
   408  				fmt.Fprintf(b, "%#x", lo)
   409  			} else {
   410  				fmt.Fprintf(b, "%#x-%#x", lo, hi)
   411  			}
   412  		}
   413  	}
   414  	b.WriteByte('}')
   415  }
   416  
   417  func TestRegexpCache(t *testing.T) {
   418  	scope := tally.NewTestScope("", nil)
   419  
   420  	SetRegexpCacheOptions(RegexpCacheOptions{Size: 1, Scope: scope})
   421  	defer SetRegexpCacheOptions(RegexpCacheOptions{Size: 0})
   422  
   423  	_, err := CompileRegex([]byte("foo.*bar"))
   424  	require.NoError(t, err)
   425  
   426  	tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.miss", nil)
   427  
   428  	_, err = CompileRegex([]byte("foo.*bar"))
   429  	require.NoError(t, err)
   430  
   431  	tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.hit", nil)
   432  }