github.com/matrixorigin/matrixone@v0.7.0/pkg/vectorize/regular/regular_substr.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package regular
    16  
    17  import (
    18  	"regexp"
    19  
    20  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    21  	"github.com/matrixorigin/matrixone/pkg/container/nulls"
    22  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    23  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    24  )
    25  
    26  func RegularSubstr(expr, pat string, pos, occurrence int64, match_type string) ([]string, error) {
    27  	if pos < 1 || occurrence < 1 || pos >= int64(len(expr)) {
    28  		return nil, moerr.NewInvalidInputNoCtx("regexp_substr have invalid input")
    29  	}
    30  	//regular expression pattern
    31  	reg, err := regexp.Compile(pat)
    32  	if err != nil {
    33  		return nil, moerr.NewInvalidArgNoCtx("regexp_substr have invalid regexp pattern arg", pat)
    34  	}
    35  	//match result strings
    36  	matchRes := reg.FindAllString(expr[pos-1:], -1)
    37  	if matchRes == nil || int64(len(matchRes)) < occurrence {
    38  		return nil, nil
    39  	}
    40  	return matchRes, nil
    41  }
    42  
    43  func RegularSubstrWithReg(expr string, pat *regexp.Regexp, pos, occurrence int64, match_type string) ([]string, error) {
    44  	if pos < 1 || occurrence < 1 || pos >= int64(len(expr)) {
    45  		return nil, moerr.NewInvalidInputNoCtx("regexp_substr have invalid input")
    46  	}
    47  	//match result strings
    48  	matchRes := pat.FindAllString(expr[pos-1:], -1)
    49  	if matchRes == nil || int64(len(matchRes)) < occurrence {
    50  		return nil, nil
    51  	}
    52  	return matchRes, nil
    53  }
    54  
    55  func RegularSubstrWithArrays(expr, pat []string, pos, occ []int64, match_type []string, exprN, patN *nulls.Nulls, resultVector *vector.Vector, proc *process.Process, maxLen int) error {
    56  	rs := make([]string, maxLen)
    57  	var posValue int64
    58  	var occValue int64
    59  	if len(expr) == 1 && len(pat) == 1 {
    60  		reg, err := regexp.Compile(pat[0])
    61  		if err != nil {
    62  			return moerr.NewInvalidArgNoCtx("regexp_substr have invalid regexp pattern arg", pat)
    63  		}
    64  		for i := 0; i < maxLen; i++ {
    65  			if nulls.Contains(exprN, uint64(0)) || nulls.Contains(patN, uint64(0)) || pat[0] == "" {
    66  				nulls.Add(resultVector.Nsp, uint64(i))
    67  				continue
    68  			}
    69  			posValue, occValue = determineValuesWithTwo(pos, occ, i)
    70  			res, err := RegularSubstrWithReg(expr[0], reg, posValue, occValue, match_type[0])
    71  			if err != nil {
    72  				return err
    73  			}
    74  			if res == nil {
    75  				nulls.Add(resultVector.Nsp, uint64(i))
    76  				continue
    77  			}
    78  			rs[i] = res[occValue-1]
    79  		}
    80  		vector.AppendString(resultVector, rs, proc.Mp())
    81  	} else if len(expr) == 1 {
    82  		for i := 0; i < maxLen; i++ {
    83  			if nulls.Contains(exprN, uint64(0)) || nulls.Contains(patN, uint64(i)) || pat[i] == "" {
    84  				nulls.Add(resultVector.Nsp, uint64(i))
    85  				continue
    86  			}
    87  			posValue, occValue = determineValuesWithTwo(pos, occ, i)
    88  			res, err := RegularSubstr(expr[0], pat[i], posValue, occValue, match_type[0])
    89  			if err != nil {
    90  				return err
    91  			}
    92  			if res == nil {
    93  				nulls.Add(resultVector.Nsp, uint64(i))
    94  				continue
    95  			}
    96  			rs[i] = res[occValue-1]
    97  		}
    98  		vector.AppendString(resultVector, rs, proc.Mp())
    99  	} else if len(pat) == 1 {
   100  		reg, err := regexp.Compile(pat[0])
   101  		if err != nil {
   102  			return moerr.NewInvalidArgNoCtx("regexp_substr have invalid regexp pattern arg", pat)
   103  		}
   104  		for i := 0; i < maxLen; i++ {
   105  			if nulls.Contains(exprN, uint64(i)) || nulls.Contains(patN, uint64(0)) || pat[0] == "" {
   106  				nulls.Add(resultVector.Nsp, uint64(i))
   107  				continue
   108  			}
   109  			posValue, occValue = determineValuesWithTwo(pos, occ, i)
   110  			res, err := RegularSubstrWithReg(expr[i], reg, posValue, occValue, match_type[0])
   111  			if err != nil {
   112  				return err
   113  			}
   114  			if res == nil {
   115  				nulls.Add(resultVector.Nsp, uint64(i))
   116  				continue
   117  			}
   118  			rs[i] = res[occValue-1]
   119  		}
   120  		vector.AppendString(resultVector, rs, proc.Mp())
   121  	} else {
   122  		for i := 0; i < maxLen; i++ {
   123  			if nulls.Contains(exprN, uint64(i)) || nulls.Contains(patN, uint64(i)) || pat[i] == "" {
   124  				nulls.Add(resultVector.Nsp, uint64(i))
   125  				continue
   126  			}
   127  			posValue, occValue = determineValuesWithTwo(pos, occ, i)
   128  			res, err := RegularSubstr(expr[0], pat[i], posValue, occValue, match_type[0])
   129  			if err != nil {
   130  				return err
   131  			}
   132  			if res == nil {
   133  				nulls.Add(resultVector.Nsp, uint64(i))
   134  				continue
   135  			}
   136  			rs[i] = res[occValue-1]
   137  		}
   138  		vector.AppendString(resultVector, rs, proc.Mp())
   139  	}
   140  	return nil
   141  }
   142  
   143  func determineValuesWithTwo(pos, occ []int64, i int) (int64, int64) {
   144  	var posValue int64
   145  	var occValue int64
   146  
   147  	if len(pos) == 1 {
   148  		posValue = pos[0]
   149  	} else {
   150  		posValue = pos[i]
   151  	}
   152  
   153  	if len(occ) == 1 {
   154  		occValue = occ[0]
   155  	} else {
   156  		occValue = occ[i]
   157  	}
   158  
   159  	return posValue, occValue
   160  }