github.com/v2fly/v2ray-core/v5@v5.16.2-0.20240507031116-8191faa6e095/common/strmatcher/matchergroup_ac_automation.go (about)

     1  package strmatcher
     2  
     3  import (
     4  	"container/list"
     5  )
     6  
     7  const (
     8  	acValidCharCount = 39 // aA-zZ (26), 0-9 (10), - (1), . (1), invalid(1)
     9  	acMatchTypeCount = 3  // Full, Domain and Substr
    10  )
    11  
    12  type acEdge byte
    13  
    14  const (
    15  	acTrieEdge acEdge = 1
    16  	acFailEdge acEdge = 0
    17  )
    18  
    19  type acNode struct {
    20  	next  [acValidCharCount]uint32 // EdgeIdx -> Next NodeIdx (Next trie node or fail node)
    21  	edge  [acValidCharCount]acEdge // EdgeIdx -> Trie Edge / Fail Edge
    22  	fail  uint32                   // NodeIdx of *next matched* Substr Pattern on its fail path
    23  	match uint32                   // MatchIdx of matchers registered on this node, 0 indicates no match
    24  } // Sizeof acNode: (4+1)*acValidCharCount + <padding> + 4 + 4
    25  
    26  type acValue [acMatchTypeCount][]uint32 // MatcherType -> Registered Matcher Values
    27  
    28  // ACAutoMationMatcherGroup is an implementation of MatcherGroup.
    29  // It uses an AC Automata to provide support for Full, Domain and Substr matcher. Trie node is char based.
    30  //
    31  // NOTICE: ACAutomatonMatcherGroup currently uses a restricted charset (LDH Subset),
    32  // upstream should manually in a way to ensure all patterns and inputs passed to it to be in this charset.
    33  type ACAutomatonMatcherGroup struct {
    34  	nodes  []acNode  // NodeIdx -> acNode
    35  	values []acValue // MatchIdx -> acValue
    36  }
    37  
    38  func NewACAutomatonMatcherGroup() *ACAutomatonMatcherGroup {
    39  	ac := new(ACAutomatonMatcherGroup)
    40  	ac.addNode()       // Create root node (NodeIdx 0)
    41  	ac.addMatchEntry() // Create sentinel match entry (MatchIdx 0)
    42  	return ac
    43  }
    44  
    45  // AddFullMatcher implements MatcherGroupForFull.AddFullMatcher.
    46  func (ac *ACAutomatonMatcherGroup) AddFullMatcher(matcher FullMatcher, value uint32) {
    47  	ac.addPattern(0, matcher.Pattern(), matcher.Type(), value)
    48  }
    49  
    50  // AddDomainMatcher implements MatcherGroupForDomain.AddDomainMatcher.
    51  func (ac *ACAutomatonMatcherGroup) AddDomainMatcher(matcher DomainMatcher, value uint32) {
    52  	node := ac.addPattern(0, matcher.Pattern(), matcher.Type(), value) // For full domain match
    53  	ac.addPattern(node, ".", matcher.Type(), value)                    // For partial domain match
    54  }
    55  
    56  // AddSubstrMatcher implements MatcherGroupForSubstr.AddSubstrMatcher.
    57  func (ac *ACAutomatonMatcherGroup) AddSubstrMatcher(matcher SubstrMatcher, value uint32) {
    58  	ac.addPattern(0, matcher.Pattern(), matcher.Type(), value)
    59  }
    60  
    61  func (ac *ACAutomatonMatcherGroup) addPattern(nodeIdx uint32, pattern string, matcherType Type, value uint32) uint32 {
    62  	node := &ac.nodes[nodeIdx]
    63  	for i := len(pattern) - 1; i >= 0; i-- {
    64  		edgeIdx := acCharset[pattern[i]]
    65  		nextIdx := node.next[edgeIdx]
    66  		if nextIdx == 0 { // Add new Trie Edge
    67  			nextIdx = ac.addNode()
    68  			ac.nodes[nodeIdx].next[edgeIdx] = nextIdx
    69  			ac.nodes[nodeIdx].edge[edgeIdx] = acTrieEdge
    70  		}
    71  		nodeIdx = nextIdx
    72  		node = &ac.nodes[nodeIdx]
    73  	}
    74  	if node.match == 0 { // Add new match entry
    75  		node.match = ac.addMatchEntry()
    76  	}
    77  	ac.values[node.match][matcherType] = append(ac.values[node.match][matcherType], value)
    78  	return nodeIdx
    79  }
    80  
    81  func (ac *ACAutomatonMatcherGroup) addNode() uint32 {
    82  	ac.nodes = append(ac.nodes, acNode{})
    83  	return uint32(len(ac.nodes) - 1)
    84  }
    85  
    86  func (ac *ACAutomatonMatcherGroup) addMatchEntry() uint32 {
    87  	ac.values = append(ac.values, acValue{})
    88  	return uint32(len(ac.values) - 1)
    89  }
    90  
    91  func (ac *ACAutomatonMatcherGroup) Build() error {
    92  	fail := make([]uint32, len(ac.nodes))
    93  	queue := list.New()
    94  	for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ {
    95  		if nextIdx := ac.nodes[0].next[edgeIdx]; nextIdx != 0 {
    96  			queue.PushBack(nextIdx)
    97  		}
    98  	}
    99  	for {
   100  		front := queue.Front()
   101  		if front == nil {
   102  			break
   103  		}
   104  		queue.Remove(front)
   105  		nodeIdx := front.Value.(uint32)
   106  		node := &ac.nodes[nodeIdx]           // Current node
   107  		failNode := &ac.nodes[fail[nodeIdx]] // Fail node of currrent node
   108  		for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ {
   109  			nodeIdx := node.next[edgeIdx]     // Next node through trie edge
   110  			failIdx := failNode.next[edgeIdx] // Next node through fail edge
   111  			if nodeIdx != 0 {
   112  				queue.PushBack(nodeIdx)
   113  				fail[nodeIdx] = failIdx
   114  				if match := ac.nodes[failIdx].match; match != 0 && len(ac.values[match][Substr]) > 0 { // Fail node is a Substr match node
   115  					ac.nodes[nodeIdx].fail = failIdx
   116  				} else { // Use path compression to reduce fail path to only contain match nodes
   117  					ac.nodes[nodeIdx].fail = ac.nodes[failIdx].fail
   118  				}
   119  			} else { // Add new fail edge
   120  				node.next[edgeIdx] = failIdx
   121  				node.edge[edgeIdx] = acFailEdge
   122  			}
   123  		}
   124  	}
   125  	return nil
   126  }
   127  
   128  // Match implements MatcherGroup.Match.
   129  func (ac *ACAutomatonMatcherGroup) Match(input string) []uint32 {
   130  	suffixMatches := make([][]uint32, 0, 5)
   131  	substrMatches := make([][]uint32, 0, 5)
   132  	fullMatch := true    // fullMatch indicates no fail edge traversed so far.
   133  	node := &ac.nodes[0] // start from root node.
   134  	// 1. the match string is all through trie edge. FULL MATCH or DOMAIN
   135  	// 2. the match string is through a fail edge. NOT FULL MATCH
   136  	// 2.1 Through a fail edge, but there exists a valid node. SUBSTR
   137  	for i := len(input) - 1; i >= 0; i-- {
   138  		edge := acCharset[input[i]]
   139  		fullMatch = fullMatch && (node.edge[edge] == acTrieEdge)
   140  		node = &ac.nodes[node.next[edge]] // Advance to next node
   141  		// When entering a new node, traverse the fail path to find all possible Substr patterns:
   142  		//   1. The fail path is compressed to only contains match nodes and root node (for terminate condition).
   143  		//   2. node.fail != 0 is added here for better performance (as shown by benchmark), possibly it helps branch prediction.
   144  		if node.fail != 0 {
   145  			for failIdx, failNode := node.fail, &ac.nodes[node.fail]; failIdx != 0; failIdx, failNode = failNode.fail, &ac.nodes[failIdx] {
   146  				substrMatches = append(substrMatches, ac.values[failNode.match][Substr])
   147  			}
   148  		}
   149  		// When entering a new node, check whether this node is a match.
   150  		// For Substr matchers:
   151  		//   1. Matched in any situation, whether a failNode edge is traversed or not.
   152  		// For Domain matchers:
   153  		//   1. Should not traverse any fail edge (fullMatch).
   154  		//   2. Only check on dot separator (input[i] == '.').
   155  		if node.match != 0 {
   156  			values := ac.values[node.match]
   157  			if len(values[Substr]) > 0 {
   158  				substrMatches = append(substrMatches, values[Substr])
   159  			}
   160  			if fullMatch && input[i] == '.' && len(values[Domain]) > 0 {
   161  				suffixMatches = append(suffixMatches, values[Domain])
   162  			}
   163  		}
   164  	}
   165  	// At the end of input, check if the whole string matches a pattern.
   166  	// For Domain matchers:
   167  	//   1. Exact match on Domain Matcher works like Full Match. e.g. foo.com is a full match for domain:foo.com.
   168  	// For Full matchers:
   169  	//   1. Only when no fail edge is traversed (fullMatch).
   170  	//   2. Takes the highest priority (added at last).
   171  	if fullMatch && node.match != 0 {
   172  		values := ac.values[node.match]
   173  		if len(values[Domain]) > 0 {
   174  			suffixMatches = append(suffixMatches, values[Domain])
   175  		}
   176  		if len(values[Full]) > 0 {
   177  			suffixMatches = append(suffixMatches, values[Full])
   178  		}
   179  	}
   180  	if len(substrMatches) == 0 {
   181  		return CompositeMatchesReverse(suffixMatches)
   182  	}
   183  	return CompositeMatchesReverse(append(substrMatches, suffixMatches...))
   184  }
   185  
   186  // MatchAny implements MatcherGroup.MatchAny.
   187  func (ac *ACAutomatonMatcherGroup) MatchAny(input string) bool {
   188  	fullMatch := true
   189  	node := &ac.nodes[0]
   190  	for i := len(input) - 1; i >= 0; i-- {
   191  		edge := acCharset[input[i]]
   192  		fullMatch = fullMatch && (node.edge[edge] == acTrieEdge)
   193  		node = &ac.nodes[node.next[edge]]
   194  		if node.fail != 0 { // There is a match on this node's fail path
   195  			return true
   196  		}
   197  		if node.match != 0 { // There is a match on this node
   198  			values := ac.values[node.match]
   199  			if len(values[Substr]) > 0 { // Substr match succeeds unconditionally
   200  				return true
   201  			}
   202  			if fullMatch && input[i] == '.' && len(values[Domain]) > 0 { // Domain match only succeeds with dot separator on trie path
   203  				return true
   204  			}
   205  		}
   206  	}
   207  	return fullMatch && node.match != 0 // At the end of input, Domain and Full match will succeed if no fail edge is traversed
   208  }
   209  
   210  // Letter-Digit-Hyphen (LDH) subset (https://tools.ietf.org/html/rfc952):
   211  //   - Letters A to Z (no distinction is made between uppercase and lowercase)
   212  //   - Digits 0 to 9
   213  //   - Hyphens(-) and Periods(.)
   214  //
   215  // If for future the strmatcher are used for other scenarios than domain,
   216  // we could add a new Charset interface to represent variable charsets.
   217  var acCharset = [256]int{
   218  	'A': 1,
   219  	'a': 1,
   220  	'B': 2,
   221  	'b': 2,
   222  	'C': 3,
   223  	'c': 3,
   224  	'D': 4,
   225  	'd': 4,
   226  	'E': 5,
   227  	'e': 5,
   228  	'F': 6,
   229  	'f': 6,
   230  	'G': 7,
   231  	'g': 7,
   232  	'H': 8,
   233  	'h': 8,
   234  	'I': 9,
   235  	'i': 9,
   236  	'J': 10,
   237  	'j': 10,
   238  	'K': 11,
   239  	'k': 11,
   240  	'L': 12,
   241  	'l': 12,
   242  	'M': 13,
   243  	'm': 13,
   244  	'N': 14,
   245  	'n': 14,
   246  	'O': 15,
   247  	'o': 15,
   248  	'P': 16,
   249  	'p': 16,
   250  	'Q': 17,
   251  	'q': 17,
   252  	'R': 18,
   253  	'r': 18,
   254  	'S': 19,
   255  	's': 19,
   256  	'T': 20,
   257  	't': 20,
   258  	'U': 21,
   259  	'u': 21,
   260  	'V': 22,
   261  	'v': 22,
   262  	'W': 23,
   263  	'w': 23,
   264  	'X': 24,
   265  	'x': 24,
   266  	'Y': 25,
   267  	'y': 25,
   268  	'Z': 26,
   269  	'z': 26,
   270  	'-': 27,
   271  	'.': 28,
   272  	'0': 29,
   273  	'1': 30,
   274  	'2': 31,
   275  	'3': 32,
   276  	'4': 33,
   277  	'5': 34,
   278  	'6': 35,
   279  	'7': 36,
   280  	'8': 37,
   281  	'9': 38,
   282  }