github.com/imannamdari/v2ray-core/v5@v5.0.5/common/strmatcher/matchergroup_ac_automation.go (about) 1 package strmatcher 2 3 import ( 4 "container/list" 5 ) 6 7 const ( 8 acValidCharCount = 39 // aA-zZ (26), 0-9 (10), - (1), . (1), invalid(1) 9 acMatchTypeCount = 3 // Full, Domain and Substr 10 ) 11 12 type acEdge byte 13 14 const ( 15 acTrieEdge acEdge = 1 16 acFailEdge acEdge = 0 17 ) 18 19 type acNode struct { 20 next [acValidCharCount]uint32 // EdgeIdx -> Next NodeIdx (Next trie node or fail node) 21 edge [acValidCharCount]acEdge // EdgeIdx -> Trie Edge / Fail Edge 22 fail uint32 // NodeIdx of *next matched* Substr Pattern on its fail path 23 match uint32 // MatchIdx of matchers registered on this node, 0 indicates no match 24 } // Sizeof acNode: (4+1)*acValidCharCount + <padding> + 4 + 4 25 26 type acValue [acMatchTypeCount][]uint32 // MatcherType -> Registered Matcher Values 27 28 // ACAutoMationMatcherGroup is an implementation of MatcherGroup. 29 // It uses an AC Automata to provide support for Full, Domain and Substr matcher. Trie node is char based. 30 // 31 // NOTICE: ACAutomatonMatcherGroup currently uses a restricted charset (LDH Subset), 32 // upstream should manually in a way to ensure all patterns and inputs passed to it to be in this charset. 33 type ACAutomatonMatcherGroup struct { 34 nodes []acNode // NodeIdx -> acNode 35 values []acValue // MatchIdx -> acValue 36 } 37 38 func NewACAutomatonMatcherGroup() *ACAutomatonMatcherGroup { 39 ac := new(ACAutomatonMatcherGroup) 40 ac.addNode() // Create root node (NodeIdx 0) 41 ac.addMatchEntry() // Create sentinel match entry (MatchIdx 0) 42 return ac 43 } 44 45 // AddFullMatcher implements MatcherGroupForFull.AddFullMatcher. 46 func (ac *ACAutomatonMatcherGroup) AddFullMatcher(matcher FullMatcher, value uint32) { 47 ac.addPattern(0, matcher.Pattern(), matcher.Type(), value) 48 } 49 50 // AddDomainMatcher implements MatcherGroupForDomain.AddDomainMatcher. 51 func (ac *ACAutomatonMatcherGroup) AddDomainMatcher(matcher DomainMatcher, value uint32) { 52 node := ac.addPattern(0, matcher.Pattern(), matcher.Type(), value) // For full domain match 53 ac.addPattern(node, ".", matcher.Type(), value) // For partial domain match 54 } 55 56 // AddSubstrMatcher implements MatcherGroupForSubstr.AddSubstrMatcher. 57 func (ac *ACAutomatonMatcherGroup) AddSubstrMatcher(matcher SubstrMatcher, value uint32) { 58 ac.addPattern(0, matcher.Pattern(), matcher.Type(), value) 59 } 60 61 func (ac *ACAutomatonMatcherGroup) addPattern(nodeIdx uint32, pattern string, matcherType Type, value uint32) uint32 { 62 node := &ac.nodes[nodeIdx] 63 for i := len(pattern) - 1; i >= 0; i-- { 64 edgeIdx := acCharset[pattern[i]] 65 nextIdx := node.next[edgeIdx] 66 if nextIdx == 0 { // Add new Trie Edge 67 nextIdx = ac.addNode() 68 ac.nodes[nodeIdx].next[edgeIdx] = nextIdx 69 ac.nodes[nodeIdx].edge[edgeIdx] = acTrieEdge 70 } 71 nodeIdx = nextIdx 72 node = &ac.nodes[nodeIdx] 73 } 74 if node.match == 0 { // Add new match entry 75 node.match = ac.addMatchEntry() 76 } 77 ac.values[node.match][matcherType] = append(ac.values[node.match][matcherType], value) 78 return nodeIdx 79 } 80 81 func (ac *ACAutomatonMatcherGroup) addNode() uint32 { 82 ac.nodes = append(ac.nodes, acNode{}) 83 return uint32(len(ac.nodes) - 1) 84 } 85 86 func (ac *ACAutomatonMatcherGroup) addMatchEntry() uint32 { 87 ac.values = append(ac.values, acValue{}) 88 return uint32(len(ac.values) - 1) 89 } 90 91 func (ac *ACAutomatonMatcherGroup) Build() error { 92 fail := make([]uint32, len(ac.nodes)) 93 queue := list.New() 94 for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ { 95 if nextIdx := ac.nodes[0].next[edgeIdx]; nextIdx != 0 { 96 queue.PushBack(nextIdx) 97 } 98 } 99 for { 100 front := queue.Front() 101 if front == nil { 102 break 103 } 104 queue.Remove(front) 105 nodeIdx := front.Value.(uint32) 106 node := &ac.nodes[nodeIdx] // Current node 107 failNode := &ac.nodes[fail[nodeIdx]] // Fail node of currrent node 108 for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ { 109 nodeIdx := node.next[edgeIdx] // Next node through trie edge 110 failIdx := failNode.next[edgeIdx] // Next node through fail edge 111 if nodeIdx != 0 { 112 queue.PushBack(nodeIdx) 113 fail[nodeIdx] = failIdx 114 if match := ac.nodes[failIdx].match; match != 0 && len(ac.values[match][Substr]) > 0 { // Fail node is a Substr match node 115 ac.nodes[nodeIdx].fail = failIdx 116 } else { // Use path compression to reduce fail path to only contain match nodes 117 ac.nodes[nodeIdx].fail = ac.nodes[failIdx].fail 118 } 119 } else { // Add new fail edge 120 node.next[edgeIdx] = failIdx 121 node.edge[edgeIdx] = acFailEdge 122 } 123 } 124 } 125 return nil 126 } 127 128 // Match implements MatcherGroup.Match. 129 func (ac *ACAutomatonMatcherGroup) Match(input string) []uint32 { 130 suffixMatches := make([][]uint32, 0, 5) 131 substrMatches := make([][]uint32, 0, 5) 132 fullMatch := true // fullMatch indicates no fail edge traversed so far. 133 node := &ac.nodes[0] // start from root node. 134 // 1. the match string is all through trie edge. FULL MATCH or DOMAIN 135 // 2. the match string is through a fail edge. NOT FULL MATCH 136 // 2.1 Through a fail edge, but there exists a valid node. SUBSTR 137 for i := len(input) - 1; i >= 0; i-- { 138 edge := acCharset[input[i]] 139 fullMatch = fullMatch && (node.edge[edge] == acTrieEdge) 140 node = &ac.nodes[node.next[edge]] // Advance to next node 141 // When entering a new node, traverse the fail path to find all possible Substr patterns: 142 // 1. The fail path is compressed to only contains match nodes and root node (for terminate condition). 143 // 2. node.fail != 0 is added here for better performance (as shown by benchmark), possibly it helps branch prediction. 144 if node.fail != 0 { 145 for failIdx, failNode := node.fail, &ac.nodes[node.fail]; failIdx != 0; failIdx, failNode = failNode.fail, &ac.nodes[failIdx] { 146 substrMatches = append(substrMatches, ac.values[failNode.match][Substr]) 147 } 148 } 149 // When entering a new node, check whether this node is a match. 150 // For Substr matchers: 151 // 1. Matched in any situation, whether a failNode edge is traversed or not. 152 // For Domain matchers: 153 // 1. Should not traverse any fail edge (fullMatch). 154 // 2. Only check on dot separator (input[i] == '.'). 155 if node.match != 0 { 156 values := ac.values[node.match] 157 if len(values[Substr]) > 0 { 158 substrMatches = append(substrMatches, values[Substr]) 159 } 160 if fullMatch && input[i] == '.' && len(values[Domain]) > 0 { 161 suffixMatches = append(suffixMatches, values[Domain]) 162 } 163 } 164 } 165 // At the end of input, check if the whole string matches a pattern. 166 // For Domain matchers: 167 // 1. Exact match on Domain Matcher works like Full Match. e.g. foo.com is a full match for domain:foo.com. 168 // For Full matchers: 169 // 1. Only when no fail edge is traversed (fullMatch). 170 // 2. Takes the highest priority (added at last). 171 if fullMatch && node.match != 0 { 172 values := ac.values[node.match] 173 if len(values[Domain]) > 0 { 174 suffixMatches = append(suffixMatches, values[Domain]) 175 } 176 if len(values[Full]) > 0 { 177 suffixMatches = append(suffixMatches, values[Full]) 178 } 179 } 180 if len(substrMatches) == 0 { 181 return CompositeMatchesReverse(suffixMatches) 182 } 183 return CompositeMatchesReverse(append(substrMatches, suffixMatches...)) 184 } 185 186 // MatchAny implements MatcherGroup.MatchAny. 187 func (ac *ACAutomatonMatcherGroup) MatchAny(input string) bool { 188 fullMatch := true 189 node := &ac.nodes[0] 190 for i := len(input) - 1; i >= 0; i-- { 191 edge := acCharset[input[i]] 192 fullMatch = fullMatch && (node.edge[edge] == acTrieEdge) 193 node = &ac.nodes[node.next[edge]] 194 if node.fail != 0 { // There is a match on this node's fail path 195 return true 196 } 197 if node.match != 0 { // There is a match on this node 198 values := ac.values[node.match] 199 if len(values[Substr]) > 0 { // Substr match succeeds unconditionally 200 return true 201 } 202 if fullMatch && input[i] == '.' && len(values[Domain]) > 0 { // Domain match only succeeds with dot separator on trie path 203 return true 204 } 205 } 206 } 207 return fullMatch && node.match != 0 // At the end of input, Domain and Full match will succeed if no fail edge is traversed 208 } 209 210 // Letter-Digit-Hyphen (LDH) subset (https://tools.ietf.org/html/rfc952): 211 // - Letters A to Z (no distinction is made between uppercase and lowercase) 212 // - Digits 0 to 9 213 // - Hyphens(-) and Periods(.) 214 // 215 // If for future the strmatcher are used for other scenarios than domain, 216 // we could add a new Charset interface to represent variable charsets. 217 var acCharset = [256]int{ 218 'A': 1, 219 'a': 1, 220 'B': 2, 221 'b': 2, 222 'C': 3, 223 'c': 3, 224 'D': 4, 225 'd': 4, 226 'E': 5, 227 'e': 5, 228 'F': 6, 229 'f': 6, 230 'G': 7, 231 'g': 7, 232 'H': 8, 233 'h': 8, 234 'I': 9, 235 'i': 9, 236 'J': 10, 237 'j': 10, 238 'K': 11, 239 'k': 11, 240 'L': 12, 241 'l': 12, 242 'M': 13, 243 'm': 13, 244 'N': 14, 245 'n': 14, 246 'O': 15, 247 'o': 15, 248 'P': 16, 249 'p': 16, 250 'Q': 17, 251 'q': 17, 252 'R': 18, 253 'r': 18, 254 'S': 19, 255 's': 19, 256 'T': 20, 257 't': 20, 258 'U': 21, 259 'u': 21, 260 'V': 22, 261 'v': 22, 262 'W': 23, 263 'w': 23, 264 'X': 24, 265 'x': 24, 266 'Y': 25, 267 'y': 25, 268 'Z': 26, 269 'z': 26, 270 '-': 27, 271 '.': 28, 272 '0': 29, 273 '1': 30, 274 '2': 31, 275 '3': 32, 276 '4': 33, 277 '5': 34, 278 '6': 35, 279 '7': 36, 280 '8': 37, 281 '9': 38, 282 }