git.templeos.me/xultist/go-enry/v2@v2.0.0-20230215093429-6ef3e87f47c0/internal/tokenizer/tokenize.go (about) 1 // +build !flex 2 3 package tokenizer 4 5 import ( 6 "bytes" 7 8 "github.com/go-enry/go-enry/v2/regex" 9 ) 10 11 // Tokenize returns lexical tokens from content. The tokens returned match what 12 // the Linguist library returns. At most the first ByteLimit bytes of content are tokenized. 13 // 14 // BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some 15 // differences between this function and the Linguist output. 16 func Tokenize(content []byte) []string { 17 if len(content) > ByteLimit { 18 content = content[:ByteLimit] 19 } 20 21 tokens := make([][]byte, 0, 50) 22 for _, extract := range extractTokens { 23 var extractedTokens [][]byte 24 content, extractedTokens = extract(content) 25 tokens = append(tokens, extractedTokens...) 26 } 27 28 return toString(tokens) 29 } 30 31 func toString(tokens [][]byte) []string { 32 stokens := make([]string, 0, len(tokens)) 33 for _, token := range tokens { 34 stokens = append(stokens, string(token)) 35 } 36 37 return stokens 38 } 39 40 var ( 41 extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){ 42 // The order to must be this 43 extractAndReplaceShebang, 44 extractAndReplaceSGML, 45 skipCommentsAndLiterals, 46 extractAndReplacePunctuation, 47 extractAndReplaceRegular, 48 extractAndReplaceOperator, 49 extractRemainders, 50 } 51 52 // Differences between golang regexp and oniguruma: 53 // 1. no (?s) in oniguruma - makes dot match \n 54 // 2. no (?U) in oniguruma - ungreedy * 55 // 3. (?m) implies dot matches \n in oniguruma 56 // 4. oniguruma handles \w differently - impossible, but true 57 // 58 // Workarounds: 59 // 1. (.|\n) 60 // 2. replace * with *? 61 // 3. replace . with [^\n] 62 // 4. replace \w with [0-9A-Za-z_] 63 // 64 // Original golang regexps: 65 // 66 // reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) 67 // reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) 68 // reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) 69 // reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 70 // reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) 71 // rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 72 // reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) 73 // reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`) 74 // reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) 75 // reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) 76 // reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) 77 // reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 78 // 79 // These regexps were converted to work in the same way for both engines: 80 // 81 reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) 82 reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) 83 reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) 84 reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 85 reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) 86 rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 87 reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) 88 reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`) 89 reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) 90 reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`) 91 reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) 92 reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 93 94 regexToSkip = []regex.EnryRegexp{ 95 // The order must be this 96 reLiteralStringQuotes, 97 reMultilineComment, 98 reSingleLineComment, 99 reLiteralNumber, 100 } 101 ) 102 103 func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) { 104 var shebangTokens [][]byte 105 matches := reShebang.FindAllSubmatch(content, -1) 106 if matches != nil { 107 shebangTokens = make([][]byte, 0, 2) 108 for _, match := range matches { 109 shebangToken := getShebangToken(match) 110 shebangTokens = append(shebangTokens, shebangToken) 111 } 112 113 reShebang.ReplaceAll(content, []byte(` `)) 114 } 115 116 return content, shebangTokens 117 } 118 119 func getShebangToken(matchedShebang [][]byte) []byte { 120 const prefix = `SHEBANG#!` 121 var token []byte 122 for i := 1; i < len(matchedShebang); i++ { 123 if len(matchedShebang[i]) > 0 { 124 token = matchedShebang[i] 125 break 126 } 127 } 128 129 tokenShebang := append([]byte(prefix), token...) 130 return tokenShebang 131 } 132 133 func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) { 134 tokens := re.FindAll(content, -1) 135 content = re.ReplaceAll(content, []byte(` `)) 136 return content, tokens 137 } 138 139 func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) { 140 return commonExtractAndReplace(content, rePunctuation) 141 } 142 143 func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) { 144 return commonExtractAndReplace(content, reRegularToken) 145 } 146 147 func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) { 148 return commonExtractAndReplace(content, reOperators) 149 } 150 151 func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) { 152 var SGMLTokens [][]byte 153 matches := reSGML.FindAllSubmatch(content, -1) 154 if matches != nil { 155 SGMLTokens = make([][]byte, 0, 2) 156 for _, match := range matches { 157 if reSGMLComment.Match(match[0]) { 158 continue 159 } 160 161 token := append(append([]byte(nil), match[1]...), '>') 162 SGMLTokens = append(SGMLTokens, token) 163 attributes := getSGMLAttributes(match[0]) 164 SGMLTokens = append(SGMLTokens, attributes...) 165 } 166 167 content = reSGML.ReplaceAll(content, []byte(` `)) 168 } 169 170 return content, SGMLTokens 171 } 172 173 func getSGMLAttributes(SGMLTag []byte) [][]byte { 174 var attributes [][]byte 175 matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1) 176 if matches != nil { 177 attributes = make([][]byte, 0, 5) 178 for _, match := range matches { 179 if len(match[1]) != 0 { 180 attributes = append(attributes, match[1]) 181 } 182 183 if len(match[2]) != 0 { 184 loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1) 185 attributes = append(attributes, loneAttributes...) 186 } 187 } 188 } 189 190 return attributes 191 } 192 193 func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) { 194 for _, skip := range regexToSkip { 195 content = skip.ReplaceAll(content, []byte(` `)) 196 } 197 198 return content, nil 199 } 200 201 func extractRemainders(content []byte) ([]byte, [][]byte) { 202 splitted := bytes.Fields(content) 203 remainderTokens := make([][]byte, 0, len(splitted)*3) 204 for _, remainder := range splitted { 205 remainders := bytes.Split(remainder, nil) 206 remainderTokens = append(remainderTokens, remainders...) 207 } 208 209 return content, remainderTokens 210 }