github.com/rohankumardubey/go-enry@v1.7.3/internal/tokenizer/tokenize.go (about) 1 // Package tokenizer implements file tokenization used by the enry content 2 // classifier. This package is an implementation detail of enry and should not 3 // be imported by other packages. 4 package tokenizer 5 6 import ( 7 "bytes" 8 9 "gopkg.in/src-d/enry.v1/regex" 10 ) 11 12 const byteLimit = 100000 13 14 // Tokenize returns language-agnostic lexical tokens from content. The tokens 15 // returned should match what the Linguist library returns. At most the first 16 // 100KB of content are tokenized. 17 func Tokenize(content []byte) []string { 18 if len(content) > byteLimit { 19 content = content[:byteLimit] 20 } 21 22 // Copy the input so that changes wrought by the tokenization steps do not 23 // modify the caller's copy of the input. See #196. 24 content = append([]byte(nil), content...) 25 26 tokens := make([][]byte, 0, 50) 27 for _, extract := range extractTokens { 28 var extractedTokens [][]byte 29 content, extractedTokens = extract(content) 30 tokens = append(tokens, extractedTokens...) 31 } 32 33 return toString(tokens) 34 } 35 36 func toString(tokens [][]byte) []string { 37 stokens := make([]string, 0, len(tokens)) 38 for _, token := range tokens { 39 stokens = append(stokens, string(token)) 40 } 41 42 return stokens 43 } 44 45 var ( 46 extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){ 47 // The order to must be this 48 extractAndReplaceShebang, 49 extractAndReplaceSGML, 50 skipCommentsAndLiterals, 51 extractAndReplacePunctuation, 52 extractAndReplaceRegular, 53 extractAndReplaceOperator, 54 extractRemainders, 55 } 56 57 // Differences between golang regexp and oniguruma: 58 // 1. no (?s) in oniguruma - makes dot match \n 59 // 2. no (?U) in oniguruma - ungreedy * 60 // 3. (?m) implies dot matches \n in oniguruma 61 // 4. oniguruma handles \w differently - impossible, but true 62 // 63 // Workarounds: 64 // 1. (.|\n) 65 // 2. replace * with *? 66 // 3. replace . with [^\n] 67 // 4. replace \w with [0-9A-Za-z_] 68 // 69 // Original golang regexps: 70 // 71 // reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) 72 // reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) 73 // reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) 74 // reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 75 // reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) 76 // rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 77 // reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) 78 // reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`) 79 // reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) 80 // reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) 81 // reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) 82 // reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 83 // 84 // These regexps were converted to work in the same way for both engines: 85 // 86 reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) 87 reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) 88 reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) 89 reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 90 reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) 91 rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 92 reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) 93 reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`) 94 reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) 95 reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`) 96 reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) 97 reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 98 99 regexToSkip = []regex.EnryRegexp{ 100 // The order must be this 101 reLiteralStringQuotes, 102 reMultilineComment, 103 reSingleLineComment, 104 reLiteralNumber, 105 } 106 ) 107 108 func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) { 109 var shebangTokens [][]byte 110 matches := reShebang.FindAllSubmatch(content, -1) 111 if matches != nil { 112 shebangTokens = make([][]byte, 0, 2) 113 for _, match := range matches { 114 shebangToken := getShebangToken(match) 115 shebangTokens = append(shebangTokens, shebangToken) 116 } 117 118 reShebang.ReplaceAll(content, []byte(` `)) 119 } 120 121 return content, shebangTokens 122 } 123 124 func getShebangToken(matchedShebang [][]byte) []byte { 125 const prefix = `SHEBANG#!` 126 var token []byte 127 for i := 1; i < len(matchedShebang); i++ { 128 if len(matchedShebang[i]) > 0 { 129 token = matchedShebang[i] 130 break 131 } 132 } 133 134 tokenShebang := append([]byte(prefix), token...) 135 return tokenShebang 136 } 137 138 func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) { 139 tokens := re.FindAll(content, -1) 140 content = re.ReplaceAll(content, []byte(` `)) 141 return content, tokens 142 } 143 144 func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) { 145 return commonExtractAndReplace(content, rePunctuation) 146 } 147 148 func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) { 149 return commonExtractAndReplace(content, reRegularToken) 150 } 151 152 func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) { 153 return commonExtractAndReplace(content, reOperators) 154 } 155 156 func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) { 157 var SGMLTokens [][]byte 158 matches := reSGML.FindAllSubmatch(content, -1) 159 if matches != nil { 160 SGMLTokens = make([][]byte, 0, 2) 161 for _, match := range matches { 162 if reSGMLComment.Match(match[0]) { 163 continue 164 } 165 166 token := append(match[1], '>') 167 SGMLTokens = append(SGMLTokens, token) 168 attributes := getSGMLAttributes(match[0]) 169 SGMLTokens = append(SGMLTokens, attributes...) 170 } 171 172 content = reSGML.ReplaceAll(content, []byte(` `)) 173 } 174 175 return content, SGMLTokens 176 } 177 178 func getSGMLAttributes(SGMLTag []byte) [][]byte { 179 var attributes [][]byte 180 matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1) 181 if matches != nil { 182 attributes = make([][]byte, 0, 5) 183 for _, match := range matches { 184 if len(match[1]) != 0 { 185 attributes = append(attributes, match[1]) 186 } 187 188 if len(match[2]) != 0 { 189 loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1) 190 attributes = append(attributes, loneAttributes...) 191 } 192 } 193 } 194 195 return attributes 196 } 197 198 func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) { 199 for _, skip := range regexToSkip { 200 content = skip.ReplaceAll(content, []byte(` `)) 201 } 202 203 return content, nil 204 } 205 206 func extractRemainders(content []byte) ([]byte, [][]byte) { 207 splitted := bytes.Fields(content) 208 remainderTokens := make([][]byte, 0, len(splitted)*3) 209 for _, remainder := range splitted { 210 remainders := bytes.Split(remainder, nil) 211 remainderTokens = append(remainderTokens, remainders...) 212 } 213 214 return content, remainderTokens 215 }