github.com/bzz/enry@v1.6.7/internal/tokenizer/tokenize.go (about) 1 package tokenizer 2 3 import ( 4 "bytes" 5 6 "gopkg.in/src-d/enry.v1/regex" 7 ) 8 9 const byteLimit = 100000 10 11 func Tokenize(content []byte) []string { 12 if len(content) > byteLimit { 13 content = content[:byteLimit] 14 } 15 16 tokens := make([][]byte, 0, 50) 17 for _, extract := range extractTokens { 18 var extractedTokens [][]byte 19 content, extractedTokens = extract(content) 20 tokens = append(tokens, extractedTokens...) 21 } 22 23 return toString(tokens) 24 } 25 26 func toString(tokens [][]byte) []string { 27 stokens := make([]string, 0, len(tokens)) 28 for _, token := range tokens { 29 stokens = append(stokens, string(token)) 30 } 31 32 return stokens 33 } 34 35 var ( 36 extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){ 37 // The order to must be this 38 extractAndReplaceShebang, 39 extractAndReplaceSGML, 40 skipCommentsAndLiterals, 41 extractAndReplacePunctuation, 42 extractAndReplaceRegular, 43 extractAndReplaceOperator, 44 extractRemainders, 45 } 46 47 // Differences between golang regexp and oniguruma: 48 // 1. no (?s) in oniguruma - makes dot match \n 49 // 2. no (?U) in oniguruma - ungreedy * 50 // 3. (?m) implies dot matches \n in oniguruma 51 // 4. oniguruma handles \w differently - impossible, but true 52 // 53 // Workarounds: 54 // 1. (.|\n) 55 // 2. replace * with *? 56 // 3. replace . with [^\n] 57 // 4. replace \w with [0-9A-Za-z_] 58 // 59 // Original golang regexps: 60 // 61 // reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) 62 // reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) 63 // reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) 64 // reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 65 // reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) 66 // rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 67 // reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) 68 // reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`) 69 // reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) 70 // reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) 71 // reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) 72 // reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 73 // 74 // These regexps were converted to work in the same way for both engines: 75 // 76 reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) 77 reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) 78 reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) 79 reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 80 reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) 81 rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 82 reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) 83 reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`) 84 reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) 85 reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`) 86 reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) 87 reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 88 89 regexToSkip = []regex.EnryRegexp{ 90 // The order must be this 91 reLiteralStringQuotes, 92 reMultilineComment, 93 reSingleLineComment, 94 reLiteralNumber, 95 } 96 ) 97 98 func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) { 99 var shebangTokens [][]byte 100 matches := reShebang.FindAllSubmatch(content, -1) 101 if matches != nil { 102 shebangTokens = make([][]byte, 0, 2) 103 for _, match := range matches { 104 shebangToken := getShebangToken(match) 105 shebangTokens = append(shebangTokens, shebangToken) 106 } 107 108 reShebang.ReplaceAll(content, []byte(` `)) 109 } 110 111 return content, shebangTokens 112 } 113 114 func getShebangToken(matchedShebang [][]byte) []byte { 115 const prefix = `SHEBANG#!` 116 var token []byte 117 for i := 1; i < len(matchedShebang); i++ { 118 if len(matchedShebang[i]) > 0 { 119 token = matchedShebang[i] 120 break 121 } 122 } 123 124 tokenShebang := append([]byte(prefix), token...) 125 return tokenShebang 126 } 127 128 func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) { 129 tokens := re.FindAll(content, -1) 130 content = re.ReplaceAll(content, []byte(` `)) 131 return content, tokens 132 } 133 134 func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) { 135 return commonExtractAndReplace(content, rePunctuation) 136 } 137 138 func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) { 139 return commonExtractAndReplace(content, reRegularToken) 140 } 141 142 func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) { 143 return commonExtractAndReplace(content, reOperators) 144 } 145 146 func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) { 147 var SGMLTokens [][]byte 148 matches := reSGML.FindAllSubmatch(content, -1) 149 if matches != nil { 150 SGMLTokens = make([][]byte, 0, 2) 151 for _, match := range matches { 152 if reSGMLComment.Match(match[0]) { 153 continue 154 } 155 156 token := append(match[1], '>') 157 SGMLTokens = append(SGMLTokens, token) 158 attributes := getSGMLAttributes(match[0]) 159 SGMLTokens = append(SGMLTokens, attributes...) 160 } 161 162 content = reSGML.ReplaceAll(content, []byte(` `)) 163 } 164 165 return content, SGMLTokens 166 } 167 168 func getSGMLAttributes(SGMLTag []byte) [][]byte { 169 var attributes [][]byte 170 matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1) 171 if matches != nil { 172 attributes = make([][]byte, 0, 5) 173 for _, match := range matches { 174 if len(match[1]) != 0 { 175 attributes = append(attributes, match[1]) 176 } 177 178 if len(match[2]) != 0 { 179 loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1) 180 attributes = append(attributes, loneAttributes...) 181 } 182 } 183 } 184 185 return attributes 186 } 187 188 func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) { 189 for _, skip := range regexToSkip { 190 content = skip.ReplaceAll(content, []byte(` `)) 191 } 192 193 return content, nil 194 } 195 196 func extractRemainders(content []byte) ([]byte, [][]byte) { 197 splitted := bytes.Fields(content) 198 remainderTokens := make([][]byte, 0, len(splitted)*3) 199 for _, remainder := range splitted { 200 remainders := bytes.Split(remainder, nil) 201 remainderTokens = append(remainderTokens, remainders...) 202 } 203 204 return content, remainderTokens 205 }