github.com/dennwc/enry@v1.6.4-0.20180424151738-42391b8e105b/internal/tokenizer/tokenize.go (about) 1 package tokenizer 2 3 import ( 4 "bytes" 5 "regexp" 6 ) 7 8 const byteLimit = 100000 9 10 func Tokenize(content []byte) []string { 11 if len(content) > byteLimit { 12 content = content[:byteLimit] 13 } 14 15 tokens := make([][]byte, 0, 50) 16 for _, extract := range extractTokens { 17 var extractedTokens [][]byte 18 content, extractedTokens = extract(content) 19 tokens = append(tokens, extractedTokens...) 20 } 21 22 return toString(tokens) 23 } 24 25 func toString(tokens [][]byte) []string { 26 stokens := make([]string, 0, len(tokens)) 27 for _, token := range tokens { 28 stokens = append(stokens, string(token)) 29 } 30 31 return stokens 32 } 33 34 var ( 35 extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){ 36 // The order to must be this 37 extractAndReplaceShebang, 38 extractAndReplaceSGML, 39 skipCommentsAndLiterals, 40 extractAndReplacePunctuation, 41 extractAndReplaceRegular, 42 extractAndReplaceOperator, 43 extractRemainders, 44 } 45 46 // Differences between golang regexp and oniguruma: 47 // 1. no (?s) in oniguruma - makes dot match \n 48 // 2. no (?U) in oniguruma - ungreedy * 49 // 3. (?m) implies dot matches \n in oniguruma 50 // 4. oniguruma handles \w differently - impossible, but true 51 // 52 // Workarounds: 53 // 1. (.|\n) 54 // 2. replace * with *? 55 // 3. replace . with [^\n] 56 // 4. replace \w with [0-9A-Za-z_] 57 // 58 // Original golang regexps: 59 // 60 // reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`) 61 // reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`) 62 // reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`) 63 // reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 64 // reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`) 65 // rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 66 // reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`) 67 // reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`) 68 // reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`) 69 // reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`) 70 // reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`) 71 // reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 72 // 73 // These regexps were converted to work in the same way for both engines: 74 // 75 reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) 76 reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) 77 reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) 78 reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 79 reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) 80 rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`) 81 reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) 82 reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`) 83 reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) 84 reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`) 85 reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) 86 reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) 87 88 regexToSkip = []*regexp.Regexp{ 89 // The order must be this 90 reLiteralStringQuotes, 91 reMultilineComment, 92 reSingleLineComment, 93 reLiteralNumber, 94 } 95 ) 96 97 func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) { 98 var shebangTokens [][]byte 99 matches := reShebang.FindAllSubmatch(content, -1) 100 if matches != nil { 101 shebangTokens = make([][]byte, 0, 2) 102 for _, match := range matches { 103 shebangToken := getShebangToken(match) 104 shebangTokens = append(shebangTokens, shebangToken) 105 } 106 107 reShebang.ReplaceAll(content, []byte(` `)) 108 } 109 110 return content, shebangTokens 111 } 112 113 func getShebangToken(matchedShebang [][]byte) []byte { 114 const prefix = `SHEBANG#!` 115 var token []byte 116 for i := 1; i < len(matchedShebang); i++ { 117 if len(matchedShebang[i]) > 0 { 118 token = matchedShebang[i] 119 break 120 } 121 } 122 123 tokenShebang := append([]byte(prefix), token...) 124 return tokenShebang 125 } 126 127 func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) { 128 tokens := re.FindAll(content, -1) 129 content = re.ReplaceAll(content, []byte(` `)) 130 return content, tokens 131 } 132 133 func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) { 134 return commonExtracAndReplace(content, rePunctuation) 135 } 136 137 func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) { 138 return commonExtracAndReplace(content, reRegularToken) 139 } 140 141 func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) { 142 return commonExtracAndReplace(content, reOperators) 143 } 144 145 func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) { 146 var SGMLTokens [][]byte 147 matches := reSGML.FindAllSubmatch(content, -1) 148 if matches != nil { 149 SGMLTokens = make([][]byte, 0, 2) 150 for _, match := range matches { 151 if reSGMLComment.Match(match[0]) { 152 continue 153 } 154 155 token := append(match[1], '>') 156 SGMLTokens = append(SGMLTokens, token) 157 attributes := getSGMLAttributes(match[0]) 158 SGMLTokens = append(SGMLTokens, attributes...) 159 } 160 161 content = reSGML.ReplaceAll(content, []byte(` `)) 162 } 163 164 return content, SGMLTokens 165 } 166 167 func getSGMLAttributes(SGMLTag []byte) [][]byte { 168 var attributes [][]byte 169 matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1) 170 if matches != nil { 171 attributes = make([][]byte, 0, 5) 172 for _, match := range matches { 173 if len(match[1]) != 0 { 174 attributes = append(attributes, match[1]) 175 } 176 177 if len(match[2]) != 0 { 178 loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1) 179 attributes = append(attributes, loneAttributes...) 180 } 181 } 182 } 183 184 return attributes 185 } 186 187 func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) { 188 for _, skip := range regexToSkip { 189 content = skip.ReplaceAll(content, []byte(` `)) 190 } 191 192 return content, nil 193 } 194 195 func extractRemainders(content []byte) ([]byte, [][]byte) { 196 splitted := bytes.Fields(content) 197 remainderTokens := make([][]byte, 0, len(splitted)*3) 198 for _, remainder := range splitted { 199 remainders := bytes.Split(remainder, nil) 200 remainderTokens = append(remainderTokens, remainders...) 201 } 202 203 return content, remainderTokens 204 }