github.com/rohankumardubey/draft-classic@v0.16.0/pkg/linguist/tokenizer/tokenizer.go (about) 1 // Package tokenizer is a go port of https://github.com/github/linguist/blob/master/lib/linguist/tokenizer.rb 2 // 3 // in their words: 4 // 5 // # Generic programming language tokenizer. 6 // # 7 // # Tokens are designed for use in the language bayes classifier. 8 // # It strips any data strings or comments and preserves significant 9 // # language symbols. 10 // 11 package tokenizer 12 13 import ( 14 "bufio" 15 "bytes" 16 "regexp" 17 ) 18 19 var ( 20 // ByteLimit is the maximum input length for Tokenize() 21 ByteLimit = 100000 22 23 // StartLineComments turns string slices into their regexp slice counterparts 24 // by this package's init() function. 25 StartLineComments = []string{ 26 "\"", // Vim 27 "%", // Tex 28 } 29 // SingleLineComments turns string slices into their regexp slice counterparts 30 // by this package's init() function. 31 SingleLineComments = []string{ 32 "//", // C 33 "--", // Ada, Haskell, AppleScript 34 "#", // Perl, Bash, Ruby 35 } 36 // MultiLineComments turns string slices into their regexp slice counterparts 37 // by this package's init() function. 38 MultiLineComments = [][]string{ 39 {"/*", "*/"}, // C 40 {"<!--", "-->"}, // XML 41 {"{-", "-}"}, // Haskell 42 {"(*", "*)"}, // Coq 43 {`"""`, `"""`}, // Python 44 {"'''", "'''"}, // Python 45 {"#`(", ")"}, // Perl6 46 } 47 startLineComment []*regexp.Regexp 48 beginSingleLineComment []*regexp.Regexp 49 beginMultiLineComment []*regexp.Regexp 50 endMultiLineComment []*regexp.Regexp 51 stringRegexp = regexp.MustCompile(`[^\\]*(["'` + "`])") 52 numberRegexp = regexp.MustCompile(`(0x[0-9a-f]([0-9a-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) 53 ) 54 55 func init() { 56 for _, st := range append(StartLineComments, SingleLineComments...) { 57 startLineComment = append(startLineComment, regexp.MustCompile(`^\s*`+regexp.QuoteMeta(st))) 58 } 59 for _, sl := range SingleLineComments { 60 beginSingleLineComment = append(beginSingleLineComment, regexp.MustCompile(regexp.QuoteMeta(sl))) 61 } 62 for _, ml := range MultiLineComments { 63 beginMultiLineComment = append(beginMultiLineComment, regexp.MustCompile(regexp.QuoteMeta(ml[0]))) 64 endMultiLineComment = append(endMultiLineComment, regexp.MustCompile(regexp.QuoteMeta(ml[1]))) 65 } 66 } 67 68 // FindMultiLineComment compares a given token to the start of a multiline comment 69 // and if true, returns the bool with a regex. Otherwise false and nil. 70 func FindMultiLineComment(token []byte) (matched bool, terminator *regexp.Regexp) { 71 for idx, re := range beginMultiLineComment { 72 if re.Match(token) { 73 return true, endMultiLineComment[idx] 74 } 75 } 76 return false, nil 77 } 78 79 // Tokenize is a simple tokenizer that uses bufio.Scanner to process lines and individual words 80 // and matches them against regular expressions to filter out comments, strings, and numerals 81 // in a manner very similar to github's linguist (see https://github.com/github/linguist/blob/master/lib/linguist/tokenizer.rb) 82 // 83 // The intention is to merely retrieve significant tokens from a piece of source code 84 // in order to identify the programming language using statistical analysis 85 // and NOT to be used as any part of the process of compilation whatsoever. 86 // 87 // NOTE(tso): The tokens produced by this function may be of a dubious quality due to the approach taken. 88 // Feedback and alternate implementations welcome :) 89 func Tokenize(input []byte) (tokens []string) { 90 if len(input) == 0 { 91 return tokens 92 } 93 if len(input) >= ByteLimit { 94 input = input[:ByteLimit] 95 } 96 97 var ( 98 mlStart = false // in a multiline comment 99 mlEnd *regexp.Regexp // closing token regexp 100 stringStart = false // in a string literal 101 stringEnd byte // closing token byte to be found by the String regexp 102 ) 103 104 buf := bytes.NewBuffer(input) 105 scanlines := bufio.NewScanner(buf) 106 scanlines.Split(bufio.ScanLines) 107 108 // NOTE(tso): the use of goto here is probably interchangeable with continue 109 line: 110 for scanlines.Scan() { 111 ln := scanlines.Bytes() 112 113 for _, re := range startLineComment { 114 if re.Match(ln) { 115 goto line 116 } 117 } 118 119 // NOTE(tso): bufio.Scanner.Split(bufio.ScanWords) seems to just split on whitespace 120 // this may yield inaccurate results where there is a lack of sufficient 121 // whitespace for the approaches taken here, i.e. jumping straight to the 122 // next word/line boundary. 123 lnBuffer := bytes.NewBuffer(ln) 124 scanwords := bufio.NewScanner(lnBuffer) 125 scanwords.Split(bufio.ScanWords) 126 word: 127 for scanwords.Scan() { 128 tokenBytes := scanwords.Bytes() 129 tokenString := scanwords.Text() 130 131 // find end of multi-line comment 132 if mlStart { 133 if mlEnd.Match(tokenBytes) { 134 mlStart = false 135 mlEnd = nil 136 } 137 goto word 138 } 139 140 // find end of string literal 141 if stringStart { 142 s := stringRegexp.FindSubmatch(tokenBytes) 143 if s != nil && s[1][0] == stringEnd { 144 stringStart = false 145 stringEnd = 0 146 } 147 goto word 148 } 149 150 // find single-line comment 151 for _, re := range beginSingleLineComment { 152 if re.Match(tokenBytes) { 153 goto line 154 } 155 } 156 157 // find start of multi-line comment 158 if matched, terminator := FindMultiLineComment(tokenBytes); matched { 159 mlStart = true 160 mlEnd = terminator 161 goto word 162 } 163 164 // find start of string literal 165 if s := stringRegexp.FindSubmatch(tokenBytes); s != nil { 166 stringStart = true 167 stringEnd = s[1][0] 168 goto word 169 } 170 171 // find numeric literal 172 if n := numberRegexp.Find(tokenBytes); n != nil { 173 goto word 174 } 175 176 // add valid tokens to result set 177 tokens = append(tokens, tokenString) 178 } 179 } 180 return tokens 181 }