github.com/rohankumardubey/draft-classic@v0.16.0/pkg/linguist/tokenizer/tokenizer.go (about)

     1  // Package tokenizer is a go port of https://github.com/github/linguist/blob/master/lib/linguist/tokenizer.rb
     2  //
     3  // in their words:
     4  //
     5  //  # Generic programming language tokenizer.
     6  //  #
     7  //  # Tokens are designed for use in the language bayes classifier.
     8  //  # It strips any data strings or comments and preserves significant
     9  //  # language symbols.
    10  //
    11  package tokenizer
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"regexp"
    17  )
    18  
    19  var (
    20  	// ByteLimit is the maximum input length for Tokenize()
    21  	ByteLimit = 100000
    22  
    23  	// StartLineComments turns string slices into their regexp slice counterparts
    24  	// by this package's init() function.
    25  	StartLineComments = []string{
    26  		"\"", // Vim
    27  		"%",  // Tex
    28  	}
    29  	// SingleLineComments turns string slices into their regexp slice counterparts
    30  	// by this package's init() function.
    31  	SingleLineComments = []string{
    32  		"//", // C
    33  		"--", // Ada, Haskell, AppleScript
    34  		"#",  // Perl, Bash, Ruby
    35  	}
    36  	// MultiLineComments turns string slices into their regexp slice counterparts
    37  	// by this package's init() function.
    38  	MultiLineComments = [][]string{
    39  		{"/*", "*/"},    // C
    40  		{"<!--", "-->"}, // XML
    41  		{"{-", "-}"},    // Haskell
    42  		{"(*", "*)"},    // Coq
    43  		{`"""`, `"""`},  // Python
    44  		{"'''", "'''"},  // Python
    45  		{"#`(", ")"},    // Perl6
    46  	}
    47  	startLineComment       []*regexp.Regexp
    48  	beginSingleLineComment []*regexp.Regexp
    49  	beginMultiLineComment  []*regexp.Regexp
    50  	endMultiLineComment    []*regexp.Regexp
    51  	stringRegexp           = regexp.MustCompile(`[^\\]*(["'` + "`])")
    52  	numberRegexp           = regexp.MustCompile(`(0x[0-9a-f]([0-9a-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    53  )
    54  
    55  func init() {
    56  	for _, st := range append(StartLineComments, SingleLineComments...) {
    57  		startLineComment = append(startLineComment, regexp.MustCompile(`^\s*`+regexp.QuoteMeta(st)))
    58  	}
    59  	for _, sl := range SingleLineComments {
    60  		beginSingleLineComment = append(beginSingleLineComment, regexp.MustCompile(regexp.QuoteMeta(sl)))
    61  	}
    62  	for _, ml := range MultiLineComments {
    63  		beginMultiLineComment = append(beginMultiLineComment, regexp.MustCompile(regexp.QuoteMeta(ml[0])))
    64  		endMultiLineComment = append(endMultiLineComment, regexp.MustCompile(regexp.QuoteMeta(ml[1])))
    65  	}
    66  }
    67  
    68  // FindMultiLineComment compares a given token to the start of a multiline comment
    69  // and if true, returns the bool with a regex. Otherwise false and nil.
    70  func FindMultiLineComment(token []byte) (matched bool, terminator *regexp.Regexp) {
    71  	for idx, re := range beginMultiLineComment {
    72  		if re.Match(token) {
    73  			return true, endMultiLineComment[idx]
    74  		}
    75  	}
    76  	return false, nil
    77  }
    78  
    79  // Tokenize is a simple tokenizer that uses bufio.Scanner to process lines and individual words
    80  // and matches them against regular expressions to filter out comments, strings, and numerals
    81  // in a manner very similar to github's linguist (see https://github.com/github/linguist/blob/master/lib/linguist/tokenizer.rb)
    82  //
    83  // The intention is to merely retrieve significant tokens from a piece of source code
    84  // in order to identify the programming language using statistical analysis
    85  // and NOT to be used as any part of the process of compilation whatsoever.
    86  //
    87  // NOTE(tso): The tokens produced by this function may be of a dubious quality due to the approach taken.
    88  // Feedback and alternate implementations welcome :)
    89  func Tokenize(input []byte) (tokens []string) {
    90  	if len(input) == 0 {
    91  		return tokens
    92  	}
    93  	if len(input) >= ByteLimit {
    94  		input = input[:ByteLimit]
    95  	}
    96  
    97  	var (
    98  		mlStart     = false        // in a multiline comment
    99  		mlEnd       *regexp.Regexp // closing token regexp
   100  		stringStart = false        // in a string literal
   101  		stringEnd   byte           // closing token byte to be found by the String regexp
   102  	)
   103  
   104  	buf := bytes.NewBuffer(input)
   105  	scanlines := bufio.NewScanner(buf)
   106  	scanlines.Split(bufio.ScanLines)
   107  
   108  	// NOTE(tso): the use of goto here is probably interchangeable with continue
   109  line:
   110  	for scanlines.Scan() {
   111  		ln := scanlines.Bytes()
   112  
   113  		for _, re := range startLineComment {
   114  			if re.Match(ln) {
   115  				goto line
   116  			}
   117  		}
   118  
   119  		// NOTE(tso): bufio.Scanner.Split(bufio.ScanWords) seems to just split on whitespace
   120  		// this may yield inaccurate results where there is a lack of sufficient
   121  		// whitespace for the approaches taken here, i.e. jumping straight to the
   122  		// next word/line boundary.
   123  		lnBuffer := bytes.NewBuffer(ln)
   124  		scanwords := bufio.NewScanner(lnBuffer)
   125  		scanwords.Split(bufio.ScanWords)
   126  	word:
   127  		for scanwords.Scan() {
   128  			tokenBytes := scanwords.Bytes()
   129  			tokenString := scanwords.Text()
   130  
   131  			// find end of multi-line comment
   132  			if mlStart {
   133  				if mlEnd.Match(tokenBytes) {
   134  					mlStart = false
   135  					mlEnd = nil
   136  				}
   137  				goto word
   138  			}
   139  
   140  			// find end of string literal
   141  			if stringStart {
   142  				s := stringRegexp.FindSubmatch(tokenBytes)
   143  				if s != nil && s[1][0] == stringEnd {
   144  					stringStart = false
   145  					stringEnd = 0
   146  				}
   147  				goto word
   148  			}
   149  
   150  			// find single-line comment
   151  			for _, re := range beginSingleLineComment {
   152  				if re.Match(tokenBytes) {
   153  					goto line
   154  				}
   155  			}
   156  
   157  			// find start of multi-line comment
   158  			if matched, terminator := FindMultiLineComment(tokenBytes); matched {
   159  				mlStart = true
   160  				mlEnd = terminator
   161  				goto word
   162  			}
   163  
   164  			// find start of string literal
   165  			if s := stringRegexp.FindSubmatch(tokenBytes); s != nil {
   166  				stringStart = true
   167  				stringEnd = s[1][0]
   168  				goto word
   169  			}
   170  
   171  			// find numeric literal
   172  			if n := numberRegexp.Find(tokenBytes); n != nil {
   173  				goto word
   174  			}
   175  
   176  			// add valid tokens to result set
   177  			tokens = append(tokens, tokenString)
   178  		}
   179  	}
   180  	return tokens
   181  }