github.com/dennwc/enry@v1.6.4-0.20180424151738-42391b8e105b/internal/tokenizer/tokenize.go (about)

     1  package tokenizer
     2  
     3  import (
     4  	"bytes"
     5  	"regexp"
     6  )
     7  
     8  const byteLimit = 100000
     9  
    10  func Tokenize(content []byte) []string {
    11  	if len(content) > byteLimit {
    12  		content = content[:byteLimit]
    13  	}
    14  
    15  	tokens := make([][]byte, 0, 50)
    16  	for _, extract := range extractTokens {
    17  		var extractedTokens [][]byte
    18  		content, extractedTokens = extract(content)
    19  		tokens = append(tokens, extractedTokens...)
    20  	}
    21  
    22  	return toString(tokens)
    23  }
    24  
    25  func toString(tokens [][]byte) []string {
    26  	stokens := make([]string, 0, len(tokens))
    27  	for _, token := range tokens {
    28  		stokens = append(stokens, string(token))
    29  	}
    30  
    31  	return stokens
    32  }
    33  
    34  var (
    35  	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
    36  		// The order to must be this
    37  		extractAndReplaceShebang,
    38  		extractAndReplaceSGML,
    39  		skipCommentsAndLiterals,
    40  		extractAndReplacePunctuation,
    41  		extractAndReplaceRegular,
    42  		extractAndReplaceOperator,
    43  		extractRemainders,
    44  	}
    45  
    46  	// Differences between golang regexp and oniguruma:
    47  	// 1. no (?s) in oniguruma - makes dot match \n
    48  	// 2. no (?U) in oniguruma - ungreedy *
    49  	// 3. (?m) implies dot matches \n in oniguruma
    50  	// 4. oniguruma handles \w differently - impossible, but true
    51  	//
    52  	// Workarounds:
    53  	// 1. (.|\n)
    54  	// 2. replace * with *?
    55  	// 3. replace . with [^\n]
    56  	// 4. replace \w with [0-9A-Za-z_]
    57  	//
    58  	// Original golang regexps:
    59  	//
    60  	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
    61  	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
    62  	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
    63  	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    64  	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
    65  	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    66  	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
    67  	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
    68  	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
    69  	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
    70  	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
    71  	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    72  	//
    73  	// These regexps were converted to work in the same way for both engines:
    74  	//
    75  	reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
    76  	reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
    77  	reMultilineComment    = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
    78  	reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    79  	reShebang             = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
    80  	rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    81  	reSGML                = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
    82  	reSGMLComment         = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
    83  	reSGMLAttributes      = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
    84  	reSGMLLoneAttribute   = regexp.MustCompile(`([0-9A-Za-z_]+)`)
    85  	reRegularToken        = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
    86  	reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    87  
    88  	regexToSkip = []*regexp.Regexp{
    89  		// The order must be this
    90  		reLiteralStringQuotes,
    91  		reMultilineComment,
    92  		reSingleLineComment,
    93  		reLiteralNumber,
    94  	}
    95  )
    96  
    97  func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
    98  	var shebangTokens [][]byte
    99  	matches := reShebang.FindAllSubmatch(content, -1)
   100  	if matches != nil {
   101  		shebangTokens = make([][]byte, 0, 2)
   102  		for _, match := range matches {
   103  			shebangToken := getShebangToken(match)
   104  			shebangTokens = append(shebangTokens, shebangToken)
   105  		}
   106  
   107  		reShebang.ReplaceAll(content, []byte(` `))
   108  	}
   109  
   110  	return content, shebangTokens
   111  }
   112  
   113  func getShebangToken(matchedShebang [][]byte) []byte {
   114  	const prefix = `SHEBANG#!`
   115  	var token []byte
   116  	for i := 1; i < len(matchedShebang); i++ {
   117  		if len(matchedShebang[i]) > 0 {
   118  			token = matchedShebang[i]
   119  			break
   120  		}
   121  	}
   122  
   123  	tokenShebang := append([]byte(prefix), token...)
   124  	return tokenShebang
   125  }
   126  
   127  func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
   128  	tokens := re.FindAll(content, -1)
   129  	content = re.ReplaceAll(content, []byte(` `))
   130  	return content, tokens
   131  }
   132  
   133  func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
   134  	return commonExtracAndReplace(content, rePunctuation)
   135  }
   136  
   137  func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
   138  	return commonExtracAndReplace(content, reRegularToken)
   139  }
   140  
   141  func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
   142  	return commonExtracAndReplace(content, reOperators)
   143  }
   144  
   145  func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
   146  	var SGMLTokens [][]byte
   147  	matches := reSGML.FindAllSubmatch(content, -1)
   148  	if matches != nil {
   149  		SGMLTokens = make([][]byte, 0, 2)
   150  		for _, match := range matches {
   151  			if reSGMLComment.Match(match[0]) {
   152  				continue
   153  			}
   154  
   155  			token := append(match[1], '>')
   156  			SGMLTokens = append(SGMLTokens, token)
   157  			attributes := getSGMLAttributes(match[0])
   158  			SGMLTokens = append(SGMLTokens, attributes...)
   159  		}
   160  
   161  		content = reSGML.ReplaceAll(content, []byte(` `))
   162  	}
   163  
   164  	return content, SGMLTokens
   165  }
   166  
   167  func getSGMLAttributes(SGMLTag []byte) [][]byte {
   168  	var attributes [][]byte
   169  	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
   170  	if matches != nil {
   171  		attributes = make([][]byte, 0, 5)
   172  		for _, match := range matches {
   173  			if len(match[1]) != 0 {
   174  				attributes = append(attributes, match[1])
   175  			}
   176  
   177  			if len(match[2]) != 0 {
   178  				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
   179  				attributes = append(attributes, loneAttributes...)
   180  			}
   181  		}
   182  	}
   183  
   184  	return attributes
   185  }
   186  
   187  func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
   188  	for _, skip := range regexToSkip {
   189  		content = skip.ReplaceAll(content, []byte(` `))
   190  	}
   191  
   192  	return content, nil
   193  }
   194  
   195  func extractRemainders(content []byte) ([]byte, [][]byte) {
   196  	splitted := bytes.Fields(content)
   197  	remainderTokens := make([][]byte, 0, len(splitted)*3)
   198  	for _, remainder := range splitted {
   199  		remainders := bytes.Split(remainder, nil)
   200  		remainderTokens = append(remainderTokens, remainders...)
   201  	}
   202  
   203  	return content, remainderTokens
   204  }