github.com/bzz/enry@v1.6.7/internal/tokenizer/tokenize.go (about)

     1  package tokenizer
     2  
     3  import (
     4  	"bytes"
     5  
     6  	"gopkg.in/src-d/enry.v1/regex"
     7  )
     8  
     9  const byteLimit = 100000
    10  
    11  func Tokenize(content []byte) []string {
    12  	if len(content) > byteLimit {
    13  		content = content[:byteLimit]
    14  	}
    15  
    16  	tokens := make([][]byte, 0, 50)
    17  	for _, extract := range extractTokens {
    18  		var extractedTokens [][]byte
    19  		content, extractedTokens = extract(content)
    20  		tokens = append(tokens, extractedTokens...)
    21  	}
    22  
    23  	return toString(tokens)
    24  }
    25  
    26  func toString(tokens [][]byte) []string {
    27  	stokens := make([]string, 0, len(tokens))
    28  	for _, token := range tokens {
    29  		stokens = append(stokens, string(token))
    30  	}
    31  
    32  	return stokens
    33  }
    34  
    35  var (
    36  	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
    37  		// The order to must be this
    38  		extractAndReplaceShebang,
    39  		extractAndReplaceSGML,
    40  		skipCommentsAndLiterals,
    41  		extractAndReplacePunctuation,
    42  		extractAndReplaceRegular,
    43  		extractAndReplaceOperator,
    44  		extractRemainders,
    45  	}
    46  
    47  	// Differences between golang regexp and oniguruma:
    48  	// 1. no (?s) in oniguruma - makes dot match \n
    49  	// 2. no (?U) in oniguruma - ungreedy *
    50  	// 3. (?m) implies dot matches \n in oniguruma
    51  	// 4. oniguruma handles \w differently - impossible, but true
    52  	//
    53  	// Workarounds:
    54  	// 1. (.|\n)
    55  	// 2. replace * with *?
    56  	// 3. replace . with [^\n]
    57  	// 4. replace \w with [0-9A-Za-z_]
    58  	//
    59  	// Original golang regexps:
    60  	//
    61  	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
    62  	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
    63  	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
    64  	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    65  	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
    66  	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    67  	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
    68  	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
    69  	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
    70  	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
    71  	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
    72  	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    73  	//
    74  	// These regexps were converted to work in the same way for both engines:
    75  	//
    76  	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
    77  	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
    78  	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
    79  	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    80  	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
    81  	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    82  	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
    83  	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
    84  	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
    85  	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
    86  	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
    87  	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    88  
    89  	regexToSkip = []regex.EnryRegexp{
    90  		// The order must be this
    91  		reLiteralStringQuotes,
    92  		reMultilineComment,
    93  		reSingleLineComment,
    94  		reLiteralNumber,
    95  	}
    96  )
    97  
    98  func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
    99  	var shebangTokens [][]byte
   100  	matches := reShebang.FindAllSubmatch(content, -1)
   101  	if matches != nil {
   102  		shebangTokens = make([][]byte, 0, 2)
   103  		for _, match := range matches {
   104  			shebangToken := getShebangToken(match)
   105  			shebangTokens = append(shebangTokens, shebangToken)
   106  		}
   107  
   108  		reShebang.ReplaceAll(content, []byte(` `))
   109  	}
   110  
   111  	return content, shebangTokens
   112  }
   113  
   114  func getShebangToken(matchedShebang [][]byte) []byte {
   115  	const prefix = `SHEBANG#!`
   116  	var token []byte
   117  	for i := 1; i < len(matchedShebang); i++ {
   118  		if len(matchedShebang[i]) > 0 {
   119  			token = matchedShebang[i]
   120  			break
   121  		}
   122  	}
   123  
   124  	tokenShebang := append([]byte(prefix), token...)
   125  	return tokenShebang
   126  }
   127  
   128  func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
   129  	tokens := re.FindAll(content, -1)
   130  	content = re.ReplaceAll(content, []byte(` `))
   131  	return content, tokens
   132  }
   133  
   134  func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
   135  	return commonExtractAndReplace(content, rePunctuation)
   136  }
   137  
   138  func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
   139  	return commonExtractAndReplace(content, reRegularToken)
   140  }
   141  
   142  func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
   143  	return commonExtractAndReplace(content, reOperators)
   144  }
   145  
   146  func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
   147  	var SGMLTokens [][]byte
   148  	matches := reSGML.FindAllSubmatch(content, -1)
   149  	if matches != nil {
   150  		SGMLTokens = make([][]byte, 0, 2)
   151  		for _, match := range matches {
   152  			if reSGMLComment.Match(match[0]) {
   153  				continue
   154  			}
   155  
   156  			token := append(match[1], '>')
   157  			SGMLTokens = append(SGMLTokens, token)
   158  			attributes := getSGMLAttributes(match[0])
   159  			SGMLTokens = append(SGMLTokens, attributes...)
   160  		}
   161  
   162  		content = reSGML.ReplaceAll(content, []byte(` `))
   163  	}
   164  
   165  	return content, SGMLTokens
   166  }
   167  
   168  func getSGMLAttributes(SGMLTag []byte) [][]byte {
   169  	var attributes [][]byte
   170  	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
   171  	if matches != nil {
   172  		attributes = make([][]byte, 0, 5)
   173  		for _, match := range matches {
   174  			if len(match[1]) != 0 {
   175  				attributes = append(attributes, match[1])
   176  			}
   177  
   178  			if len(match[2]) != 0 {
   179  				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
   180  				attributes = append(attributes, loneAttributes...)
   181  			}
   182  		}
   183  	}
   184  
   185  	return attributes
   186  }
   187  
   188  func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
   189  	for _, skip := range regexToSkip {
   190  		content = skip.ReplaceAll(content, []byte(` `))
   191  	}
   192  
   193  	return content, nil
   194  }
   195  
   196  func extractRemainders(content []byte) ([]byte, [][]byte) {
   197  	splitted := bytes.Fields(content)
   198  	remainderTokens := make([][]byte, 0, len(splitted)*3)
   199  	for _, remainder := range splitted {
   200  		remainders := bytes.Split(remainder, nil)
   201  		remainderTokens = append(remainderTokens, remainders...)
   202  	}
   203  
   204  	return content, remainderTokens
   205  }