git.templeos.me/xultist/go-enry/v2@v2.0.0-20230215093429-6ef3e87f47c0/internal/tokenizer/tokenize.go

git.templeos.me/xultist/go-enry/v2@v2.0.0-20230215093429-6ef3e87f47c0/internal/tokenizer/tokenize.go (about)

     1  // +build !flex
     2  
     3  package tokenizer
     4  
     5  import (
     6  	"bytes"
     7  
     8  	"github.com/go-enry/go-enry/v2/regex"
     9  )
    10  
    11  // Tokenize returns lexical tokens from content. The tokens returned match what
    12  // the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
    13  //
    14  // BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
    15  // differences between this function and the Linguist output.
    16  func Tokenize(content []byte) []string {
    17  	if len(content) > ByteLimit {
    18  		content = content[:ByteLimit]
    19  	}
    20  
    21  	tokens := make([][]byte, 0, 50)
    22  	for _, extract := range extractTokens {
    23  		var extractedTokens [][]byte
    24  		content, extractedTokens = extract(content)
    25  		tokens = append(tokens, extractedTokens...)
    26  	}
    27  
    28  	return toString(tokens)
    29  }
    30  
    31  func toString(tokens [][]byte) []string {
    32  	stokens := make([]string, 0, len(tokens))
    33  	for _, token := range tokens {
    34  		stokens = append(stokens, string(token))
    35  	}
    36  
    37  	return stokens
    38  }
    39  
    40  var (
    41  	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
    42  		// The order to must be this
    43  		extractAndReplaceShebang,
    44  		extractAndReplaceSGML,
    45  		skipCommentsAndLiterals,
    46  		extractAndReplacePunctuation,
    47  		extractAndReplaceRegular,
    48  		extractAndReplaceOperator,
    49  		extractRemainders,
    50  	}
    51  
    52  	// Differences between golang regexp and oniguruma:
    53  	// 1. no (?s) in oniguruma - makes dot match \n
    54  	// 2. no (?U) in oniguruma - ungreedy *
    55  	// 3. (?m) implies dot matches \n in oniguruma
    56  	// 4. oniguruma handles \w differently - impossible, but true
    57  	//
    58  	// Workarounds:
    59  	// 1. (.|\n)
    60  	// 2. replace * with *?
    61  	// 3. replace . with [^\n]
    62  	// 4. replace \w with [0-9A-Za-z_]
    63  	//
    64  	// Original golang regexps:
    65  	//
    66  	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
    67  	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
    68  	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
    69  	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    70  	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
    71  	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    72  	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
    73  	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
    74  	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
    75  	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
    76  	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
    77  	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    78  	//
    79  	// These regexps were converted to work in the same way for both engines:
    80  	//
    81  	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
    82  	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
    83  	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
    84  	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    85  	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
    86  	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    87  	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
    88  	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
    89  	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
    90  	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
    91  	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
    92  	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    93  
    94  	regexToSkip = []regex.EnryRegexp{
    95  		// The order must be this
    96  		reLiteralStringQuotes,
    97  		reMultilineComment,
    98  		reSingleLineComment,
    99  		reLiteralNumber,
   100  	}
   101  )
   102  
   103  func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
   104  	var shebangTokens [][]byte
   105  	matches := reShebang.FindAllSubmatch(content, -1)
   106  	if matches != nil {
   107  		shebangTokens = make([][]byte, 0, 2)
   108  		for _, match := range matches {
   109  			shebangToken := getShebangToken(match)
   110  			shebangTokens = append(shebangTokens, shebangToken)
   111  		}
   112  
   113  		reShebang.ReplaceAll(content, []byte(` `))
   114  	}
   115  
   116  	return content, shebangTokens
   117  }
   118  
   119  func getShebangToken(matchedShebang [][]byte) []byte {
   120  	const prefix = `SHEBANG#!`
   121  	var token []byte
   122  	for i := 1; i < len(matchedShebang); i++ {
   123  		if len(matchedShebang[i]) > 0 {
   124  			token = matchedShebang[i]
   125  			break
   126  		}
   127  	}
   128  
   129  	tokenShebang := append([]byte(prefix), token...)
   130  	return tokenShebang
   131  }
   132  
   133  func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
   134  	tokens := re.FindAll(content, -1)
   135  	content = re.ReplaceAll(content, []byte(` `))
   136  	return content, tokens
   137  }
   138  
   139  func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
   140  	return commonExtractAndReplace(content, rePunctuation)
   141  }
   142  
   143  func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
   144  	return commonExtractAndReplace(content, reRegularToken)
   145  }
   146  
   147  func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
   148  	return commonExtractAndReplace(content, reOperators)
   149  }
   150  
   151  func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
   152  	var SGMLTokens [][]byte
   153  	matches := reSGML.FindAllSubmatch(content, -1)
   154  	if matches != nil {
   155  		SGMLTokens = make([][]byte, 0, 2)
   156  		for _, match := range matches {
   157  			if reSGMLComment.Match(match[0]) {
   158  				continue
   159  			}
   160  
   161  			token := append(append([]byte(nil), match[1]...), '>')
   162  			SGMLTokens = append(SGMLTokens, token)
   163  			attributes := getSGMLAttributes(match[0])
   164  			SGMLTokens = append(SGMLTokens, attributes...)
   165  		}
   166  
   167  		content = reSGML.ReplaceAll(content, []byte(` `))
   168  	}
   169  
   170  	return content, SGMLTokens
   171  }
   172  
   173  func getSGMLAttributes(SGMLTag []byte) [][]byte {
   174  	var attributes [][]byte
   175  	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
   176  	if matches != nil {
   177  		attributes = make([][]byte, 0, 5)
   178  		for _, match := range matches {
   179  			if len(match[1]) != 0 {
   180  				attributes = append(attributes, match[1])
   181  			}
   182  
   183  			if len(match[2]) != 0 {
   184  				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
   185  				attributes = append(attributes, loneAttributes...)
   186  			}
   187  		}
   188  	}
   189  
   190  	return attributes
   191  }
   192  
   193  func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
   194  	for _, skip := range regexToSkip {
   195  		content = skip.ReplaceAll(content, []byte(` `))
   196  	}
   197  
   198  	return content, nil
   199  }
   200  
   201  func extractRemainders(content []byte) ([]byte, [][]byte) {
   202  	splitted := bytes.Fields(content)
   203  	remainderTokens := make([][]byte, 0, len(splitted)*3)
   204  	for _, remainder := range splitted {
   205  		remainders := bytes.Split(remainder, nil)
   206  		remainderTokens = append(remainderTokens, remainders...)
   207  	}
   208  
   209  	return content, remainderTokens
   210  }