github.com/rohankumardubey/go-enry@v1.7.3/internal/tokenizer/tokenize.go (about)

     1  // Package tokenizer implements file tokenization used by the enry content
     2  // classifier. This package is an implementation detail of enry and should not
     3  // be imported by other packages.
     4  package tokenizer
     5  
     6  import (
     7  	"bytes"
     8  
     9  	"gopkg.in/src-d/enry.v1/regex"
    10  )
    11  
    12  const byteLimit = 100000
    13  
    14  // Tokenize returns language-agnostic lexical tokens from content. The tokens
    15  // returned should match what the Linguist library returns. At most the first
    16  // 100KB of content are tokenized.
    17  func Tokenize(content []byte) []string {
    18  	if len(content) > byteLimit {
    19  		content = content[:byteLimit]
    20  	}
    21  
    22  	// Copy the input so that changes wrought by the tokenization steps do not
    23  	// modify the caller's copy of the input. See #196.
    24  	content = append([]byte(nil), content...)
    25  
    26  	tokens := make([][]byte, 0, 50)
    27  	for _, extract := range extractTokens {
    28  		var extractedTokens [][]byte
    29  		content, extractedTokens = extract(content)
    30  		tokens = append(tokens, extractedTokens...)
    31  	}
    32  
    33  	return toString(tokens)
    34  }
    35  
    36  func toString(tokens [][]byte) []string {
    37  	stokens := make([]string, 0, len(tokens))
    38  	for _, token := range tokens {
    39  		stokens = append(stokens, string(token))
    40  	}
    41  
    42  	return stokens
    43  }
    44  
    45  var (
    46  	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
    47  		// The order to must be this
    48  		extractAndReplaceShebang,
    49  		extractAndReplaceSGML,
    50  		skipCommentsAndLiterals,
    51  		extractAndReplacePunctuation,
    52  		extractAndReplaceRegular,
    53  		extractAndReplaceOperator,
    54  		extractRemainders,
    55  	}
    56  
    57  	// Differences between golang regexp and oniguruma:
    58  	// 1. no (?s) in oniguruma - makes dot match \n
    59  	// 2. no (?U) in oniguruma - ungreedy *
    60  	// 3. (?m) implies dot matches \n in oniguruma
    61  	// 4. oniguruma handles \w differently - impossible, but true
    62  	//
    63  	// Workarounds:
    64  	// 1. (.|\n)
    65  	// 2. replace * with *?
    66  	// 3. replace . with [^\n]
    67  	// 4. replace \w with [0-9A-Za-z_]
    68  	//
    69  	// Original golang regexps:
    70  	//
    71  	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
    72  	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
    73  	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
    74  	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    75  	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
    76  	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    77  	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
    78  	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
    79  	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
    80  	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
    81  	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
    82  	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    83  	//
    84  	// These regexps were converted to work in the same way for both engines:
    85  	//
    86  	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
    87  	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
    88  	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
    89  	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
    90  	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
    91  	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
    92  	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
    93  	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
    94  	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
    95  	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
    96  	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
    97  	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
    98  
    99  	regexToSkip = []regex.EnryRegexp{
   100  		// The order must be this
   101  		reLiteralStringQuotes,
   102  		reMultilineComment,
   103  		reSingleLineComment,
   104  		reLiteralNumber,
   105  	}
   106  )
   107  
   108  func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
   109  	var shebangTokens [][]byte
   110  	matches := reShebang.FindAllSubmatch(content, -1)
   111  	if matches != nil {
   112  		shebangTokens = make([][]byte, 0, 2)
   113  		for _, match := range matches {
   114  			shebangToken := getShebangToken(match)
   115  			shebangTokens = append(shebangTokens, shebangToken)
   116  		}
   117  
   118  		reShebang.ReplaceAll(content, []byte(` `))
   119  	}
   120  
   121  	return content, shebangTokens
   122  }
   123  
   124  func getShebangToken(matchedShebang [][]byte) []byte {
   125  	const prefix = `SHEBANG#!`
   126  	var token []byte
   127  	for i := 1; i < len(matchedShebang); i++ {
   128  		if len(matchedShebang[i]) > 0 {
   129  			token = matchedShebang[i]
   130  			break
   131  		}
   132  	}
   133  
   134  	tokenShebang := append([]byte(prefix), token...)
   135  	return tokenShebang
   136  }
   137  
   138  func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
   139  	tokens := re.FindAll(content, -1)
   140  	content = re.ReplaceAll(content, []byte(` `))
   141  	return content, tokens
   142  }
   143  
   144  func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
   145  	return commonExtractAndReplace(content, rePunctuation)
   146  }
   147  
   148  func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
   149  	return commonExtractAndReplace(content, reRegularToken)
   150  }
   151  
   152  func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
   153  	return commonExtractAndReplace(content, reOperators)
   154  }
   155  
   156  func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
   157  	var SGMLTokens [][]byte
   158  	matches := reSGML.FindAllSubmatch(content, -1)
   159  	if matches != nil {
   160  		SGMLTokens = make([][]byte, 0, 2)
   161  		for _, match := range matches {
   162  			if reSGMLComment.Match(match[0]) {
   163  				continue
   164  			}
   165  
   166  			token := append(match[1], '>')
   167  			SGMLTokens = append(SGMLTokens, token)
   168  			attributes := getSGMLAttributes(match[0])
   169  			SGMLTokens = append(SGMLTokens, attributes...)
   170  		}
   171  
   172  		content = reSGML.ReplaceAll(content, []byte(` `))
   173  	}
   174  
   175  	return content, SGMLTokens
   176  }
   177  
   178  func getSGMLAttributes(SGMLTag []byte) [][]byte {
   179  	var attributes [][]byte
   180  	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
   181  	if matches != nil {
   182  		attributes = make([][]byte, 0, 5)
   183  		for _, match := range matches {
   184  			if len(match[1]) != 0 {
   185  				attributes = append(attributes, match[1])
   186  			}
   187  
   188  			if len(match[2]) != 0 {
   189  				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
   190  				attributes = append(attributes, loneAttributes...)
   191  			}
   192  		}
   193  	}
   194  
   195  	return attributes
   196  }
   197  
   198  func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
   199  	for _, skip := range regexToSkip {
   200  		content = skip.ReplaceAll(content, []byte(` `))
   201  	}
   202  
   203  	return content, nil
   204  }
   205  
   206  func extractRemainders(content []byte) ([]byte, [][]byte) {
   207  	splitted := bytes.Fields(content)
   208  	remainderTokens := make([][]byte, 0, len(splitted)*3)
   209  	for _, remainder := range splitted {
   210  		remainders := bytes.Split(remainder, nil)
   211  		remainderTokens = append(remainderTokens, remainders...)
   212  	}
   213  
   214  	return content, remainderTokens
   215  }