github.com/blend/go-sdk@v1.20220411.3/stringutil/split_space_quoted.go (about)

     1  /*
     2  
     3  Copyright (c) 2022 - Present. Blend Labs, Inc. All rights reserved
     4  Use of this source code is governed by a MIT license that can be found in the LICENSE file.
     5  
     6  */
     7  
     8  package stringutil
     9  
    10  import "unicode"
    11  
    12  // SplitSpaceQuoted splits a corpus on space but treats quoted strings
    13  // i.e. within `"` as being atomic chunks.
    14  func SplitSpaceQuoted(text string) (output []string) {
    15  	if len(text) == 0 {
    16  		return
    17  	}
    18  
    19  	// fsm states
    20  	const (
    21  		stateLeadingSpace    = iota
    22  		stateWord            = iota
    23  		stateIntraSpace      = iota
    24  		stateLeadingQuoted   = iota
    25  		stateIntraWordQuoted = iota
    26  	)
    27  
    28  	var state int
    29  	var word []rune
    30  	var opened rune
    31  	for _, r := range text {
    32  		switch state {
    33  		case stateLeadingSpace: //leading whitespace until quote or alpha
    34  			if !unicode.IsSpace(r) {
    35  				if isQuote(r) { // start a quoted section
    36  					opened = r
    37  					state = stateLeadingQuoted
    38  				} else {
    39  					state = stateWord
    40  					word = append(word, r)
    41  				}
    42  			}
    43  		case stateWord: // within a word
    44  			if isQuote(r) {
    45  				opened = r
    46  				word = append(word, r)
    47  				state = stateIntraWordQuoted
    48  			} else if unicode.IsSpace(r) {
    49  				if len(word) > 0 {
    50  					output = append(output, string(word))
    51  					word = nil
    52  				}
    53  				state = stateIntraSpace
    54  			} else {
    55  				word = append(word, r)
    56  			}
    57  		case stateIntraSpace: // we've seen a space after we've seen at least one word
    58  			// consume spaces until a non-space character
    59  			if !unicode.IsSpace(r) {
    60  				if isQuote(r) { // start a quoted section
    61  					opened = r
    62  					state = stateLeadingQuoted
    63  				} else {
    64  					state = stateWord
    65  					word = append(word, r)
    66  				}
    67  			}
    68  		case stateLeadingQuoted: // leading quoted section
    69  			// if we close a quoted section, switch
    70  			// back to normal word mode
    71  			if matchesQuote(opened, r) {
    72  				state = stateWord
    73  			} else {
    74  				word = append(word, r)
    75  			}
    76  		case stateIntraWordQuoted: // quoted section within a word
    77  			// if we close a quoted section, switch
    78  			// back to normal word mode
    79  			if matchesQuote(opened, r) {
    80  				state = stateWord
    81  			}
    82  			word = append(word, r)
    83  		}
    84  	}
    85  
    86  	if len(word) > 0 {
    87  		output = append(output, string(word))
    88  	}
    89  	return
    90  }
    91  
    92  func isQuote(r rune) bool {
    93  	return r == '"' ||
    94  		r == '\'' ||
    95  		r == '“' ||
    96  		r == '”' ||
    97  		r == '`' ||
    98  		r == '‘' ||
    99  		r == '’'
   100  }
   101  
   102  func matchesQuote(a, b rune) bool {
   103  	if a == '“' && b == '”' {
   104  		return true
   105  	}
   106  	if a == '”' && b == '“' {
   107  		return true
   108  	}
   109  	if a == '‘' && b == '’' {
   110  		return true
   111  	}
   112  	if a == '’' && b == '‘' {
   113  		return true
   114  	}
   115  	return a == b
   116  }