go.charczuk.com@v0.0.0-20240327042549-bc490516bd1a/sdk/stringutil/split_quoted.go (about)

     1  /*
     2  
     3  Copyright (c) 2023 - Present. Will Charczuk. All rights reserved.
     4  Use of this source code is governed by a MIT license that can be found in the LICENSE file at the root of the repository.
     5  
     6  */
     7  
     8  package stringutil
     9  
    10  // SplitQuoted splits a corpus on a given string but treats quoted strings,
    11  // specifically any text within `"` as whole values.
    12  func SplitQuoted(text, sep string) (output []string) {
    13  	if text == "" || sep == "" {
    14  		return
    15  	}
    16  
    17  	// generally we read the text rune by rune
    18  	// if we see a rune that is the start of the separator
    19  	// we consume the separator until we either miss
    20  	// or we reach the end of the separator
    21  	//
    22  	// if we miss, we move the separator contents to the normal
    23  	// accumulation working word.
    24  	//
    25  	// if we run out of separator, we collect the previous
    26  	// accumulation working word as an output.
    27  	//
    28  	// we aim generally to do a single pass through the text
    29  	// and prioritize _not_ re-reading the same rune multiple times.
    30  
    31  	// fsm states
    32  	const (
    33  		stateWord   = iota
    34  		stateSep    = iota
    35  		stateQuoted = iota
    36  	)
    37  
    38  	var state int
    39  	var working, workingSep []rune
    40  	var openingQuote rune
    41  	sepRunes := []rune(sep)
    42  	var sepIndex int
    43  	for _, r := range text {
    44  		switch state {
    45  		case stateWord:
    46  			{
    47  				if r == sepRunes[0] {
    48  					state = stateSep
    49  					workingSep = append(workingSep, r)
    50  					sepIndex = 1
    51  					continue
    52  				}
    53  
    54  				working = append(working, r)
    55  
    56  				if fieldsIsQuote(r) {
    57  					openingQuote = r
    58  					state = stateQuoted
    59  					continue
    60  				}
    61  
    62  				continue
    63  			}
    64  
    65  		case stateSep:
    66  			{
    67  				if sepIndex == len(sepRunes) {
    68  					workingSep = nil
    69  					sepIndex = 0
    70  
    71  					if len(working) > 0 {
    72  						output = append(output, string(working))
    73  					}
    74  
    75  					working = []rune{r}
    76  
    77  					if fieldsIsQuote(r) {
    78  						openingQuote = r
    79  						state = stateQuoted
    80  						continue
    81  					}
    82  
    83  					state = stateWord
    84  					continue
    85  				}
    86  
    87  				if r == sepRunes[sepIndex] {
    88  					workingSep = append(workingSep, r)
    89  					sepIndex++
    90  					continue
    91  				}
    92  
    93  				// if we have a separator miss, add
    94  				// whatever we've collected so far to the
    95  				// working word
    96  				working = append(working, workingSep...)
    97  				workingSep = nil
    98  				sepIndex = 0
    99  
   100  				working = append(working, r)
   101  
   102  				if fieldsIsQuote(r) {
   103  					openingQuote = r
   104  					state = stateQuoted
   105  					continue
   106  				}
   107  
   108  				state = stateWord
   109  				continue
   110  			}
   111  
   112  		case stateQuoted:
   113  			{
   114  				// if we hit a quote, and it matches the "opening" quote
   115  				// switch back to normal word mode
   116  				if fieldsIsQuote(r) && fieldsMatchesQuote(openingQuote, r) {
   117  					state = stateWord
   118  				}
   119  				working = append(working, r)
   120  				continue
   121  			}
   122  
   123  		}
   124  	}
   125  
   126  	if len(workingSep) > 0 {
   127  		if string(workingSep) != sep {
   128  			working = append(working, workingSep...)
   129  		}
   130  	}
   131  	if len(working) > 0 {
   132  		output = append(output, string(working))
   133  	}
   134  	return
   135  }