github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/dsref/generate.go (about)

     1  package dsref
     2  
     3  import (
     4  	"strings"
     5  	"unicode"
     6  
     7  	"golang.org/x/text/transform"
     8  	"golang.org/x/text/unicode/norm"
     9  )
    10  
    11  // NameMaxLength is the maximum length of a name that will be generated
    12  var NameMaxLength = 44
    13  
    14  // GenerateName converts the input into a valid dataset string, which starts with a lower-case
    15  // letter, and only has letters, digits, dashes and underscores.
    16  func GenerateName(input, prefix string) string {
    17  	// Normalize unicode by trying to convert all unicode characters to ascii.
    18  	// https://stackoverflow.com/questions/26722450/remove-diacritics-using-go
    19  	t := transform.Chain(norm.NFD, transform.RemoveFunc(isNonspacingMark), norm.NFC)
    20  	input, _, _ = transform.String(t, input)
    21  	// Trim space from the left and right
    22  	input = strings.TrimSpace(input)
    23  	// Run the state machine that converts the string to a valid dataset name
    24  	name := convertWordsStateMachine(input)
    25  	// If the dataset does not start with a lower-case letter, add the prefix. It is the
    26  	// responsibility of the caller to provide a valid prefix.
    27  	first := []rune(name)[0]
    28  	if !unicode.IsLower(first) {
    29  		name = prefix + name
    30  	}
    31  	return name
    32  }
    33  
    34  func isNonspacingMark(r rune) bool {
    35  	return unicode.Is(unicode.Mn, r)
    36  }
    37  
    38  // State is used to handle the state machine that processes strings
    39  type State int
    40  
    41  const (
    42  	// StateNone is for being outside of a word, looking at spaces
    43  	StateNone State = 0
    44  	// StateLowerWord is for looking at a word made-up of lower-case letters
    45  	StateLowerWord State = 1
    46  	// StateFirstUpper is for looking at the first upper-case letter of a word
    47  	StateFirstUpper State = 2
    48  	// StateCapsWord is for looking at a word made up of all upper-case letters
    49  	StateCapsWord State = 3
    50  	// StateNumber is for looking at a sequence of digits
    51  	StateNumber State = 4
    52  	// StatePunc is for looking at punctuation characters
    53  	StatePunc State = 5
    54  	// StateExit is set by flush in order to exit the top-level loop immediately
    55  	StateExit State = 6
    56  )
    57  
    58  // Process a string, finding each word and outputting them to a new string. Words may be separated
    59  // by spaces, or punctuation, or could be camel cased. In any case, they should be separated
    60  // by dashes or underscores in the output string. Other punctuation characters will be replaced
    61  // by dashes.
    62  func convertWordsStateMachine(input string) string {
    63  	state := StateNone
    64  	// result is the accumlated string based upon the state machine's position
    65  	result := strings.Builder{}
    66  	// next is one or more dashes (obtained from spaces or punctuation) or underscores, followed
    67  	// by characters in the curent word. Once a state change finishes that word, it will be
    68  	// added to the result by calling `flush`.
    69  	next := strings.Builder{}
    70  
    71  	// flushWord will take the word in `next` and append it to the result. It handles any per-word
    72  	// tasks, like lower-casing words, separating words with underscores, and making sure the
    73  	// result does not exceed the maximum length.
    74  	flushWord := func(nextState State, nextRune rune) {
    75  		word := strings.ToLower(next.String())
    76  
    77  		// If nothing to flush, exit early
    78  		if word == "" {
    79  			return
    80  		}
    81  
    82  		// Put an underscore between words
    83  		if result.Len() > 0 {
    84  			prev := []rune(result.String())
    85  			// Check if the previous word ended with, and the next word starts with, an alphanum.
    86  			if isAlphanum(prev[len(prev)-1]) && isAlphanum([]rune(word)[0]) {
    87  				word = "_" + word
    88  			}
    89  		}
    90  
    91  		// Check length of result
    92  		if result.Len()+len(word) > NameMaxLength {
    93  			if result.Len() == 0 {
    94  				result.WriteString(word[:NameMaxLength])
    95  			}
    96  			state = StateExit
    97  			return
    98  		}
    99  
   100  		// Add the word to the result
   101  		result.WriteString(word)
   102  		next.Reset()
   103  
   104  		// Assign next state and rune
   105  		state = nextState
   106  		next.WriteRune(nextRune)
   107  	}
   108  
   109  	//
   110  	// The state machine below is used to convert an arbitrary string into a string that is
   111  	// always a valid dataset name. The main two reasons for using a state machine instead of
   112  	// another approach is these two cases:
   113  	//   AnnualTPSReport -> annual_tps_report
   114  	//   category: climate -> category-climate
   115  	// These both require looking at multiple characters in a row in order to decide how to
   116  	// split words and replace punctuation. The state machine accomplishes this with the
   117  	// two particular states StateCapsWord and StatePunc, respectively.
   118  	//
   119  	// The basic form of the state machine that accomplishes these cases is this:
   120  	//
   121  	// +-----------+            +-----------------+            +---------------+
   122  	// | StateNone | - upper -> | StateFirstUpper | - upper -> | StateCapsWord |
   123  	// +-----------+            +-----------------+            +---------------+
   124  	//    |  |                     |                              |
   125  	//    |  |                    lower (CamelCase)              lower
   126  	//    |  |                     |                              |
   127  	//    |  |                     v                              v
   128  	//    |  |               +------------+             `split prev, goto StateLower`
   129  	//    | lower ---------> | StateLower |
   130  	//    |                  +------------+
   131  	//    |
   132  	//    |                  +-----------+
   133  	//   punc -------------> | StatePunc | - space -> `ignore space, combine with punc`
   134  	//                       +-----------+
   135  
   136  	for _, r := range input {
   137  		if r > 127 {
   138  			// Ignore non-ascii code points
   139  			continue
   140  		}
   141  
   142  		switch state {
   143  		case StateExit:
   144  			break
   145  
   146  		case StateNone:
   147  			if r == ' ' {
   148  				next.WriteRune('_')
   149  			} else if unicode.IsLower(r) {
   150  				state = StateLowerWord
   151  				next.WriteRune(r)
   152  			} else if unicode.IsUpper(r) {
   153  				state = StateFirstUpper
   154  				next.WriteRune(r)
   155  			} else if unicode.IsDigit(r) {
   156  				state = StateNumber
   157  				next.WriteRune(r)
   158  			} else if isPunc(r) {
   159  				state = StatePunc
   160  				next.WriteRune(r)
   161  			} else if r == '_' || r == '-' {
   162  				next.WriteRune('-')
   163  			}
   164  
   165  		case StateLowerWord:
   166  			if r == ' ' {
   167  				flushWord(StateNone, '_')
   168  			} else if unicode.IsLower(r) {
   169  				next.WriteRune(r)
   170  			} else if unicode.IsUpper(r) {
   171  				// Was looking at a word of lower-case characters, and now encountered a
   172  				// upper-case letter, which means the previous word is done
   173  				flushWord(StateFirstUpper, r)
   174  			} else if unicode.IsDigit(r) {
   175  				flushWord(StateNumber, r)
   176  			} else if isPunc(r) {
   177  				flushWord(StatePunc, '-')
   178  			} else if r == '_' || r == '-' {
   179  				flushWord(StateNone, r)
   180  			}
   181  
   182  		case StateFirstUpper:
   183  			if r == ' ' {
   184  				flushWord(StateNone, '_')
   185  			} else if unicode.IsLower(r) {
   186  				state = StateLowerWord
   187  				next.WriteRune(r)
   188  			} else if unicode.IsUpper(r) {
   189  				state = StateCapsWord
   190  				next.WriteRune(r)
   191  			} else if unicode.IsDigit(r) {
   192  				flushWord(StateNumber, r)
   193  			} else if isPunc(r) {
   194  				flushWord(StatePunc, '-')
   195  			} else if r == '_' || r == '-' {
   196  				flushWord(StateNone, r)
   197  			}
   198  
   199  		case StateCapsWord:
   200  			if r == ' ' {
   201  				flushWord(StateNone, '_')
   202  			} else if unicode.IsLower(r) {
   203  				// Just encounterd a series of upper-case letters (2 or more) and now see a
   204  				// lower-case letter. Split off the previous upper-case letters before the final
   205  				// one, and turn that into a word, then keep that final upper-case letter as the
   206  				// start of the next word
   207  				//
   208  				// For example, if this is the string:
   209  				//   NBCTelevisionNetwork
   210  				// We would encounter this situation when the cursor gets here
   211  				//   NBCTelevisionNetwork
   212  				//       ^
   213  				// Which would be split into this:
   214  				// 'nbc' <- previous word
   215  				// 'te'  <- next
   216  				got := []rune(next.String())
   217  				prevWord := got[:len(got)-1]
   218  				lastLetter := got[len(got)-1]
   219  				// Pull off the previous word
   220  				next.Reset()
   221  				next.WriteString(string(prevWord))
   222  				// Flush that word, start the next, which now has two characters
   223  				flushWord(StateLowerWord, lastLetter)
   224  				next.WriteRune(r)
   225  			} else if unicode.IsUpper(r) {
   226  				next.WriteRune(r)
   227  			} else if unicode.IsDigit(r) {
   228  				flushWord(StateNumber, r)
   229  			} else if isPunc(r) {
   230  				flushWord(StatePunc, '-')
   231  			} else if r == '_' || r == '-' {
   232  				flushWord(StateNone, r)
   233  			}
   234  
   235  		case StateNumber:
   236  			if r == ' ' {
   237  				flushWord(StateNone, '_')
   238  			} else if unicode.IsLower(r) {
   239  				flushWord(StateLowerWord, r)
   240  			} else if unicode.IsUpper(r) {
   241  				// Was looking at a number, and now encountered a upper-case letter, which
   242  				// means the previous word is done
   243  				flushWord(StateFirstUpper, r)
   244  			} else if unicode.IsDigit(r) {
   245  				next.WriteRune(r)
   246  			} else if isPunc(r) {
   247  				flushWord(StatePunc, '-')
   248  			} else if r == '_' || r == '-' {
   249  				flushWord(StateNone, r)
   250  			}
   251  
   252  		case StatePunc:
   253  			if r == ' ' {
   254  				// Punctuation ignores spaces after it
   255  				continue
   256  			} else if unicode.IsLower(r) {
   257  				flushWord(StateLowerWord, r)
   258  			} else if unicode.IsUpper(r) {
   259  				// Was looking at punctuation, and now encountered a upper-case letter, which
   260  				// means the previous word is done
   261  				flushWord(StateFirstUpper, r)
   262  			} else if unicode.IsDigit(r) {
   263  				flushWord(StateNumber, r)
   264  			} else if isPunc(r) {
   265  				next.WriteRune('-')
   266  			} else if r == '_' || r == '-' {
   267  				flushWord(StateNone, r)
   268  			}
   269  		}
   270  	}
   271  	// Input is finished, flush the last word
   272  	flushWord(StateNone, rune(0))
   273  	return result.String()
   274  }
   275  
   276  // PuncCharacters is the list of punctuation characters that get converted to dashes
   277  const PuncCharacters = "`~!@#$%^&*()=+[{]}\\|;:'\",<.>/?"
   278  
   279  func isPunc(r rune) bool {
   280  	return strings.IndexRune(PuncCharacters, r) != -1
   281  }
   282  
   283  func isAlphanum(r rune) bool {
   284  	return unicode.IsLetter(r) || unicode.IsDigit(r)
   285  }