github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/dsref/generate.go (about) 1 package dsref 2 3 import ( 4 "strings" 5 "unicode" 6 7 "golang.org/x/text/transform" 8 "golang.org/x/text/unicode/norm" 9 ) 10 11 // NameMaxLength is the maximum length of a name that will be generated 12 var NameMaxLength = 44 13 14 // GenerateName converts the input into a valid dataset string, which starts with a lower-case 15 // letter, and only has letters, digits, dashes and underscores. 16 func GenerateName(input, prefix string) string { 17 // Normalize unicode by trying to convert all unicode characters to ascii. 18 // https://stackoverflow.com/questions/26722450/remove-diacritics-using-go 19 t := transform.Chain(norm.NFD, transform.RemoveFunc(isNonspacingMark), norm.NFC) 20 input, _, _ = transform.String(t, input) 21 // Trim space from the left and right 22 input = strings.TrimSpace(input) 23 // Run the state machine that converts the string to a valid dataset name 24 name := convertWordsStateMachine(input) 25 // If the dataset does not start with a lower-case letter, add the prefix. It is the 26 // responsibility of the caller to provide a valid prefix. 27 first := []rune(name)[0] 28 if !unicode.IsLower(first) { 29 name = prefix + name 30 } 31 return name 32 } 33 34 func isNonspacingMark(r rune) bool { 35 return unicode.Is(unicode.Mn, r) 36 } 37 38 // State is used to handle the state machine that processes strings 39 type State int 40 41 const ( 42 // StateNone is for being outside of a word, looking at spaces 43 StateNone State = 0 44 // StateLowerWord is for looking at a word made-up of lower-case letters 45 StateLowerWord State = 1 46 // StateFirstUpper is for looking at the first upper-case letter of a word 47 StateFirstUpper State = 2 48 // StateCapsWord is for looking at a word made up of all upper-case letters 49 StateCapsWord State = 3 50 // StateNumber is for looking at a sequence of digits 51 StateNumber State = 4 52 // StatePunc is for looking at punctuation characters 53 StatePunc State = 5 54 // StateExit is set by flush in order to exit the top-level loop immediately 55 StateExit State = 6 56 ) 57 58 // Process a string, finding each word and outputting them to a new string. Words may be separated 59 // by spaces, or punctuation, or could be camel cased. In any case, they should be separated 60 // by dashes or underscores in the output string. Other punctuation characters will be replaced 61 // by dashes. 62 func convertWordsStateMachine(input string) string { 63 state := StateNone 64 // result is the accumlated string based upon the state machine's position 65 result := strings.Builder{} 66 // next is one or more dashes (obtained from spaces or punctuation) or underscores, followed 67 // by characters in the curent word. Once a state change finishes that word, it will be 68 // added to the result by calling `flush`. 69 next := strings.Builder{} 70 71 // flushWord will take the word in `next` and append it to the result. It handles any per-word 72 // tasks, like lower-casing words, separating words with underscores, and making sure the 73 // result does not exceed the maximum length. 74 flushWord := func(nextState State, nextRune rune) { 75 word := strings.ToLower(next.String()) 76 77 // If nothing to flush, exit early 78 if word == "" { 79 return 80 } 81 82 // Put an underscore between words 83 if result.Len() > 0 { 84 prev := []rune(result.String()) 85 // Check if the previous word ended with, and the next word starts with, an alphanum. 86 if isAlphanum(prev[len(prev)-1]) && isAlphanum([]rune(word)[0]) { 87 word = "_" + word 88 } 89 } 90 91 // Check length of result 92 if result.Len()+len(word) > NameMaxLength { 93 if result.Len() == 0 { 94 result.WriteString(word[:NameMaxLength]) 95 } 96 state = StateExit 97 return 98 } 99 100 // Add the word to the result 101 result.WriteString(word) 102 next.Reset() 103 104 // Assign next state and rune 105 state = nextState 106 next.WriteRune(nextRune) 107 } 108 109 // 110 // The state machine below is used to convert an arbitrary string into a string that is 111 // always a valid dataset name. The main two reasons for using a state machine instead of 112 // another approach is these two cases: 113 // AnnualTPSReport -> annual_tps_report 114 // category: climate -> category-climate 115 // These both require looking at multiple characters in a row in order to decide how to 116 // split words and replace punctuation. The state machine accomplishes this with the 117 // two particular states StateCapsWord and StatePunc, respectively. 118 // 119 // The basic form of the state machine that accomplishes these cases is this: 120 // 121 // +-----------+ +-----------------+ +---------------+ 122 // | StateNone | - upper -> | StateFirstUpper | - upper -> | StateCapsWord | 123 // +-----------+ +-----------------+ +---------------+ 124 // | | | | 125 // | | lower (CamelCase) lower 126 // | | | | 127 // | | v v 128 // | | +------------+ `split prev, goto StateLower` 129 // | lower ---------> | StateLower | 130 // | +------------+ 131 // | 132 // | +-----------+ 133 // punc -------------> | StatePunc | - space -> `ignore space, combine with punc` 134 // +-----------+ 135 136 for _, r := range input { 137 if r > 127 { 138 // Ignore non-ascii code points 139 continue 140 } 141 142 switch state { 143 case StateExit: 144 break 145 146 case StateNone: 147 if r == ' ' { 148 next.WriteRune('_') 149 } else if unicode.IsLower(r) { 150 state = StateLowerWord 151 next.WriteRune(r) 152 } else if unicode.IsUpper(r) { 153 state = StateFirstUpper 154 next.WriteRune(r) 155 } else if unicode.IsDigit(r) { 156 state = StateNumber 157 next.WriteRune(r) 158 } else if isPunc(r) { 159 state = StatePunc 160 next.WriteRune(r) 161 } else if r == '_' || r == '-' { 162 next.WriteRune('-') 163 } 164 165 case StateLowerWord: 166 if r == ' ' { 167 flushWord(StateNone, '_') 168 } else if unicode.IsLower(r) { 169 next.WriteRune(r) 170 } else if unicode.IsUpper(r) { 171 // Was looking at a word of lower-case characters, and now encountered a 172 // upper-case letter, which means the previous word is done 173 flushWord(StateFirstUpper, r) 174 } else if unicode.IsDigit(r) { 175 flushWord(StateNumber, r) 176 } else if isPunc(r) { 177 flushWord(StatePunc, '-') 178 } else if r == '_' || r == '-' { 179 flushWord(StateNone, r) 180 } 181 182 case StateFirstUpper: 183 if r == ' ' { 184 flushWord(StateNone, '_') 185 } else if unicode.IsLower(r) { 186 state = StateLowerWord 187 next.WriteRune(r) 188 } else if unicode.IsUpper(r) { 189 state = StateCapsWord 190 next.WriteRune(r) 191 } else if unicode.IsDigit(r) { 192 flushWord(StateNumber, r) 193 } else if isPunc(r) { 194 flushWord(StatePunc, '-') 195 } else if r == '_' || r == '-' { 196 flushWord(StateNone, r) 197 } 198 199 case StateCapsWord: 200 if r == ' ' { 201 flushWord(StateNone, '_') 202 } else if unicode.IsLower(r) { 203 // Just encounterd a series of upper-case letters (2 or more) and now see a 204 // lower-case letter. Split off the previous upper-case letters before the final 205 // one, and turn that into a word, then keep that final upper-case letter as the 206 // start of the next word 207 // 208 // For example, if this is the string: 209 // NBCTelevisionNetwork 210 // We would encounter this situation when the cursor gets here 211 // NBCTelevisionNetwork 212 // ^ 213 // Which would be split into this: 214 // 'nbc' <- previous word 215 // 'te' <- next 216 got := []rune(next.String()) 217 prevWord := got[:len(got)-1] 218 lastLetter := got[len(got)-1] 219 // Pull off the previous word 220 next.Reset() 221 next.WriteString(string(prevWord)) 222 // Flush that word, start the next, which now has two characters 223 flushWord(StateLowerWord, lastLetter) 224 next.WriteRune(r) 225 } else if unicode.IsUpper(r) { 226 next.WriteRune(r) 227 } else if unicode.IsDigit(r) { 228 flushWord(StateNumber, r) 229 } else if isPunc(r) { 230 flushWord(StatePunc, '-') 231 } else if r == '_' || r == '-' { 232 flushWord(StateNone, r) 233 } 234 235 case StateNumber: 236 if r == ' ' { 237 flushWord(StateNone, '_') 238 } else if unicode.IsLower(r) { 239 flushWord(StateLowerWord, r) 240 } else if unicode.IsUpper(r) { 241 // Was looking at a number, and now encountered a upper-case letter, which 242 // means the previous word is done 243 flushWord(StateFirstUpper, r) 244 } else if unicode.IsDigit(r) { 245 next.WriteRune(r) 246 } else if isPunc(r) { 247 flushWord(StatePunc, '-') 248 } else if r == '_' || r == '-' { 249 flushWord(StateNone, r) 250 } 251 252 case StatePunc: 253 if r == ' ' { 254 // Punctuation ignores spaces after it 255 continue 256 } else if unicode.IsLower(r) { 257 flushWord(StateLowerWord, r) 258 } else if unicode.IsUpper(r) { 259 // Was looking at punctuation, and now encountered a upper-case letter, which 260 // means the previous word is done 261 flushWord(StateFirstUpper, r) 262 } else if unicode.IsDigit(r) { 263 flushWord(StateNumber, r) 264 } else if isPunc(r) { 265 next.WriteRune('-') 266 } else if r == '_' || r == '-' { 267 flushWord(StateNone, r) 268 } 269 } 270 } 271 // Input is finished, flush the last word 272 flushWord(StateNone, rune(0)) 273 return result.String() 274 } 275 276 // PuncCharacters is the list of punctuation characters that get converted to dashes 277 const PuncCharacters = "`~!@#$%^&*()=+[{]}\\|;:'\",<.>/?" 278 279 func isPunc(r rune) bool { 280 return strings.IndexRune(PuncCharacters, r) != -1 281 } 282 283 func isAlphanum(r rune) bool { 284 return unicode.IsLetter(r) || unicode.IsDigit(r) 285 }