github.com/pankona/gometalinter@v2.0.11+incompatible/_linters/src/golang.org/x/text/internal/ucd/ucd.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package ucd provides a parser for Unicode Character Database files, the 6 // format of which is defined in http://www.unicode.org/reports/tr44/. See 7 // http://www.unicode.org/Public/UCD/latest/ucd/ for example files. 8 // 9 // It currently does not support substitutions of missing fields. 10 package ucd // import "golang.org/x/text/internal/ucd" 11 12 import ( 13 "bufio" 14 "bytes" 15 "errors" 16 "io" 17 "log" 18 "regexp" 19 "strconv" 20 "strings" 21 ) 22 23 // UnicodeData.txt fields. 24 const ( 25 CodePoint = iota 26 Name 27 GeneralCategory 28 CanonicalCombiningClass 29 BidiClass 30 DecompMapping 31 DecimalValue 32 DigitValue 33 NumericValue 34 BidiMirrored 35 Unicode1Name 36 ISOComment 37 SimpleUppercaseMapping 38 SimpleLowercaseMapping 39 SimpleTitlecaseMapping 40 ) 41 42 // Parse calls f for each entry in the given reader of a UCD file. It will close 43 // the reader upon return. It will call log.Fatal if any error occurred. 44 // 45 // This implements the most common usage pattern of using Parser. 46 func Parse(r io.ReadCloser, f func(p *Parser)) { 47 defer r.Close() 48 49 p := New(r) 50 for p.Next() { 51 f(p) 52 } 53 if err := p.Err(); err != nil { 54 r.Close() // os.Exit will cause defers not to be called. 55 log.Fatal(err) 56 } 57 } 58 59 // An Option is used to configure a Parser. 60 type Option func(p *Parser) 61 62 func keepRanges(p *Parser) { 63 p.keepRanges = true 64 } 65 66 var ( 67 // KeepRanges prevents the expansion of ranges. The raw ranges can be 68 // obtained by calling Range(0) on the parser. 69 KeepRanges Option = keepRanges 70 ) 71 72 // The Part option register a handler for lines starting with a '@'. The text 73 // after a '@' is available as the first field. Comments are handled as usual. 74 func Part(f func(p *Parser)) Option { 75 return func(p *Parser) { 76 p.partHandler = f 77 } 78 } 79 80 // The CommentHandler option passes comments that are on a line by itself to 81 // a given handler. 82 func CommentHandler(f func(s string)) Option { 83 return func(p *Parser) { 84 p.commentHandler = f 85 } 86 } 87 88 // A Parser parses Unicode Character Database (UCD) files. 89 type Parser struct { 90 scanner *bufio.Scanner 91 92 keepRanges bool // Don't expand rune ranges in field 0. 93 94 err error 95 comment []byte 96 field [][]byte 97 // parsedRange is needed in case Range(0) is called more than once for one 98 // field. In some cases this requires scanning ahead. 99 parsedRange bool 100 rangeStart, rangeEnd rune 101 102 partHandler func(p *Parser) 103 commentHandler func(s string) 104 } 105 106 func (p *Parser) setError(err error) { 107 if p.err == nil { 108 p.err = err 109 } 110 } 111 112 func (p *Parser) getField(i int) []byte { 113 if i >= len(p.field) { 114 return nil 115 } 116 return p.field[i] 117 } 118 119 // Err returns a non-nil error if any error occurred during parsing. 120 func (p *Parser) Err() error { 121 return p.err 122 } 123 124 // New returns a Parser for the given Reader. 125 func New(r io.Reader, o ...Option) *Parser { 126 p := &Parser{ 127 scanner: bufio.NewScanner(r), 128 } 129 for _, f := range o { 130 f(p) 131 } 132 return p 133 } 134 135 // Next parses the next line in the file. It returns true if a line was parsed 136 // and false if it reached the end of the file. 137 func (p *Parser) Next() bool { 138 if !p.keepRanges && p.rangeStart < p.rangeEnd { 139 p.rangeStart++ 140 return true 141 } 142 p.comment = nil 143 p.field = p.field[:0] 144 p.parsedRange = false 145 146 for p.scanner.Scan() { 147 b := p.scanner.Bytes() 148 if len(b) == 0 { 149 continue 150 } 151 if b[0] == '#' { 152 if p.commentHandler != nil { 153 p.commentHandler(strings.TrimSpace(string(b[1:]))) 154 } 155 continue 156 } 157 158 // Parse line 159 if i := bytes.IndexByte(b, '#'); i != -1 { 160 p.comment = bytes.TrimSpace(b[i+1:]) 161 b = b[:i] 162 } 163 if b[0] == '@' { 164 if p.partHandler != nil { 165 p.field = append(p.field, bytes.TrimSpace(b[1:])) 166 p.partHandler(p) 167 p.field = p.field[:0] 168 } 169 p.comment = nil 170 continue 171 } 172 for { 173 i := bytes.IndexByte(b, ';') 174 if i == -1 { 175 p.field = append(p.field, bytes.TrimSpace(b)) 176 break 177 } 178 p.field = append(p.field, bytes.TrimSpace(b[:i])) 179 b = b[i+1:] 180 } 181 if !p.keepRanges { 182 p.rangeStart, p.rangeEnd = p.getRange(0) 183 } 184 return true 185 } 186 p.setError(p.scanner.Err()) 187 return false 188 } 189 190 func parseRune(b []byte) (rune, error) { 191 if len(b) > 2 && b[0] == 'U' && b[1] == '+' { 192 b = b[2:] 193 } 194 x, err := strconv.ParseUint(string(b), 16, 32) 195 return rune(x), err 196 } 197 198 func (p *Parser) parseRune(b []byte) rune { 199 x, err := parseRune(b) 200 p.setError(err) 201 return x 202 } 203 204 // Rune parses and returns field i as a rune. 205 func (p *Parser) Rune(i int) rune { 206 if i > 0 || p.keepRanges { 207 return p.parseRune(p.getField(i)) 208 } 209 return p.rangeStart 210 } 211 212 // Runes interprets and returns field i as a sequence of runes. 213 func (p *Parser) Runes(i int) (runes []rune) { 214 add := func(b []byte) { 215 if b = bytes.TrimSpace(b); len(b) > 0 { 216 runes = append(runes, p.parseRune(b)) 217 } 218 } 219 for b := p.getField(i); ; { 220 i := bytes.IndexByte(b, ' ') 221 if i == -1 { 222 add(b) 223 break 224 } 225 add(b[:i]) 226 b = b[i+1:] 227 } 228 return 229 } 230 231 var ( 232 errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>") 233 234 // reRange matches one line of a legacy rune range. 235 reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$") 236 ) 237 238 // Range parses and returns field i as a rune range. A range is inclusive at 239 // both ends. If the field only has one rune, first and last will be identical. 240 // It supports the legacy format for ranges used in UnicodeData.txt. 241 func (p *Parser) Range(i int) (first, last rune) { 242 if !p.keepRanges { 243 return p.rangeStart, p.rangeStart 244 } 245 return p.getRange(i) 246 } 247 248 func (p *Parser) getRange(i int) (first, last rune) { 249 b := p.getField(i) 250 if k := bytes.Index(b, []byte("..")); k != -1 { 251 return p.parseRune(b[:k]), p.parseRune(b[k+2:]) 252 } 253 // The first field may not be a rune, in which case we may ignore any error 254 // and set the range as 0..0. 255 x, err := parseRune(b) 256 if err != nil { 257 // Disable range parsing henceforth. This ensures that an error will be 258 // returned if the user subsequently will try to parse this field as 259 // a Rune. 260 p.keepRanges = true 261 } 262 // Special case for UnicodeData that was retained for backwards compatibility. 263 if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) { 264 if p.parsedRange { 265 return p.rangeStart, p.rangeEnd 266 } 267 mf := reRange.FindStringSubmatch(p.scanner.Text()) 268 if mf == nil || !p.scanner.Scan() { 269 p.setError(errIncorrectLegacyRange) 270 return x, x 271 } 272 // Using Bytes would be more efficient here, but Text is a lot easier 273 // and this is not a frequent case. 274 ml := reRange.FindStringSubmatch(p.scanner.Text()) 275 if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] { 276 p.setError(errIncorrectLegacyRange) 277 return x, x 278 } 279 p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])]) 280 p.parsedRange = true 281 return p.rangeStart, p.rangeEnd 282 } 283 return x, x 284 } 285 286 // bools recognizes all valid UCD boolean values. 287 var bools = map[string]bool{ 288 "": false, 289 "N": false, 290 "No": false, 291 "F": false, 292 "False": false, 293 "Y": true, 294 "Yes": true, 295 "T": true, 296 "True": true, 297 } 298 299 // Bool parses and returns field i as a boolean value. 300 func (p *Parser) Bool(i int) bool { 301 b := p.getField(i) 302 for s, v := range bools { 303 if bstrEq(b, s) { 304 return v 305 } 306 } 307 p.setError(strconv.ErrSyntax) 308 return false 309 } 310 311 // Int parses and returns field i as an integer value. 312 func (p *Parser) Int(i int) int { 313 x, err := strconv.ParseInt(string(p.getField(i)), 10, 64) 314 p.setError(err) 315 return int(x) 316 } 317 318 // Uint parses and returns field i as an unsigned integer value. 319 func (p *Parser) Uint(i int) uint { 320 x, err := strconv.ParseUint(string(p.getField(i)), 10, 64) 321 p.setError(err) 322 return uint(x) 323 } 324 325 // Float parses and returns field i as a decimal value. 326 func (p *Parser) Float(i int) float64 { 327 x, err := strconv.ParseFloat(string(p.getField(i)), 64) 328 p.setError(err) 329 return x 330 } 331 332 // String parses and returns field i as a string value. 333 func (p *Parser) String(i int) string { 334 return string(p.getField(i)) 335 } 336 337 // Strings parses and returns field i as a space-separated list of strings. 338 func (p *Parser) Strings(i int) []string { 339 ss := strings.Split(string(p.getField(i)), " ") 340 for i, s := range ss { 341 ss[i] = strings.TrimSpace(s) 342 } 343 return ss 344 } 345 346 // Comment returns the comments for the current line. 347 func (p *Parser) Comment() string { 348 return string(p.comment) 349 } 350 351 var errUndefinedEnum = errors.New("ucd: undefined enum value") 352 353 // Enum interprets and returns field i as a value that must be one of the values 354 // in enum. 355 func (p *Parser) Enum(i int, enum ...string) string { 356 b := p.getField(i) 357 for _, s := range enum { 358 if bstrEq(b, s) { 359 return s 360 } 361 } 362 p.setError(errUndefinedEnum) 363 return "" 364 } 365 366 func bstrEq(b []byte, s string) bool { 367 if len(b) != len(s) { 368 return false 369 } 370 for i, c := range b { 371 if c != s[i] { 372 return false 373 } 374 } 375 return true 376 }