github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/ucd/ucd.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package ucd provides a parser for Unicode Character Database files, the 6 // format of which is defined in http://www.unicode.org/reports/tr44/. See 7 // http://www.unicode.org/Public/UCD/latest/ucd/ for example files. 8 // 9 // It currently does not support substitutions of missing fields. 10 package ucd // import "github.com/go-xe2/third/golang.org/x/text/internal/ucd" 11 12 import ( 13 "bufio" 14 "errors" 15 "fmt" 16 "io" 17 "log" 18 "regexp" 19 "strconv" 20 "strings" 21 ) 22 23 // UnicodeData.txt fields. 24 const ( 25 CodePoint = iota 26 Name 27 GeneralCategory 28 CanonicalCombiningClass 29 BidiClass 30 DecompMapping 31 DecimalValue 32 DigitValue 33 NumericValue 34 BidiMirrored 35 Unicode1Name 36 ISOComment 37 SimpleUppercaseMapping 38 SimpleLowercaseMapping 39 SimpleTitlecaseMapping 40 ) 41 42 // Parse calls f for each entry in the given reader of a UCD file. It will close 43 // the reader upon return. It will call log.Fatal if any error occurred. 44 // 45 // This implements the most common usage pattern of using Parser. 46 func Parse(r io.ReadCloser, f func(p *Parser)) { 47 defer r.Close() 48 49 p := New(r) 50 for p.Next() { 51 f(p) 52 } 53 if err := p.Err(); err != nil { 54 r.Close() // os.Exit will cause defers not to be called. 55 log.Fatal(err) 56 } 57 } 58 59 // An Option is used to configure a Parser. 60 type Option func(p *Parser) 61 62 func keepRanges(p *Parser) { 63 p.keepRanges = true 64 } 65 66 var ( 67 // KeepRanges prevents the expansion of ranges. The raw ranges can be 68 // obtained by calling Range(0) on the parser. 69 KeepRanges Option = keepRanges 70 ) 71 72 // The Part option register a handler for lines starting with a '@'. The text 73 // after a '@' is available as the first field. Comments are handled as usual. 74 func Part(f func(p *Parser)) Option { 75 return func(p *Parser) { 76 p.partHandler = f 77 } 78 } 79 80 // The CommentHandler option passes comments that are on a line by itself to 81 // a given handler. 82 func CommentHandler(f func(s string)) Option { 83 return func(p *Parser) { 84 p.commentHandler = f 85 } 86 } 87 88 // A Parser parses Unicode Character Database (UCD) files. 89 type Parser struct { 90 scanner *bufio.Scanner 91 92 keepRanges bool // Don't expand rune ranges in field 0. 93 94 err error 95 comment string 96 field []string 97 // parsedRange is needed in case Range(0) is called more than once for one 98 // field. In some cases this requires scanning ahead. 99 line int 100 parsedRange bool 101 rangeStart, rangeEnd rune 102 103 partHandler func(p *Parser) 104 commentHandler func(s string) 105 } 106 107 func (p *Parser) setError(err error, msg string) { 108 if p.err == nil && err != nil { 109 if msg == "" { 110 p.err = fmt.Errorf("ucd:line:%d: %v", p.line, err) 111 } else { 112 p.err = fmt.Errorf("ucd:line:%d:%s: %v", p.line, msg, err) 113 } 114 } 115 } 116 117 func (p *Parser) getField(i int) string { 118 if i >= len(p.field) { 119 return "" 120 } 121 return p.field[i] 122 } 123 124 // Err returns a non-nil error if any error occurred during parsing. 125 func (p *Parser) Err() error { 126 return p.err 127 } 128 129 // New returns a Parser for the given Reader. 130 func New(r io.Reader, o ...Option) *Parser { 131 p := &Parser{ 132 scanner: bufio.NewScanner(r), 133 } 134 for _, f := range o { 135 f(p) 136 } 137 return p 138 } 139 140 // Next parses the next line in the file. It returns true if a line was parsed 141 // and false if it reached the end of the file. 142 func (p *Parser) Next() bool { 143 if !p.keepRanges && p.rangeStart < p.rangeEnd { 144 p.rangeStart++ 145 return true 146 } 147 p.comment = "" 148 p.field = p.field[:0] 149 p.parsedRange = false 150 151 for p.scanner.Scan() && p.err == nil { 152 p.line++ 153 s := p.scanner.Text() 154 if s == "" { 155 continue 156 } 157 if s[0] == '#' { 158 if p.commentHandler != nil { 159 p.commentHandler(strings.TrimSpace(s[1:])) 160 } 161 continue 162 } 163 164 // Parse line 165 if i := strings.IndexByte(s, '#'); i != -1 { 166 p.comment = strings.TrimSpace(s[i+1:]) 167 s = s[:i] 168 } 169 if s[0] == '@' { 170 if p.partHandler != nil { 171 p.field = append(p.field, strings.TrimSpace(s[1:])) 172 p.partHandler(p) 173 p.field = p.field[:0] 174 } 175 p.comment = "" 176 continue 177 } 178 for { 179 i := strings.IndexByte(s, ';') 180 if i == -1 { 181 p.field = append(p.field, strings.TrimSpace(s)) 182 break 183 } 184 p.field = append(p.field, strings.TrimSpace(s[:i])) 185 s = s[i+1:] 186 } 187 if !p.keepRanges { 188 p.rangeStart, p.rangeEnd = p.getRange(0) 189 } 190 return true 191 } 192 p.setError(p.scanner.Err(), "scanner failed") 193 return false 194 } 195 196 func parseRune(b string) (rune, error) { 197 if len(b) > 2 && b[0] == 'U' && b[1] == '+' { 198 b = b[2:] 199 } 200 x, err := strconv.ParseUint(b, 16, 32) 201 return rune(x), err 202 } 203 204 func (p *Parser) parseRune(s string) rune { 205 x, err := parseRune(s) 206 p.setError(err, "failed to parse rune") 207 return x 208 } 209 210 // Rune parses and returns field i as a rune. 211 func (p *Parser) Rune(i int) rune { 212 if i > 0 || p.keepRanges { 213 return p.parseRune(p.getField(i)) 214 } 215 return p.rangeStart 216 } 217 218 // Runes interprets and returns field i as a sequence of runes. 219 func (p *Parser) Runes(i int) (runes []rune) { 220 add := func(s string) { 221 if s = strings.TrimSpace(s); len(s) > 0 { 222 runes = append(runes, p.parseRune(s)) 223 } 224 } 225 for b := p.getField(i); ; { 226 i := strings.IndexByte(b, ' ') 227 if i == -1 { 228 add(b) 229 break 230 } 231 add(b[:i]) 232 b = b[i+1:] 233 } 234 return 235 } 236 237 var ( 238 errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>") 239 240 // reRange matches one line of a legacy rune range. 241 reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$") 242 ) 243 244 // Range parses and returns field i as a rune range. A range is inclusive at 245 // both ends. If the field only has one rune, first and last will be identical. 246 // It supports the legacy format for ranges used in UnicodeData.txt. 247 func (p *Parser) Range(i int) (first, last rune) { 248 if !p.keepRanges { 249 return p.rangeStart, p.rangeStart 250 } 251 return p.getRange(i) 252 } 253 254 func (p *Parser) getRange(i int) (first, last rune) { 255 b := p.getField(i) 256 if k := strings.Index(b, ".."); k != -1 { 257 return p.parseRune(b[:k]), p.parseRune(b[k+2:]) 258 } 259 // The first field may not be a rune, in which case we may ignore any error 260 // and set the range as 0..0. 261 x, err := parseRune(b) 262 if err != nil { 263 // Disable range parsing henceforth. This ensures that an error will be 264 // returned if the user subsequently will try to parse this field as 265 // a Rune. 266 p.keepRanges = true 267 } 268 // Special case for UnicodeData that was retained for backwards compatibility. 269 if i == 0 && len(p.field) > 1 && strings.HasSuffix(p.field[1], "First>") { 270 if p.parsedRange { 271 return p.rangeStart, p.rangeEnd 272 } 273 mf := reRange.FindStringSubmatch(p.scanner.Text()) 274 p.line++ 275 if mf == nil || !p.scanner.Scan() { 276 p.setError(errIncorrectLegacyRange, "") 277 return x, x 278 } 279 // Using Bytes would be more efficient here, but Text is a lot easier 280 // and this is not a frequent case. 281 ml := reRange.FindStringSubmatch(p.scanner.Text()) 282 if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] { 283 p.setError(errIncorrectLegacyRange, "") 284 return x, x 285 } 286 p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Text()[:len(ml[1])]) 287 p.parsedRange = true 288 return p.rangeStart, p.rangeEnd 289 } 290 return x, x 291 } 292 293 // bools recognizes all valid UCD boolean values. 294 var bools = map[string]bool{ 295 "": false, 296 "N": false, 297 "No": false, 298 "F": false, 299 "False": false, 300 "Y": true, 301 "Yes": true, 302 "T": true, 303 "True": true, 304 } 305 306 // Bool parses and returns field i as a boolean value. 307 func (p *Parser) Bool(i int) bool { 308 f := p.getField(i) 309 for s, v := range bools { 310 if f == s { 311 return v 312 } 313 } 314 p.setError(strconv.ErrSyntax, "error parsing bool") 315 return false 316 } 317 318 // Int parses and returns field i as an integer value. 319 func (p *Parser) Int(i int) int { 320 x, err := strconv.ParseInt(string(p.getField(i)), 10, 64) 321 p.setError(err, "error parsing int") 322 return int(x) 323 } 324 325 // Uint parses and returns field i as an unsigned integer value. 326 func (p *Parser) Uint(i int) uint { 327 x, err := strconv.ParseUint(string(p.getField(i)), 10, 64) 328 p.setError(err, "error parsing uint") 329 return uint(x) 330 } 331 332 // Float parses and returns field i as a decimal value. 333 func (p *Parser) Float(i int) float64 { 334 x, err := strconv.ParseFloat(string(p.getField(i)), 64) 335 p.setError(err, "error parsing float") 336 return x 337 } 338 339 // String parses and returns field i as a string value. 340 func (p *Parser) String(i int) string { 341 return string(p.getField(i)) 342 } 343 344 // Strings parses and returns field i as a space-separated list of strings. 345 func (p *Parser) Strings(i int) []string { 346 ss := strings.Split(string(p.getField(i)), " ") 347 for i, s := range ss { 348 ss[i] = strings.TrimSpace(s) 349 } 350 return ss 351 } 352 353 // Comment returns the comments for the current line. 354 func (p *Parser) Comment() string { 355 return string(p.comment) 356 } 357 358 var errUndefinedEnum = errors.New("ucd: undefined enum value") 359 360 // Enum interprets and returns field i as a value that must be one of the values 361 // in enum. 362 func (p *Parser) Enum(i int, enum ...string) string { 363 f := p.getField(i) 364 for _, s := range enum { 365 if f == s { 366 return s 367 } 368 } 369 p.setError(errUndefinedEnum, "error parsing enum") 370 return "" 371 }