github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/internal/ucd/ucd.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package ucd provides a parser for Unicode Character Database files, the 6 // format of which is defined in http://www.unicode.org/reports/tr44/. See 7 // http://www.unicode.org/Public/UCD/latest/ucd/ for example files. 8 // 9 // It currently does not support substitutions of missing fields. 10 package ucd // import "golang.org/x/text/internal/ucd" 11 12 import ( 13 "bufio" 14 "bytes" 15 "errors" 16 "fmt" 17 "io" 18 "log" 19 "regexp" 20 "strconv" 21 "strings" 22 ) 23 24 // UnicodeData.txt fields. 25 const ( 26 CodePoint = iota 27 Name 28 GeneralCategory 29 CanonicalCombiningClass 30 BidiClass 31 DecompMapping 32 DecimalValue 33 DigitValue 34 NumericValue 35 BidiMirrored 36 Unicode1Name 37 ISOComment 38 SimpleUppercaseMapping 39 SimpleLowercaseMapping 40 SimpleTitlecaseMapping 41 ) 42 43 // Parse calls f for each entry in the given reader of a UCD file. It will close 44 // the reader upon return. It will call log.Fatal if any error occurred. 45 // 46 // This implements the most common usage pattern of using Parser. 47 func Parse(r io.ReadCloser, f func(p *Parser)) { 48 defer r.Close() 49 50 p := New(r) 51 for p.Next() { 52 f(p) 53 } 54 if err := p.Err(); err != nil { 55 r.Close() // os.Exit will cause defers not to be called. 56 log.Fatal(err) 57 } 58 } 59 60 // An Option is used to configure a Parser. 61 type Option func(p *Parser) 62 63 func keepRanges(p *Parser) { 64 p.keepRanges = true 65 } 66 67 var ( 68 // KeepRanges prevents the expansion of ranges. The raw ranges can be 69 // obtained by calling Range(0) on the parser. 70 KeepRanges Option = keepRanges 71 ) 72 73 // The Part option register a handler for lines starting with a '@'. The text 74 // after a '@' is available as the first field. Comments are handled as usual. 75 func Part(f func(p *Parser)) Option { 76 return func(p *Parser) { 77 p.partHandler = f 78 } 79 } 80 81 // A Parser parses Unicode Character Database (UCD) files. 82 type Parser struct { 83 scanner *bufio.Scanner 84 85 keepRanges bool // Don't expand rune ranges in field 0. 86 87 err error 88 comment []byte 89 field [][]byte 90 // parsedRange is needed in case Range(0) is called more than once for one 91 // field. In some cases this requires scanning ahead. 92 parsedRange bool 93 rangeStart, rangeEnd rune 94 95 partHandler func(p *Parser) 96 } 97 98 func (p *Parser) setError(err error) { 99 if p.err == nil { 100 p.err = err 101 } 102 } 103 104 func (p *Parser) getField(i int) []byte { 105 if i >= len(p.field) { 106 p.setError(fmt.Errorf("ucd: index of field %d out of bounds", i)) 107 return nil 108 } 109 return p.field[i] 110 } 111 112 // Err returns a non-nil error if any error occurred during parsing. 113 func (p *Parser) Err() error { 114 return p.err 115 } 116 117 // New returns a Parser for the given Reader. 118 func New(r io.Reader, o ...Option) *Parser { 119 p := &Parser{ 120 scanner: bufio.NewScanner(r), 121 } 122 for _, f := range o { 123 f(p) 124 } 125 return p 126 } 127 128 // Next parses the next line in the file. It returns true if a line was parsed 129 // and false if it reached the end of the file. 130 func (p *Parser) Next() bool { 131 if !p.keepRanges && p.rangeStart < p.rangeEnd { 132 p.rangeStart++ 133 return true 134 } 135 p.comment = nil 136 p.field = p.field[:0] 137 p.parsedRange = false 138 139 for p.scanner.Scan() { 140 b := p.scanner.Bytes() 141 if len(b) == 0 || b[0] == '#' { 142 continue 143 } 144 145 // Parse line 146 if i := bytes.IndexByte(b, '#'); i != -1 { 147 p.comment = bytes.TrimSpace(b[i+1:]) 148 b = b[:i] 149 } 150 if b[0] == '@' { 151 if p.partHandler != nil { 152 p.field = append(p.field, bytes.TrimSpace(b[1:])) 153 p.partHandler(p) 154 p.field = p.field[:0] 155 } 156 p.comment = nil 157 continue 158 } 159 for { 160 i := bytes.IndexByte(b, ';') 161 if i == -1 { 162 p.field = append(p.field, bytes.TrimSpace(b)) 163 break 164 } 165 p.field = append(p.field, bytes.TrimSpace(b[:i])) 166 b = b[i+1:] 167 } 168 if !p.keepRanges { 169 p.rangeStart, p.rangeEnd = p.getRange(0) 170 } 171 return true 172 } 173 p.setError(p.scanner.Err()) 174 return false 175 } 176 177 func parseRune(b []byte) (rune, error) { 178 if len(b) > 2 && b[0] == 'U' && b[1] == '+' { 179 b = b[2:] 180 } 181 x, err := strconv.ParseUint(string(b), 16, 32) 182 return rune(x), err 183 } 184 185 func (p *Parser) parseRune(b []byte) rune { 186 x, err := parseRune(b) 187 p.setError(err) 188 return x 189 } 190 191 // Rune parses and returns field i as a rune. 192 func (p *Parser) Rune(i int) rune { 193 if i > 0 || p.keepRanges { 194 return p.parseRune(p.getField(i)) 195 } 196 return p.rangeStart 197 } 198 199 // Runes interprets and returns field i as a sequence of runes. 200 func (p *Parser) Runes(i int) (runes []rune) { 201 add := func(b []byte) { 202 if b = bytes.TrimSpace(b); len(b) > 0 { 203 runes = append(runes, p.parseRune(b)) 204 } 205 } 206 for b := p.getField(i); ; { 207 i := bytes.IndexByte(b, ' ') 208 if i == -1 { 209 add(b) 210 break 211 } 212 add(b[:i]) 213 b = b[i+1:] 214 } 215 return 216 } 217 218 var ( 219 errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>") 220 221 // reRange matches one line of a legacy rune range. 222 reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$") 223 ) 224 225 // Range parses and returns field i as a rune range. A range is inclusive at 226 // both ends. If the field only has one rune, first and last will be identical. 227 // It supports the legacy format for ranges used in UnicodeData.txt. 228 func (p *Parser) Range(i int) (first, last rune) { 229 if !p.keepRanges { 230 return p.rangeStart, p.rangeStart 231 } 232 return p.getRange(i) 233 } 234 235 func (p *Parser) getRange(i int) (first, last rune) { 236 b := p.getField(i) 237 if k := bytes.Index(b, []byte("..")); k != -1 { 238 return p.parseRune(b[:k]), p.parseRune(b[k+2:]) 239 } 240 // The first field may not be a rune, in which case we may ignore any error 241 // and set the range as 0..0. 242 x, err := parseRune(b) 243 if err != nil { 244 // Disable range parsing henceforth. This ensures that an error will be 245 // returned if the user subsequently will try to parse this field as 246 // a Rune. 247 p.keepRanges = true 248 } 249 // Special case for UnicodeData that was retained for backwards compatibility. 250 if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) { 251 if p.parsedRange { 252 return p.rangeStart, p.rangeEnd 253 } 254 mf := reRange.FindStringSubmatch(p.scanner.Text()) 255 if mf == nil || !p.scanner.Scan() { 256 p.setError(errIncorrectLegacyRange) 257 return x, x 258 } 259 // Using Bytes would be more efficient here, but Text is a lot easier 260 // and this is not a frequent case. 261 ml := reRange.FindStringSubmatch(p.scanner.Text()) 262 if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] { 263 p.setError(errIncorrectLegacyRange) 264 return x, x 265 } 266 p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])]) 267 p.parsedRange = true 268 return p.rangeStart, p.rangeEnd 269 } 270 return x, x 271 } 272 273 // bools recognizes all valid UCD boolean values. 274 var bools = map[string]bool{ 275 "": false, 276 "N": false, 277 "No": false, 278 "F": false, 279 "False": false, 280 "Y": true, 281 "Yes": true, 282 "T": true, 283 "True": true, 284 } 285 286 // Bool parses and returns field i as a boolean value. 287 func (p *Parser) Bool(i int) bool { 288 b := p.getField(i) 289 for s, v := range bools { 290 if bstrEq(b, s) { 291 return v 292 } 293 } 294 p.setError(strconv.ErrSyntax) 295 return false 296 } 297 298 // Int parses and returns field i as an integer value. 299 func (p *Parser) Int(i int) int { 300 x, err := strconv.ParseInt(string(p.getField(i)), 10, 64) 301 p.setError(err) 302 return int(x) 303 } 304 305 // Uint parses and returns field i as an unsigned integer value. 306 func (p *Parser) Uint(i int) uint { 307 x, err := strconv.ParseUint(string(p.getField(i)), 10, 64) 308 p.setError(err) 309 return uint(x) 310 } 311 312 // Float parses and returns field i as a decimal value. 313 func (p *Parser) Float(i int) float64 { 314 x, err := strconv.ParseFloat(string(p.getField(i)), 64) 315 p.setError(err) 316 return x 317 } 318 319 // String parses and returns field i as a string value. 320 func (p *Parser) String(i int) string { 321 return string(p.getField(i)) 322 } 323 324 // Strings parses and returns field i as a space-separated list of strings. 325 func (p *Parser) Strings(i int) []string { 326 ss := strings.Split(string(p.getField(i)), " ") 327 for i, s := range ss { 328 ss[i] = strings.TrimSpace(s) 329 } 330 return ss 331 } 332 333 // Comment returns the comments for the current line. 334 func (p *Parser) Comment() string { 335 return string(p.comment) 336 } 337 338 var errUndefinedEnum = errors.New("ucd: undefined enum value") 339 340 // Enum interprets and returns field i as a value that must be one of the values 341 // in enum. 342 func (p *Parser) Enum(i int, enum ...string) string { 343 b := p.getField(i) 344 for _, s := range enum { 345 if bstrEq(b, s) { 346 return s 347 } 348 } 349 p.setError(errUndefinedEnum) 350 return "" 351 } 352 353 func bstrEq(b []byte, s string) bool { 354 if len(b) != len(s) { 355 return false 356 } 357 for i, c := range b { 358 if c != s[i] { 359 return false 360 } 361 } 362 return true 363 }