github.com/ncw/rclone@v1.48.1-0.20190724201158-a35aa1360e3e/lib/encoder/encoder.go (about) 1 /* 2 Translate file names for usage on restrictive storage systems 3 4 The restricted set of characters are mapped to a unicode equivalent version 5 (most to their FULLWIDTH variant) to increase compatability with other 6 storage systems. 7 See: http://unicode-search.net/unicode-namesearch.pl?term=FULLWIDTH 8 9 Encoders will also quote reserved characters to differentiate between 10 the raw and encoded forms. 11 */ 12 13 package encoder 14 15 import ( 16 "bytes" 17 "fmt" 18 "io" 19 "strconv" 20 "strings" 21 "unicode/utf8" 22 ) 23 24 const ( 25 // adding this to any printable ASCII character turns it into the 26 // FULLWIDTH variant 27 fullOffset = 0xFEE0 28 // the first rune of the SYMBOL FOR block for control characters 29 symbolOffset = '␀' // SYMBOL FOR NULL 30 // QuoteRune is the rune used for quoting reserved characters 31 QuoteRune = '‛' // SINGLE HIGH-REVERSED-9 QUOTATION MARK 32 // EncodeStandard contains the flags used for the Standard Encoder 33 EncodeStandard = EncodeZero | EncodeSlash | EncodeCtl | EncodeDel 34 // Standard defines the encoding that is used for paths in- and output by rclone. 35 // 36 // List of replaced characters: 37 // (0x00) -> '␀' // SYMBOL FOR NULL 38 // / (slash) -> '/' // FULLWIDTH SOLIDUS 39 Standard = MultiEncoder(EncodeStandard) 40 ) 41 42 // Possible flags for the MultiEncoder 43 const ( 44 EncodeZero uint = 0 // NUL(0x00) 45 EncodeSlash uint = 1 << iota // / 46 EncodeWin // :?"*<>| 47 EncodeBackSlash // \ 48 EncodeHashPercent // #% 49 EncodeDel // DEL(0x7F) 50 EncodeCtl // CTRL(0x01-0x1F) 51 EncodeLeftSpace // Leading SPACE 52 EncodeLeftTilde // Leading ~ 53 EncodeRightSpace // Trailing SPACE 54 EncodeRightPeriod // Trailing . 55 EncodeInvalidUtf8 // Invalid UTF-8 bytes 56 ) 57 58 // Encoder can transform names to and from the original and translated version. 59 type Encoder interface { 60 // Encode takes a raw name and substitutes any reserved characters and 61 // patterns in it 62 Encode(string) string 63 // Decode takes a name and undoes any substitutions made by Encode 64 Decode(string) string 65 66 // FromStandardPath takes a / separated path in Standard encoding 67 // and converts it to a / separated path in this encoding. 68 FromStandardPath(string) string 69 // FromStandardName takes name in Standard encoding and converts 70 // it in this encoding. 71 FromStandardName(string) string 72 // ToStandardPath takes a / separated path in this encoding 73 // and converts it to a / separated path in Standard encoding. 74 ToStandardPath(string) string 75 // ToStandardName takes name in this encoding and converts 76 // it in Standard encoding. 77 ToStandardName(string) string 78 } 79 80 // MultiEncoder is a configurable Encoder. The Encode* constants in this 81 // package can be combined using bitwise or (|) to enable handling of multiple 82 // character classes 83 type MultiEncoder uint 84 85 // Encode takes a raw name and substitutes any reserved characters and 86 // patterns in it 87 func (mask MultiEncoder) Encode(in string) string { 88 var ( 89 encodeWin = uint(mask)&EncodeWin != 0 90 encodeSlash = uint(mask)&EncodeSlash != 0 91 encodeBackSlash = uint(mask)&EncodeBackSlash != 0 92 encodeHashPercent = uint(mask)&EncodeHashPercent != 0 93 encodeDel = uint(mask)&EncodeDel != 0 94 encodeCtl = uint(mask)&EncodeCtl != 0 95 encodeLeftSpace = uint(mask)&EncodeLeftSpace != 0 96 encodeLeftTilde = uint(mask)&EncodeLeftTilde != 0 97 encodeRightSpace = uint(mask)&EncodeRightSpace != 0 98 encodeRightPeriod = uint(mask)&EncodeRightPeriod != 0 99 encodeInvalidUnicode = uint(mask)&EncodeInvalidUtf8 != 0 100 ) 101 102 // handle prefix only replacements 103 prefix := "" 104 if encodeLeftSpace && len(in) > 0 { // Leading SPACE 105 if in[0] == ' ' { 106 prefix, in = "␠", in[1:] // SYMBOL FOR SPACE 107 } else if r, l := utf8.DecodeRuneInString(in); r == '␠' { // SYMBOL FOR SPACE 108 prefix, in = string(QuoteRune)+"␠", in[l:] // SYMBOL FOR SPACE 109 } 110 } 111 if encodeLeftTilde && len(in) > 0 { // Leading ~ 112 if in[0] == '~' { 113 prefix, in = string('~'+fullOffset), in[1:] // FULLWIDTH TILDE 114 } else if r, l := utf8.DecodeRuneInString(in); r == '~'+fullOffset { 115 prefix, in = string(QuoteRune)+string('~'+fullOffset), in[l:] // FULLWIDTH TILDE 116 } 117 } 118 // handle suffix only replacements 119 suffix := "" 120 if encodeRightSpace && len(in) > 0 { // Trailing SPACE 121 if in[len(in)-1] == ' ' { 122 suffix, in = "␠", in[:len(in)-1] // SYMBOL FOR SPACE 123 } else if r, l := utf8.DecodeLastRuneInString(in); r == '␠' { 124 suffix, in = string(QuoteRune)+"␠", in[:len(in)-l] // SYMBOL FOR SPACE 125 } 126 } 127 if encodeRightPeriod && len(in) > 0 { // Trailing . 128 if in[len(in)-1] == '.' { 129 suffix, in = ".", in[:len(in)-1] // FULLWIDTH FULL STOP 130 } else if r, l := utf8.DecodeLastRuneInString(in); r == '.' { 131 suffix, in = string(QuoteRune)+".", in[:len(in)-l] // FULLWIDTH FULL STOP 132 } 133 } 134 index := 0 135 if prefix == "" && suffix == "" { 136 // find the first rune which (most likely) needs to be replaced 137 index = strings.IndexFunc(in, func(r rune) bool { 138 switch r { 139 case 0, '␀', QuoteRune, utf8.RuneError: 140 return true 141 } 142 if encodeWin { // :?"*<>| 143 switch r { 144 case '*', '<', '>', '?', ':', '|', '"', 145 '*', '<', '>', '?', ':', '|', '"': 146 return true 147 } 148 } 149 if encodeSlash { // / 150 switch r { 151 case '/', 152 '/': 153 return true 154 } 155 } 156 if encodeBackSlash { // \ 157 switch r { 158 case '\\', 159 '\': 160 return true 161 } 162 } 163 if encodeHashPercent { // #% 164 switch r { 165 case '#', '%', 166 '#', '%': 167 return true 168 } 169 } 170 if encodeDel { // DEL(0x7F) 171 switch r { 172 case rune(0x7F), '␡': 173 return true 174 } 175 } 176 if encodeCtl { // CTRL(0x01-0x1F) 177 if r >= 1 && r <= 0x1F { 178 return true 179 } else if r > symbolOffset && r <= symbolOffset+0x1F { 180 return true 181 } 182 } 183 return false 184 }) 185 } 186 // nothing to replace, return input 187 if index == -1 { 188 return in 189 } 190 191 var out bytes.Buffer 192 out.Grow(len(in) + len(prefix) + len(suffix)) 193 out.WriteString(prefix) 194 // copy the clean part of the input and skip it 195 out.WriteString(in[:index]) 196 in = in[index:] 197 198 for i, r := range in { 199 switch r { 200 case 0: 201 out.WriteRune(symbolOffset) 202 continue 203 case '␀', QuoteRune: 204 out.WriteRune(QuoteRune) 205 out.WriteRune(r) 206 continue 207 case utf8.RuneError: 208 if encodeInvalidUnicode { 209 // only encode invalid sequences and not utf8.RuneError 210 if i+3 > len(in) || in[i:i+3] != string(utf8.RuneError) { 211 _, l := utf8.DecodeRuneInString(in[i:]) 212 appendQuotedBytes(&out, in[i:i+l]) 213 continue 214 } 215 } else { 216 // append the real bytes instead of utf8.RuneError 217 _, l := utf8.DecodeRuneInString(in[i:]) 218 out.WriteString(in[i : i+l]) 219 continue 220 } 221 } 222 if encodeWin { // :?"*<>| 223 switch r { 224 case '*', '<', '>', '?', ':', '|', '"': 225 out.WriteRune(r + fullOffset) 226 continue 227 case '*', '<', '>', '?', ':', '|', '"': 228 out.WriteRune(QuoteRune) 229 out.WriteRune(r) 230 continue 231 } 232 } 233 if encodeSlash { // / 234 switch r { 235 case '/': 236 out.WriteRune(r + fullOffset) 237 continue 238 case '/': 239 out.WriteRune(QuoteRune) 240 out.WriteRune(r) 241 continue 242 } 243 } 244 if encodeBackSlash { // \ 245 switch r { 246 case '\\': 247 out.WriteRune(r + fullOffset) 248 continue 249 case '\': 250 out.WriteRune(QuoteRune) 251 out.WriteRune(r) 252 continue 253 } 254 } 255 if encodeHashPercent { // #% 256 switch r { 257 case '#', '%': 258 out.WriteRune(r + fullOffset) 259 continue 260 case '#', '%': 261 out.WriteRune(QuoteRune) 262 out.WriteRune(r) 263 continue 264 } 265 } 266 if encodeDel { // DEL(0x7F) 267 switch r { 268 case rune(0x7F): 269 out.WriteRune('␡') // SYMBOL FOR DELETE 270 continue 271 case '␡': 272 out.WriteRune(QuoteRune) 273 out.WriteRune(r) 274 continue 275 } 276 } 277 if encodeCtl { // CTRL(0x01-0x1F) 278 if r >= 1 && r <= 0x1F { 279 out.WriteRune('␀' + r) // SYMBOL FOR NULL 280 continue 281 } else if r > symbolOffset && r <= symbolOffset+0x1F { 282 out.WriteRune(QuoteRune) 283 out.WriteRune(r) 284 continue 285 } 286 } 287 out.WriteRune(r) 288 } 289 out.WriteString(suffix) 290 return out.String() 291 } 292 293 // Decode takes a name and undoes any substitutions made by Encode 294 func (mask MultiEncoder) Decode(in string) string { 295 var ( 296 encodeWin = uint(mask)&EncodeWin != 0 297 encodeSlash = uint(mask)&EncodeSlash != 0 298 encodeBackSlash = uint(mask)&EncodeBackSlash != 0 299 encodeHashPercent = uint(mask)&EncodeHashPercent != 0 300 encodeDel = uint(mask)&EncodeDel != 0 301 encodeCtl = uint(mask)&EncodeCtl != 0 302 encodeLeftSpace = uint(mask)&EncodeLeftSpace != 0 303 encodeLeftTilde = uint(mask)&EncodeLeftTilde != 0 304 encodeRightSpace = uint(mask)&EncodeRightSpace != 0 305 encodeRightPeriod = uint(mask)&EncodeRightPeriod != 0 306 encodeInvalidUnicode = uint(mask)&EncodeInvalidUtf8 != 0 307 ) 308 309 // handle prefix only replacements 310 prefix := "" 311 if r, l1 := utf8.DecodeRuneInString(in); encodeLeftSpace && r == '␠' { // SYMBOL FOR SPACE 312 prefix, in = " ", in[l1:] 313 } else if encodeLeftTilde && r == '~' { // FULLWIDTH TILDE 314 prefix, in = "~", in[l1:] 315 } else if r == QuoteRune { 316 if r, l2 := utf8.DecodeRuneInString(in[l1:]); encodeLeftSpace && r == '␠' { // SYMBOL FOR SPACE 317 prefix, in = "␠", in[l1+l2:] 318 } else if encodeLeftTilde && r == '~' { // FULLWIDTH TILDE 319 prefix, in = "~", in[l1+l2:] 320 } 321 } 322 323 // handle suffix only replacements 324 suffix := "" 325 if r, l := utf8.DecodeLastRuneInString(in); encodeRightSpace && r == '␠' { // SYMBOL FOR SPACE 326 in = in[:len(in)-l] 327 if r, l2 := utf8.DecodeLastRuneInString(in); r == QuoteRune { 328 suffix, in = "␠", in[:len(in)-l2] 329 } else { 330 suffix = " " 331 } 332 } else if encodeRightPeriod && r == '.' { // FULLWIDTH FULL STOP 333 in = in[:len(in)-l] 334 if r, l2 := utf8.DecodeLastRuneInString(in); r == QuoteRune { 335 suffix, in = ".", in[:len(in)-l2] 336 } else { 337 suffix = "." 338 } 339 } 340 index := 0 341 if prefix == "" && suffix == "" { 342 // find the first rune which (most likely) needs to be replaced 343 index = strings.IndexFunc(in, func(r rune) bool { 344 switch r { 345 case '␀', QuoteRune: 346 return true 347 } 348 if encodeWin { // :?"*<>| 349 switch r { 350 case '*', '<', '>', '?', ':', '|', '"': 351 return true 352 } 353 } 354 if encodeSlash { // / 355 switch r { 356 case '/': 357 return true 358 } 359 } 360 if encodeBackSlash { // \ 361 switch r { 362 case '\': 363 return true 364 } 365 } 366 if encodeHashPercent { // #% 367 switch r { 368 case '#', '%': 369 return true 370 } 371 } 372 if encodeDel { // DEL(0x7F) 373 switch r { 374 case '␡': 375 return true 376 } 377 } 378 if encodeCtl { // CTRL(0x01-0x1F) 379 if r > symbolOffset && r <= symbolOffset+0x1F { 380 return true 381 } 382 } 383 384 return false 385 }) 386 } 387 // nothing to replace, return input 388 if index == -1 { 389 return in 390 } 391 392 var out bytes.Buffer 393 out.Grow(len(in)) 394 out.WriteString(prefix) 395 // copy the clean part of the input and skip it 396 out.WriteString(in[:index]) 397 in = in[index:] 398 var unquote, unquoteNext, skipNext bool 399 400 for i, r := range in { 401 if skipNext { 402 skipNext = false 403 continue 404 } 405 unquote, unquoteNext = unquoteNext, false 406 switch r { 407 case '␀': // SYMBOL FOR NULL 408 if unquote { 409 out.WriteRune(r) 410 } else { 411 out.WriteRune(0) 412 } 413 continue 414 case QuoteRune: 415 if unquote { 416 out.WriteRune(r) 417 } else { 418 unquoteNext = true 419 } 420 continue 421 } 422 if encodeWin { // :?"*<>| 423 switch r { 424 case '*', '<', '>', '?', ':', '|', '"': 425 if unquote { 426 out.WriteRune(r) 427 } else { 428 out.WriteRune(r - fullOffset) 429 } 430 continue 431 } 432 } 433 if encodeSlash { // / 434 switch r { 435 case '/': // FULLWIDTH SOLIDUS 436 if unquote { 437 out.WriteRune(r) 438 } else { 439 out.WriteRune(r - fullOffset) 440 } 441 continue 442 } 443 } 444 if encodeBackSlash { // \ 445 switch r { 446 case '\': // FULLWIDTH REVERSE SOLIDUS 447 if unquote { 448 out.WriteRune(r) 449 } else { 450 out.WriteRune(r - fullOffset) 451 } 452 continue 453 } 454 } 455 if encodeHashPercent { // #% 456 switch r { 457 case '#', '%': 458 if unquote { 459 out.WriteRune(r) 460 } else { 461 out.WriteRune(r - fullOffset) 462 } 463 continue 464 } 465 } 466 if encodeDel { // DEL(0x7F) 467 switch r { 468 case '␡': // SYMBOL FOR DELETE 469 if unquote { 470 out.WriteRune(r) 471 } else { 472 out.WriteRune(0x7F) 473 } 474 continue 475 } 476 } 477 if encodeCtl { // CTRL(0x01-0x1F) 478 if r > symbolOffset && r <= symbolOffset+0x1F { 479 if unquote { 480 out.WriteRune(r) 481 } else { 482 out.WriteRune(r - symbolOffset) 483 } 484 continue 485 } 486 } 487 if unquote { 488 if encodeInvalidUnicode { 489 skipNext = appendUnquotedByte(&out, in[i:]) 490 if skipNext { 491 continue 492 } 493 } 494 out.WriteRune(QuoteRune) 495 } 496 switch r { 497 case utf8.RuneError: 498 // append the real bytes instead of utf8.RuneError 499 _, l := utf8.DecodeRuneInString(in[i:]) 500 out.WriteString(in[i : i+l]) 501 continue 502 } 503 504 out.WriteRune(r) 505 } 506 if unquoteNext { 507 out.WriteRune(QuoteRune) 508 } 509 out.WriteString(suffix) 510 return out.String() 511 } 512 513 // FromStandardPath takes a / separated path in Standard encoding 514 // and converts it to a / separated path in this encoding. 515 func (mask MultiEncoder) FromStandardPath(s string) string { 516 return FromStandardPath(mask, s) 517 } 518 519 // FromStandardName takes name in Standard encoding and converts 520 // it in this encoding. 521 func (mask MultiEncoder) FromStandardName(s string) string { 522 return FromStandardName(mask, s) 523 } 524 525 // ToStandardPath takes a / separated path in this encoding 526 // and converts it to a / separated path in Standard encoding. 527 func (mask MultiEncoder) ToStandardPath(s string) string { 528 return ToStandardPath(mask, s) 529 } 530 531 // ToStandardName takes name in this encoding and converts 532 // it in Standard encoding. 533 func (mask MultiEncoder) ToStandardName(s string) string { 534 return ToStandardName(mask, s) 535 } 536 537 func appendQuotedBytes(w io.Writer, s string) { 538 for _, b := range []byte(s) { 539 _, _ = fmt.Fprintf(w, string(QuoteRune)+"%02X", b) 540 } 541 } 542 func appendUnquotedByte(w io.Writer, s string) bool { 543 if len(s) < 2 { 544 return false 545 } 546 u, err := strconv.ParseUint(s[:2], 16, 8) 547 if err != nil { 548 return false 549 } 550 n, _ := w.Write([]byte{byte(u)}) 551 return n == 1 552 } 553 554 type identity struct{} 555 556 func (identity) Encode(in string) string { return in } 557 func (identity) Decode(in string) string { return in } 558 559 func (i identity) FromStandardPath(s string) string { 560 return FromStandardPath(i, s) 561 } 562 func (i identity) FromStandardName(s string) string { 563 return FromStandardName(i, s) 564 } 565 func (i identity) ToStandardPath(s string) string { 566 return ToStandardPath(i, s) 567 } 568 func (i identity) ToStandardName(s string) string { 569 return ToStandardName(i, s) 570 } 571 572 // Identity returns a Encoder that always returns the input value 573 func Identity() Encoder { 574 return identity{} 575 } 576 577 // FromStandardPath takes a / separated path in Standard encoding 578 // and converts it to a / separated path in the given encoding. 579 func FromStandardPath(e Encoder, s string) string { 580 if e == Standard { 581 return s 582 } 583 parts := strings.Split(s, "/") 584 encoded := make([]string, len(parts)) 585 changed := false 586 for i, p := range parts { 587 enc := FromStandardName(e, p) 588 changed = changed || enc != p 589 encoded[i] = enc 590 } 591 if !changed { 592 return s 593 } 594 return strings.Join(encoded, "/") 595 } 596 597 // FromStandardName takes name in Standard encoding and converts 598 // it in the given encoding. 599 func FromStandardName(e Encoder, s string) string { 600 if e == Standard { 601 return s 602 } 603 return e.Encode(Standard.Decode(s)) 604 } 605 606 // ToStandardPath takes a / separated path in the given encoding 607 // and converts it to a / separated path in Standard encoding. 608 func ToStandardPath(e Encoder, s string) string { 609 if e == Standard { 610 return s 611 } 612 parts := strings.Split(s, "/") 613 encoded := make([]string, len(parts)) 614 changed := false 615 for i, p := range parts { 616 dec := ToStandardName(e, p) 617 changed = changed || dec != p 618 encoded[i] = dec 619 } 620 if !changed { 621 return s 622 } 623 return strings.Join(encoded, "/") 624 } 625 626 // ToStandardName takes name in the given encoding and converts 627 // it in Standard encoding. 628 func ToStandardName(e Encoder, s string) string { 629 if e == Standard { 630 return s 631 } 632 return Standard.Encode(e.Decode(s)) 633 }