github.com/ncw/rclone@v1.48.1-0.20190724201158-a35aa1360e3e/lib/encoder/encoder.go (about)

     1  /*
     2  Translate file names for usage on restrictive storage systems
     3  
     4  The restricted set of characters are mapped to a unicode equivalent version
     5  (most to their FULLWIDTH variant) to increase compatability with other
     6  storage systems.
     7  See: http://unicode-search.net/unicode-namesearch.pl?term=FULLWIDTH
     8  
     9  Encoders will also quote reserved characters to differentiate between
    10  the raw and encoded forms.
    11  */
    12  
    13  package encoder
    14  
    15  import (
    16  	"bytes"
    17  	"fmt"
    18  	"io"
    19  	"strconv"
    20  	"strings"
    21  	"unicode/utf8"
    22  )
    23  
    24  const (
    25  	// adding this to any printable ASCII character turns it into the
    26  	// FULLWIDTH variant
    27  	fullOffset = 0xFEE0
    28  	// the first rune of the SYMBOL FOR block for control characters
    29  	symbolOffset = '␀' // SYMBOL FOR NULL
    30  	// QuoteRune is the rune used for quoting reserved characters
    31  	QuoteRune = '‛' // SINGLE HIGH-REVERSED-9 QUOTATION MARK
    32  	// EncodeStandard contains the flags used for the Standard Encoder
    33  	EncodeStandard = EncodeZero | EncodeSlash | EncodeCtl | EncodeDel
    34  	// Standard defines the encoding that is used for paths in- and output by rclone.
    35  	//
    36  	// List of replaced characters:
    37  	//     (0x00)  -> '␀' // SYMBOL FOR NULL
    38  	//   / (slash) -> '/' // FULLWIDTH SOLIDUS
    39  	Standard = MultiEncoder(EncodeStandard)
    40  )
    41  
    42  // Possible flags for the MultiEncoder
    43  const (
    44  	EncodeZero        uint = 0         // NUL(0x00)
    45  	EncodeSlash       uint = 1 << iota // /
    46  	EncodeWin                          // :?"*<>|
    47  	EncodeBackSlash                    // \
    48  	EncodeHashPercent                  // #%
    49  	EncodeDel                          // DEL(0x7F)
    50  	EncodeCtl                          // CTRL(0x01-0x1F)
    51  	EncodeLeftSpace                    // Leading SPACE
    52  	EncodeLeftTilde                    // Leading ~
    53  	EncodeRightSpace                   // Trailing SPACE
    54  	EncodeRightPeriod                  // Trailing .
    55  	EncodeInvalidUtf8                  // Invalid UTF-8 bytes
    56  )
    57  
    58  // Encoder can transform names to and from the original and translated version.
    59  type Encoder interface {
    60  	// Encode takes a raw name and substitutes any reserved characters and
    61  	// patterns in it
    62  	Encode(string) string
    63  	// Decode takes a name and undoes any substitutions made by Encode
    64  	Decode(string) string
    65  
    66  	// FromStandardPath takes a / separated path in Standard encoding
    67  	// and converts it to a / separated path in this encoding.
    68  	FromStandardPath(string) string
    69  	// FromStandardName takes name in Standard encoding and converts
    70  	// it in this encoding.
    71  	FromStandardName(string) string
    72  	// ToStandardPath takes a / separated path in this encoding
    73  	// and converts it to a / separated path in Standard encoding.
    74  	ToStandardPath(string) string
    75  	// ToStandardName takes name in this encoding and converts
    76  	// it in Standard encoding.
    77  	ToStandardName(string) string
    78  }
    79  
    80  // MultiEncoder is a configurable Encoder. The Encode* constants in this
    81  // package can be combined using bitwise or (|) to enable handling of multiple
    82  // character classes
    83  type MultiEncoder uint
    84  
    85  // Encode takes a raw name and substitutes any reserved characters and
    86  // patterns in it
    87  func (mask MultiEncoder) Encode(in string) string {
    88  	var (
    89  		encodeWin            = uint(mask)&EncodeWin != 0
    90  		encodeSlash          = uint(mask)&EncodeSlash != 0
    91  		encodeBackSlash      = uint(mask)&EncodeBackSlash != 0
    92  		encodeHashPercent    = uint(mask)&EncodeHashPercent != 0
    93  		encodeDel            = uint(mask)&EncodeDel != 0
    94  		encodeCtl            = uint(mask)&EncodeCtl != 0
    95  		encodeLeftSpace      = uint(mask)&EncodeLeftSpace != 0
    96  		encodeLeftTilde      = uint(mask)&EncodeLeftTilde != 0
    97  		encodeRightSpace     = uint(mask)&EncodeRightSpace != 0
    98  		encodeRightPeriod    = uint(mask)&EncodeRightPeriod != 0
    99  		encodeInvalidUnicode = uint(mask)&EncodeInvalidUtf8 != 0
   100  	)
   101  
   102  	// handle prefix only replacements
   103  	prefix := ""
   104  	if encodeLeftSpace && len(in) > 0 { // Leading SPACE
   105  		if in[0] == ' ' {
   106  			prefix, in = "␠", in[1:] // SYMBOL FOR SPACE
   107  		} else if r, l := utf8.DecodeRuneInString(in); r == '␠' { // SYMBOL FOR SPACE
   108  			prefix, in = string(QuoteRune)+"␠", in[l:] // SYMBOL FOR SPACE
   109  		}
   110  	}
   111  	if encodeLeftTilde && len(in) > 0 { // Leading ~
   112  		if in[0] == '~' {
   113  			prefix, in = string('~'+fullOffset), in[1:] // FULLWIDTH TILDE
   114  		} else if r, l := utf8.DecodeRuneInString(in); r == '~'+fullOffset {
   115  			prefix, in = string(QuoteRune)+string('~'+fullOffset), in[l:] // FULLWIDTH TILDE
   116  		}
   117  	}
   118  	// handle suffix only replacements
   119  	suffix := ""
   120  	if encodeRightSpace && len(in) > 0 { // Trailing SPACE
   121  		if in[len(in)-1] == ' ' {
   122  			suffix, in = "␠", in[:len(in)-1] // SYMBOL FOR SPACE
   123  		} else if r, l := utf8.DecodeLastRuneInString(in); r == '␠' {
   124  			suffix, in = string(QuoteRune)+"␠", in[:len(in)-l] // SYMBOL FOR SPACE
   125  		}
   126  	}
   127  	if encodeRightPeriod && len(in) > 0 { // Trailing .
   128  		if in[len(in)-1] == '.' {
   129  			suffix, in = ".", in[:len(in)-1] // FULLWIDTH FULL STOP
   130  		} else if r, l := utf8.DecodeLastRuneInString(in); r == '.' {
   131  			suffix, in = string(QuoteRune)+".", in[:len(in)-l] // FULLWIDTH FULL STOP
   132  		}
   133  	}
   134  	index := 0
   135  	if prefix == "" && suffix == "" {
   136  		// find the first rune which (most likely) needs to be replaced
   137  		index = strings.IndexFunc(in, func(r rune) bool {
   138  			switch r {
   139  			case 0, '␀', QuoteRune, utf8.RuneError:
   140  				return true
   141  			}
   142  			if encodeWin { // :?"*<>|
   143  				switch r {
   144  				case '*', '<', '>', '?', ':', '|', '"',
   145  					'*', '<', '>', '?', ':', '|', '"':
   146  					return true
   147  				}
   148  			}
   149  			if encodeSlash { // /
   150  				switch r {
   151  				case '/',
   152  					'/':
   153  					return true
   154  				}
   155  			}
   156  			if encodeBackSlash { // \
   157  				switch r {
   158  				case '\\',
   159  					'\':
   160  					return true
   161  				}
   162  			}
   163  			if encodeHashPercent { // #%
   164  				switch r {
   165  				case '#', '%',
   166  					'#', '%':
   167  					return true
   168  				}
   169  			}
   170  			if encodeDel { // DEL(0x7F)
   171  				switch r {
   172  				case rune(0x7F), '␡':
   173  					return true
   174  				}
   175  			}
   176  			if encodeCtl { // CTRL(0x01-0x1F)
   177  				if r >= 1 && r <= 0x1F {
   178  					return true
   179  				} else if r > symbolOffset && r <= symbolOffset+0x1F {
   180  					return true
   181  				}
   182  			}
   183  			return false
   184  		})
   185  	}
   186  	// nothing to replace, return input
   187  	if index == -1 {
   188  		return in
   189  	}
   190  
   191  	var out bytes.Buffer
   192  	out.Grow(len(in) + len(prefix) + len(suffix))
   193  	out.WriteString(prefix)
   194  	// copy the clean part of the input and skip it
   195  	out.WriteString(in[:index])
   196  	in = in[index:]
   197  
   198  	for i, r := range in {
   199  		switch r {
   200  		case 0:
   201  			out.WriteRune(symbolOffset)
   202  			continue
   203  		case '␀', QuoteRune:
   204  			out.WriteRune(QuoteRune)
   205  			out.WriteRune(r)
   206  			continue
   207  		case utf8.RuneError:
   208  			if encodeInvalidUnicode {
   209  				// only encode invalid sequences and not utf8.RuneError
   210  				if i+3 > len(in) || in[i:i+3] != string(utf8.RuneError) {
   211  					_, l := utf8.DecodeRuneInString(in[i:])
   212  					appendQuotedBytes(&out, in[i:i+l])
   213  					continue
   214  				}
   215  			} else {
   216  				// append the real bytes instead of utf8.RuneError
   217  				_, l := utf8.DecodeRuneInString(in[i:])
   218  				out.WriteString(in[i : i+l])
   219  				continue
   220  			}
   221  		}
   222  		if encodeWin { // :?"*<>|
   223  			switch r {
   224  			case '*', '<', '>', '?', ':', '|', '"':
   225  				out.WriteRune(r + fullOffset)
   226  				continue
   227  			case '*', '<', '>', '?', ':', '|', '"':
   228  				out.WriteRune(QuoteRune)
   229  				out.WriteRune(r)
   230  				continue
   231  			}
   232  		}
   233  		if encodeSlash { // /
   234  			switch r {
   235  			case '/':
   236  				out.WriteRune(r + fullOffset)
   237  				continue
   238  			case '/':
   239  				out.WriteRune(QuoteRune)
   240  				out.WriteRune(r)
   241  				continue
   242  			}
   243  		}
   244  		if encodeBackSlash { // \
   245  			switch r {
   246  			case '\\':
   247  				out.WriteRune(r + fullOffset)
   248  				continue
   249  			case '\':
   250  				out.WriteRune(QuoteRune)
   251  				out.WriteRune(r)
   252  				continue
   253  			}
   254  		}
   255  		if encodeHashPercent { // #%
   256  			switch r {
   257  			case '#', '%':
   258  				out.WriteRune(r + fullOffset)
   259  				continue
   260  			case '#', '%':
   261  				out.WriteRune(QuoteRune)
   262  				out.WriteRune(r)
   263  				continue
   264  			}
   265  		}
   266  		if encodeDel { // DEL(0x7F)
   267  			switch r {
   268  			case rune(0x7F):
   269  				out.WriteRune('␡') // SYMBOL FOR DELETE
   270  				continue
   271  			case '␡':
   272  				out.WriteRune(QuoteRune)
   273  				out.WriteRune(r)
   274  				continue
   275  			}
   276  		}
   277  		if encodeCtl { // CTRL(0x01-0x1F)
   278  			if r >= 1 && r <= 0x1F {
   279  				out.WriteRune('␀' + r) // SYMBOL FOR NULL
   280  				continue
   281  			} else if r > symbolOffset && r <= symbolOffset+0x1F {
   282  				out.WriteRune(QuoteRune)
   283  				out.WriteRune(r)
   284  				continue
   285  			}
   286  		}
   287  		out.WriteRune(r)
   288  	}
   289  	out.WriteString(suffix)
   290  	return out.String()
   291  }
   292  
   293  // Decode takes a name and undoes any substitutions made by Encode
   294  func (mask MultiEncoder) Decode(in string) string {
   295  	var (
   296  		encodeWin            = uint(mask)&EncodeWin != 0
   297  		encodeSlash          = uint(mask)&EncodeSlash != 0
   298  		encodeBackSlash      = uint(mask)&EncodeBackSlash != 0
   299  		encodeHashPercent    = uint(mask)&EncodeHashPercent != 0
   300  		encodeDel            = uint(mask)&EncodeDel != 0
   301  		encodeCtl            = uint(mask)&EncodeCtl != 0
   302  		encodeLeftSpace      = uint(mask)&EncodeLeftSpace != 0
   303  		encodeLeftTilde      = uint(mask)&EncodeLeftTilde != 0
   304  		encodeRightSpace     = uint(mask)&EncodeRightSpace != 0
   305  		encodeRightPeriod    = uint(mask)&EncodeRightPeriod != 0
   306  		encodeInvalidUnicode = uint(mask)&EncodeInvalidUtf8 != 0
   307  	)
   308  
   309  	// handle prefix only replacements
   310  	prefix := ""
   311  	if r, l1 := utf8.DecodeRuneInString(in); encodeLeftSpace && r == '␠' { // SYMBOL FOR SPACE
   312  		prefix, in = " ", in[l1:]
   313  	} else if encodeLeftTilde && r == '~' { // FULLWIDTH TILDE
   314  		prefix, in = "~", in[l1:]
   315  	} else if r == QuoteRune {
   316  		if r, l2 := utf8.DecodeRuneInString(in[l1:]); encodeLeftSpace && r == '␠' { // SYMBOL FOR SPACE
   317  			prefix, in = "␠", in[l1+l2:]
   318  		} else if encodeLeftTilde && r == '~' { // FULLWIDTH TILDE
   319  			prefix, in = "~", in[l1+l2:]
   320  		}
   321  	}
   322  
   323  	// handle suffix only replacements
   324  	suffix := ""
   325  	if r, l := utf8.DecodeLastRuneInString(in); encodeRightSpace && r == '␠' { // SYMBOL FOR SPACE
   326  		in = in[:len(in)-l]
   327  		if r, l2 := utf8.DecodeLastRuneInString(in); r == QuoteRune {
   328  			suffix, in = "␠", in[:len(in)-l2]
   329  		} else {
   330  			suffix = " "
   331  		}
   332  	} else if encodeRightPeriod && r == '.' { // FULLWIDTH FULL STOP
   333  		in = in[:len(in)-l]
   334  		if r, l2 := utf8.DecodeLastRuneInString(in); r == QuoteRune {
   335  			suffix, in = ".", in[:len(in)-l2]
   336  		} else {
   337  			suffix = "."
   338  		}
   339  	}
   340  	index := 0
   341  	if prefix == "" && suffix == "" {
   342  		// find the first rune which (most likely) needs to be replaced
   343  		index = strings.IndexFunc(in, func(r rune) bool {
   344  			switch r {
   345  			case '␀', QuoteRune:
   346  				return true
   347  			}
   348  			if encodeWin { // :?"*<>|
   349  				switch r {
   350  				case '*', '<', '>', '?', ':', '|', '"':
   351  					return true
   352  				}
   353  			}
   354  			if encodeSlash { // /
   355  				switch r {
   356  				case '/':
   357  					return true
   358  				}
   359  			}
   360  			if encodeBackSlash { // \
   361  				switch r {
   362  				case '\':
   363  					return true
   364  				}
   365  			}
   366  			if encodeHashPercent { // #%
   367  				switch r {
   368  				case '#', '%':
   369  					return true
   370  				}
   371  			}
   372  			if encodeDel { // DEL(0x7F)
   373  				switch r {
   374  				case '␡':
   375  					return true
   376  				}
   377  			}
   378  			if encodeCtl { // CTRL(0x01-0x1F)
   379  				if r > symbolOffset && r <= symbolOffset+0x1F {
   380  					return true
   381  				}
   382  			}
   383  
   384  			return false
   385  		})
   386  	}
   387  	// nothing to replace, return input
   388  	if index == -1 {
   389  		return in
   390  	}
   391  
   392  	var out bytes.Buffer
   393  	out.Grow(len(in))
   394  	out.WriteString(prefix)
   395  	// copy the clean part of the input and skip it
   396  	out.WriteString(in[:index])
   397  	in = in[index:]
   398  	var unquote, unquoteNext, skipNext bool
   399  
   400  	for i, r := range in {
   401  		if skipNext {
   402  			skipNext = false
   403  			continue
   404  		}
   405  		unquote, unquoteNext = unquoteNext, false
   406  		switch r {
   407  		case '␀': // SYMBOL FOR NULL
   408  			if unquote {
   409  				out.WriteRune(r)
   410  			} else {
   411  				out.WriteRune(0)
   412  			}
   413  			continue
   414  		case QuoteRune:
   415  			if unquote {
   416  				out.WriteRune(r)
   417  			} else {
   418  				unquoteNext = true
   419  			}
   420  			continue
   421  		}
   422  		if encodeWin { // :?"*<>|
   423  			switch r {
   424  			case '*', '<', '>', '?', ':', '|', '"':
   425  				if unquote {
   426  					out.WriteRune(r)
   427  				} else {
   428  					out.WriteRune(r - fullOffset)
   429  				}
   430  				continue
   431  			}
   432  		}
   433  		if encodeSlash { // /
   434  			switch r {
   435  			case '/': // FULLWIDTH SOLIDUS
   436  				if unquote {
   437  					out.WriteRune(r)
   438  				} else {
   439  					out.WriteRune(r - fullOffset)
   440  				}
   441  				continue
   442  			}
   443  		}
   444  		if encodeBackSlash { // \
   445  			switch r {
   446  			case '\': // FULLWIDTH REVERSE SOLIDUS
   447  				if unquote {
   448  					out.WriteRune(r)
   449  				} else {
   450  					out.WriteRune(r - fullOffset)
   451  				}
   452  				continue
   453  			}
   454  		}
   455  		if encodeHashPercent { // #%
   456  			switch r {
   457  			case '#', '%':
   458  				if unquote {
   459  					out.WriteRune(r)
   460  				} else {
   461  					out.WriteRune(r - fullOffset)
   462  				}
   463  				continue
   464  			}
   465  		}
   466  		if encodeDel { // DEL(0x7F)
   467  			switch r {
   468  			case '␡': // SYMBOL FOR DELETE
   469  				if unquote {
   470  					out.WriteRune(r)
   471  				} else {
   472  					out.WriteRune(0x7F)
   473  				}
   474  				continue
   475  			}
   476  		}
   477  		if encodeCtl { // CTRL(0x01-0x1F)
   478  			if r > symbolOffset && r <= symbolOffset+0x1F {
   479  				if unquote {
   480  					out.WriteRune(r)
   481  				} else {
   482  					out.WriteRune(r - symbolOffset)
   483  				}
   484  				continue
   485  			}
   486  		}
   487  		if unquote {
   488  			if encodeInvalidUnicode {
   489  				skipNext = appendUnquotedByte(&out, in[i:])
   490  				if skipNext {
   491  					continue
   492  				}
   493  			}
   494  			out.WriteRune(QuoteRune)
   495  		}
   496  		switch r {
   497  		case utf8.RuneError:
   498  			// append the real bytes instead of utf8.RuneError
   499  			_, l := utf8.DecodeRuneInString(in[i:])
   500  			out.WriteString(in[i : i+l])
   501  			continue
   502  		}
   503  
   504  		out.WriteRune(r)
   505  	}
   506  	if unquoteNext {
   507  		out.WriteRune(QuoteRune)
   508  	}
   509  	out.WriteString(suffix)
   510  	return out.String()
   511  }
   512  
   513  // FromStandardPath takes a / separated path in Standard encoding
   514  // and converts it to a / separated path in this encoding.
   515  func (mask MultiEncoder) FromStandardPath(s string) string {
   516  	return FromStandardPath(mask, s)
   517  }
   518  
   519  // FromStandardName takes name in Standard encoding and converts
   520  // it in this encoding.
   521  func (mask MultiEncoder) FromStandardName(s string) string {
   522  	return FromStandardName(mask, s)
   523  }
   524  
   525  // ToStandardPath takes a / separated path in this encoding
   526  // and converts it to a / separated path in Standard encoding.
   527  func (mask MultiEncoder) ToStandardPath(s string) string {
   528  	return ToStandardPath(mask, s)
   529  }
   530  
   531  // ToStandardName takes name in this encoding and converts
   532  // it in Standard encoding.
   533  func (mask MultiEncoder) ToStandardName(s string) string {
   534  	return ToStandardName(mask, s)
   535  }
   536  
   537  func appendQuotedBytes(w io.Writer, s string) {
   538  	for _, b := range []byte(s) {
   539  		_, _ = fmt.Fprintf(w, string(QuoteRune)+"%02X", b)
   540  	}
   541  }
   542  func appendUnquotedByte(w io.Writer, s string) bool {
   543  	if len(s) < 2 {
   544  		return false
   545  	}
   546  	u, err := strconv.ParseUint(s[:2], 16, 8)
   547  	if err != nil {
   548  		return false
   549  	}
   550  	n, _ := w.Write([]byte{byte(u)})
   551  	return n == 1
   552  }
   553  
   554  type identity struct{}
   555  
   556  func (identity) Encode(in string) string { return in }
   557  func (identity) Decode(in string) string { return in }
   558  
   559  func (i identity) FromStandardPath(s string) string {
   560  	return FromStandardPath(i, s)
   561  }
   562  func (i identity) FromStandardName(s string) string {
   563  	return FromStandardName(i, s)
   564  }
   565  func (i identity) ToStandardPath(s string) string {
   566  	return ToStandardPath(i, s)
   567  }
   568  func (i identity) ToStandardName(s string) string {
   569  	return ToStandardName(i, s)
   570  }
   571  
   572  // Identity returns a Encoder that always returns the input value
   573  func Identity() Encoder {
   574  	return identity{}
   575  }
   576  
   577  // FromStandardPath takes a / separated path in Standard encoding
   578  // and converts it to a / separated path in the given encoding.
   579  func FromStandardPath(e Encoder, s string) string {
   580  	if e == Standard {
   581  		return s
   582  	}
   583  	parts := strings.Split(s, "/")
   584  	encoded := make([]string, len(parts))
   585  	changed := false
   586  	for i, p := range parts {
   587  		enc := FromStandardName(e, p)
   588  		changed = changed || enc != p
   589  		encoded[i] = enc
   590  	}
   591  	if !changed {
   592  		return s
   593  	}
   594  	return strings.Join(encoded, "/")
   595  }
   596  
   597  // FromStandardName takes name in Standard encoding and converts
   598  // it in the given encoding.
   599  func FromStandardName(e Encoder, s string) string {
   600  	if e == Standard {
   601  		return s
   602  	}
   603  	return e.Encode(Standard.Decode(s))
   604  }
   605  
   606  // ToStandardPath takes a / separated path in the given encoding
   607  // and converts it to a / separated path in Standard encoding.
   608  func ToStandardPath(e Encoder, s string) string {
   609  	if e == Standard {
   610  		return s
   611  	}
   612  	parts := strings.Split(s, "/")
   613  	encoded := make([]string, len(parts))
   614  	changed := false
   615  	for i, p := range parts {
   616  		dec := ToStandardName(e, p)
   617  		changed = changed || dec != p
   618  		encoded[i] = dec
   619  	}
   620  	if !changed {
   621  		return s
   622  	}
   623  	return strings.Join(encoded, "/")
   624  }
   625  
   626  // ToStandardName takes name in the given encoding and converts
   627  // it in Standard encoding.
   628  func ToStandardName(e Encoder, s string) string {
   629  	if e == Standard {
   630  		return s
   631  	}
   632  	return Standard.Encode(e.Decode(s))
   633  }