kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/kytheuri/uri.go (about)

     1  /*
     2   * Copyright 2014 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Package kytheuri provides a type to represent Kythe URIs.  This package
    18  // supports parsing a Kythe URI from a string, and converting back and forth
    19  // between a Kythe URI and a Kythe VName protobuf message.
    20  package kytheuri // import "kythe.io/kythe/go/util/kytheuri"
    21  
    22  import (
    23  	"errors"
    24  	"fmt"
    25  	"path"
    26  	"strings"
    27  
    28  	cpb "kythe.io/kythe/proto/common_go_proto"
    29  	spb "kythe.io/kythe/proto/storage_go_proto"
    30  )
    31  
    32  // Scheme is the URI scheme label for Kythe.
    33  const Scheme = "kythe:"
    34  
    35  // A URI represents a parsed, unescaped Kythe URI.  A zero-valued URI is ready
    36  // for use, representing the empty URI.
    37  type URI struct {
    38  	Signature string
    39  	Corpus    string
    40  	Root      string
    41  	Path      string
    42  	Language  string
    43  }
    44  
    45  // VName converts the URI to an equivalent Kythe VName protobuf message.
    46  func (u *URI) VName() *spb.VName {
    47  	if u == nil {
    48  		return new(spb.VName)
    49  	}
    50  	return &spb.VName{
    51  		Signature: u.Signature,
    52  		Corpus:    u.Corpus,
    53  		Root:      u.Root,
    54  		Path:      cleanPath(u.Path),
    55  		Language:  u.Language,
    56  	}
    57  }
    58  
    59  // CorpusPath returns the CorpusPath components of the URI.
    60  func (u *URI) CorpusPath() *cpb.CorpusPath {
    61  	if u == nil {
    62  		return new(cpb.CorpusPath)
    63  	}
    64  	return &cpb.CorpusPath{
    65  		Corpus: u.Corpus,
    66  		Root:   u.Root,
    67  		Path:   cleanPath(u.Path),
    68  	}
    69  }
    70  
    71  // String renders the Kythe URI into the standard URI string format.
    72  //
    73  // The resulting string is in canonical ordering, so if the URI was created by
    74  // parsing a string, this may return a different string from that.  However,
    75  // parsing this string will always give back the same URI.  If u == nil, it is
    76  // treated as an empty URI.
    77  func (u *URI) String() string { return u.Encode().String() }
    78  
    79  // Equal reports whether u is equal to v.
    80  func (u *URI) Equal(v *URI) bool { return u.String() == v.String() }
    81  
    82  // Encode returns an escaped "raw" Kythe URI equivalent to u.
    83  func (u *URI) Encode() *Raw {
    84  	if u == nil {
    85  		return nil
    86  	}
    87  	return &Raw{
    88  		URI: URI{
    89  			Signature: all.escape(u.Signature),
    90  			Corpus:    paths.escape(u.Corpus),
    91  			Root:      paths.escape(u.Root),
    92  			Path:      paths.escape(cleanPath(u.Path)),
    93  			Language:  all.escape(u.Language),
    94  		},
    95  	}
    96  }
    97  
    98  // A Raw represents a parsed, "raw" Kythe URI whose field values are escaped.
    99  // Use the Decode method to convert a *Raw to a plain *URI.
   100  type Raw struct{ URI URI }
   101  
   102  // Decode returns a *URI equivalent to r but with its field values unescaped.
   103  func (r *Raw) Decode() (*URI, error) {
   104  	u := r.URI // copy
   105  	buf := make([]byte, len(u.Signature)+len(u.Corpus)+len(u.Root)+len(u.Path)+len(u.Language))
   106  	return decode(&u, buf)
   107  }
   108  
   109  // String renders r into the standard URI string format.
   110  //
   111  // The resulting string is in canonical ordering, so if the URI was created by
   112  // parsing a string, this may return a different string from that.  However,
   113  // parsing this string will always give back the same URI.  If r == nil, it is
   114  // treated as an empty URI.
   115  func (r *Raw) String() string {
   116  	if r == nil {
   117  		return Scheme
   118  	}
   119  	var buf strings.Builder
   120  	buf.Grow(len(Scheme) +
   121  		2 + len(r.URI.Corpus) + // "//" + corpus
   122  		6 + len(r.URI.Language) + // "?lang=" + string
   123  		6 + len(r.URI.Path) + // "?path=" + string
   124  		6 + len(r.URI.Root) + // "?root=" + string
   125  		1 + len(r.URI.Signature), // "#" + string
   126  	)
   127  	buf.WriteString(Scheme)
   128  	if c := r.URI.Corpus; c != "" {
   129  		buf.WriteString("//")
   130  		buf.WriteString(c)
   131  	}
   132  
   133  	// Pack up the query arguments. Order matters here, so that we can preserve
   134  	// a canonical string format.
   135  	if s := r.URI.Language; s != "" {
   136  		buf.WriteString("?lang=")
   137  		buf.WriteString(s)
   138  	}
   139  	if s := r.URI.Path; s != "" {
   140  		buf.WriteString("?path=")
   141  		buf.WriteString(s)
   142  	}
   143  	if s := r.URI.Root; s != "" {
   144  		buf.WriteString("?root=")
   145  		buf.WriteString(s)
   146  	}
   147  
   148  	// If there is a signature, add that in as well.
   149  	if s := r.URI.Signature; s != "" {
   150  		buf.WriteByte('#')
   151  		buf.WriteString(s)
   152  	}
   153  	return buf.String()
   154  }
   155  
   156  // FromVName returns a Kythe URI for the given Kythe VName protobuf message.
   157  func FromVName(v *spb.VName) *URI {
   158  	if v == nil {
   159  		return &URI{}
   160  	}
   161  	return &URI{
   162  		Signature: v.Signature,
   163  		Corpus:    v.Corpus,
   164  		Root:      v.Root,
   165  		Path:      v.Path,
   166  		Language:  v.Language,
   167  	}
   168  }
   169  
   170  // FromCorpusPath returns a Kythe URI for the given Kythe CorpusPath protobuf message.
   171  func FromCorpusPath(cp *cpb.CorpusPath) *URI {
   172  	if cp == nil {
   173  		return &URI{}
   174  	}
   175  	return &URI{
   176  		Corpus: cp.Corpus,
   177  		Root:   cp.Root,
   178  		Path:   cp.Path,
   179  	}
   180  }
   181  
   182  // cleanPath is as path.Clean, but leaves "" alone.
   183  func cleanPath(s string) string {
   184  	if s == "" {
   185  		return s
   186  	}
   187  	return path.Clean(s)
   188  }
   189  
   190  // Partition s around the first occurrence of mark, if any.
   191  // If s has the form p mark q, returns p, q; otherwise returns s, "".
   192  func split(s string, mark byte) (prefix, suffix string) {
   193  	if i := strings.IndexByte(s, mark); i >= 0 {
   194  		return s[:i], s[i+1:]
   195  	}
   196  	return s, ""
   197  }
   198  
   199  // ParseRaw parses a Kythe URI from s, but does not unescape its fields.  Use
   200  // Parse to fully parse and unescape a URI, or call the Decode method of the
   201  // returned value.
   202  func ParseRaw(s string) (*Raw, error) {
   203  	if s == "" {
   204  		return new(Raw), nil
   205  	}
   206  
   207  	// Split off the signature from the fragment tail, if defined.
   208  	head, fragment := split(s, '#')
   209  
   210  	// Check for a scheme label.  This may be empty; but if present, it must be
   211  	// our expected scheme.
   212  	if tail := strings.TrimPrefix(head, Scheme); tail != head {
   213  		head = tail // found and removed our scheme marker
   214  	}
   215  
   216  	// Check for a bundle of attribute values.  This may be empty.
   217  	head, attrs := split(head, '?')
   218  	if tail := strings.TrimPrefix(head, "//"); tail != head {
   219  		head = tail
   220  	} else if head != "" {
   221  		return nil, errors.New("invalid URI scheme")
   222  	}
   223  
   224  	r := &Raw{
   225  		URI: URI{
   226  			Signature: fragment,
   227  			Corpus:    head,
   228  		},
   229  	}
   230  
   231  	// If there are any attributes, parse them.  We allow valid attributes to
   232  	// occur in any order, even if it is not canonical.
   233  	if attrs != "" {
   234  		if err := splitByte(attrs, '?', func(attr string) error {
   235  			name, value := split(attr, '=')
   236  			if value == "" {
   237  				return fmt.Errorf("invalid attribute: %q", attr)
   238  			}
   239  			switch name {
   240  			case "lang":
   241  				r.URI.Language = value
   242  			case "root":
   243  				r.URI.Root = value
   244  			case "path":
   245  				r.URI.Path = value
   246  			default:
   247  				return fmt.Errorf("invalid attribute: %q", name)
   248  			}
   249  			return nil
   250  		}); err != nil {
   251  			return nil, err
   252  		}
   253  	}
   254  	return r, nil
   255  }
   256  
   257  // splitByte calls f with each partition of s delimited by b or the end of the
   258  // string.  If f reports an error, the split is aborted and that error is
   259  // returned to the caller of splitByte.
   260  func splitByte(s string, b byte, f func(string) error) error {
   261  	pos := 0
   262  	for pos < len(s) {
   263  		tail := s[pos:]
   264  		i := strings.IndexByte(tail, b)
   265  		if i < 0 {
   266  			return f(tail)
   267  		} else if err := f(tail[:i]); err != nil {
   268  			return err
   269  		}
   270  		pos += i + 1
   271  	}
   272  	return nil
   273  }
   274  
   275  // Parse parses and unescapes a Kythe URI from s. If s omits a scheme label,
   276  // the "kythe" scheme is assumed.
   277  func Parse(s string) (*URI, error) {
   278  	r, err := ParseRaw(s)
   279  	if err != nil {
   280  		return nil, err
   281  	}
   282  	return decode(&r.URI, make([]byte, len(s)))
   283  }
   284  
   285  // ParseCorpusPath parses a Kythe URI and returns its CorpusPath components.
   286  func ParseCorpusPath(s string) (*cpb.CorpusPath, error) {
   287  	u, err := Parse(s)
   288  	if err != nil {
   289  		return nil, err
   290  	}
   291  	return u.CorpusPath(), nil
   292  }
   293  
   294  // decode decodes u in-place using buf as an intermediate buffer.  The caller
   295  // must ensure len(buf) is sufficient to hold the longest field.  Preallocation
   296  // reduces allocation for unescaping and saves ~200 ns/op in benchmarks.
   297  func decode(u *URI, buf []byte) (*URI, error) {
   298  	if err := unescape(&u.Signature, buf); err != nil {
   299  		return nil, fmt.Errorf("invalid signature: %v", err)
   300  	} else if err := unescape(&u.Corpus, buf); err != nil {
   301  		return nil, fmt.Errorf("invalid corpus label: %v", err)
   302  	} else if err := unescape(&u.Language, buf); err != nil {
   303  		return nil, fmt.Errorf("invalid language: %v", err)
   304  	} else if err := unescape(&u.Path, buf); err != nil {
   305  		return nil, fmt.Errorf("invalid path: %v", err)
   306  	} else if err := unescape(&u.Root, buf); err != nil {
   307  		return nil, fmt.Errorf("invalid root: %v", err)
   308  	}
   309  	return u, nil
   310  }
   311  
   312  // ToString renders the given VName into the standard string uri format.
   313  func ToString(v *spb.VName) string { return FromVName(v).String() }
   314  
   315  // ToVName parses the given string as a URI and returns an equivalent VName.
   316  func ToVName(s string) (*spb.VName, error) {
   317  	uri, err := Parse(s)
   318  	if err != nil {
   319  		return nil, err
   320  	}
   321  	return uri.VName(), nil
   322  }
   323  
   324  // MustParse returns the URI from parsing s, or panics in case of error.
   325  func MustParse(s string) *URI {
   326  	u, err := Parse(s)
   327  	if err != nil {
   328  		panic(fmt.Sprintf("Parse %q: %v", s, err))
   329  	}
   330  	return u
   331  }
   332  
   333  // Fix returns the canonical form of the given Kythe URI, if possible.
   334  func Fix(s string) (string, error) {
   335  	u, err := Parse(s)
   336  	if err != nil {
   337  		return "", err
   338  	}
   339  	return u.String(), nil
   340  }
   341  
   342  // Equal reports whether the two Kythe URI strings are equal in canonical form.
   343  // If either URI is invalid, Equal returns false.
   344  func Equal(u1, u2 string) bool {
   345  	f1, err := Fix(u1)
   346  	if err != nil {
   347  		return false
   348  	}
   349  	f2, err := Fix(u2)
   350  	if err != nil {
   351  		return false
   352  	}
   353  	return f1 == f2
   354  }