github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/dsref/parse.go

github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/dsref/parse.go (about)

     1  package dsref
     2  
     3  import (
     4  	"fmt"
     5  	"regexp"
     6  	"unicode"
     7  )
     8  
     9  // These functions parse a string to create a dsref. We refer to a "human-friendly reference"
    10  // as one with only a username and dataset name, such as "my_user/my_dataset". A "full reference"
    11  // can also contain a "concrete reference", which includes an optional datasetID plus a network
    12  // and a "commit hash".
    13  //
    14  // Parse will parse a human-friendly reference, or only a concrete reference, or a full reference.
    15  //
    16  // ParseHumanFriendly will only successfully parse a human-friendly reference, and nothing else.
    17  //
    18  // The grammar is here:
    19  //
    20  //  <dsref> = <humanFriendlyPortion> [ <concreteRef> ] | <concreteRef>
    21  //  <humanFriendlyPortion> = <validName> '/' <validName>
    22  //  <concreteRef> = '@' [ <datasetID> ] '/' <network> '/' <commitHash>
    23  //
    24  // Some examples of valid references:
    25  //     me/dataset
    26  //     username/dataset
    27  //     @/ipfs/QmSome1Commit2Hash3
    28  //     @datasetIdenfitier/ipfs/QmSome1Commit2Hash3
    29  //     username/dataset@QmProfile4ID5/ipfs/QmSome1Commit2Hash3
    30  // An invalid reference:
    31  //     /ipfs/QmSome1Commit2Hash3
    32  
    33  const (
    34  	alphaNumeric       = `[a-zA-Z][\w-]*`
    35  	alphaNumericDsname = `[a-zA-Z][\w-]{0,143}`
    36  	b58IdRSA           = `Qm[0-9a-zA-Z]{0,44}`
    37  	b58IdED            = `12D[0-9a-zA-Z]{0,50}`
    38  	b32LogbookID       = `[a-z2-7]{0,52}`
    39  )
    40  
    41  var (
    42  	validName         = regexp.MustCompile(`^` + alphaNumeric)
    43  	dsNameCheck       = regexp.MustCompile(`^` + alphaNumericDsname + `$`)
    44  	concreteRef       = regexp.MustCompile(`^@(` + b32LogbookID + `|` + b58IdRSA + `|` + b58IdED + `)?\/(` + alphaNumeric + `)\/(` + b58IdRSA + `|` + b58IdED + `)`)
    45  	b58StrictCheckRSA = regexp.MustCompile(`^Qm[1-9A-HJ-NP-Za-km-z]*$`)
    46  	b58StrictCheckED  = regexp.MustCompile(`^12D[1-9A-HJ-NP-Za-km-z]*$`)
    47  	b32LowerCheck     = regexp.MustCompile(`^[a-z2-7]*$`)
    48  
    49  	// ErrEmptyRef is an error for when a reference is empty
    50  	ErrEmptyRef = fmt.Errorf("empty reference")
    51  	// ErrParseError is an error returned when parsing fails
    52  	ErrParseError = fmt.Errorf("could not parse ref")
    53  	// ErrUnexpectedChar is an error when a character is unexpected, topic string must be non-empty
    54  	ErrUnexpectedChar = fmt.Errorf("unexpected character")
    55  	// ErrNotHumanFriendly is an error returned when a reference is not human-friendly
    56  	ErrNotHumanFriendly = fmt.Errorf("unexpected character '@', ref can only have username/name")
    57  	// ErrBadCaseName is the error when a bad case is used in the dataset name
    58  	ErrBadCaseName = fmt.Errorf("dataset name may not contain any upper-case letters")
    59  	// ErrBadCaseUsername is for when a username contains upper-case letters
    60  	ErrBadCaseUsername = fmt.Errorf("username may not contain any upper-case letters")
    61  	// ErrBadCaseShouldRename is the error when a dataset should be renamed to not use upper case letters
    62  	ErrBadCaseShouldRename = fmt.Errorf("dataset name should not contain any upper-case letters, rename it to only use lower-case letters, numbers, and underscores")
    63  	// ErrDescribeValidName is an error describing a valid dataset name
    64  	ErrDescribeValidName = fmt.Errorf("dataset name must start with a lower-case letter, and only contain lower-case letters, numbers, dashes, and underscore. Maximum length is 144 characters")
    65  	// ErrDescribeValidUsername describes valid username
    66  	ErrDescribeValidUsername = fmt.Errorf("username must start with a lower-case letter, and only contain lower-case letters, numbers, dashes, and underscores")
    67  )
    68  
    69  // Parse a reference from a string
    70  func Parse(text string) (Ref, error) {
    71  	return parse(text, false)
    72  }
    73  
    74  // ParsePeerRef a reference from a string where the dataset name is non-mandatory
    75  func ParsePeerRef(text string) (Ref, error) {
    76  	return parse(text, true)
    77  }
    78  
    79  func parse(text string, peerRef bool) (Ref, error) {
    80  	var r Ref
    81  	origLength := len(text)
    82  	if origLength == 0 {
    83  		return r, ErrEmptyRef
    84  	}
    85  
    86  	remain, partial, err := parseHumanFriendlyPortion(text)
    87  	if err == nil {
    88  		text = remain
    89  		r.Username = partial.Username
    90  		r.Name = partial.Name
    91  	} else if err.Error() == needUsernameSeparatedErr && peerRef == true {
    92  		return partial, nil
    93  	} else if err == ErrUnexpectedChar {
    94  		// This error must only be returned when the topic string is non-empty, so it's safe to
    95  		// index it at position 0.
    96  		return r, NewParseError("%s at position %d: '%c'", err, len(text)-len(remain), remain[0])
    97  	} else if err != ErrParseError {
    98  		return r, err
    99  	}
   100  
   101  	remain, partial, err = parseConcreteRef(text)
   102  	if err == nil {
   103  		text = remain
   104  		r.ProfileID = partial.ProfileID
   105  		r.InitID = partial.InitID
   106  		r.Path = partial.Path
   107  	} else if err != ErrParseError {
   108  		return r, err
   109  	}
   110  
   111  	if text != "" {
   112  		pos := origLength - len(text)
   113  		return r, NewParseError("unexpected character at position %d: '%c'", pos, text[0])
   114  	}
   115  
   116  	// Dataset names are not supposed to contain upper-case characters. For now, return an error
   117  	// but also the reference; callers should display a warning, but continue to work for now.
   118  	for _, rune := range r.Name {
   119  		if unicode.IsUpper(rune) {
   120  			return r, ErrBadCaseName
   121  		}
   122  	}
   123  
   124  	return r, nil
   125  }
   126  
   127  // ParseHumanFriendly parses a reference that only has a username and a dataset name
   128  func ParseHumanFriendly(text string) (Ref, error) {
   129  	var r Ref
   130  	origLength := len(text)
   131  	if origLength == 0 {
   132  		return r, ErrEmptyRef
   133  	}
   134  
   135  	remain, partial, err := parseHumanFriendlyPortion(text)
   136  	if err == nil {
   137  		text = remain
   138  		r.Username = partial.Username
   139  		r.Name = partial.Name
   140  	} else if err != ErrParseError {
   141  		return r, err
   142  	}
   143  
   144  	if text != "" {
   145  		if text[0] == '@' {
   146  			return r, ErrNotHumanFriendly
   147  		}
   148  		pos := origLength - len(text)
   149  		return r, NewParseError("unexpected character at position %d: '%c'", pos, text[0])
   150  	}
   151  
   152  	// Dataset names are not supposed to contain upper-case characters. For now, return an error
   153  	// but also the reference; callers should display a warning, but continue to work for now.
   154  	for _, rune := range r.Name {
   155  		if unicode.IsUpper(rune) {
   156  			return r, ErrBadCaseName
   157  		}
   158  	}
   159  
   160  	return r, nil
   161  }
   162  
   163  // ParseError is an error for when a dataset reference fails to parse
   164  type ParseError struct {
   165  	Message string
   166  }
   167  
   168  // Error renders the ParseError as a string
   169  func (e *ParseError) Error() string {
   170  	return e.Message
   171  }
   172  
   173  // NewParseError returns a new ParseError, its parameters are a format string and arguments
   174  func NewParseError(template string, args ...interface{}) error {
   175  	return &ParseError{Message: fmt.Sprintf(template, args...)}
   176  }
   177  
   178  // MustParse parses a dsref from a string, or panics if it fails
   179  func MustParse(text string) Ref {
   180  	ref, err := Parse(text)
   181  	if err != nil {
   182  		panic(err)
   183  	}
   184  	return ref
   185  }
   186  
   187  // IsRefString returns whether the string parses as a valid reference
   188  func IsRefString(text string) bool {
   189  	_, err := Parse(text)
   190  	return err == nil || err == ErrBadCaseName
   191  }
   192  
   193  // IsValidName returns whether the dataset name is valid
   194  func IsValidName(text string) bool {
   195  	return dsNameCheck.Match([]byte(text))
   196  }
   197  
   198  // EnsureValidName returns nil if the name is valid, and an error otherwise
   199  func EnsureValidName(text string) error {
   200  	if !dsNameCheck.Match([]byte(text)) {
   201  		return ErrDescribeValidName
   202  	}
   203  
   204  	// Dataset names are not supposed to contain upper-case characters. For now, return an error
   205  	// but also the reference; callers should display a warning, but continue to work for now.
   206  	for _, r := range text {
   207  		if unicode.IsUpper(r) {
   208  			return ErrBadCaseName
   209  		}
   210  	}
   211  
   212  	return nil
   213  }
   214  
   215  // EnsureValidUsername is the same as EnsureValidName but returns a different error
   216  func EnsureValidUsername(text string) error {
   217  	err := EnsureValidName(text)
   218  	if err == ErrDescribeValidName {
   219  		return ErrDescribeValidUsername
   220  	}
   221  	if err == ErrBadCaseName {
   222  		return ErrBadCaseUsername
   223  	}
   224  	return err
   225  }
   226  
   227  const needUsernameSeparatedErr = "need username separated by '/' from dataset name"
   228  
   229  // parse the front of a dataset reference, the human friendly portion
   230  func parseHumanFriendlyPortion(text string) (string, Ref, error) {
   231  	var r Ref
   232  	// Parse as many alphaNumeric characters as possible for the username
   233  	match := validName.FindString(text)
   234  	if match == "" {
   235  		return text, r, ErrParseError
   236  	}
   237  	r.Username = match
   238  	text = text[len(match):]
   239  	// Check if the remaining text is empty, or there's not a slash next
   240  	if text == "" {
   241  		return text, r, NewParseError(needUsernameSeparatedErr)
   242  	} else if text[0] != '/' {
   243  		return text, r, ErrUnexpectedChar
   244  	}
   245  	text = text[1:]
   246  	// Parse as many alphaNumeric characters as possible for the dataset name
   247  	match = validName.FindString(text)
   248  	if match == "" {
   249  		return text, r, NewParseError("did not find valid dataset name")
   250  	}
   251  	r.Name = match
   252  	text = text[len(match):]
   253  	return text, r, nil
   254  }
   255  
   256  // parse the back of the dataset reference, the concrete path
   257  func parseConcreteRef(text string) (string, Ref, error) {
   258  	var r Ref
   259  	matches := concreteRef.FindStringSubmatch(text)
   260  	if matches == nil {
   261  		return text, r, ErrParseError
   262  	}
   263  	if len(matches) != 4 {
   264  		return text, r, NewParseError("unexpected number of regex matches %d", len(matches))
   265  	}
   266  	// TODO(b5): this is incorrect. We should be allowing any 2-to-4-character alphanumeric
   267  	// network identifier
   268  	if matches[2] != "mem" && matches[2] != "ipfs" {
   269  		return text, r, NewParseError("invalid network")
   270  	}
   271  	matchedLen := len(matches[0])
   272  	if matches[1] != "" {
   273  		if b58StrictCheckRSA.FindString(matches[1]) != "" || b58StrictCheckED.FindString(matches[1]) != "" {
   274  			r.ProfileID = matches[1]
   275  		} else {
   276  			if b32LowerCheck.FindString(matches[1]) == "" {
   277  				return text, r, NewParseError("datasetID contains invalid base32 characters")
   278  			}
   279  			r.InitID = matches[1]
   280  		}
   281  	}
   282  	if matches[3] != "" && b58StrictCheckRSA.FindString(matches[3]) == "" && b58StrictCheckED.FindString(matches[3]) == "" {
   283  		return text, r, NewParseError("path contains invalid base58 characters")
   284  	}
   285  	r.Path = fmt.Sprintf("/%s/%s", matches[2], matches[3])
   286  	return text[matchedLen:], r, nil
   287  }