github.com/square/finch@v0.0.0-20240412205204-6530c03e2b96/trx/trx.go (about)

     1  // Copyright 2024 Block, Inc.
     2  
     3  package trx
     4  
     5  import (
     6  	"bufio"
     7  	"fmt"
     8  	"log"
     9  	"os"
    10  	"regexp"
    11  	"strconv"
    12  	"strings"
    13  	"time"
    14  
    15  	"github.com/dustin/go-humanize"
    16  
    17  	"github.com/square/finch"
    18  	"github.com/square/finch/config"
    19  	"github.com/square/finch/data"
    20  	"github.com/square/finch/limit"
    21  )
    22  
    23  const (
    24  	STMT  = byte(0x0)
    25  	BEGIN = byte(0x1)
    26  	END   = byte(0x2)
    27  )
    28  
    29  const EXPLICIT_CALL_SUFFIX = "()"
    30  
    31  var DataKeyPattern = regexp.MustCompile(`@[\w_-]+(?:\(\))?`)
    32  var ExplicitCallPattern = regexp.MustCompile(`@[\w_-]+\(\)`)
    33  
    34  // Set is the complete set of transactions (and statements) for a stage.
    35  type Set struct {
    36  	Order      []string                // trx names in config order
    37  	Statements map[string][]*Statement // keyed on trx name
    38  	Meta       map[string]Meta         // keyed on trx name
    39  	Data       *data.Scope             // keyed on data key (@d)
    40  }
    41  
    42  // Statement is one query in a transaction and all its read-only metadata.
    43  type Statement struct {
    44  	Trx          string
    45  	Query        string
    46  	ResultSet    bool
    47  	Prepare      bool
    48  	PrepareMulti int
    49  	Begin        bool
    50  	Commit       bool
    51  	Write        bool
    52  	DDL          bool
    53  	Idle         time.Duration
    54  	Inputs       []string // data keys (number of values)
    55  	Outputs      []string // data keys save-results|columns and save-insert-id
    56  	InsertId     string   // data key (special output)
    57  	Limit        limit.Data
    58  	Calls        []byte
    59  }
    60  
    61  type Meta struct {
    62  	DDL bool
    63  }
    64  
    65  // Load loads all trx files and returns a Set representing all parsed trx.
    66  // This is called from stage.Prepare since a stage comprises all trx.
    67  // The given data scope comes from compute.Server to handle globally scoped
    68  // data keys. Params are user-defined from the stage file: stage.params.
    69  // The stage uses the returned Set for workload allocation based on however
    70  // stage.workload mixes and matches trx to exec/client groups.
    71  func Load(trxFiles []config.Trx, scope *data.Scope, params map[string]string) (*Set, error) {
    72  	set := &Set{
    73  		Order:      make([]string, 0, len(trxFiles)),
    74  		Statements: map[string][]*Statement{},
    75  		Data:       scope,
    76  		Meta:       map[string]Meta{},
    77  	}
    78  	for i := range trxFiles {
    79  		if err := NewFile(trxFiles[i], set, params).Load(); err != nil {
    80  			return nil, err
    81  		}
    82  	}
    83  	return set, nil
    84  }
    85  
    86  var ErrEOF = fmt.Errorf("EOF")
    87  
    88  type lineBuf struct {
    89  	n      uint
    90  	str    string
    91  	mods   []string
    92  	copyNo uint
    93  }
    94  
    95  // File represents and loads one trx file. File.Load is called by the pkg func,
    96  // trx.Load, which is called by stage.Prepare. Do not call File.Load directly
    97  // except for testing.
    98  type File struct {
    99  	cfg    config.Trx        // stage.trx[]
   100  	set    *Set              // trx set for the stage, what File.Load fills in
   101  	params map[string]string // stage.params: user-defined value interpolation
   102  	// --
   103  	lb      lineBuf        // save lines until a complete statement is read
   104  	colRefs map[string]int // column ref counts to detect unused ones
   105  	stmtNo  uint           // 1-indexed in file (not a line number; not an index into stmt)
   106  	stmts   []*Statement   // all statements in this file
   107  	hasDDL  bool           // true if any statement is DDL
   108  }
   109  
   110  func NewFile(cfg config.Trx, set *Set, params map[string]string) *File {
   111  	return &File{
   112  		cfg:     cfg,
   113  		set:     set,
   114  		params:  params,
   115  		colRefs: map[string]int{},
   116  		lb:      lineBuf{mods: []string{}},
   117  		stmts:   []*Statement{},
   118  		stmtNo:  0,
   119  	}
   120  }
   121  
   122  func (f *File) Load() error {
   123  	finch.Debug("loading %s", f.cfg.File)
   124  	file, err := os.Open(f.cfg.File)
   125  	if err != nil {
   126  		return err
   127  	}
   128  	defer file.Close()
   129  
   130  	scanner := bufio.NewScanner(file)
   131  	for scanner.Scan() {
   132  		err = f.line(strings.TrimSpace(scanner.Text()))
   133  		if err != nil {
   134  			if err == ErrEOF {
   135  				break
   136  			}
   137  			return err
   138  		}
   139  	}
   140  	err = f.line("") // last line
   141  	if err != nil {
   142  		return err
   143  	}
   144  
   145  	if len(f.stmts) == 0 {
   146  		return fmt.Errorf("trx file %s has no statements; at least 1 is required", f.cfg.File)
   147  	}
   148  
   149  	noRefs := []string{}
   150  	for col, refs := range f.colRefs {
   151  		if refs > 0 {
   152  			continue
   153  		}
   154  		noRefs = append(noRefs, col)
   155  	}
   156  	if len(noRefs) > 0 {
   157  		return fmt.Errorf("saved columns not referenced: %s", strings.Join(noRefs, ", "))
   158  	}
   159  
   160  	if err := scanner.Err(); err != nil {
   161  		log.Fatal(err) // shouldn't happen
   162  	}
   163  
   164  	f.set.Order = append(f.set.Order, f.cfg.Name)
   165  	f.set.Statements[f.cfg.Name] = f.stmts
   166  	f.set.Meta[f.cfg.Name] = Meta{
   167  		DDL: f.hasDDL,
   168  	}
   169  
   170  	return nil
   171  }
   172  
   173  func (f *File) line(line string) error {
   174  	f.lb.n++
   175  
   176  	// More lines in statement
   177  	if line != "" {
   178  		finch.Debug("line %d: %s\n", f.lb.n, line)
   179  		if strings.HasPrefix(line, "-- ") {
   180  			if line == "-- EOF" {
   181  				return ErrEOF
   182  			}
   183  			mod, err := config.Vars(strings.TrimSpace(strings.TrimPrefix(line, "--")), f.params, true)
   184  			if err != nil {
   185  				return fmt.Errorf("parsing modifier '%s' on line %d: %s", line, f.lb.n, err)
   186  			}
   187  			f.lb.mods = append(f.lb.mods, mod)
   188  		} else {
   189  			f.lb.str += line + " "
   190  		}
   191  		return nil
   192  	}
   193  
   194  	// Empty lines between statements
   195  	if f.lb.str == "" {
   196  		finch.Debug("line %d: space", f.lb.n)
   197  		return nil
   198  	}
   199  
   200  	// End of statement
   201  	finch.Debug("line %d: end prev", f.lb.n)
   202  	s, err := f.statements()
   203  	if err != nil {
   204  		return fmt.Errorf("error parsing %s at line %d: %s", f.cfg.File, f.lb.n-1, err)
   205  	}
   206  	for i := range s {
   207  		finch.Debug("stmt: %+v", s[i])
   208  	}
   209  	f.stmts = append(f.stmts, s...)
   210  
   211  	f.lb.str = ""
   212  	f.lb.mods = []string{}
   213  
   214  	return nil
   215  }
   216  
   217  var reKeyVal = regexp.MustCompile(`([\w_-]+)(?:\:\s*(\w+))?`)
   218  var reCSV = regexp.MustCompile(`\/\*\!csv\s+(\d+)\s+(.+)\*\/`)
   219  var reFirstWord = regexp.MustCompile(`^(\w+)`)
   220  
   221  func (f *File) statements() ([]*Statement, error) {
   222  	f.stmtNo++
   223  	s := &Statement{
   224  		Trx: f.cfg.Name, // trx name (trx.name or base(trx.file)
   225  	}
   226  
   227  	query := strings.TrimSpace(f.lb.str)
   228  	finch.Debug("query raw: %s", query)
   229  
   230  	// ----------------------------------------------------------------------
   231  	// Switches
   232  	// ----------------------------------------------------------------------
   233  
   234  	// @todo regexp to extract first word
   235  	com := strings.ToUpper(reFirstWord.FindString(query))
   236  	switch com {
   237  	case "SELECT":
   238  		s.ResultSet = true
   239  	case "BEGIN", "START":
   240  		s.Begin = true // used to rate limit trx per second (TPS) in client/client.go
   241  	case "COMMIT":
   242  		s.Commit = true // used to measure TPS rate in client/client.go
   243  	case "INSERT", "UPDATE", "DELETE", "REPLACE":
   244  		s.Write = true
   245  	case "ALTER", "CREATE", "DROP", "RENAME", "TRUNCATE":
   246  		finch.Debug("DDL")
   247  		s.DDL = true    // statement is DDL
   248  		f.hasDDL = true // trx has DDL
   249  	}
   250  
   251  	// ----------------------------------------------------------------------
   252  	// Modifiers: --prepare, --table-size, etc.
   253  	// ----------------------------------------------------------------------
   254  
   255  	for _, mod := range f.lb.mods {
   256  		m := strings.Fields(mod)
   257  		finch.Debug("mod: '%v' %#v", mod, m)
   258  		if len(m) < 1 {
   259  			return nil, fmt.Errorf("invalid modifier: '%s': does not match key: value (pattern match < 2)", mod)
   260  		}
   261  		m[0] = strings.Trim(m[0], ":")
   262  		switch m[0] {
   263  		case "prepare", "prepared":
   264  			s.Prepare = true
   265  		case "idle":
   266  			d, err := time.ParseDuration(m[1])
   267  			if err != nil {
   268  				return nil, fmt.Errorf("invalid idle modifier: '%s': %s", mod, err)
   269  			}
   270  			s.Idle = d
   271  		case "rows":
   272  			max, err := strconv.ParseUint(m[1], 10, 64)
   273  			if err != nil {
   274  				return nil, fmt.Errorf("invalid rows limit: %s: %s", m[1], err)
   275  			}
   276  			var offset uint64
   277  			if len(m) == 3 {
   278  				offset, err = strconv.ParseUint(m[2], 10, 64)
   279  				if err != nil {
   280  					return nil, fmt.Errorf("invalid rows offset: %s: %s", m[2], err)
   281  				}
   282  			}
   283  			finch.Debug("write limit: %d rows (offset %d)", max, offset)
   284  			s.Limit = limit.Or(s.Limit, limit.NewRows(int64(max), int64(offset)))
   285  		case "table-size", "database-size":
   286  			if len(m) != 3 {
   287  				return nil, fmt.Errorf("invalid %s modifier: split %d fields, expected 3: %s", m[0], len(m), mod)
   288  			}
   289  			max, err := humanize.ParseBytes(m[2])
   290  			if err != nil {
   291  				return nil, err
   292  			}
   293  			var lm limit.Data
   294  			if m[0] == "table-size" {
   295  				lm = limit.NewSize(max, m[2], "", m[1])
   296  			} else { // database-size
   297  				lm = limit.NewSize(max, m[2], m[1], "")
   298  			}
   299  			s.Limit = limit.Or(s.Limit, lm)
   300  		case "save-insert-id":
   301  			// @todo check len(m)
   302  			if s.ResultSet {
   303  				return nil, fmt.Errorf("save-insert-id not allowed on SELECT")
   304  			}
   305  			finch.Debug("save-insert-id")
   306  			dataKey, err := f.column(0, m[1])
   307  			if err != nil {
   308  				return nil, err
   309  			}
   310  			s.InsertId = dataKey
   311  			s.Outputs = append(s.Outputs, dataKey)
   312  		case "save-columns":
   313  			// @todo check len(m)
   314  			for i, col := range m[1:] {
   315  				// @todo split csv (handle "col1,col2" instead of "col1, col2")
   316  				dataKey, err := f.column(i, col)
   317  				if err != nil {
   318  					return nil, err
   319  				}
   320  				s.Outputs = append(s.Outputs, dataKey)
   321  			}
   322  		case "copies":
   323  			n, err := strconv.Atoi(m[1])
   324  			if err != nil {
   325  				return nil, fmt.Errorf("copies: %s invalid: %s", m[1], err)
   326  			}
   327  			if n < 0 {
   328  				return nil, fmt.Errorf("copies: %s invalid: must be >= 0", m[1])
   329  			}
   330  			if n == 0 {
   331  				return nil, nil
   332  			}
   333  			if n == 1 {
   334  				continue
   335  			}
   336  			prepareMulti := false
   337  			mods := make([]string, 0, len(f.lb.mods)-1)
   338  			for _, mod := range f.lb.mods {
   339  				if strings.HasPrefix(mod, "copies") {
   340  					continue
   341  				}
   342  				if strings.HasPrefix(mod, "prepare") && !strings.Contains(query, finch.COPY_NUMBER) {
   343  					prepareMulti = true
   344  				}
   345  				mods = append(mods, mod)
   346  			}
   347  			f.lb.mods = mods
   348  			f.stmtNo--
   349  			multi := make([]*Statement, n)
   350  			for i := 0; i < n; i++ {
   351  				finch.Debug("copy %d of %d", i+1, n)
   352  				f.lb.copyNo = uint(i + 1)
   353  				ms, err := f.statements() // recurse
   354  				if err != nil {
   355  					return nil, fmt.Errorf("during copy recurse: %s", err)
   356  				}
   357  				multi[i] = ms[0]
   358  			}
   359  			if prepareMulti {
   360  				multi[0].PrepareMulti = n
   361  			}
   362  			f.lb.copyNo = 0
   363  			return multi, nil
   364  		default:
   365  			return nil, fmt.Errorf("unknown modifier: %s: '%s'", m[0], mod)
   366  		}
   367  	}
   368  
   369  	// ----------------------------------------------------------------------
   370  	// Replace /*!copy-number*/
   371  	// ----------------------------------------------------------------------
   372  	query = strings.ReplaceAll(query, finch.COPY_NUMBER, fmt.Sprintf("%d", f.lb.copyNo))
   373  
   374  	// ----------------------------------------------------------------------
   375  	// Expand CSV /*!csv N template*/
   376  	// ----------------------------------------------------------------------
   377  	csvTemplate := ""
   378  	m := reCSV.FindStringSubmatch(query)
   379  	if len(m) > 0 {
   380  		n, err := strconv.ParseInt(m[1], 10, 32)
   381  		if err != nil {
   382  			return nil, fmt.Errorf("invalid number of CSV values in %s: %s", m[0], err)
   383  		}
   384  		vals := make([]string, n)
   385  		csvTemplate = strings.TrimSpace(m[2])
   386  
   387  		keys := map[string]bool{}
   388  		for _, name := range DataKeyPattern.FindAllString(csvTemplate, -1) {
   389  			// Trim to look up data key in config because @ is not valid YAML.
   390  			// The @ will be put back later because all other code expects it.
   391  			name = cfgKey(name)
   392  			dataCfg, ok := f.cfg.Data[name] // config.stage.trx[].data
   393  			if !ok {
   394  				return nil, fmt.Errorf("%s not configured: trx file uses %s but this data key is not configured in the stage file", name, name)
   395  			}
   396  
   397  			// @d in a CSV template defaults to row scope
   398  			if dataCfg.Scope == "" {
   399  				dataCfg.Scope = finch.SCOPE_ROW
   400  				f.cfg.Data[name] = dataCfg
   401  			}
   402  
   403  			// Save row scoped @d in CSV template, ignore other scopes
   404  			if dataCfg.Scope == finch.SCOPE_ROW {
   405  				keys["@"+name] = true
   406  			}
   407  		}
   408  
   409  		// Change first row scoped @d -> @d() so it generates new values per row
   410  		csvTemplateScoped := RowScope(keys, csvTemplate)
   411  		finch.Debug("csv %d %s -> %s", n, csvTemplate, csvTemplateScoped)
   412  
   413  		// Expand template, e.g. 3 (@d) -> (@d), (@d), (@d)
   414  		for i := int64(0); i < n; i++ {
   415  			vals[i] = csvTemplateScoped
   416  		}
   417  		csv := strings.Join(vals, ", ")
   418  		query = reCSV.ReplaceAllLiteralString(query, csv)
   419  	}
   420  
   421  	// ----------------------------------------------------------------------
   422  	// Data keys: @d -> data.Generator
   423  	// ----------------------------------------------------------------------
   424  	dataKeys := DataKeyPattern.FindAllString(query, -1)
   425  	finch.Debug("data keys: %v", dataKeys)
   426  	if len(dataKeys) == 0 {
   427  		s.Query = query
   428  		return []*Statement{s}, nil // no data key, return early
   429  	}
   430  	s.Inputs = dataKeys
   431  
   432  	s.Calls = Calls(s.Inputs)
   433  	query = ExplicitCallPattern.ReplaceAllStringFunc(query, func(s string) string {
   434  		return strings.TrimSuffix(s, EXPLICIT_CALL_SUFFIX)
   435  	})
   436  
   437  	dataFormats := map[string]string{} // keyed on data name
   438  	for i, name := range s.Inputs {
   439  		// Remove () from @d()
   440  		name = strings.TrimSuffix(name, EXPLICIT_CALL_SUFFIX)
   441  		s.Inputs[i] = name
   442  
   443  		var g data.Generator
   444  		var err error
   445  
   446  		if k, ok := f.set.Data.Keys[name]; ok && k.Column >= 0 {
   447  			f.colRefs[name]++
   448  			g = k.Generator
   449  		} else if name == "@PREV" {
   450  			if i == 0 {
   451  				return nil, fmt.Errorf("no @PREV data generator")
   452  			}
   453  			for p := i - 1; p >= 0; p-- {
   454  				finch.Debug("%s <- %s", dataKeys[p], dataKeys[i])
   455  				if dataKeys[p] == "@PREV" {
   456  					continue
   457  				}
   458  				g = f.set.Data.Keys[dataKeys[p]].Generator
   459  				break
   460  			}
   461  		} else {
   462  			if k, ok = f.set.Data.Keys[name]; ok {
   463  				g = k.Generator
   464  			} else {
   465  				dataCfg, ok := f.cfg.Data[cfgKey(name)] // config.stage.trx[].data
   466  				if !ok {
   467  					return nil, fmt.Errorf("%s not configured: trx file uses %s but this data key is not configured in the stage file", name, name)
   468  				}
   469  				finch.Debug("make data generator: %s %s scope: %s", dataCfg.Generator, name, dataCfg.Scope)
   470  
   471  				if dataCfg.Scope == "" {
   472  					dataCfg.Scope = finch.SCOPE_STATEMENT
   473  					f.cfg.Data[name] = dataCfg
   474  				}
   475  
   476  				g, err = data.Make(
   477  					dataCfg.Generator, // e.g. "auto-inc"
   478  					name,              // @d
   479  					dataCfg.Params,    // trx[].data.params, generator-specific
   480  				)
   481  				if err != nil {
   482  					return nil, err
   483  				}
   484  				f.set.Data.Keys[name] = data.Key{
   485  					Name:      name,
   486  					Trx:       f.cfg.Name,
   487  					Line:      f.lb.n - 1,
   488  					Statement: f.stmtNo,
   489  					Column:    -1,
   490  					Scope:     dataCfg.Scope,
   491  					Generator: g,
   492  				}
   493  				finch.Debug("%#v", k)
   494  			}
   495  		}
   496  
   497  		if s.Prepare {
   498  			dataFormats[name] = "?"
   499  		} else {
   500  			_, dataFormats[name] = g.Format()
   501  		}
   502  	}
   503  
   504  	replacements := make([]string, len(dataFormats)*2) // *2 because key + value
   505  	i := 0
   506  	for k, v := range dataFormats {
   507  		replacements[i] = k
   508  		replacements[i+1] = v
   509  		i += 2
   510  	}
   511  	finch.Debug("replacements: %v", replacements)
   512  	r := strings.NewReplacer(replacements...)
   513  	s.Query = r.Replace(query)
   514  
   515  	// Caller debug prints full Statement
   516  	return []*Statement{s}, nil
   517  }
   518  
   519  func (f *File) column(colNo int, col string) (string, error) {
   520  	col = strings.TrimSpace(strings.TrimSuffix(col, ","))
   521  	finch.Debug("col %s %d", col, colNo)
   522  
   523  	// If no-op column "_"?
   524  	if col == finch.NOOP_COLUMN {
   525  		if _, ok := f.set.Data.Keys[finch.NOOP_COLUMN]; !ok {
   526  			f.set.Data.Keys[finch.NOOP_COLUMN] = data.Key{
   527  				Name:      finch.NOOP_COLUMN,
   528  				Trx:       f.cfg.Name,
   529  				Line:      f.lb.n - 1,
   530  				Statement: f.stmtNo,
   531  				Column:    colNo,
   532  				Scope:     finch.SCOPE_GLOBAL,
   533  				Generator: data.Noop,
   534  			}
   535  			finch.Debug("%#v", f.set.Data.Keys[finch.NOOP_COLUMN])
   536  		}
   537  		finch.Debug("saved no-op col %s @ %d", col, colNo)
   538  		return finch.NOOP_COLUMN, nil
   539  	}
   540  
   541  	if k, ok := f.set.Data.Keys[col]; ok {
   542  		return "", fmt.Errorf("duplicated saved column: %s (first use: %s)", col, k)
   543  	}
   544  
   545  	dataCfg, ok := f.cfg.Data[cfgKey(col)] // config.stage.trx.*.data
   546  	if !ok {
   547  		dataCfg = config.Data{
   548  			Name:      col,
   549  			Generator: "column",
   550  			Scope:     finch.SCOPE_TRX,
   551  		}
   552  		fmt.Printf("No data params for column %s (%s line %d), default to non-quoted value\n", col, f.cfg.Name, f.lb.n-1)
   553  	}
   554  
   555  	g, err := data.Make("column", col, dataCfg.Params)
   556  	if err != nil {
   557  		return "", err
   558  	}
   559  	f.colRefs[col] = 0
   560  	f.set.Data.Keys[col] = data.Key{
   561  		Name:      col,
   562  		Trx:       f.cfg.Name,
   563  		Line:      f.lb.n - 1,
   564  		Statement: f.stmtNo,
   565  		Column:    colNo,
   566  		Scope:     dataCfg.Scope,
   567  		Generator: g,
   568  	}
   569  	finch.Debug("%#v", f.set.Data.Keys[col])
   570  	return col, nil
   571  }
   572  
   573  func Calls(dataKeys []string) []byte {
   574  	calls := make([]byte, len(dataKeys))
   575  	for i, name := range dataKeys {
   576  		if strings.HasSuffix(name, EXPLICIT_CALL_SUFFIX) {
   577  			calls[i] = 1
   578  		}
   579  	}
   580  	finch.Debug("calls: %v", calls)
   581  	return calls
   582  }
   583  
   584  // RowScope changes every first occurrence of the keys from @d to @d()
   585  // in csvTemplate. So "(@d, @d)" -> "(@d(), @d)". The explicit call @d()
   586  // makes @d row scoped because each row will call @d again. This is called
   587  // when the /*!csv N template */ is being processed (see reCSV).
   588  func RowScope(keys map[string]bool, csvTemplate string) string {
   589  	csvDataKeys := DataKeyPattern.FindAllString(csvTemplate, -1)
   590  KEY:
   591  	for dataKey := range keys { // row scoped keys
   592  		for _, k := range csvDataKeys { // all keys in csvTemplate
   593  			if !strings.HasPrefix(k, dataKey) {
   594  				continue // not the row scoped key we're looking for
   595  			}
   596  			// This is first occurrence of row scoped key in csvTemplate.
   597  			// Add () suffix if not already set.
   598  			if !strings.HasSuffix(k, EXPLICIT_CALL_SUFFIX) {
   599  				csvTemplate = strings.Replace(csvTemplate, k, k+EXPLICIT_CALL_SUFFIX, 1) // 1=only first occurrence
   600  			}
   601  			continue KEY // only check/change first occurrence, so this row scoped key is done
   602  		}
   603  	}
   604  	return csvTemplate
   605  }
   606  
   607  func cfgKey(s string) string {
   608  	return strings.Trim(s, "@"+EXPLICIT_CALL_SUFFIX)
   609  }