github.com/m3db/m3@v1.5.0/src/metrics/carbon/parser.go (about)

     1  // Copyright (c) 2019 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package carbon
    22  
    23  import (
    24  	"bufio"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"math"
    29  	"strconv"
    30  	"strings"
    31  	"time"
    32  	"unicode/utf8"
    33  
    34  	"github.com/m3db/m3/src/x/instrument"
    35  	"github.com/m3db/m3/src/x/unsafe"
    36  
    37  	"go.uber.org/zap"
    38  )
    39  
    40  const (
    41  	negativeNanStr = "-nan"
    42  	nanStr         = "nan"
    43  
    44  	floatFormatByte = 'f'
    45  	floatPrecision  = -1
    46  	intBitSize      = 64
    47  	floatBitSize    = 64
    48  	intBase         = 10
    49  
    50  	initScannerBufferSize = 2 << 15 // ~ 65KiB
    51  	maxScannerBufferSize  = 2 << 17 // ~ 0.25iB
    52  )
    53  
    54  var (
    55  	errInvalidLine = errors.New("invalid line")
    56  	errNotUTF8     = errors.New("not valid UTF8 string")
    57  	mathNan        = math.NaN()
    58  )
    59  
    60  // Metric represents a carbon metric.
    61  type Metric struct {
    62  	Name []byte
    63  	Time time.Time
    64  	Val  float64
    65  }
    66  
    67  // ToLine converts the carbon Metric struct to a line.
    68  func (m *Metric) ToLine() string {
    69  	return string(m.Name) + " " + strconv.FormatFloat(m.Val, floatFormatByte, floatPrecision, floatBitSize) +
    70  		" " + strconv.FormatInt(m.Time.Unix(), intBase) + "\n"
    71  }
    72  
    73  // ParsePacket parses a carbon packet and returns the metrics and number of malformed lines.
    74  func ParsePacket(packet []byte) ([]Metric, int) {
    75  	return parsePacket([]Metric{}, packet)
    76  }
    77  
    78  // ParseAndAppendPacket does the same thing as parse packet, but it allows the caller to pass
    79  // in the []Metric to facilitate pooling.
    80  func ParseAndAppendPacket(mets []Metric, packet []byte) ([]Metric, int) {
    81  	return parsePacket(mets, packet)
    82  }
    83  
    84  func parsePacket(mets []Metric, packet []byte) ([]Metric, int) {
    85  	var malformed, prevIdx, i int
    86  	for i = 0; i < len(packet); i++ {
    87  		if packet[i] == '\n' {
    88  			if (i - prevIdx) > 1 {
    89  				name, timestamp, value, err := Parse(packet[prevIdx:i])
    90  				if err == nil {
    91  					mets = append(mets, Metric{
    92  						Name: name,
    93  						Time: timestamp,
    94  						Val:  value,
    95  					})
    96  				} else {
    97  					malformed++
    98  				}
    99  			}
   100  			prevIdx = i + 1
   101  		}
   102  	}
   103  
   104  	if (i - prevIdx) > 1 {
   105  		name, timestamp, value, err := Parse(packet[prevIdx:i])
   106  		if err == nil {
   107  			mets = append(mets, Metric{
   108  				Name: name,
   109  				Time: timestamp,
   110  				Val:  value,
   111  			})
   112  		} else {
   113  			malformed++
   114  		}
   115  	}
   116  
   117  	return mets, malformed
   118  }
   119  
   120  // ParseName parses out the name portion of a string and returns the
   121  // name and the remaining portion of the line.
   122  func ParseName(line []byte) (name []byte, rest []byte, err error) {
   123  	firstSepIdx := -1
   124  	for i := 0; i < len(line); i++ {
   125  		if line[i] == ' ' && !(i != 0 && line[i-1] == ' ') {
   126  			firstSepIdx = i
   127  			break
   128  		}
   129  	}
   130  
   131  	if firstSepIdx == -1 {
   132  		err = errInvalidLine
   133  		return
   134  	}
   135  
   136  	name = line[:firstSepIdx]
   137  	if len(name) == 0 {
   138  		err = errInvalidLine
   139  		return
   140  	}
   141  	if !utf8.Valid(name) {
   142  		err = errNotUTF8
   143  		return
   144  	}
   145  
   146  	nonSpaceIdx := firstSepIdx + 1
   147  	for nonSpaceIdx < len(line) && line[nonSpaceIdx] == ' ' {
   148  		nonSpaceIdx++
   149  	}
   150  
   151  	rest = line[nonSpaceIdx:]
   152  	return
   153  }
   154  
   155  // ParseRemainder parses a line's components (name and remainder) and returns
   156  // all but the name and returns the timestamp of the metric, its value, the
   157  // time it was received and any error encountered.
   158  func ParseRemainder(rest []byte) (timestamp time.Time, value float64, err error) {
   159  	if !utf8.Valid(rest) {
   160  		err = errNotUTF8
   161  		return
   162  	}
   163  
   164  	// Determine the start and end offsets for the value.
   165  	valStart, valEnd := parseWordOffsets(rest)
   166  	if valStart == -1 || valEnd == -1 || valEnd >= len(rest) {
   167  		// If we couldn't determine the offsets, or the end of the value is also
   168  		// the end of the line, then this is an invalid line.
   169  		err = errInvalidLine
   170  		return
   171  	}
   172  
   173  	// Found valid offsets for the value, try and parse it into a float. Note that
   174  	// we use unsafe.WithString() so that we can use standard library functions
   175  	// without allocating a string.
   176  	unsafe.WithString(rest, func(s string) {
   177  		if val := strings.ToLower(s[valStart:valEnd]); val == negativeNanStr || val == nanStr {
   178  			value = mathNan
   179  		} else {
   180  			value, err = strconv.ParseFloat(s[valStart:valEnd], floatBitSize)
   181  		}
   182  	})
   183  	if err != nil {
   184  		return
   185  	}
   186  
   187  	// Determine the start and end offsets for the timestamp (seconds).
   188  	rest = rest[valEnd:]
   189  	secStart, secEnd := parseWordOffsets(rest)
   190  
   191  	if secStart == -1 || secEnd == -1 || secEnd != len(rest) {
   192  		// If we couldn't determine the offsets, or the end of the the timestamp
   193  		// is not the end of the line (I.E there are still characters after the end
   194  		// of the timestamp), then this is an invalid line.
   195  		err = errInvalidLine
   196  		return
   197  	}
   198  
   199  	// Found valid offsets for the timestamp, try and parse it into an integer. Note that
   200  	// we use unsafe.WithString() so that we can use standard library functions without
   201  	// allocating a string.
   202  	var tsInSecs int64
   203  	unsafe.WithString(rest, func(s string) {
   204  		tsInSecs, err = strconv.ParseInt(s[secStart:secEnd], intBase, intBitSize)
   205  		if err != nil {
   206  			err = fmt.Errorf("invalid timestamp %s: %v", rest[secStart:secEnd], err)
   207  		}
   208  	})
   209  	if err != nil {
   210  		return
   211  	}
   212  	timestamp = time.Unix(tsInSecs, 0)
   213  
   214  	return
   215  }
   216  
   217  // Parse parses a carbon line into the corresponding parts.
   218  func Parse(line []byte) (name []byte, timestamp time.Time, value float64, err error) {
   219  	var rest []byte
   220  	name, rest, err = ParseName(line)
   221  	if err != nil {
   222  		return
   223  	}
   224  
   225  	timestamp, value, err = ParseRemainder(rest)
   226  	return
   227  }
   228  
   229  // A Scanner is used to scan carbon lines from an underlying io.Reader.
   230  type Scanner struct {
   231  	scanner   *bufio.Scanner
   232  	timestamp time.Time
   233  	path      []byte
   234  	value     float64
   235  
   236  	// The number of malformed metrics encountered.
   237  	MalformedCount int
   238  
   239  	iOpts instrument.Options
   240  }
   241  
   242  // NewScanner creates a new carbon scanner.
   243  func NewScanner(r io.Reader, iOpts instrument.Options) *Scanner {
   244  	s := bufio.NewScanner(r)
   245  
   246  	// Force the scanner to use a large buffer upfront to reduce the number of
   247  	// syscalls that occur if the io.Reader is backed by something that requires
   248  	// I/O (like a TCP connection).
   249  	// TODO(rartoul): Make this configurable.
   250  	s.Buffer(make([]byte, 0, initScannerBufferSize), maxScannerBufferSize)
   251  
   252  	s.Split(bufio.ScanLines)
   253  	return &Scanner{scanner: s, iOpts: iOpts}
   254  }
   255  
   256  // Scan scans for the next carbon metric. Malformed metrics are skipped but counted.
   257  func (s *Scanner) Scan() bool {
   258  	for {
   259  		if !s.scanner.Scan() {
   260  			return false
   261  		}
   262  
   263  		var err error
   264  		if s.path, s.timestamp, s.value, err = Parse(s.scanner.Bytes()); err != nil {
   265  			s.iOpts.Logger().Error("error trying to scan malformed carbon line",
   266  				zap.String("line", string(s.path)), zap.Error(err))
   267  			s.MalformedCount++
   268  			continue
   269  		}
   270  
   271  		return true
   272  	}
   273  }
   274  
   275  // Metric returns the path, timestamp, and value of the last parsed metric.
   276  func (s *Scanner) Metric() ([]byte, time.Time, float64) {
   277  	return s.path, s.timestamp, s.value
   278  }
   279  
   280  // Err returns any errors in the scan.
   281  func (s *Scanner) Err() error { return s.scanner.Err() }
   282  
   283  // parseWordOffsets scans through b searching for the start and end offsets
   284  // of the next "word" (ignores spaces on either side), returning offsets
   285  // such that b[start:end] will return the complete word with no spaces. Note
   286  // that the function will tolerate any number of spaces on either side.
   287  func parseWordOffsets(b []byte) (int, int) {
   288  	valStart := -1
   289  	for i := 0; i < len(b); i++ {
   290  		charByte := b[i]
   291  		if valStart == -1 && charByte != ' ' {
   292  			valStart = i
   293  			break
   294  		}
   295  	}
   296  
   297  	valEnd := valStart
   298  	reachedEnd := true
   299  	for i := valStart + 1; i < len(b); i++ {
   300  		valEnd = i
   301  
   302  		charByte := b[i]
   303  		if charByte == ' ' {
   304  			reachedEnd = false
   305  			break
   306  		}
   307  	}
   308  	if reachedEnd {
   309  		valEnd = valEnd + 1
   310  	}
   311  
   312  	return valStart, valEnd
   313  }