github.com/m3db/m3@v1.5.0/src/metrics/carbon/parser.go (about) 1 // Copyright (c) 2019 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package carbon 22 23 import ( 24 "bufio" 25 "errors" 26 "fmt" 27 "io" 28 "math" 29 "strconv" 30 "strings" 31 "time" 32 "unicode/utf8" 33 34 "github.com/m3db/m3/src/x/instrument" 35 "github.com/m3db/m3/src/x/unsafe" 36 37 "go.uber.org/zap" 38 ) 39 40 const ( 41 negativeNanStr = "-nan" 42 nanStr = "nan" 43 44 floatFormatByte = 'f' 45 floatPrecision = -1 46 intBitSize = 64 47 floatBitSize = 64 48 intBase = 10 49 50 initScannerBufferSize = 2 << 15 // ~ 65KiB 51 maxScannerBufferSize = 2 << 17 // ~ 0.25iB 52 ) 53 54 var ( 55 errInvalidLine = errors.New("invalid line") 56 errNotUTF8 = errors.New("not valid UTF8 string") 57 mathNan = math.NaN() 58 ) 59 60 // Metric represents a carbon metric. 61 type Metric struct { 62 Name []byte 63 Time time.Time 64 Val float64 65 } 66 67 // ToLine converts the carbon Metric struct to a line. 68 func (m *Metric) ToLine() string { 69 return string(m.Name) + " " + strconv.FormatFloat(m.Val, floatFormatByte, floatPrecision, floatBitSize) + 70 " " + strconv.FormatInt(m.Time.Unix(), intBase) + "\n" 71 } 72 73 // ParsePacket parses a carbon packet and returns the metrics and number of malformed lines. 74 func ParsePacket(packet []byte) ([]Metric, int) { 75 return parsePacket([]Metric{}, packet) 76 } 77 78 // ParseAndAppendPacket does the same thing as parse packet, but it allows the caller to pass 79 // in the []Metric to facilitate pooling. 80 func ParseAndAppendPacket(mets []Metric, packet []byte) ([]Metric, int) { 81 return parsePacket(mets, packet) 82 } 83 84 func parsePacket(mets []Metric, packet []byte) ([]Metric, int) { 85 var malformed, prevIdx, i int 86 for i = 0; i < len(packet); i++ { 87 if packet[i] == '\n' { 88 if (i - prevIdx) > 1 { 89 name, timestamp, value, err := Parse(packet[prevIdx:i]) 90 if err == nil { 91 mets = append(mets, Metric{ 92 Name: name, 93 Time: timestamp, 94 Val: value, 95 }) 96 } else { 97 malformed++ 98 } 99 } 100 prevIdx = i + 1 101 } 102 } 103 104 if (i - prevIdx) > 1 { 105 name, timestamp, value, err := Parse(packet[prevIdx:i]) 106 if err == nil { 107 mets = append(mets, Metric{ 108 Name: name, 109 Time: timestamp, 110 Val: value, 111 }) 112 } else { 113 malformed++ 114 } 115 } 116 117 return mets, malformed 118 } 119 120 // ParseName parses out the name portion of a string and returns the 121 // name and the remaining portion of the line. 122 func ParseName(line []byte) (name []byte, rest []byte, err error) { 123 firstSepIdx := -1 124 for i := 0; i < len(line); i++ { 125 if line[i] == ' ' && !(i != 0 && line[i-1] == ' ') { 126 firstSepIdx = i 127 break 128 } 129 } 130 131 if firstSepIdx == -1 { 132 err = errInvalidLine 133 return 134 } 135 136 name = line[:firstSepIdx] 137 if len(name) == 0 { 138 err = errInvalidLine 139 return 140 } 141 if !utf8.Valid(name) { 142 err = errNotUTF8 143 return 144 } 145 146 nonSpaceIdx := firstSepIdx + 1 147 for nonSpaceIdx < len(line) && line[nonSpaceIdx] == ' ' { 148 nonSpaceIdx++ 149 } 150 151 rest = line[nonSpaceIdx:] 152 return 153 } 154 155 // ParseRemainder parses a line's components (name and remainder) and returns 156 // all but the name and returns the timestamp of the metric, its value, the 157 // time it was received and any error encountered. 158 func ParseRemainder(rest []byte) (timestamp time.Time, value float64, err error) { 159 if !utf8.Valid(rest) { 160 err = errNotUTF8 161 return 162 } 163 164 // Determine the start and end offsets for the value. 165 valStart, valEnd := parseWordOffsets(rest) 166 if valStart == -1 || valEnd == -1 || valEnd >= len(rest) { 167 // If we couldn't determine the offsets, or the end of the value is also 168 // the end of the line, then this is an invalid line. 169 err = errInvalidLine 170 return 171 } 172 173 // Found valid offsets for the value, try and parse it into a float. Note that 174 // we use unsafe.WithString() so that we can use standard library functions 175 // without allocating a string. 176 unsafe.WithString(rest, func(s string) { 177 if val := strings.ToLower(s[valStart:valEnd]); val == negativeNanStr || val == nanStr { 178 value = mathNan 179 } else { 180 value, err = strconv.ParseFloat(s[valStart:valEnd], floatBitSize) 181 } 182 }) 183 if err != nil { 184 return 185 } 186 187 // Determine the start and end offsets for the timestamp (seconds). 188 rest = rest[valEnd:] 189 secStart, secEnd := parseWordOffsets(rest) 190 191 if secStart == -1 || secEnd == -1 || secEnd != len(rest) { 192 // If we couldn't determine the offsets, or the end of the the timestamp 193 // is not the end of the line (I.E there are still characters after the end 194 // of the timestamp), then this is an invalid line. 195 err = errInvalidLine 196 return 197 } 198 199 // Found valid offsets for the timestamp, try and parse it into an integer. Note that 200 // we use unsafe.WithString() so that we can use standard library functions without 201 // allocating a string. 202 var tsInSecs int64 203 unsafe.WithString(rest, func(s string) { 204 tsInSecs, err = strconv.ParseInt(s[secStart:secEnd], intBase, intBitSize) 205 if err != nil { 206 err = fmt.Errorf("invalid timestamp %s: %v", rest[secStart:secEnd], err) 207 } 208 }) 209 if err != nil { 210 return 211 } 212 timestamp = time.Unix(tsInSecs, 0) 213 214 return 215 } 216 217 // Parse parses a carbon line into the corresponding parts. 218 func Parse(line []byte) (name []byte, timestamp time.Time, value float64, err error) { 219 var rest []byte 220 name, rest, err = ParseName(line) 221 if err != nil { 222 return 223 } 224 225 timestamp, value, err = ParseRemainder(rest) 226 return 227 } 228 229 // A Scanner is used to scan carbon lines from an underlying io.Reader. 230 type Scanner struct { 231 scanner *bufio.Scanner 232 timestamp time.Time 233 path []byte 234 value float64 235 236 // The number of malformed metrics encountered. 237 MalformedCount int 238 239 iOpts instrument.Options 240 } 241 242 // NewScanner creates a new carbon scanner. 243 func NewScanner(r io.Reader, iOpts instrument.Options) *Scanner { 244 s := bufio.NewScanner(r) 245 246 // Force the scanner to use a large buffer upfront to reduce the number of 247 // syscalls that occur if the io.Reader is backed by something that requires 248 // I/O (like a TCP connection). 249 // TODO(rartoul): Make this configurable. 250 s.Buffer(make([]byte, 0, initScannerBufferSize), maxScannerBufferSize) 251 252 s.Split(bufio.ScanLines) 253 return &Scanner{scanner: s, iOpts: iOpts} 254 } 255 256 // Scan scans for the next carbon metric. Malformed metrics are skipped but counted. 257 func (s *Scanner) Scan() bool { 258 for { 259 if !s.scanner.Scan() { 260 return false 261 } 262 263 var err error 264 if s.path, s.timestamp, s.value, err = Parse(s.scanner.Bytes()); err != nil { 265 s.iOpts.Logger().Error("error trying to scan malformed carbon line", 266 zap.String("line", string(s.path)), zap.Error(err)) 267 s.MalformedCount++ 268 continue 269 } 270 271 return true 272 } 273 } 274 275 // Metric returns the path, timestamp, and value of the last parsed metric. 276 func (s *Scanner) Metric() ([]byte, time.Time, float64) { 277 return s.path, s.timestamp, s.value 278 } 279 280 // Err returns any errors in the scan. 281 func (s *Scanner) Err() error { return s.scanner.Err() } 282 283 // parseWordOffsets scans through b searching for the start and end offsets 284 // of the next "word" (ignores spaces on either side), returning offsets 285 // such that b[start:end] will return the complete word with no spaces. Note 286 // that the function will tolerate any number of spaces on either side. 287 func parseWordOffsets(b []byte) (int, int) { 288 valStart := -1 289 for i := 0; i < len(b); i++ { 290 charByte := b[i] 291 if valStart == -1 && charByte != ' ' { 292 valStart = i 293 break 294 } 295 } 296 297 valEnd := valStart 298 reachedEnd := true 299 for i := valStart + 1; i < len(b); i++ { 300 valEnd = i 301 302 charByte := b[i] 303 if charByte == ' ' { 304 reachedEnd = false 305 break 306 } 307 } 308 if reachedEnd { 309 valEnd = valEnd + 1 310 } 311 312 return valStart, valEnd 313 }