github.com/XiaoMi/Gaea@v1.2.5/mysql/sql_fingerprint.go (about) 1 // Copyright (c) 2014-2015, Percona LLC and/or its affiliates. All rights reserved. 2 // This program is free software: you can redistribute it and/or modify 3 // it under the terms of the GNU Affero General Public License as published by 4 // the Free Software Foundation, either version 3 of the License, or 5 // (at your option) any later version. 6 // This program is distributed in the hope that it will be useful, 7 // but WITHOUT ANY WARRANTY; without even the implied warranty of 8 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 // GNU Affero General Public License for more details. 10 // You should have received a copy of the GNU Affero General Public License 11 // along with this program. If not, see <http://www.gnu.org/licenses/> 12 13 // Copyright 2016 The kingshard Authors. All rights reserved. 14 // 15 // Licensed under the Apache License, Version 2.0 (the "License"): you may 16 // not use this file except in compliance with the License. You may obtain 17 // a copy of the License at 18 // 19 // http://www.apache.org/licenses/LICENSE-2.0 20 // 21 // Unless required by applicable law or agreed to in writing, software 22 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 23 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 24 // License for the specific language governing permissions and limitations 25 // under the License. 26 27 // Package mysql query provides functions to transform queries. 28 package mysql 29 30 import ( 31 "crypto/md5" 32 "fmt" 33 "strings" 34 ) 35 36 const ( 37 unknown byte = iota 38 inWord // \S+ 39 inNumber // [0-9a-fA-Fx.-] 40 inSpace // space, tab, \r, \n 41 inOp // [=<>!] (usually precedes a number) 42 opOrNumber // + in 2 + 2 or +3e-9 43 inQuote // '...' or "..." 44 subOrOLC // - or start of -- comment 45 inDash // -- begins a one-line comment if followed by space 46 inOLC // -- comment (at least one space after dash is required) 47 divOrMLC // / operator or start of /* comment */ 48 mlcOrMySQLCode // /* comment */ or /*! MySQL-specific code */ 49 inMLC // /* comment */ 50 inValues // VALUES (1), ..., (N) 51 moreValuesOrUnknown // , (2nd+) or ON DUPLICATE KEY or end of query 52 orderBy // ORDER BY 53 onDupeKeyUpdate // ON DUPLICATE KEY UPDATE 54 inNumberInWord // e.g. db23 55 ) 56 57 var stateName = map[byte]string{ 58 0: "unknown", 59 1: "inWord", 60 2: "inNumber", 61 3: "inSpace", 62 4: "inOp", 63 5: "opOrNumber", 64 6: "inQuote", 65 7: "subOrOLC", 66 8: "inDash", 67 9: "inOLC", 68 10: "divOrMLC", 69 11: "mlcOrMySQLCode", 70 12: "inMLC", 71 13: "inValues", 72 14: "moreValuesOrUnknown", 73 15: "orderBy", 74 16: "onDupeKeyUpdate", 75 17: "inNumberInWord", 76 } 77 78 // Debug prints very verbose tracing information to STDOUT. 79 var Debug = false 80 81 // ReplaceNumbersInWords enables replacing numbers in words. For example: 82 // `SELECT c FROM org235.t` -> `SELECT c FROM org?.t`. For more examples 83 // look at test query_test.go/TestFingerprintWithNumberInDbName. 84 var ReplaceNumbersInWords = false 85 86 // GetFingerprint returns the canonical form of q. The primary transformations are: 87 // - Replace values with ? 88 // - Collapse whitespace 89 // - Remove comments 90 // - Lowercase everything 91 // Additional trasnformations are performed which change the syntax of the 92 // original query without affecting its performance characteristics. For 93 // example, "ORDER BY col ASC" is the same as "ORDER BY col", so "ASC" in the 94 // fingerprint is removed. 95 func GetFingerprint(q string) string { 96 q += " " // need range to run off end of original query 97 prevWord := "" 98 f := make([]byte, len(q)+1) 99 fi := 0 100 pr := rune(0) // previous rune 101 s := unknown // current state 102 sqlState := unknown 103 quoteChar := rune(0) 104 cpFromOffset := 0 105 cpToOffset := 0 106 addSpace := false 107 escape := false 108 parOpen := 0 109 parOpenTotal := 0 110 valueNo := 0 111 firstPar := 0 112 113 for qi, r := range q { 114 if Debug { 115 fmt.Printf("\n%d:%d %s/%s [%d:%d] %x %q\n", qi, fi, stateName[s], stateName[sqlState], cpFromOffset, cpToOffset, r, r) 116 } 117 118 //1. Skip parts of the query for certain states. 119 120 if s == inQuote { 121 // We're in a 'quoted value' or "quoted value". The quoted value 122 // ends at the first non-escaped matching quote character (' or "). 123 if r != quoteChar { 124 // The only char inside a quoted value we need to track is \, 125 // the escape char. This allows us to tell that the 2nd ' in 126 // '\'' is escaped, not the ending quote char. 127 if escape { 128 if Debug { 129 fmt.Println("Ignore quoted literal") 130 } 131 escape = false 132 } else if r == '\\' { 133 if Debug { 134 fmt.Println("Escape") 135 } 136 escape = true 137 } else { 138 if Debug { 139 fmt.Println("Ignore quoted value") 140 } 141 } 142 } else if escape { 143 // \' or \" 144 if Debug { 145 fmt.Println("Quote literal") 146 } 147 escape = false 148 } else { 149 // 'foo' -> ? 150 // "foo" -> ? 151 if Debug { 152 fmt.Println("Quote end") 153 } 154 escape = false 155 156 // qi = the closing quote char, so +1 to ensure we don't copy 157 // anything before this, i.e. quoted value is done, move on. 158 cpFromOffset = qi + 1 159 160 if sqlState == inValues { 161 // ('Hello world!', ...) -> VALUES (, ...) 162 // The inValues state uses this state to skip quoted values, 163 // so we don't replace them with ?; the inValues blocks will 164 // replace the entire value list with ?+. 165 s = inValues 166 } else { 167 f[fi] = '?' 168 fi++ 169 s = unknown 170 } 171 } 172 continue 173 } else if s == inNumberInWord { 174 // Replaces number in words with ? 175 // e.g. `db37` to `db?` 176 // Parser can fall into inNumberInWord only if 177 // option ReplaceNumbersInWords is turned on 178 if r >= '0' && r <= '9' { 179 if Debug { 180 fmt.Println("Ignore digit in word") 181 } 182 continue 183 } 184 // 123 -> ?, 0xff -> ?, 1e-9 -> ?, etc. 185 if Debug { 186 fmt.Println("Number in word end") 187 } 188 f[fi] = '?' 189 fi++ 190 cpFromOffset = qi 191 if isSpace(r) { 192 s = unknown 193 } else { 194 s = inWord 195 } 196 } else if s == inNumber { 197 // We're in a number which can be something simple like 123 or 198 // something trickier like 1e-9 or 0xFF. The pathological case is 199 // like 12ff: this is valid hex number and a valid ident (e.g. table 200 // name). We can't detect this; the best we can do is realize that 201 // 12ffz is not a number because of the z. 202 if (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F') || r == '.' || r == 'x' || r == '-' { 203 if Debug { 204 fmt.Println("Ignore digit") 205 } 206 continue 207 } 208 if (r >= 'g' && r <= 'z') || (r >= 'G' && r <= 'Z') || r == '_' { 209 if Debug { 210 fmt.Println("Not a number") 211 } 212 cpToOffset = qi 213 s = inWord 214 } else { 215 // 123 -> ?, 0xff -> ?, 1e-9 -> ?, etc. 216 if Debug { 217 fmt.Println("Number end") 218 } 219 f[fi] = '?' 220 fi++ 221 cpFromOffset = qi 222 cpToOffset = qi 223 s = unknown 224 } 225 } else if s == inValues { 226 // We're in the (val1),...,(valN) after IN or VALUE[S]. A single 227 // () value ends when the parenthesis are balanced, but... 228 if r == ')' { 229 parOpen-- 230 parOpenTotal++ 231 if Debug { 232 fmt.Println("Close parenthesis", parOpen) 233 } 234 } else if r == '(' { 235 parOpen++ 236 if Debug { 237 fmt.Println("Open parenthesis", parOpen) 238 } 239 if parOpen == 1 { 240 firstPar = qi 241 } 242 } else if r == '\'' || r == '"' { 243 // VALUES ('Hello world!') -> enter inQuote state to skip 244 // the quoted value so ')' in 'This ) is a trick' doesn't 245 // balance an outer parenthesis. 246 if Debug { 247 fmt.Println("Quote begin") 248 } 249 s = inQuote 250 quoteChar = r 251 continue 252 } else if isSpace(r) { 253 if Debug { 254 fmt.Println("Space") 255 } 256 continue 257 } 258 if parOpen > 0 { 259 // Parenthesis are not balanced yet; i.e. haven't reached 260 // closing ) for this value. 261 continue 262 } 263 if parOpenTotal == 0 { 264 // SELECT value FROM t 265 if Debug { 266 fmt.Println("Literal values not VALUES()") 267 } 268 s = inWord 269 continue 270 } 271 // (<anything>) -> (?+) only for first value 272 if Debug { 273 fmt.Println("Values end") 274 } 275 valueNo++ 276 if valueNo == 1 { 277 if qi-firstPar > 1 { 278 copy(f[fi:fi+4], "(?+)") 279 fi += 4 280 } else { 281 // INSERT INTO t VALUES () 282 copy(f[fi:fi+2], "()") 283 fi += 2 284 } 285 firstPar = 0 286 } 287 // ... the difficult part is that there may be other values, e.g. 288 // (1), (2), (3). So we enter the following state. The values list 289 // ends when the next char is not a comma. 290 s = moreValuesOrUnknown 291 pr = r 292 cpFromOffset = qi + 1 293 parOpenTotal = 0 294 continue 295 } else if s == inMLC { 296 // We're in a /* mutli-line comments */. Skip and ignore it all. 297 if pr == '*' && r == '/' { 298 // /* foo */ -> (nothing) 299 if Debug { 300 fmt.Println("Multi-line comment end") 301 } 302 s = unknown 303 } else { 304 if Debug { 305 fmt.Println("Ignore multi-line comment content") 306 } 307 } 308 continue 309 } else if s == mlcOrMySQLCode { 310 // We're at the start of either a /* multi-line comment */ or some 311 // /*![version] some MySQL-specific code */. The ! after the /* 312 // determines which one. 313 if r != '!' { 314 if Debug { 315 fmt.Println("Multi-line comment") 316 } 317 s = inMLC 318 continue 319 } else { 320 // /*![version] SQL_NO_CACHE */ -> /*![version] SQL_NO_CACHE */ (no change) 321 if Debug { 322 fmt.Println("MySQL-specific code") 323 } 324 s = inWord 325 } 326 } else if s == inOLC { 327 // We're in a -- one line comment. A space after -- is required. 328 // It ends at the end of the line, but there can be more query after 329 // it like: 330 // SELECT * -- comment 331 // FROM t 332 // is really "SELECT * FROM t". 333 if r == 0x0A { // newline 334 if Debug { 335 fmt.Println("One-line comment end") 336 } 337 s = unknown 338 } 339 continue 340 } else if isSpace(r) && isSpace(pr) { 341 // All space is collapsed into a single space, so if this char is 342 // a space and the previous was too, then skip the extra space. 343 if Debug { 344 fmt.Println("Skip space") 345 } 346 // +1 here ensures we actually skip the extra space in certain 347 // cases like "select \n-- bar\n foo". When a part of the query 348 // triggers a copy of preceding chars, if the only preceding char 349 // is a space then it's incorrectly copied, but +1 sets cpFromOffset 350 // to the same offset as the trigger char, thus avoiding the copy. 351 // For example in that ^ query, the offsets are: 352 // 0 's' 353 // 1 'e' 354 // 2 'l' 355 // 3 'e' 356 // 4 'c' 357 // 5 't' 358 // 6 ' ' 359 // 7 '\n' 360 // 8 '-' 361 // After copying 'select ', we are here @ 7 and intend to skip the 362 // newline. Next, the '-' @ 8 triggers a copy of any preceding 363 // chars. So here if we set cpFromOffset = 7 then 7:8 is copied, 364 // the newline, but setting cpFromOffset = 7 + 1 is 8:8 and so 365 // nothing is copied as we want. Actually, cpToOffset is still 6 366 // in this case, but 8:6 avoids the copy too. 367 cpFromOffset = qi + 1 368 pr = r 369 continue 370 } 371 372 //2. Change state based on rune and current state. 373 374 switch { 375 case r >= 0x30 && r <= 0x39: // 0-9 376 switch s { 377 case opOrNumber: 378 if Debug { 379 fmt.Println("+/-First digit") 380 } 381 cpToOffset = qi - 1 382 s = inNumber 383 case inOp: 384 if Debug { 385 fmt.Println("First digit after operator") 386 } 387 cpToOffset = qi 388 s = inNumber 389 case inWord: 390 if pr == '(' { 391 if Debug { 392 fmt.Println("Number in function") 393 } 394 cpToOffset = qi 395 s = inNumber 396 } else if pr == ',' { 397 // foo,4 -- 4 may be a number literal or a word/ident 398 if Debug { 399 fmt.Println("Number or word") 400 } 401 s = inNumber 402 cpToOffset = qi 403 } else { 404 if Debug { 405 fmt.Println("Number in word") 406 } 407 if ReplaceNumbersInWords { 408 s = inNumberInWord 409 cpToOffset = qi 410 } 411 } 412 default: 413 if Debug { 414 fmt.Println("Number literal") 415 } 416 s = inNumber 417 cpToOffset = qi 418 } 419 case isSpace(r): 420 if s == unknown { 421 if Debug { 422 fmt.Println("Lost in space") 423 } 424 if fi > 0 && !isSpace(rune(f[fi-1])) { 425 if Debug { 426 fmt.Println("Add space") 427 } 428 f[fi] = ' ' 429 fi++ 430 // This is a common case: a space after skipping something, 431 // e.g. col = 'foo'<space>. We want only the first space, 432 // so advance cpFromOffset to whatever is after the space 433 // and if it's more space then space skipping block will 434 // handle it. 435 cpFromOffset = qi + 1 436 } 437 } else if s == inDash { 438 if Debug { 439 fmt.Println("One-line comment begin") 440 } 441 s = inOLC 442 if cpToOffset > 2 { 443 cpToOffset = qi - 2 444 addSpace = true 445 } 446 } else if s == moreValuesOrUnknown { 447 if Debug { 448 fmt.Println("Space after values") 449 } 450 if valueNo == 1 { 451 f[fi] = ' ' 452 fi++ 453 } 454 } else { 455 if Debug { 456 fmt.Println("Word end") 457 } 458 word := strings.ToLower(q[cpFromOffset:qi]) 459 // Only match USE if it is the first word in the query, otherwise, 460 // it could be a USE INDEX 461 if word == "use" && prevWord == "" { 462 return "use ?" 463 } else if (word == "null" && (prevWord != "is" && prevWord != "not")) || word == "null," { 464 if Debug { 465 fmt.Println("NULL as value") 466 } 467 f[fi] = '?' 468 fi++ 469 if word[len(word)-1] == ',' { 470 f[fi] = ',' 471 fi++ 472 } 473 f[fi] = ' ' 474 fi++ 475 cpFromOffset = qi + 1 476 } else if prevWord == "order" && word == "by" { 477 if Debug { 478 fmt.Println("ORDER BY begin") 479 } 480 sqlState = orderBy 481 } else if sqlState == orderBy && wordIn(word, "asc", "asc,", "asc ") { 482 if Debug { 483 fmt.Println("ORDER BY ASC") 484 } 485 cpFromOffset = qi 486 if word[len(word)-1] == ',' { 487 fi-- 488 f[fi] = ',' 489 f[fi+1] = ' ' 490 fi += 2 491 } 492 } else if prevWord == "key" && word == "update" { 493 if Debug { 494 fmt.Println("ON DUPLICATE KEY UPDATE begin") 495 } 496 sqlState = onDupeKeyUpdate 497 } 498 s = inSpace 499 cpToOffset = qi 500 addSpace = true 501 } 502 case r == '\'' || r == '"': 503 if pr != '\\' { 504 if s != inQuote { 505 if Debug { 506 fmt.Println("Quote begin") 507 } 508 s = inQuote 509 quoteChar = r 510 cpToOffset = qi 511 if pr == 'x' || pr == 'b' { 512 if Debug { 513 fmt.Println("Hex/binary value") 514 } 515 // We're at the first quote char of x'0F' 516 // (or b'0101', etc.), so -2 for the quote char and 517 // the x or b char to copy anything before and up to 518 // this value. 519 cpToOffset = -2 520 } 521 } 522 } 523 case r == '=' || r == '<' || r == '>' || r == '!': 524 if Debug { 525 fmt.Println("Operator") 526 } 527 if s != inWord && s != inOp { 528 cpFromOffset = qi 529 } 530 s = inOp 531 case r == '/': 532 if Debug { 533 fmt.Println("Op or multi-line comment") 534 } 535 s = divOrMLC 536 case r == '*' && s == divOrMLC: 537 if Debug { 538 fmt.Println("Multi-line comment or MySQL-specific code") 539 } 540 s = mlcOrMySQLCode 541 case r == '+': 542 if Debug { 543 fmt.Println("Operator or number") 544 } 545 s = opOrNumber 546 case r == '-': 547 if pr == '-' { 548 if Debug { 549 fmt.Println("Dash") 550 } 551 s = inDash 552 } else { 553 if Debug { 554 fmt.Println("Operator or number") 555 } 556 s = opOrNumber 557 } 558 case r == '.': 559 if s == inNumber || s == inOp { 560 if Debug { 561 fmt.Println("Floating point number") 562 } 563 s = inNumber 564 cpToOffset = qi 565 } 566 case r == '(': 567 if prevWord == "call" { 568 // 'CALL foo(...)' -> 'call foo' 569 if Debug { 570 fmt.Println("CALL sp_name") 571 } 572 return "call " + q[cpFromOffset:qi] 573 } else if sqlState != onDupeKeyUpdate && (((s == inSpace || s == moreValuesOrUnknown) && (prevWord == "value" || prevWord == "values" || prevWord == "in")) || wordIn(q[cpFromOffset:qi], "value", "values", "in")) { 574 // VALUE(, VALUE (, VALUES(, VALUES (, IN(, or IN( 575 // but not after ON DUPLICATE KEY UPDATE 576 if Debug { 577 fmt.Println("Values begin") 578 } 579 s = inValues 580 sqlState = inValues 581 parOpen = 1 582 firstPar = qi 583 if valueNo == 0 { 584 cpToOffset = qi 585 } 586 } else if s != inWord { 587 if Debug { 588 fmt.Println("Random (") 589 } 590 valueNo = 0 591 cpFromOffset = qi 592 s = inWord 593 } 594 case r == ',' && s == moreValuesOrUnknown: 595 if Debug { 596 fmt.Println("More values") 597 } 598 case r == ':' && prevWord == "administrator": 599 // 'administrator command: Init DB' -> 'administrator command: Init DB' (no change) 600 if Debug { 601 fmt.Println("Admin cmd") 602 } 603 return q[0 : len(q)-1] // original query minus the trailing space we added 604 case r == '#': 605 if Debug { 606 fmt.Println("One-line comment begin") 607 } 608 s = inOLC 609 default: 610 if s != inWord && s != inOp { 611 // If in a word or operator then keep copying the query, else 612 // previous chars were being ignored for some reasons but now 613 // we should start copying again, so set cpFromOffset. Example: 614 // col=NOW(). 'col' will be set to copy, but then '=' will put 615 // us in inOp state which, if a value follows, will trigger a 616 // copy of "col=", but "NOW()" is not a value so "N" is caught 617 // here and since s=inOp still we do not copy yet (this block is 618 // is not entered). 619 if Debug { 620 fmt.Println("Random character") 621 } 622 valueNo = 0 623 cpFromOffset = qi 624 625 if sqlState == inValues { 626 // Values are comma-separated, so the first random char 627 // marks the end of the VALUE() or IN() list. 628 if Debug { 629 fmt.Println("No more values") 630 } 631 sqlState = unknown 632 } 633 } 634 s = inWord 635 } 636 637 /** 638 * 3. Copy a slice of the query into the fingerprint. 639 */ 640 641 if cpToOffset > cpFromOffset { 642 l := cpToOffset - cpFromOffset 643 prevWord = strings.ToLower(q[cpFromOffset:cpToOffset]) 644 if Debug { 645 fmt.Printf("copy '%s' (%d:%d, %d:%d) %d\n", prevWord, fi, fi+l, cpFromOffset, cpToOffset, l) 646 } 647 copy(f[fi:fi+l], prevWord) 648 fi += l 649 cpFromOffset = cpToOffset 650 if wordIn(prevWord, "in", "value", "values") && sqlState != onDupeKeyUpdate { 651 // IN () -> in(?+) 652 // VALUES () -> values(?+) 653 addSpace = false 654 s = inValues 655 sqlState = inValues 656 } else if addSpace { 657 if Debug { 658 fmt.Println("Add space") 659 } 660 f[fi] = ' ' 661 fi++ 662 cpFromOffset++ 663 addSpace = false 664 } 665 } 666 pr = r 667 } 668 669 // Remove trailing spaces. 670 for fi > 0 && isSpace(rune(f[fi-1])) { 671 fi-- 672 } 673 674 // Return the fingerprint. 675 return string(f[0:fi]) 676 } 677 678 func isSpace(r rune) bool { 679 return r == 0x20 || r == 0x09 || r == 0x0D || r == 0x0A 680 } 681 682 func wordIn(q string, words ...string) bool { 683 q = strings.ToLower(q) 684 for _, word := range words { 685 if q == word { 686 return true 687 } 688 } 689 return false 690 } 691 692 // GetMd5 returns the MD5 checksum of fingerprint. 693 func GetMd5(fingerPrint string) string { 694 data := []byte(fingerPrint) 695 return fmt.Sprintf("%x", md5.Sum(data)) 696 } 697 698 // GetFingerprintOperation return fingerprint's operation 699 func GetFingerprintOperation(fingerprint string) string { 700 strList := strings.SplitN(fingerprint, " ", 2) 701 if len(strList) == 0 { 702 return "" 703 } 704 return strList[0] 705 }