modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_porter.c (about) 1 /* 2 ** 2006 September 30 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** Implementation of the full-text-search tokenizer that implements 13 ** a Porter stemmer. 14 */ 15 16 /* 17 ** The code in this file is only compiled if: 18 ** 19 ** * The FTS3 module is being built as an extension 20 ** (in which case SQLITE_CORE is not defined), or 21 ** 22 ** * The FTS3 module is being built into the core of 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). 24 */ 25 #include "fts3Int.h" 26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 27 28 #include <assert.h> 29 #include <stdlib.h> 30 #include <stdio.h> 31 #include <string.h> 32 33 #include "fts3_tokenizer.h" 34 35 /* 36 ** Class derived from sqlite3_tokenizer 37 */ 38 typedef struct porter_tokenizer { 39 sqlite3_tokenizer base; /* Base class */ 40 } porter_tokenizer; 41 42 /* 43 ** Class derived from sqlite3_tokenizer_cursor 44 */ 45 typedef struct porter_tokenizer_cursor { 46 sqlite3_tokenizer_cursor base; 47 const char *zInput; /* input we are tokenizing */ 48 int nInput; /* size of the input */ 49 int iOffset; /* current position in zInput */ 50 int iToken; /* index of next token to be returned */ 51 char *zToken; /* storage for current token */ 52 int nAllocated; /* space allocated to zToken buffer */ 53 } porter_tokenizer_cursor; 54 55 56 /* 57 ** Create a new tokenizer instance. 58 */ 59 static int porterCreate( 60 int argc, const char * const *argv, 61 sqlite3_tokenizer **ppTokenizer 62 ){ 63 porter_tokenizer *t; 64 65 UNUSED_PARAMETER(argc); 66 UNUSED_PARAMETER(argv); 67 68 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); 69 if( t==NULL ) return SQLITE_NOMEM; 70 memset(t, 0, sizeof(*t)); 71 *ppTokenizer = &t->base; 72 return SQLITE_OK; 73 } 74 75 /* 76 ** Destroy a tokenizer 77 */ 78 static int porterDestroy(sqlite3_tokenizer *pTokenizer){ 79 sqlite3_free(pTokenizer); 80 return SQLITE_OK; 81 } 82 83 /* 84 ** Prepare to begin tokenizing a particular string. The input 85 ** string to be tokenized is zInput[0..nInput-1]. A cursor 86 ** used to incrementally tokenize this string is returned in 87 ** *ppCursor. 88 */ 89 static int porterOpen( 90 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 91 const char *zInput, int nInput, /* String to be tokenized */ 92 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 93 ){ 94 porter_tokenizer_cursor *c; 95 96 UNUSED_PARAMETER(pTokenizer); 97 98 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); 99 if( c==NULL ) return SQLITE_NOMEM; 100 101 c->zInput = zInput; 102 if( zInput==0 ){ 103 c->nInput = 0; 104 }else if( nInput<0 ){ 105 c->nInput = (int)strlen(zInput); 106 }else{ 107 c->nInput = nInput; 108 } 109 c->iOffset = 0; /* start tokenizing at the beginning */ 110 c->iToken = 0; 111 c->zToken = NULL; /* no space allocated, yet. */ 112 c->nAllocated = 0; 113 114 *ppCursor = &c->base; 115 return SQLITE_OK; 116 } 117 118 /* 119 ** Close a tokenization cursor previously opened by a call to 120 ** porterOpen() above. 121 */ 122 static int porterClose(sqlite3_tokenizer_cursor *pCursor){ 123 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; 124 sqlite3_free(c->zToken); 125 sqlite3_free(c); 126 return SQLITE_OK; 127 } 128 /* 129 ** Vowel or consonant 130 */ 131 static const char cType[] = { 132 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 133 1, 1, 1, 2, 1 134 }; 135 136 /* 137 ** isConsonant() and isVowel() determine if their first character in 138 ** the string they point to is a consonant or a vowel, according 139 ** to Porter ruls. 140 ** 141 ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. 142 ** 'Y' is a consonant unless it follows another consonant, 143 ** in which case it is a vowel. 144 ** 145 ** In these routine, the letters are in reverse order. So the 'y' rule 146 ** is that 'y' is a consonant unless it is followed by another 147 ** consonent. 148 */ 149 static int isVowel(const char*); 150 static int isConsonant(const char *z){ 151 int j; 152 char x = *z; 153 if( x==0 ) return 0; 154 assert( x>='a' && x<='z' ); 155 j = cType[x-'a']; 156 if( j<2 ) return j; 157 return z[1]==0 || isVowel(z + 1); 158 } 159 static int isVowel(const char *z){ 160 int j; 161 char x = *z; 162 if( x==0 ) return 0; 163 assert( x>='a' && x<='z' ); 164 j = cType[x-'a']; 165 if( j<2 ) return 1-j; 166 return isConsonant(z + 1); 167 } 168 169 /* 170 ** Let any sequence of one or more vowels be represented by V and let 171 ** C be sequence of one or more consonants. Then every word can be 172 ** represented as: 173 ** 174 ** [C] (VC){m} [V] 175 ** 176 ** In prose: A word is an optional consonant followed by zero or 177 ** vowel-consonant pairs followed by an optional vowel. "m" is the 178 ** number of vowel consonant pairs. This routine computes the value 179 ** of m for the first i bytes of a word. 180 ** 181 ** Return true if the m-value for z is 1 or more. In other words, 182 ** return true if z contains at least one vowel that is followed 183 ** by a consonant. 184 ** 185 ** In this routine z[] is in reverse order. So we are really looking 186 ** for an instance of a consonant followed by a vowel. 187 */ 188 static int m_gt_0(const char *z){ 189 while( isVowel(z) ){ z++; } 190 if( *z==0 ) return 0; 191 while( isConsonant(z) ){ z++; } 192 return *z!=0; 193 } 194 195 /* Like mgt0 above except we are looking for a value of m which is 196 ** exactly 1 197 */ 198 static int m_eq_1(const char *z){ 199 while( isVowel(z) ){ z++; } 200 if( *z==0 ) return 0; 201 while( isConsonant(z) ){ z++; } 202 if( *z==0 ) return 0; 203 while( isVowel(z) ){ z++; } 204 if( *z==0 ) return 1; 205 while( isConsonant(z) ){ z++; } 206 return *z==0; 207 } 208 209 /* Like mgt0 above except we are looking for a value of m>1 instead 210 ** or m>0 211 */ 212 static int m_gt_1(const char *z){ 213 while( isVowel(z) ){ z++; } 214 if( *z==0 ) return 0; 215 while( isConsonant(z) ){ z++; } 216 if( *z==0 ) return 0; 217 while( isVowel(z) ){ z++; } 218 if( *z==0 ) return 0; 219 while( isConsonant(z) ){ z++; } 220 return *z!=0; 221 } 222 223 /* 224 ** Return TRUE if there is a vowel anywhere within z[0..n-1] 225 */ 226 static int hasVowel(const char *z){ 227 while( isConsonant(z) ){ z++; } 228 return *z!=0; 229 } 230 231 /* 232 ** Return TRUE if the word ends in a double consonant. 233 ** 234 ** The text is reversed here. So we are really looking at 235 ** the first two characters of z[]. 236 */ 237 static int doubleConsonant(const char *z){ 238 return isConsonant(z) && z[0]==z[1]; 239 } 240 241 /* 242 ** Return TRUE if the word ends with three letters which 243 ** are consonant-vowel-consonent and where the final consonant 244 ** is not 'w', 'x', or 'y'. 245 ** 246 ** The word is reversed here. So we are really checking the 247 ** first three letters and the first one cannot be in [wxy]. 248 */ 249 static int star_oh(const char *z){ 250 return 251 isConsonant(z) && 252 z[0]!='w' && z[0]!='x' && z[0]!='y' && 253 isVowel(z+1) && 254 isConsonant(z+2); 255 } 256 257 /* 258 ** If the word ends with zFrom and xCond() is true for the stem 259 ** of the word that preceeds the zFrom ending, then change the 260 ** ending to zTo. 261 ** 262 ** The input word *pz and zFrom are both in reverse order. zTo 263 ** is in normal order. 264 ** 265 ** Return TRUE if zFrom matches. Return FALSE if zFrom does not 266 ** match. Not that TRUE is returned even if xCond() fails and 267 ** no substitution occurs. 268 */ 269 static int stem( 270 char **pz, /* The word being stemmed (Reversed) */ 271 const char *zFrom, /* If the ending matches this... (Reversed) */ 272 const char *zTo, /* ... change the ending to this (not reversed) */ 273 int (*xCond)(const char*) /* Condition that must be true */ 274 ){ 275 char *z = *pz; 276 while( *zFrom && *zFrom==*z ){ z++; zFrom++; } 277 if( *zFrom!=0 ) return 0; 278 if( xCond && !xCond(z) ) return 1; 279 while( *zTo ){ 280 *(--z) = *(zTo++); 281 } 282 *pz = z; 283 return 1; 284 } 285 286 /* 287 ** This is the fallback stemmer used when the porter stemmer is 288 ** inappropriate. The input word is copied into the output with 289 ** US-ASCII case folding. If the input word is too long (more 290 ** than 20 bytes if it contains no digits or more than 6 bytes if 291 ** it contains digits) then word is truncated to 20 or 6 bytes 292 ** by taking 10 or 3 bytes from the beginning and end. 293 */ 294 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ 295 int i, mx, j; 296 int hasDigit = 0; 297 for(i=0; i<nIn; i++){ 298 char c = zIn[i]; 299 if( c>='A' && c<='Z' ){ 300 zOut[i] = c - 'A' + 'a'; 301 }else{ 302 if( c>='0' && c<='9' ) hasDigit = 1; 303 zOut[i] = c; 304 } 305 } 306 mx = hasDigit ? 3 : 10; 307 if( nIn>mx*2 ){ 308 for(j=mx, i=nIn-mx; i<nIn; i++, j++){ 309 zOut[j] = zOut[i]; 310 } 311 i = j; 312 } 313 zOut[i] = 0; 314 *pnOut = i; 315 } 316 317 318 /* 319 ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. 320 ** zOut is at least big enough to hold nIn bytes. Write the actual 321 ** size of the output word (exclusive of the '\0' terminator) into *pnOut. 322 ** 323 ** Any upper-case characters in the US-ASCII character set ([A-Z]) 324 ** are converted to lower case. Upper-case UTF characters are 325 ** unchanged. 326 ** 327 ** Words that are longer than about 20 bytes are stemmed by retaining 328 ** a few bytes from the beginning and the end of the word. If the 329 ** word contains digits, 3 bytes are taken from the beginning and 330 ** 3 bytes from the end. For long words without digits, 10 bytes 331 ** are taken from each end. US-ASCII case folding still applies. 332 ** 333 ** If the input word contains not digits but does characters not 334 ** in [a-zA-Z] then no stemming is attempted and this routine just 335 ** copies the input into the input into the output with US-ASCII 336 ** case folding. 337 ** 338 ** Stemming never increases the length of the word. So there is 339 ** no chance of overflowing the zOut buffer. 340 */ 341 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ 342 int i, j; 343 char zReverse[28]; 344 char *z, *z2; 345 if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){ 346 /* The word is too big or too small for the porter stemmer. 347 ** Fallback to the copy stemmer */ 348 copy_stemmer(zIn, nIn, zOut, pnOut); 349 return; 350 } 351 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ 352 char c = zIn[i]; 353 if( c>='A' && c<='Z' ){ 354 zReverse[j] = c + 'a' - 'A'; 355 }else if( c>='a' && c<='z' ){ 356 zReverse[j] = c; 357 }else{ 358 /* The use of a character not in [a-zA-Z] means that we fallback 359 ** to the copy stemmer */ 360 copy_stemmer(zIn, nIn, zOut, pnOut); 361 return; 362 } 363 } 364 memset(&zReverse[sizeof(zReverse)-5], 0, 5); 365 z = &zReverse[j+1]; 366 367 368 /* Step 1a */ 369 if( z[0]=='s' ){ 370 if( 371 !stem(&z, "sess", "ss", 0) && 372 !stem(&z, "sei", "i", 0) && 373 !stem(&z, "ss", "ss", 0) 374 ){ 375 z++; 376 } 377 } 378 379 /* Step 1b */ 380 z2 = z; 381 if( stem(&z, "dee", "ee", m_gt_0) ){ 382 /* Do nothing. The work was all in the test */ 383 }else if( 384 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) 385 && z!=z2 386 ){ 387 if( stem(&z, "ta", "ate", 0) || 388 stem(&z, "lb", "ble", 0) || 389 stem(&z, "zi", "ize", 0) ){ 390 /* Do nothing. The work was all in the test */ 391 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ 392 z++; 393 }else if( m_eq_1(z) && star_oh(z) ){ 394 *(--z) = 'e'; 395 } 396 } 397 398 /* Step 1c */ 399 if( z[0]=='y' && hasVowel(z+1) ){ 400 z[0] = 'i'; 401 } 402 403 /* Step 2 */ 404 switch( z[1] ){ 405 case 'a': 406 if( !stem(&z, "lanoita", "ate", m_gt_0) ){ 407 stem(&z, "lanoit", "tion", m_gt_0); 408 } 409 break; 410 case 'c': 411 if( !stem(&z, "icne", "ence", m_gt_0) ){ 412 stem(&z, "icna", "ance", m_gt_0); 413 } 414 break; 415 case 'e': 416 stem(&z, "rezi", "ize", m_gt_0); 417 break; 418 case 'g': 419 stem(&z, "igol", "log", m_gt_0); 420 break; 421 case 'l': 422 if( !stem(&z, "ilb", "ble", m_gt_0) 423 && !stem(&z, "illa", "al", m_gt_0) 424 && !stem(&z, "iltne", "ent", m_gt_0) 425 && !stem(&z, "ile", "e", m_gt_0) 426 ){ 427 stem(&z, "ilsuo", "ous", m_gt_0); 428 } 429 break; 430 case 'o': 431 if( !stem(&z, "noitazi", "ize", m_gt_0) 432 && !stem(&z, "noita", "ate", m_gt_0) 433 ){ 434 stem(&z, "rota", "ate", m_gt_0); 435 } 436 break; 437 case 's': 438 if( !stem(&z, "msila", "al", m_gt_0) 439 && !stem(&z, "ssenevi", "ive", m_gt_0) 440 && !stem(&z, "ssenluf", "ful", m_gt_0) 441 ){ 442 stem(&z, "ssensuo", "ous", m_gt_0); 443 } 444 break; 445 case 't': 446 if( !stem(&z, "itila", "al", m_gt_0) 447 && !stem(&z, "itivi", "ive", m_gt_0) 448 ){ 449 stem(&z, "itilib", "ble", m_gt_0); 450 } 451 break; 452 } 453 454 /* Step 3 */ 455 switch( z[0] ){ 456 case 'e': 457 if( !stem(&z, "etaci", "ic", m_gt_0) 458 && !stem(&z, "evita", "", m_gt_0) 459 ){ 460 stem(&z, "ezila", "al", m_gt_0); 461 } 462 break; 463 case 'i': 464 stem(&z, "itici", "ic", m_gt_0); 465 break; 466 case 'l': 467 if( !stem(&z, "laci", "ic", m_gt_0) ){ 468 stem(&z, "luf", "", m_gt_0); 469 } 470 break; 471 case 's': 472 stem(&z, "ssen", "", m_gt_0); 473 break; 474 } 475 476 /* Step 4 */ 477 switch( z[1] ){ 478 case 'a': 479 if( z[0]=='l' && m_gt_1(z+2) ){ 480 z += 2; 481 } 482 break; 483 case 'c': 484 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ 485 z += 4; 486 } 487 break; 488 case 'e': 489 if( z[0]=='r' && m_gt_1(z+2) ){ 490 z += 2; 491 } 492 break; 493 case 'i': 494 if( z[0]=='c' && m_gt_1(z+2) ){ 495 z += 2; 496 } 497 break; 498 case 'l': 499 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ 500 z += 4; 501 } 502 break; 503 case 'n': 504 if( z[0]=='t' ){ 505 if( z[2]=='a' ){ 506 if( m_gt_1(z+3) ){ 507 z += 3; 508 } 509 }else if( z[2]=='e' ){ 510 if( !stem(&z, "tneme", "", m_gt_1) 511 && !stem(&z, "tnem", "", m_gt_1) 512 ){ 513 stem(&z, "tne", "", m_gt_1); 514 } 515 } 516 } 517 break; 518 case 'o': 519 if( z[0]=='u' ){ 520 if( m_gt_1(z+2) ){ 521 z += 2; 522 } 523 }else if( z[3]=='s' || z[3]=='t' ){ 524 stem(&z, "noi", "", m_gt_1); 525 } 526 break; 527 case 's': 528 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ 529 z += 3; 530 } 531 break; 532 case 't': 533 if( !stem(&z, "eta", "", m_gt_1) ){ 534 stem(&z, "iti", "", m_gt_1); 535 } 536 break; 537 case 'u': 538 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ 539 z += 3; 540 } 541 break; 542 case 'v': 543 case 'z': 544 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ 545 z += 3; 546 } 547 break; 548 } 549 550 /* Step 5a */ 551 if( z[0]=='e' ){ 552 if( m_gt_1(z+1) ){ 553 z++; 554 }else if( m_eq_1(z+1) && !star_oh(z+1) ){ 555 z++; 556 } 557 } 558 559 /* Step 5b */ 560 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ 561 z++; 562 } 563 564 /* z[] is now the stemmed word in reverse order. Flip it back 565 ** around into forward order and return. 566 */ 567 *pnOut = i = (int)strlen(z); 568 zOut[i] = 0; 569 while( *z ){ 570 zOut[--i] = *(z++); 571 } 572 } 573 574 /* 575 ** Characters that can be part of a token. We assume any character 576 ** whose value is greater than 0x80 (any UTF character) can be 577 ** part of a token. In other words, delimiters all must have 578 ** values of 0x7f or lower. 579 */ 580 static const char porterIdChar[] = { 581 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 583 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ 585 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ 587 }; 588 #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) 589 590 /* 591 ** Extract the next token from a tokenization cursor. The cursor must 592 ** have been opened by a prior call to porterOpen(). 593 */ 594 static int porterNext( 595 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ 596 const char **pzToken, /* OUT: *pzToken is the token text */ 597 int *pnBytes, /* OUT: Number of bytes in token */ 598 int *piStartOffset, /* OUT: Starting offset of token */ 599 int *piEndOffset, /* OUT: Ending offset of token */ 600 int *piPosition /* OUT: Position integer of token */ 601 ){ 602 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; 603 const char *z = c->zInput; 604 605 while( c->iOffset<c->nInput ){ 606 int iStartOffset, ch; 607 608 /* Scan past delimiter characters */ 609 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ 610 c->iOffset++; 611 } 612 613 /* Count non-delimiter characters. */ 614 iStartOffset = c->iOffset; 615 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ 616 c->iOffset++; 617 } 618 619 if( c->iOffset>iStartOffset ){ 620 int n = c->iOffset-iStartOffset; 621 if( n>c->nAllocated ){ 622 char *pNew; 623 c->nAllocated = n+20; 624 pNew = sqlite3_realloc(c->zToken, c->nAllocated); 625 if( !pNew ) return SQLITE_NOMEM; 626 c->zToken = pNew; 627 } 628 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); 629 *pzToken = c->zToken; 630 *piStartOffset = iStartOffset; 631 *piEndOffset = c->iOffset; 632 *piPosition = c->iToken++; 633 return SQLITE_OK; 634 } 635 } 636 return SQLITE_DONE; 637 } 638 639 /* 640 ** The set of routines that implement the porter-stemmer tokenizer 641 */ 642 static const sqlite3_tokenizer_module porterTokenizerModule = { 643 0, 644 porterCreate, 645 porterDestroy, 646 porterOpen, 647 porterClose, 648 porterNext, 649 0 650 }; 651 652 /* 653 ** Allocate a new porter tokenizer. Return a pointer to the new 654 ** tokenizer in *ppModule 655 */ 656 void sqlite3Fts3PorterTokenizerModule( 657 sqlite3_tokenizer_module const**ppModule 658 ){ 659 *ppModule = &porterTokenizerModule; 660 } 661 662 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */