modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts1/fulltext.c (about) 1 /* The author disclaims copyright to this source code. 2 * 3 * This is an SQLite module implementing full-text search. 4 */ 5 6 #include <assert.h> 7 #if !defined(__APPLE__) 8 #include <malloc.h> 9 #else 10 #include <stdlib.h> 11 #endif 12 #include <stdio.h> 13 #include <string.h> 14 #include <ctype.h> 15 16 #include "fulltext.h" 17 #include "ft_hash.h" 18 #include "tokenizer.h" 19 #include "sqlite3.h" 20 #include "sqlite3ext.h" 21 SQLITE_EXTENSION_INIT1 22 23 /* utility functions */ 24 25 /* We encode variable-length integers in little-endian order using seven bits 26 * per byte as follows: 27 ** 28 ** KEY: 29 ** A = 0xxxxxxx 7 bits of data and one flag bit 30 ** B = 1xxxxxxx 7 bits of data and one flag bit 31 ** 32 ** 7 bits - A 33 ** 14 bits - BA 34 ** 21 bits - BBA 35 ** and so on. 36 */ 37 38 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ 39 #define VARINT_MAX 10 40 41 /* Write a 64-bit variable-length integer to memory starting at p[0]. 42 * The length of data written will be between 1 and VARINT_MAX bytes. 43 * The number of bytes written is returned. */ 44 static int putVarint(char *p, sqlite_int64 v){ 45 unsigned char *q = (unsigned char *) p; 46 sqlite_uint64 vu = v; 47 do{ 48 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); 49 vu >>= 7; 50 }while( vu!=0 ); 51 q[-1] &= 0x7f; /* turn off high bit in final byte */ 52 assert( q - (unsigned char *)p <= VARINT_MAX ); 53 return (int) (q - (unsigned char *)p); 54 } 55 56 /* Read a 64-bit variable-length integer from memory starting at p[0]. 57 * Return the number of bytes read, or 0 on error. 58 * The value is stored in *v. */ 59 static int getVarint(const char *p, sqlite_int64 *v){ 60 const unsigned char *q = (const unsigned char *) p; 61 sqlite_uint64 x = 0, y = 1; 62 while( (*q & 0x80) == 0x80 ){ 63 x += y * (*q++ & 0x7f); 64 y <<= 7; 65 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ 66 assert( 0 ); 67 return 0; 68 } 69 } 70 x += y * (*q++); 71 *v = (sqlite_int64) x; 72 return (int) (q - (unsigned char *)p); 73 } 74 75 static int getVarint32(const char *p, int *pi){ 76 sqlite_int64 i; 77 int ret = getVarint(p, &i); 78 *pi = (int) i; 79 assert( *pi==i ); 80 return ret; 81 } 82 83 /*** Document lists *** 84 * 85 * A document list holds a sorted list of varint-encoded document IDs. 86 * 87 * A doclist with type DL_POSITIONS_OFFSETS is stored like this: 88 * 89 * array { 90 * varint docid; 91 * array { 92 * varint position; (delta from previous position plus 1, or 0 for end) 93 * varint startOffset; (delta from previous startOffset) 94 * varint endOffset; (delta from startOffset) 95 * } 96 * } 97 * 98 * Here, array { X } means zero or more occurrences of X, adjacent in memory. 99 * 100 * A doclist with type DL_POSITIONS is like the above, but holds only docids 101 * and positions without offset information. 102 * 103 * A doclist with type DL_DOCIDS is like the above, but holds only docids 104 * without positions or offset information. 105 * 106 * On disk, every document list has positions and offsets, so we don't bother 107 * to serialize a doclist's type. 108 * 109 * We don't yet delta-encode document IDs; doing so will probably be a 110 * modest win. 111 * 112 * NOTE(shess) I've thought of a slightly (1%) better offset encoding. 113 * After the first offset, estimate the next offset by using the 114 * current token position and the previous token position and offset, 115 * offset to handle some variance. So the estimate would be 116 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded 117 * as normal. Offsets more than 64 chars from the estimate are 118 * encoded as the delta to the previous start offset + 128. An 119 * additional tiny increment can be gained by using the end offset of 120 * the previous token to make the estimate a tiny bit more precise. 121 */ 122 123 typedef enum DocListType { 124 DL_DOCIDS, /* docids only */ 125 DL_POSITIONS, /* docids + positions */ 126 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ 127 } DocListType; 128 129 typedef struct DocList { 130 char *pData; 131 int nData; 132 DocListType iType; 133 int iLastPos; /* the last position written */ 134 int iLastOffset; /* the last start offset written */ 135 } DocList; 136 137 /* Initialize a new DocList to hold the given data. */ 138 static void docListInit(DocList *d, DocListType iType, 139 const char *pData, int nData){ 140 d->nData = nData; 141 if( nData>0 ){ 142 d->pData = malloc(nData); 143 memcpy(d->pData, pData, nData); 144 } else { 145 d->pData = NULL; 146 } 147 d->iType = iType; 148 d->iLastPos = 0; 149 d->iLastOffset = 0; 150 } 151 152 /* Create a new dynamically-allocated DocList. */ 153 static DocList *docListNew(DocListType iType){ 154 DocList *d = (DocList *) malloc(sizeof(DocList)); 155 docListInit(d, iType, 0, 0); 156 return d; 157 } 158 159 static void docListDestroy(DocList *d){ 160 free(d->pData); 161 #ifndef NDEBUG 162 memset(d, 0x55, sizeof(*d)); 163 #endif 164 } 165 166 static void docListDelete(DocList *d){ 167 docListDestroy(d); 168 free(d); 169 } 170 171 static char *docListEnd(DocList *d){ 172 return d->pData + d->nData; 173 } 174 175 /* Append a varint to a DocList's data. */ 176 static void appendVarint(DocList *d, sqlite_int64 i){ 177 char c[VARINT_MAX]; 178 int n = putVarint(c, i); 179 d->pData = realloc(d->pData, d->nData + n); 180 memcpy(d->pData + d->nData, c, n); 181 d->nData += n; 182 } 183 184 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ 185 appendVarint(d, iDocid); 186 d->iLastPos = 0; 187 } 188 189 /* Add a position to the last position list in a doclist. */ 190 static void docListAddPos(DocList *d, int iPos){ 191 assert( d->iType>=DL_POSITIONS ); 192 appendVarint(d, iPos-d->iLastPos+1); 193 d->iLastPos = iPos; 194 } 195 196 static void docListAddPosOffset(DocList *d, int iPos, 197 int iStartOffset, int iEndOffset){ 198 assert( d->iType==DL_POSITIONS_OFFSETS ); 199 docListAddPos(d, iPos); 200 appendVarint(d, iStartOffset-d->iLastOffset); 201 d->iLastOffset = iStartOffset; 202 appendVarint(d, iEndOffset-iStartOffset); 203 } 204 205 /* Terminate the last position list in the given doclist. */ 206 static void docListAddEndPos(DocList *d){ 207 appendVarint(d, 0); 208 } 209 210 typedef struct DocListReader { 211 DocList *pDoclist; 212 char *p; 213 int iLastPos; /* the last position read */ 214 } DocListReader; 215 216 static void readerInit(DocListReader *r, DocList *pDoclist){ 217 r->pDoclist = pDoclist; 218 if( pDoclist!=NULL ){ 219 r->p = pDoclist->pData; 220 } 221 r->iLastPos = 0; 222 } 223 224 static int readerAtEnd(DocListReader *pReader){ 225 return pReader->p >= docListEnd(pReader->pDoclist); 226 } 227 228 /* Peek at the next docid without advancing the read pointer. */ 229 static sqlite_int64 peekDocid(DocListReader *pReader){ 230 sqlite_int64 ret; 231 assert( !readerAtEnd(pReader) ); 232 getVarint(pReader->p, &ret); 233 return ret; 234 } 235 236 /* Read the next docid. */ 237 static sqlite_int64 readDocid(DocListReader *pReader){ 238 sqlite_int64 ret; 239 assert( !readerAtEnd(pReader) ); 240 pReader->p += getVarint(pReader->p, &ret); 241 pReader->iLastPos = 0; 242 return ret; 243 } 244 245 /* Read the next position from a position list. 246 * Returns the position, or -1 at the end of the list. */ 247 static int readPosition(DocListReader *pReader){ 248 int i; 249 int iType = pReader->pDoclist->iType; 250 assert( iType>=DL_POSITIONS ); 251 assert( !readerAtEnd(pReader) ); 252 253 pReader->p += getVarint32(pReader->p, &i); 254 if( i==0 ){ 255 pReader->iLastPos = -1; 256 return -1; 257 } 258 pReader->iLastPos += ((int) i)-1; 259 if( iType>=DL_POSITIONS_OFFSETS ){ 260 /* Skip over offsets, ignoring them for now. */ 261 int iStart, iEnd; 262 pReader->p += getVarint32(pReader->p, &iStart); 263 pReader->p += getVarint32(pReader->p, &iEnd); 264 } 265 return pReader->iLastPos; 266 } 267 268 /* Skip past the end of a position list. */ 269 static void skipPositionList(DocListReader *pReader){ 270 while( readPosition(pReader)!=-1 ) 271 ; 272 } 273 274 /* Skip over a docid, including its position list if the doclist has 275 * positions. */ 276 static void skipDocument(DocListReader *pReader){ 277 readDocid(pReader); 278 if( pReader->pDoclist->iType >= DL_POSITIONS ){ 279 skipPositionList(pReader); 280 } 281 } 282 283 static sqlite_int64 firstDocid(DocList *d){ 284 DocListReader r; 285 readerInit(&r, d); 286 return readDocid(&r); 287 } 288 289 /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; 290 * otherwise pUpdate, which must contain only the single docid [iDocid], is 291 * inserted (if not present) or updated (if already present). */ 292 static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ 293 int modified = 0; 294 DocListReader reader; 295 char *p; 296 297 if( pUpdate!=NULL ){ 298 assert( d->iType==pUpdate->iType); 299 assert( iDocid==firstDocid(pUpdate) ); 300 } 301 302 readerInit(&reader, d); 303 while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ 304 skipDocument(&reader); 305 } 306 307 p = reader.p; 308 /* Delete if there is a matching element. */ 309 if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ 310 skipDocument(&reader); 311 memmove(p, reader.p, docListEnd(d) - reader.p); 312 d->nData -= (reader.p - p); 313 modified = 1; 314 } 315 316 /* Insert if indicated. */ 317 if( pUpdate!=NULL ){ 318 int iDoclist = p-d->pData; 319 docListAddEndPos(pUpdate); 320 321 d->pData = realloc(d->pData, d->nData+pUpdate->nData); 322 p = d->pData + iDoclist; 323 324 memmove(p+pUpdate->nData, p, docListEnd(d) - p); 325 memcpy(p, pUpdate->pData, pUpdate->nData); 326 d->nData += pUpdate->nData; 327 modified = 1; 328 } 329 330 return modified; 331 } 332 333 /* Split the second half of doclist d into a separate doclist d2. Returns 1 334 * if successful, or 0 if d contains a single document and hence can't be 335 * split. */ 336 static int docListSplit(DocList *d, DocList *d2){ 337 const char *pSplitPoint = d->pData + d->nData / 2; 338 DocListReader reader; 339 340 readerInit(&reader, d); 341 while( reader.p<pSplitPoint ){ 342 skipDocument(&reader); 343 } 344 if( readerAtEnd(&reader) ) return 0; 345 docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); 346 d->nData = reader.p - d->pData; 347 d->pData = realloc(d->pData, d->nData); 348 return 1; 349 } 350 351 /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked 352 * on-disk doclist, resulting in another in-memory DocList [out]. [in] 353 * and [out] may or may not store position information according to the 354 * caller's wishes. The on-disk doclist always comes with positions. 355 * 356 * The caller must read each chunk of the on-disk doclist in succession and 357 * pass it to mergeBlock(). 358 * 359 * If [in] has positions, then the merge output contains only documents with 360 * matching positions in the two input doclists. If [in] does not have 361 * positions, then the merge output contains all documents common to the two 362 * input doclists. 363 * 364 * If [in] is NULL, then the on-disk doclist is copied to [out] directly. 365 * 366 * A merge is performed using an integer [iOffset] provided by the caller. 367 * [iOffset] is subtracted from each position in the on-disk doclist for the 368 * purpose of position comparison; this is helpful in implementing phrase 369 * searches. 370 * 371 * A DocListMerge is not yet able to propagate offsets through query 372 * processing; we should add that capability soon. 373 */ 374 typedef struct DocListMerge { 375 DocListReader in; 376 DocList *pOut; 377 int iOffset; 378 } DocListMerge; 379 380 static void mergeInit(DocListMerge *m, 381 DocList *pIn, int iOffset, DocList *pOut){ 382 readerInit(&m->in, pIn); 383 m->pOut = pOut; 384 m->iOffset = iOffset; 385 386 /* can't handle offsets yet */ 387 assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); 388 assert( pOut->iType <= DL_POSITIONS ); 389 } 390 391 /* A helper function for mergeBlock(), below. Merge the position lists 392 * pointed to by m->in and pBlockReader. 393 * If the merge matches, write [iDocid] to m->pOut; if m->pOut 394 * has positions then write all matching positions as well. */ 395 static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, 396 DocListReader *pBlockReader){ 397 int block_pos = readPosition(pBlockReader); 398 int in_pos = readPosition(&m->in); 399 int match = 0; 400 while( block_pos!=-1 || in_pos!=-1 ){ 401 if( block_pos-m->iOffset==in_pos ){ 402 if( !match ){ 403 docListAddDocid(m->pOut, iDocid); 404 match = 1; 405 } 406 if( m->pOut->iType >= DL_POSITIONS ){ 407 docListAddPos(m->pOut, in_pos); 408 } 409 block_pos = readPosition(pBlockReader); 410 in_pos = readPosition(&m->in); 411 } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ 412 block_pos = readPosition(pBlockReader); 413 } else { 414 in_pos = readPosition(&m->in); 415 } 416 } 417 if( m->pOut->iType >= DL_POSITIONS && match ){ 418 docListAddEndPos(m->pOut); 419 } 420 } 421 422 /* Merge one block of an on-disk doclist into a DocListMerge. */ 423 static void mergeBlock(DocListMerge *m, DocList *pBlock){ 424 DocListReader blockReader; 425 assert( pBlock->iType >= DL_POSITIONS ); 426 readerInit(&blockReader, pBlock); 427 while( !readerAtEnd(&blockReader) ){ 428 sqlite_int64 iDocid = readDocid(&blockReader); 429 if( m->in.pDoclist!=NULL ){ 430 while( 1 ){ 431 if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ 432 if( peekDocid(&m->in)>=iDocid ) break; 433 skipDocument(&m->in); 434 } 435 if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ 436 skipPositionList(&blockReader); /* skip this docid in the block */ 437 continue; 438 } 439 readDocid(&m->in); 440 } 441 /* We have a document match. */ 442 if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ 443 /* We don't need to do a poslist merge. */ 444 docListAddDocid(m->pOut, iDocid); 445 if( m->pOut->iType >= DL_POSITIONS ){ 446 /* Copy all positions to the output doclist. */ 447 while( 1 ){ 448 int pos = readPosition(&blockReader); 449 if( pos==-1 ) break; 450 docListAddPos(m->pOut, pos); 451 } 452 docListAddEndPos(m->pOut); 453 } else skipPositionList(&blockReader); 454 continue; 455 } 456 mergePosList(m, iDocid, &blockReader); 457 } 458 } 459 460 static char *string_dup_n(const char *s, int n){ 461 char *str = malloc(n + 1); 462 memcpy(str, s, n); 463 str[n] = '\0'; 464 return str; 465 } 466 467 /* Duplicate a string; the caller must free() the returned string. 468 * (We don't use strdup() since it's not part of the standard C library and 469 * may not be available everywhere.) */ 470 static char *string_dup(const char *s){ 471 return string_dup_n(s, strlen(s)); 472 } 473 474 /* Format a string, replacing each occurrence of the % character with 475 * zName. This may be more convenient than sqlite_mprintf() 476 * when one string is used repeatedly in a format string. 477 * The caller must free() the returned string. */ 478 static char *string_format(const char *zFormat, const char *zName){ 479 const char *p; 480 size_t len = 0; 481 size_t nName = strlen(zName); 482 char *result; 483 char *r; 484 485 /* first compute length needed */ 486 for(p = zFormat ; *p ; ++p){ 487 len += (*p=='%' ? nName : 1); 488 } 489 len += 1; /* for null terminator */ 490 491 r = result = malloc(len); 492 for(p = zFormat; *p; ++p){ 493 if( *p=='%' ){ 494 memcpy(r, zName, nName); 495 r += nName; 496 } else { 497 *r++ = *p; 498 } 499 } 500 *r++ = '\0'; 501 assert( r == result + len ); 502 return result; 503 } 504 505 static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ 506 char *zCommand = string_format(zFormat, zName); 507 int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); 508 free(zCommand); 509 return rc; 510 } 511 512 static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, 513 const char *zFormat){ 514 char *zCommand = string_format(zFormat, zName); 515 int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); 516 free(zCommand); 517 return rc; 518 } 519 520 /* end utility functions */ 521 522 #define QUERY_GENERIC 0 523 #define QUERY_FULLTEXT 1 524 525 #define CHUNK_MAX 1024 526 527 typedef enum fulltext_statement { 528 CONTENT_INSERT_STMT, 529 CONTENT_SELECT_STMT, 530 CONTENT_DELETE_STMT, 531 532 TERM_SELECT_STMT, 533 TERM_CHUNK_SELECT_STMT, 534 TERM_INSERT_STMT, 535 TERM_UPDATE_STMT, 536 TERM_DELETE_STMT, 537 538 MAX_STMT /* Always at end! */ 539 } fulltext_statement; 540 541 /* These must exactly match the enum above. */ 542 /* TODO(adam): Is there some risk that a statement (in particular, 543 ** pTermSelectStmt) will be used in two cursors at once, e.g. if a 544 ** query joins a virtual table to itself? If so perhaps we should 545 ** move some of these to the cursor object. 546 */ 547 static const char *fulltext_zStatement[MAX_STMT] = { 548 /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", 549 /* CONTENT_SELECT */ "select content from %_content where rowid = ?", 550 /* CONTENT_DELETE */ "delete from %_content where rowid = ?", 551 552 /* TERM_SELECT */ 553 "select rowid, doclist from %_term where term = ? and first = ?", 554 /* TERM_CHUNK_SELECT */ 555 "select max(first) from %_term where term = ? and first <= ?", 556 /* TERM_INSERT */ 557 "insert into %_term (term, first, doclist) values (?, ?, ?)", 558 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", 559 /* TERM_DELETE */ "delete from %_term where rowid = ?", 560 }; 561 562 typedef struct fulltext_vtab { 563 sqlite3_vtab base; 564 sqlite3 *db; 565 const char *zName; /* virtual table name */ 566 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ 567 568 /* Precompiled statements which we keep as long as the table is 569 ** open. 570 */ 571 sqlite3_stmt *pFulltextStatements[MAX_STMT]; 572 } fulltext_vtab; 573 574 typedef struct fulltext_cursor { 575 sqlite3_vtab_cursor base; 576 int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ 577 578 sqlite3_stmt *pStmt; 579 580 int eof; 581 582 /* The following is used only when iCursorType == QUERY_FULLTEXT. */ 583 DocListReader result; 584 } fulltext_cursor; 585 586 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ 587 return (fulltext_vtab *) c->base.pVtab; 588 } 589 590 static sqlite3_module fulltextModule; /* forward declaration */ 591 592 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. 593 ** If the indicated statement has never been prepared, it is prepared 594 ** and cached, otherwise the cached version is reset. 595 */ 596 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, 597 sqlite3_stmt **ppStmt){ 598 assert( iStmt<MAX_STMT ); 599 if( v->pFulltextStatements[iStmt]==NULL ){ 600 int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], 601 fulltext_zStatement[iStmt]); 602 if( rc!=SQLITE_OK ) return rc; 603 } else { 604 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); 605 if( rc!=SQLITE_OK ) return rc; 606 } 607 608 *ppStmt = v->pFulltextStatements[iStmt]; 609 return SQLITE_OK; 610 } 611 612 /* Step the indicated statement, handling errors SQLITE_BUSY (by 613 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring 614 ** bindings to the new statement). 615 ** TODO(adam): We should extend this function so that it can work with 616 ** statements declared locally, not only globally cached statements. 617 */ 618 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, 619 sqlite3_stmt **ppStmt){ 620 int rc; 621 sqlite3_stmt *s = *ppStmt; 622 assert( iStmt<MAX_STMT ); 623 assert( s==v->pFulltextStatements[iStmt] ); 624 625 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ 626 sqlite3_stmt *pNewStmt; 627 628 if( rc==SQLITE_BUSY ) continue; 629 if( rc!=SQLITE_ERROR ) return rc; 630 631 rc = sqlite3_reset(s); 632 if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; 633 634 v->pFulltextStatements[iStmt] = NULL; /* Still in s */ 635 rc = sql_get_statement(v, iStmt, &pNewStmt); 636 if( rc!=SQLITE_OK ) goto err; 637 *ppStmt = pNewStmt; 638 639 rc = sqlite3_transfer_bindings(s, pNewStmt); 640 if( rc!=SQLITE_OK ) goto err; 641 642 rc = sqlite3_finalize(s); 643 if( rc!=SQLITE_OK ) return rc; 644 s = pNewStmt; 645 } 646 return rc; 647 648 err: 649 sqlite3_finalize(s); 650 return rc; 651 } 652 653 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. 654 ** Useful for statements like UPDATE, where we expect no results. 655 */ 656 static int sql_single_step_statement(fulltext_vtab *v, 657 fulltext_statement iStmt, 658 sqlite3_stmt **ppStmt){ 659 int rc = sql_step_statement(v, iStmt, ppStmt); 660 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; 661 } 662 663 /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ 664 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, 665 const char *zContent, int nContent){ 666 sqlite3_stmt *s; 667 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); 668 if( rc!=SQLITE_OK ) return rc; 669 670 rc = sqlite3_bind_value(s, 1, rowid); 671 if( rc!=SQLITE_OK ) return rc; 672 673 rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); 674 if( rc!=SQLITE_OK ) return rc; 675 676 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); 677 } 678 679 /* select content from %_content where rowid = [iRow] 680 * The caller must delete the returned string. */ 681 static int content_select(fulltext_vtab *v, sqlite_int64 iRow, 682 char **pzContent){ 683 sqlite3_stmt *s; 684 int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); 685 if( rc!=SQLITE_OK ) return rc; 686 687 rc = sqlite3_bind_int64(s, 1, iRow); 688 if( rc!=SQLITE_OK ) return rc; 689 690 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); 691 if( rc!=SQLITE_ROW ) return rc; 692 693 *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); 694 695 /* We expect only one row. We must execute another sqlite3_step() 696 * to complete the iteration; otherwise the table will remain locked. */ 697 rc = sqlite3_step(s); 698 if( rc==SQLITE_DONE ) return SQLITE_OK; 699 700 free(*pzContent); 701 return rc; 702 } 703 704 /* delete from %_content where rowid = [iRow ] */ 705 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ 706 sqlite3_stmt *s; 707 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); 708 if( rc!=SQLITE_OK ) return rc; 709 710 rc = sqlite3_bind_int64(s, 1, iRow); 711 if( rc!=SQLITE_OK ) return rc; 712 713 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); 714 } 715 716 /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] 717 * If found, returns SQLITE_OK; the caller must free the returned doclist. 718 * If no rows found, returns SQLITE_ERROR. */ 719 static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, 720 sqlite_int64 iFirst, 721 sqlite_int64 *rowid, 722 DocList *out){ 723 sqlite3_stmt *s; 724 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); 725 if( rc!=SQLITE_OK ) return rc; 726 727 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); 728 if( rc!=SQLITE_OK ) return rc; 729 730 rc = sqlite3_bind_int64(s, 2, iFirst); 731 if( rc!=SQLITE_OK ) return rc; 732 733 rc = sql_step_statement(v, TERM_SELECT_STMT, &s); 734 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 735 736 *rowid = sqlite3_column_int64(s, 0); 737 docListInit(out, DL_POSITIONS_OFFSETS, 738 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); 739 740 /* We expect only one row. We must execute another sqlite3_step() 741 * to complete the iteration; otherwise the table will remain locked. */ 742 rc = sqlite3_step(s); 743 return rc==SQLITE_DONE ? SQLITE_OK : rc; 744 } 745 746 /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] 747 * If found, returns SQLITE_ROW and result in *piResult; if the query returns 748 * NULL (meaning no row found) returns SQLITE_DONE. 749 */ 750 static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, 751 sqlite_int64 iFirst, sqlite_int64 *piResult){ 752 sqlite3_stmt *s; 753 int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); 754 if( rc!=SQLITE_OK ) return rc; 755 756 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); 757 if( rc!=SQLITE_OK ) return rc; 758 759 rc = sqlite3_bind_int64(s, 2, iFirst); 760 if( rc!=SQLITE_OK ) return rc; 761 762 rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); 763 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 764 765 switch( sqlite3_column_type(s, 0) ){ 766 case SQLITE_NULL: 767 rc = SQLITE_DONE; 768 break; 769 case SQLITE_INTEGER: 770 *piResult = sqlite3_column_int64(s, 0); 771 break; 772 default: 773 return SQLITE_ERROR; 774 } 775 /* We expect only one row. We must execute another sqlite3_step() 776 * to complete the iteration; otherwise the table will remain locked. */ 777 if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; 778 return rc; 779 } 780 781 /* insert into %_term (term, first, doclist) 782 values ([zTerm], [iFirst], [doclist]) */ 783 static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, 784 sqlite_int64 iFirst, DocList *doclist){ 785 sqlite3_stmt *s; 786 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); 787 if( rc!=SQLITE_OK ) return rc; 788 789 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); 790 if( rc!=SQLITE_OK ) return rc; 791 792 rc = sqlite3_bind_int64(s, 2, iFirst); 793 if( rc!=SQLITE_OK ) return rc; 794 795 rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); 796 if( rc!=SQLITE_OK ) return rc; 797 798 return sql_single_step_statement(v, TERM_INSERT_STMT, &s); 799 } 800 801 /* update %_term set doclist = [doclist] where rowid = [rowid] */ 802 static int term_update(fulltext_vtab *v, sqlite_int64 rowid, 803 DocList *doclist){ 804 sqlite3_stmt *s; 805 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); 806 if( rc!=SQLITE_OK ) return rc; 807 808 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, 809 SQLITE_STATIC); 810 if( rc!=SQLITE_OK ) return rc; 811 812 rc = sqlite3_bind_int64(s, 2, rowid); 813 if( rc!=SQLITE_OK ) return rc; 814 815 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); 816 } 817 818 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ 819 sqlite3_stmt *s; 820 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); 821 if( rc!=SQLITE_OK ) return rc; 822 823 rc = sqlite3_bind_int64(s, 1, rowid); 824 if( rc!=SQLITE_OK ) return rc; 825 826 return sql_single_step_statement(v, TERM_DELETE_STMT, &s); 827 } 828 829 static void fulltext_vtab_destroy(fulltext_vtab *v){ 830 int iStmt; 831 832 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ 833 if( v->pFulltextStatements[iStmt]!=NULL ){ 834 sqlite3_finalize(v->pFulltextStatements[iStmt]); 835 v->pFulltextStatements[iStmt] = NULL; 836 } 837 } 838 839 if( v->pTokenizer!=NULL ){ 840 v->pTokenizer->pModule->xDestroy(v->pTokenizer); 841 v->pTokenizer = NULL; 842 } 843 844 free((void *) v->zName); 845 free(v); 846 } 847 848 /* Current interface: 849 ** argv[0] - module name 850 ** argv[1] - database name 851 ** argv[2] - table name 852 ** argv[3] - tokenizer name (optional, a sensible default is provided) 853 ** argv[4..] - passed to tokenizer (optional based on tokenizer) 854 **/ 855 static int fulltextConnect( 856 sqlite3 *db, 857 void *pAux, 858 int argc, 859 const char * const *argv, 860 sqlite3_vtab **ppVTab, 861 char **pzErr 862 ){ 863 int rc; 864 fulltext_vtab *v; 865 sqlite3_tokenizer_module *m = NULL; 866 867 assert( argc>=3 ); 868 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); 869 /* sqlite will initialize v->base */ 870 v->db = db; 871 v->zName = string_dup(argv[2]); 872 v->pTokenizer = NULL; 873 874 if( argc==3 ){ 875 get_simple_tokenizer_module(&m); 876 } else { 877 /* TODO(shess) For now, add new tokenizers as else if clauses. */ 878 if( !strcmp(argv[3], "simple") ){ 879 get_simple_tokenizer_module(&m); 880 } else { 881 assert( "unrecognized tokenizer"==NULL ); 882 } 883 } 884 885 /* TODO(shess) Since tokenization impacts the index, the parameters 886 ** to the tokenizer need to be identical when a persistent virtual 887 ** table is re-created. One solution would be a meta-table to track 888 ** such information in the database. Then we could verify that the 889 ** information is identical on subsequent creates. 890 */ 891 /* TODO(shess) Why isn't argv already (const char **)? */ 892 rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); 893 if( rc!=SQLITE_OK ) return rc; 894 v->pTokenizer->pModule = m; 895 896 /* TODO: verify the existence of backing tables foo_content, foo_term */ 897 898 rc = sqlite3_declare_vtab(db, "create table x(content text)"); 899 if( rc!=SQLITE_OK ) return rc; 900 901 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); 902 903 *ppVTab = &v->base; 904 return SQLITE_OK; 905 } 906 907 static int fulltextCreate( 908 sqlite3 *db, 909 void *pAux, 910 int argc, 911 const char * const *argv, 912 sqlite3_vtab **ppVTab, 913 char **pzErr 914 ){ 915 int rc; 916 assert( argc>=3 ); 917 918 /* The %_content table holds the text of each full-text item, with 919 ** the rowid used as the docid. 920 ** 921 ** The %_term table maps each term to a document list blob 922 ** containing elements sorted by ascending docid, each element 923 ** encoded as: 924 ** 925 ** docid varint-encoded 926 ** token count varint-encoded 927 ** "count" token elements (poslist): 928 ** position varint-encoded as delta from previous position 929 ** start offset varint-encoded as delta from previous start offset 930 ** end offset varint-encoded as delta from start offset 931 ** 932 ** Additionally, doclist blobs can be chunked into multiple rows, 933 ** using "first" to order the blobs. "first" is simply the first 934 ** docid in the blob. 935 */ 936 /* 937 ** NOTE(shess) That last sentence is incorrect in the face of 938 ** deletion, which can leave a doclist that doesn't contain the 939 ** first from that row. I _believe_ this does not matter to the 940 ** operation of the system, but it might be reasonable to update 941 ** appropriately in case this assumption becomes more important. 942 */ 943 rc = sql_exec(db, argv[2], 944 "create table %_content(content text);" 945 "create table %_term(term text, first integer, doclist blob);" 946 "create index %_index on %_term(term, first)"); 947 if( rc!=SQLITE_OK ) return rc; 948 949 return fulltextConnect(db, pAux, argc, argv, ppVTab, pzErr); 950 } 951 952 /* Decide how to handle an SQL query. 953 * At the moment, MATCH queries can include implicit boolean ANDs; we 954 * haven't implemented phrase searches or OR yet. */ 955 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ 956 int i; 957 958 for(i=0; i<pInfo->nConstraint; ++i){ 959 const struct sqlite3_index_constraint *pConstraint; 960 pConstraint = &pInfo->aConstraint[i]; 961 if( pConstraint->iColumn==0 && 962 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && 963 pConstraint->usable ){ /* a full-text search */ 964 pInfo->aConstraintUsage[i].argvIndex = 1; 965 pInfo->aConstraintUsage[i].omit = 1; 966 pInfo->idxNum = QUERY_FULLTEXT; 967 pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ 968 return SQLITE_OK; 969 } 970 } 971 pInfo->idxNum = QUERY_GENERIC; 972 return SQLITE_OK; 973 } 974 975 static int fulltextDisconnect(sqlite3_vtab *pVTab){ 976 fulltext_vtab_destroy((fulltext_vtab *)pVTab); 977 return SQLITE_OK; 978 } 979 980 static int fulltextDestroy(sqlite3_vtab *pVTab){ 981 fulltext_vtab *v = (fulltext_vtab *)pVTab; 982 983 int rc = sql_exec(v->db, v->zName, 984 "drop table %_content; drop table %_term"); 985 if( rc!=SQLITE_OK ) return rc; 986 987 fulltext_vtab_destroy((fulltext_vtab *)pVTab); 988 return SQLITE_OK; 989 } 990 991 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ 992 fulltext_cursor *c; 993 994 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); 995 /* sqlite will initialize c->base */ 996 *ppCursor = &c->base; 997 998 return SQLITE_OK; 999 } 1000 1001 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ 1002 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1003 sqlite3_finalize(c->pStmt); 1004 if( c->result.pDoclist!=NULL ){ 1005 docListDelete(c->result.pDoclist); 1006 } 1007 free(c); 1008 return SQLITE_OK; 1009 } 1010 1011 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ 1012 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1013 sqlite_int64 iDocid; 1014 int rc; 1015 1016 switch( c->iCursorType ){ 1017 case QUERY_GENERIC: 1018 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 1019 rc = sqlite3_step(c->pStmt); 1020 switch( rc ){ 1021 case SQLITE_ROW: 1022 c->eof = 0; 1023 return SQLITE_OK; 1024 case SQLITE_DONE: 1025 c->eof = 1; 1026 return SQLITE_OK; 1027 default: 1028 c->eof = 1; 1029 return rc; 1030 } 1031 case QUERY_FULLTEXT: 1032 rc = sqlite3_reset(c->pStmt); 1033 if( rc!=SQLITE_OK ) return rc; 1034 1035 if( readerAtEnd(&c->result)){ 1036 c->eof = 1; 1037 return SQLITE_OK; 1038 } 1039 iDocid = readDocid(&c->result); 1040 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); 1041 if( rc!=SQLITE_OK ) return rc; 1042 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 1043 rc = sqlite3_step(c->pStmt); 1044 if( rc==SQLITE_ROW ){ /* the case we expect */ 1045 c->eof = 0; 1046 return SQLITE_OK; 1047 } 1048 /* an error occurred; abort */ 1049 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 1050 default: 1051 assert( 0 ); 1052 return SQLITE_ERROR; /* not reached */ 1053 } 1054 } 1055 1056 static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, 1057 sqlite3_stmt **ppStmt){ 1058 int rc; 1059 if( *ppStmt ){ 1060 rc = sqlite3_reset(*ppStmt); 1061 } else { 1062 rc = sql_prepare(v->db, v->zName, ppStmt, 1063 "select doclist from %_term where term = ? order by first"); 1064 } 1065 if( rc!=SQLITE_OK ) return rc; 1066 1067 rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); 1068 if( rc!=SQLITE_OK ) return rc; 1069 1070 return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ 1071 } 1072 1073 /* Read the posting list for [zTerm]; AND it with the doclist [in] to 1074 * produce the doclist [out], using the given offset [iOffset] for phrase 1075 * matching. 1076 * (*pSelect) is used to hold an SQLite statement used inside this function; 1077 * the caller should initialize *pSelect to NULL before the first call. 1078 */ 1079 static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, 1080 const char *zTerm, 1081 DocList *pIn, int iOffset, DocList *out){ 1082 int rc; 1083 DocListMerge merge; 1084 1085 if( pIn!=NULL && !pIn->nData ){ 1086 /* If [pIn] is already empty, there's no point in reading the 1087 * posting list to AND it in; return immediately. */ 1088 return SQLITE_OK; 1089 } 1090 1091 rc = term_select_doclist(v, zTerm, -1, pSelect); 1092 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; 1093 1094 mergeInit(&merge, pIn, iOffset, out); 1095 while( rc==SQLITE_ROW ){ 1096 DocList block; 1097 docListInit(&block, DL_POSITIONS_OFFSETS, 1098 sqlite3_column_blob(*pSelect, 0), 1099 sqlite3_column_bytes(*pSelect, 0)); 1100 mergeBlock(&merge, &block); 1101 docListDestroy(&block); 1102 1103 rc = sqlite3_step(*pSelect); 1104 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ 1105 return rc; 1106 } 1107 } 1108 1109 return SQLITE_OK; 1110 } 1111 1112 typedef struct QueryTerm { 1113 int is_phrase; /* true if this term begins a new phrase */ 1114 const char *zTerm; 1115 } QueryTerm; 1116 1117 /* A parsed query. 1118 * 1119 * As an example, parsing the query ["four score" years "new nation"] will 1120 * yield a Query with 5 terms: 1121 * "four", is_phrase = 1 1122 * "score", is_phrase = 0 1123 * "years", is_phrase = 1 1124 * "new", is_phrase = 1 1125 * "nation", is_phrase = 0 1126 */ 1127 typedef struct Query { 1128 int nTerms; 1129 QueryTerm *pTerm; 1130 } Query; 1131 1132 static void query_add(Query *q, int is_phrase, const char *zTerm){ 1133 QueryTerm *t; 1134 ++q->nTerms; 1135 q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); 1136 t = &q->pTerm[q->nTerms - 1]; 1137 t->is_phrase = is_phrase; 1138 t->zTerm = zTerm; 1139 } 1140 1141 static void query_free(Query *q){ 1142 int i; 1143 for(i = 0; i < q->nTerms; ++i){ 1144 free((void *) q->pTerm[i].zTerm); 1145 } 1146 free(q->pTerm); 1147 } 1148 1149 static int tokenize_segment(sqlite3_tokenizer *pTokenizer, 1150 const char *zQuery, int in_phrase, 1151 Query *pQuery){ 1152 sqlite3_tokenizer_module *pModule = pTokenizer->pModule; 1153 sqlite3_tokenizer_cursor *pCursor; 1154 int is_first = 1; 1155 1156 int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); 1157 if( rc!=SQLITE_OK ) return rc; 1158 pCursor->pTokenizer = pTokenizer; 1159 1160 while( 1 ){ 1161 const char *zToken; 1162 int nToken, iStartOffset, iEndOffset, dummy_pos; 1163 1164 rc = pModule->xNext(pCursor, 1165 &zToken, &nToken, 1166 &iStartOffset, &iEndOffset, 1167 &dummy_pos); 1168 if( rc!=SQLITE_OK ) break; 1169 query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); 1170 is_first = 0; 1171 } 1172 1173 return pModule->xClose(pCursor); 1174 } 1175 1176 /* Parse a query string, yielding a Query object. */ 1177 static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ 1178 char *zQuery1 = string_dup(zQuery); 1179 int in_phrase = 0; 1180 char *s = zQuery1; 1181 pQuery->nTerms = 0; 1182 pQuery->pTerm = NULL; 1183 1184 while( *s ){ 1185 char *t = s; 1186 while( *t ){ 1187 if( *t=='"' ){ 1188 *t++ = '\0'; 1189 break; 1190 } 1191 ++t; 1192 } 1193 if( *s ){ 1194 tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); 1195 } 1196 s = t; 1197 in_phrase = !in_phrase; 1198 } 1199 1200 free(zQuery1); 1201 return SQLITE_OK; 1202 } 1203 1204 /* Perform a full-text query; return a list of documents in [pResult]. */ 1205 static int fulltext_query(fulltext_vtab *v, const char *zQuery, 1206 DocList **pResult){ 1207 Query q; 1208 int phrase_start = -1; 1209 int i; 1210 sqlite3_stmt *pSelect = NULL; 1211 DocList *d = NULL; 1212 1213 int rc = parse_query(v, zQuery, &q); 1214 if( rc!=SQLITE_OK ) return rc; 1215 1216 /* Merge terms. */ 1217 for(i = 0 ; i < q.nTerms ; ++i){ 1218 /* In each merge step, we need to generate positions whenever we're 1219 * processing a phrase which hasn't ended yet. */ 1220 int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; 1221 DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); 1222 if( q.pTerm[i].is_phrase ){ 1223 phrase_start = i; 1224 } 1225 rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); 1226 if( rc!=SQLITE_OK ) break; 1227 if( d!=NULL ){ 1228 docListDelete(d); 1229 } 1230 d = next; 1231 } 1232 1233 sqlite3_finalize(pSelect); 1234 query_free(&q); 1235 *pResult = d; 1236 return rc; 1237 } 1238 1239 static int fulltextFilter(sqlite3_vtab_cursor *pCursor, 1240 int idxNum, const char *idxStr, 1241 int argc, sqlite3_value **argv){ 1242 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1243 fulltext_vtab *v = cursor_vtab(c); 1244 int rc; 1245 const char *zStatement; 1246 1247 c->iCursorType = idxNum; 1248 switch( idxNum ){ 1249 case QUERY_GENERIC: 1250 zStatement = "select rowid, content from %_content"; 1251 break; 1252 1253 case QUERY_FULLTEXT: /* full-text search */ 1254 { 1255 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); 1256 DocList *pResult; 1257 assert( argc==1 ); 1258 rc = fulltext_query(v, zQuery, &pResult); 1259 if( rc!=SQLITE_OK ) return rc; 1260 readerInit(&c->result, pResult); 1261 zStatement = "select rowid, content from %_content where rowid = ?"; 1262 break; 1263 } 1264 1265 default: 1266 assert( 0 ); 1267 } 1268 1269 rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); 1270 if( rc!=SQLITE_OK ) return rc; 1271 1272 return fulltextNext(pCursor); 1273 } 1274 1275 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ 1276 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1277 return c->eof; 1278 } 1279 1280 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, 1281 sqlite3_context *pContext, int idxCol){ 1282 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1283 const char *s; 1284 1285 assert( idxCol==0 ); 1286 s = (const char *) sqlite3_column_text(c->pStmt, 1); 1287 sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); 1288 1289 return SQLITE_OK; 1290 } 1291 1292 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ 1293 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1294 1295 *pRowid = sqlite3_column_int64(c->pStmt, 0); 1296 return SQLITE_OK; 1297 } 1298 1299 /* Build a hash table containing all terms in zText. */ 1300 static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, 1301 const char *zText, sqlite_int64 iDocid){ 1302 sqlite3_tokenizer_cursor *pCursor; 1303 const char *pToken; 1304 int nTokenBytes; 1305 int iStartOffset, iEndOffset, iPosition; 1306 1307 int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); 1308 if( rc!=SQLITE_OK ) return rc; 1309 1310 pCursor->pTokenizer = pTokenizer; 1311 HashInit(terms, HASH_STRING, 1); 1312 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, 1313 &pToken, &nTokenBytes, 1314 &iStartOffset, &iEndOffset, 1315 &iPosition) ){ 1316 DocList *p; 1317 1318 /* Positions can't be negative; we use -1 as a terminator internally. */ 1319 if( iPosition<0 ) { 1320 rc = SQLITE_ERROR; 1321 goto err; 1322 } 1323 1324 p = HashFind(terms, pToken, nTokenBytes); 1325 if( p==NULL ){ 1326 p = docListNew(DL_POSITIONS_OFFSETS); 1327 docListAddDocid(p, iDocid); 1328 HashInsert(terms, pToken, nTokenBytes, p); 1329 } 1330 docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); 1331 } 1332 1333 err: 1334 /* TODO(shess) Check return? Should this be able to cause errors at 1335 ** this point? Actually, same question about sqlite3_finalize(), 1336 ** though one could argue that failure there means that the data is 1337 ** not durable. *ponder* 1338 */ 1339 pTokenizer->pModule->xClose(pCursor); 1340 return rc; 1341 } 1342 /* Update the %_terms table to map the term [zTerm] to the given rowid. */ 1343 static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, 1344 sqlite_int64 iDocid, DocList *p){ 1345 sqlite_int64 iFirst; 1346 sqlite_int64 iIndexRow; 1347 DocList doclist; 1348 1349 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); 1350 if( rc==SQLITE_DONE ){ 1351 docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); 1352 if( docListUpdate(&doclist, iDocid, p) ){ 1353 rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); 1354 docListDestroy(&doclist); 1355 return rc; 1356 } 1357 return SQLITE_OK; 1358 } 1359 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 1360 1361 /* This word is in the index; add this document ID to its blob. */ 1362 1363 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); 1364 if( rc!=SQLITE_OK ) return rc; 1365 1366 if( docListUpdate(&doclist, iDocid, p) ){ 1367 /* If the blob is too big, split it in half. */ 1368 if( doclist.nData>CHUNK_MAX ){ 1369 DocList half; 1370 if( docListSplit(&doclist, &half) ){ 1371 rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); 1372 docListDestroy(&half); 1373 if( rc!=SQLITE_OK ) goto err; 1374 } 1375 } 1376 rc = term_update(v, iIndexRow, &doclist); 1377 } 1378 1379 err: 1380 docListDestroy(&doclist); 1381 return rc; 1382 } 1383 1384 /* Insert a row into the full-text index; set *piRowid to be the ID of the 1385 * new row. */ 1386 static int index_insert(fulltext_vtab *v, 1387 sqlite3_value *pRequestRowid, const char *zText, 1388 sqlite_int64 *piRowid){ 1389 Hash terms; /* maps term string -> PosList */ 1390 HashElem *e; 1391 1392 int rc = content_insert(v, pRequestRowid, zText, -1); 1393 if( rc!=SQLITE_OK ) return rc; 1394 *piRowid = sqlite3_last_insert_rowid(v->db); 1395 1396 if( !zText ) return SQLITE_OK; /* nothing to index */ 1397 1398 rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); 1399 if( rc!=SQLITE_OK ) return rc; 1400 1401 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1402 DocList *p = HashData(e); 1403 rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); 1404 if( rc!=SQLITE_OK ) break; 1405 } 1406 1407 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1408 DocList *p = HashData(e); 1409 docListDelete(p); 1410 } 1411 HashClear(&terms); 1412 return rc; 1413 } 1414 1415 static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, 1416 sqlite_int64 iDocid){ 1417 sqlite_int64 iFirst; 1418 sqlite_int64 iIndexRow; 1419 DocList doclist; 1420 1421 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); 1422 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 1423 1424 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); 1425 if( rc!=SQLITE_OK ) return rc; 1426 1427 if( docListUpdate(&doclist, iDocid, NULL) ){ 1428 if( doclist.nData>0 ){ 1429 rc = term_update(v, iIndexRow, &doclist); 1430 } else { /* empty posting list */ 1431 rc = term_delete(v, iIndexRow); 1432 } 1433 } 1434 docListDestroy(&doclist); 1435 return rc; 1436 } 1437 1438 /* Delete a row from the full-text index. */ 1439 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ 1440 char *zText; 1441 Hash terms; 1442 HashElem *e; 1443 1444 int rc = content_select(v, iRow, &zText); 1445 if( rc!=SQLITE_OK ) return rc; 1446 1447 rc = build_terms(&terms, v->pTokenizer, zText, iRow); 1448 free(zText); 1449 if( rc!=SQLITE_OK ) return rc; 1450 1451 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1452 rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); 1453 if( rc!=SQLITE_OK ) break; 1454 } 1455 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1456 DocList *p = HashData(e); 1457 docListDelete(p); 1458 } 1459 HashClear(&terms); 1460 1461 return content_delete(v, iRow); 1462 } 1463 1464 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, 1465 sqlite_int64 *pRowid){ 1466 fulltext_vtab *v = (fulltext_vtab *) pVtab; 1467 1468 if( nArg<2 ){ 1469 return index_delete(v, sqlite3_value_int64(ppArg[0])); 1470 } 1471 1472 if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ 1473 return SQLITE_ERROR; /* an update; not yet supported */ 1474 } 1475 1476 assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ 1477 return index_insert(v, ppArg[1], 1478 (const char *)sqlite3_value_text(ppArg[2]), pRowid); 1479 } 1480 1481 static sqlite3_module fulltextModule = { 1482 0, 1483 fulltextCreate, 1484 fulltextConnect, 1485 fulltextBestIndex, 1486 fulltextDisconnect, 1487 fulltextDestroy, 1488 fulltextOpen, 1489 fulltextClose, 1490 fulltextFilter, 1491 fulltextNext, 1492 fulltextEof, 1493 fulltextColumn, 1494 fulltextRowid, 1495 fulltextUpdate 1496 }; 1497 1498 int fulltext_init(sqlite3 *db){ 1499 return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); 1500 } 1501 1502 #if !SQLITE_CORE 1503 #ifdef _WIN32 1504 __declspec(dllexport) 1505 #endif 1506 int sqlite3_fulltext_init(sqlite3 *db, char **pzErrMsg, 1507 const sqlite3_api_routines *pApi){ 1508 SQLITE_EXTENSION_INIT2(pApi) 1509 return fulltext_init(db); 1510 } 1511 #endif