github.com/ledgerwatch/erigon-lib@v1.0.0/sais/gsa/gsacak.c (about) 1 // vim: noai:ts=2:sw=2 2 3 #include "gsacak.h" 4 5 // set only the highest bit as 1, i.e. 1000... 6 //const unsigned int EMPTY_k=((unsigned int)1)<<(sizeof(unsigned int)*8-1); 7 const uint_t EMPTY_k=((uint_t)1)<<(sizeof(uint_t)*8-1); 8 9 // get s[i] at a certain level 10 #define chr(i) (cs==sizeof(int_t)?((int_t*)s)[i]:(cs==sizeof(int_text)?((int_text*)s)[i]:((unsigned char *)s)[i])) 11 12 #define true 1 13 #define false 0 14 15 #define DEPTH 0 // compute time and size of reduced problem for each recursion call 16 #define PHASES 0 // compute time for each phase 17 #define RMQ_L 2 //variants = (1, trivial) (2, using Gog's stack) 18 #define RMQ_S 2 //variants = (1, trivial) (2, using Gog's stack) 19 20 #define STACK_SIZE_L 894 //to use 10Kb of working space 21 #define STACK_SIZE_S 894 //to use 10Kb of working space 22 23 #define EMPTY_STRING 0 //check if there is an empty string in the input collection 24 25 typedef struct _pair{ 26 uint_t idx; 27 int_t lcp; 28 } t_pair_k; 29 30 int compare_k (const void * a, const void * b){ 31 if(*(const uint_t *)a < *(const uint_t *)b) return -1; 32 if(*(const uint_t *)a > *(const uint_t *)b) return 1; 33 return 0; 34 } 35 36 void stack_push_k(t_pair_k* STACK, int_t *top, uint_t idx, int_t lcp){ 37 38 STACK[*top].idx=idx; 39 STACK[*top].lcp=lcp; 40 41 (*top)++; 42 } 43 44 void compute_lcp_phi_sparse(int_t *s, uint_t *SA1, 45 uint_t *RA, int_t *LCP, int_t *PLCP, 46 uint_t n1, int cs, uint_t separator) { 47 48 uint_t i; 49 50 PLCP[SA1[0]]=0;//PLCP* (lms) is stored in PLCP array 51 for(i=1; i<n1; i++) 52 PLCP[SA1[i]] = LCP[i]; 53 54 LCP[SA1[0]]=0;//PHI is stored in LCP array 55 for(i=1; i<n1; i++) 56 LCP[SA1[i]] = SA1[i-1]; //RA[SA1[i-1]]; 57 58 int_t l=0; //q=0; 59 for(i=0; i<n1-1;i++){ 60 if(chr(RA[i])==separator) continue; 61 62 l = max(PLCP[i], l);//consider the LCP-value of the lms-substrings 63 64 while(chr(RA[i]+l)==chr(RA[LCP[i]]+l) && !(chr(RA[i]+l) == separator && chr(RA[LCP[i]]+l)==separator) ) ++l; 65 PLCP[i]=l; 66 67 if(LCP[i]==n1-1) l -= RA[i+1]-RA[i]; 68 else l -= max(RA[i+1]-RA[i], RA[LCP[i]+1]-RA[LCP[i]]);//LCP[i] stores the distance of i-th suffix to its successor 69 } 70 71 LCP[0]=0; 72 for(i=1; i<n1;i++) LCP[i]=PLCP[SA1[i]]; 73 74 } 75 76 /*****************************************************************************/ 77 78 void getBuckets_k(int_t *s, 79 uint_t *bkt, uint_t n, 80 unsigned int K, int end, int cs) { 81 uint_t i, sum=0; 82 83 // clear all buckets . 84 for(i=0; i<K; i++) bkt[i]=0; 85 86 // compute the size of each bucket . 87 for(i=0; i<n; i++) bkt[chr(i)]++; 88 89 for(i=0; i<K; i++) { 90 sum+=bkt[i]; 91 bkt[i]=end ? sum-1 : sum-bkt[i]; 92 } 93 } 94 95 /*****************************************************************************/ 96 97 void putSuffix0(uint_t *SA, 98 int_t *s, uint_t *bkt, 99 uint_t n, unsigned int K, int_t n1, int cs) { 100 uint_t i, j; 101 102 // find the end of each bucket. 103 getBuckets_k(s, bkt, n, K, true, cs); 104 105 // put the suffixes into their buckets. 106 for(i=n1-1; i>0; i--) { 107 j=SA[i]; SA[i]=0; 108 SA[bkt[chr(j)]--]=j; 109 } 110 SA[0]=n-1; // set the single sentinel suffix. 111 } 112 113 void putSuffix0_generalized(uint_t *SA, 114 uint_t *s, uint_t *bkt, 115 uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { 116 uint_t i, j; 117 118 // find the end of each bucket. 119 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 120 121 int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] 122 123 // put the suffixes into their buckets. 124 for(i=n1-1; i>0; i--) { 125 j=SA[i]; SA[i]=0; 126 SA[bkt[chr(j)]--]=j; 127 } 128 129 // SA[0]=n-1; // set the single sentinel suffix. 130 131 SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] 132 133 } 134 135 void putSuffix0_generalized_LCP(uint_t *SA, int_t *LCP, 136 uint_t *s, uint_t *bkt, 137 uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { 138 uint_t i, j; 139 int_t l; 140 141 // find the end of each bucket. 142 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 143 144 int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] 145 146 // put the suffixes into their buckets. 147 for(i=n1-1; i>0; i--) { 148 j=SA[i]; SA[i]=U_MAX; 149 l=LCP[i]; LCP[i]=0; 150 151 SA[bkt[chr(j)]]=j; 152 LCP[bkt[chr(j)]--]=l; 153 } 154 155 // SA[0]=n-1; // set the single sentinel suffix. 156 157 SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] 158 159 } 160 161 void putSuffix0_generalized_DA(uint_t *SA, int_da *DA, 162 uint_t *s, uint_t *bkt, 163 uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { 164 uint_t i, j; 165 166 // find the end of each bucket. 167 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 168 169 int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] 170 171 // put the suffixes into their buckets. 172 for(i=n1-1; i>0; i--) { 173 j=SA[i]; SA[i]=0; 174 SA[bkt[chr(j)]]=j; 175 DA[bkt[chr(j)]--]=DA[i]; 176 } 177 178 // SA[0]=n-1; // set the single sentinel suffix. 179 180 SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] 181 DA[tmp]= (int_da) tmp-1; 182 183 } 184 185 186 void putSuffix0_generalized_LCP_DA(uint_t *SA, int_t *LCP, int_da *DA, 187 uint_t *s, uint_t *bkt, 188 uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { 189 uint_t i, j; 190 int_t l; 191 192 // find the end of each bucket. 193 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 194 195 int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] 196 197 // put the suffixes into their buckets. 198 for(i=n1-1; i>0; i--) { 199 j=SA[i]; SA[i]=U_MAX; 200 l=LCP[i]; LCP[i]=0; 201 202 SA[bkt[chr(j)]]=j; 203 DA[bkt[chr(j)]]=DA[i]; 204 LCP[bkt[chr(j)]--]=l; 205 } 206 207 // SA[0]=n-1; // set the single sentinel suffix. 208 209 SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] 210 DA[tmp]= (int_da) tmp-1; 211 } 212 213 /*****************************************************************************/ 214 215 void induceSAl0(uint_t *SA, 216 int_t *s, uint_t *bkt, 217 uint_t n, unsigned int K, int_t suffix, int cs) { 218 uint_t i, j; 219 220 // find the head of each bucket. 221 getBuckets_k(s, bkt, n, K, false, cs); 222 223 bkt[0]++; // skip the virtual sentinel. 224 for(i=0; i<n; i++) 225 if(SA[i]>0) { 226 j=SA[i]-1; 227 if(chr(j)>=chr(j+1)) { 228 SA[bkt[chr(j)]]=j; 229 bkt[chr(j)]++; 230 if(!suffix && i>0) SA[i]=0; 231 } 232 } 233 } 234 235 void induceSAs0(uint_t *SA, 236 int_t *s, uint_t *bkt, 237 uint_t n, unsigned int K, int_t suffix, int cs) { 238 uint_t i, j; 239 240 // find the end of each bucket. 241 getBuckets_k(s, bkt, n, K, true, cs); 242 243 for(i=n-1; i>0; i--) 244 if(SA[i]>0) { 245 j=SA[i]-1; 246 if(chr(j)<=chr(j+1) && bkt[chr(j)]<i) { 247 SA[bkt[chr(j)]]=j; 248 bkt[chr(j)]--; 249 if(!suffix) SA[i]=0; 250 } 251 } 252 } 253 /*****************************************************************************/ 254 255 void induceSAl0_generalized(uint_t *SA, 256 uint_t *s, uint_t *bkt, 257 uint_t n, unsigned int K, int_t suffix, int cs, uint_t separator) { 258 uint_t i, j; 259 260 // find the head of each bucket. 261 getBuckets_k((int_t*)s, bkt, n, K, false, cs); 262 263 bkt[0]++; // skip the virtual sentinel. 264 for(i=0; i<n; i++) 265 if(SA[i]>0) { 266 j=SA[i]-1; 267 if(chr(j)>=chr(j+1) ) { 268 if(chr(j)!=separator)//gsa-is 269 SA[bkt[chr(j)]++]=j; 270 if(!suffix && i>0) SA[i]=0; 271 } 272 } 273 } 274 275 void induceSAs0_generalized(uint_t *SA, 276 uint_t *s, uint_t *bkt, 277 uint_t n, uint_t K, int_t suffix, int cs, uint_t separator) { 278 uint_t i, j; 279 280 // find the end of each bucket. 281 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 282 283 for(i=n-1; i>0; i--) 284 if(SA[i]>0) { 285 j=SA[i]-1; 286 if(chr(j)<=chr(j+1) && bkt[chr(j)]<i) { 287 if(chr(j)!=separator) 288 SA[bkt[chr(j)]--]=j; 289 if(!suffix) SA[i]=0; 290 } 291 } 292 } 293 294 /*****************************************************************************/ 295 296 void induceSAl0_generalized_LCP(uint_t *SA, int_t *LCP, 297 uint_t *s, uint_t *bkt, 298 uint_t n, unsigned int K, int cs, uint_t separator) { 299 uint_t i, j; 300 301 for(i=0;i<K;i++) 302 if(bkt[i]+1<n) if(SA[bkt[i]+1]!=U_MAX) LCP[bkt[i]+1]=I_MIN; 303 304 // find the head of each bucket. 305 getBuckets_k((int_t*)s, bkt, n, K, false, cs); 306 307 for(i=0;i<K;i++) 308 if(bkt[i]<n) LCP[bkt[i]]=-2; 309 310 #if RMQ_L == 1 311 int_t *M=(int_t *)malloc(sizeof(int_t)*K); 312 for(i=0;i<K;i++){ 313 M[i]=I_MAX; 314 } 315 #elif RMQ_L == 2 316 uint_t* last_occ = (uint_t*) malloc(K*sizeof(uint_t)); 317 uint_t* tmp = (uint_t*) malloc(K*sizeof(uint_t)); 318 319 t_pair_k* STACK = (t_pair_k*) malloc((STACK_SIZE_L+2)*sizeof(t_pair_k)); 320 int_t top = 0; 321 stack_push_k(STACK, &top, 0, -1);//init 322 for(i=0;i<K;i++) last_occ[i]=0; 323 #endif 324 325 #if DEBUG 326 printf("inducing..\n"); 327 for(i=0; i<n; i++) 328 printf("%" PRIdN "\t", SA[i]+1); 329 printf("\n"); 330 printf("LCP\n"); 331 for(i=0; i<n; i++) 332 printf("%" PRIdN "\t", LCP[i]); 333 printf("\n\n"); 334 #endif 335 336 bkt[0]++; // skip the virtual sentinel. 337 for(i=0; i<n; i++){ 338 if(SA[i]!=U_MAX){ 339 340 if(LCP[i]==I_MIN){ //is a L/S-seam position 341 int_t l=0; 342 if(SA[bkt[chr(SA[i])]-1]<n-1) 343 while(chr(SA[i]+l)==chr(SA[bkt[chr(SA[i])]-1]+l))++l; 344 LCP[i]=l; 345 } 346 #if RMQ_L == 1 347 uint_t k; 348 for(k=0; k<K; k++) if(M[k]>LCP[i]) M[k] = max(0,LCP[i]); 349 #elif RMQ_L == 2 350 int_t min_lcp=0; 351 uint_t last; 352 353 if(!SA[i]) last = 0; 354 else{ 355 last = last_occ[chr(SA[i]-1)]; 356 last_occ[chr(SA[i]-1)] = i+1; 357 } 358 359 int_t lcp=max(0,LCP[i]); 360 while(STACK[(top)-1].lcp>=lcp) (top)--; 361 362 stack_push_k(STACK, &top, i+1, lcp); 363 j = top-1; 364 365 while(STACK[j].idx>last) j--; 366 min_lcp=STACK[(j+1)].lcp; 367 368 #endif 369 370 if(SA[i]>0) { 371 j=SA[i]-1; 372 if(chr(j)>=chr(j+1)) 373 if(chr(j)!=separator){//gsa-is 374 SA[bkt[chr(j)]]=j; 375 376 #if RMQ_L == 1 377 LCP[bkt[chr(j)]]+=M[chr(j)]+1; 378 M[chr(j)] = I_MAX; 379 #elif RMQ_L == 2 380 LCP[bkt[chr(j)]]+=min_lcp+1; 381 #endif 382 383 bkt[chr(j)]++; 384 } 385 if(bkt[chr(SA[i])]-1<i){ //if is LMS-type 386 if(chr(SA[i])!=separator) 387 SA[i]=U_MAX; 388 } 389 390 } 391 #if RMQ_L == 2 392 if(top>STACK_SIZE_L){//if stack is full 393 394 int_t j; 395 memcpy(tmp, last_occ, K*sizeof(uint_t)); 396 qsort(tmp, K, sizeof(uint_t), compare_k); 397 398 int_t curr=1, end=1; 399 STACK[top].idx=U_MAX; 400 for(j=0;j<K; j++){ 401 402 if(STACK[end-1].idx < tmp[j]+1){ 403 404 while(STACK[curr].idx<tmp[j]+1) curr++; 405 406 if(curr<top){ 407 stack_push_k(STACK, &end, STACK[curr].idx, STACK[curr].lcp); 408 curr++; 409 } 410 } 411 } 412 413 if(end>=STACK_SIZE_L){ 414 fprintf(stderr,"ERROR: induceSAl0_LCP\n"); 415 exit(1); 416 } 417 418 top = end; 419 } 420 #endif 421 } 422 } 423 #if RMQ_L == 1 424 free(M); 425 #elif RMQ_L == 2 426 free(STACK); 427 free(last_occ); 428 free(tmp); 429 #endif 430 431 } 432 433 void induceSAs0_generalized_LCP(uint_t *SA, int_t* LCP, 434 uint_t *s, uint_t *bkt, 435 uint_t n, uint_t K, int cs, uint_t separator) { 436 uint_t i, j; 437 438 // find the end of each bucket. 439 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 440 441 #if RMQ_S == 1 442 int_t *M=(int_t *)malloc(sizeof(int_t)*K); 443 for(i=0;i<K;i++) M[i]=I_MAX; 444 #elif RMQ_S == 2 445 uint_t* last_occ = (uint_t*) malloc(K*sizeof(uint_t)); 446 uint_t* tmp = (uint_t*) malloc(K*sizeof(uint_t)); 447 448 t_pair_k* STACK = (t_pair_k*) malloc((STACK_SIZE_S+2)*sizeof(t_pair_k)); 449 int_t top = 0; 450 stack_push_k(STACK, &top, n, -1); //init 451 for(i=0;i<K;i++) last_occ[i]=n-1; 452 #endif 453 454 for(i=n-1; i>0; i--){ 455 if(SA[i]>0) { 456 j=SA[i]-1; 457 if(chr(j)<=chr(j+1) && bkt[chr(j)]<i)// induce S-type 458 if(chr(j)!=separator){ 459 SA[bkt[chr(j)]]=j; 460 461 #if RMQ_S == 1 462 if(LCP[bkt[chr(j)]+1]>=0) 463 LCP[bkt[chr(j)]+1]=M[chr(j)]+1; 464 465 if(LCP[bkt[chr(j)]]>0) 466 LCP[bkt[chr(j)]]=I_MAX; 467 468 #elif RMQ_S == 2 469 int_t min = I_MAX, end = top-1; 470 471 int_t last=last_occ[chr(j)]; 472 while(STACK[end].idx<=last) end--; 473 474 min=STACK[(end+1)].lcp; 475 last_occ[chr(j)] = i; 476 477 if(LCP[bkt[chr(j)]+1]>=0) 478 LCP[bkt[chr(j)]+1]=min+1; 479 #endif 480 481 482 #if RMQ_S == 1 483 M[chr(j)] = I_MAX; 484 #endif 485 486 bkt[chr(j)]--; 487 488 if(SA[bkt[chr(j)]]!=U_MAX) {//L/S-seam 489 int_t l=0; 490 while(chr(SA[bkt[chr(j)]+1]+l)==chr(SA[bkt[chr(j)]]+l))++l; 491 LCP[bkt[chr(j)]+1]=l; 492 } 493 } 494 } 495 496 if(LCP[i]<0) LCP[i]=0; 497 #if RMQ_S == 1 498 int_t k; 499 for(k=0; k<K; k++) if(M[k]>LCP[i]) M[k] = LCP[i]; 500 #elif RMQ_S == 2 501 502 int_t lcp=max(0,LCP[i]); 503 504 while(STACK[(top)-1].lcp>=lcp) (top)--; 505 stack_push_k(STACK, &top, i, lcp); 506 507 if(top>=STACK_SIZE_S){ 508 509 int_t j; 510 memcpy(tmp, last_occ, K*sizeof(uint_t)); 511 qsort(tmp, K, sizeof(uint_t), compare_k); 512 513 int_t curr=1, end=1; 514 515 for(j=K-1;j>=0; j--){ 516 517 if(tmp[j] < STACK[end-1].idx){ 518 519 while(STACK[curr].idx>tmp[j] && curr < top) curr++; 520 if(curr>=top) break; 521 stack_push_k(STACK, &end, STACK[curr].idx, STACK[curr].lcp); 522 curr++; 523 } 524 } 525 526 if(end>=STACK_SIZE_S){ 527 fprintf(stderr,"ERROR: induceSAl0_LCP\n"); 528 exit(1); 529 } 530 top = end; 531 } 532 #endif 533 534 } 535 536 LCP[0]=0; 537 538 //variant 1 539 #if RMQ_S == 1 540 free(M); 541 #elif RMQ_S == 2 542 free(STACK); 543 free(last_occ); 544 free(tmp); 545 #endif 546 } 547 548 549 /*****************************************************************************/ 550 551 void induceSAl0_generalized_DA(uint_t *SA, int_da* DA, 552 uint_t *s, uint_t *bkt, 553 uint_t n, unsigned int K, int cs, uint_t separator) { 554 uint_t i, j; 555 556 // find the head of each bucket. 557 getBuckets_k((int_t*)s, bkt, n, K, false, cs); 558 559 bkt[0]++; // skip the virtual sentinel. 560 for(i=0; i<n; i++) 561 if(SA[i]>0) { 562 j=SA[i]-1; 563 if(chr(j)>=chr(j+1) ) { 564 if(chr(j)!=separator){//gsa-is 565 SA[bkt[chr(j)]]=j; 566 DA[bkt[chr(j)]++]=DA[i]; 567 } 568 } 569 } 570 } 571 572 void induceSAs0_generalized_DA(uint_t *SA, int_da* DA, 573 uint_t *s, uint_t *bkt, 574 uint_t n, uint_t K, int cs, uint_t separator) { 575 uint_t i, j; 576 577 // find the end of each bucket. 578 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 579 580 for(i=n-1; i>0; i--) 581 if(SA[i]>0) { 582 j=SA[i]-1; 583 if(chr(j)<=chr(j+1) && bkt[chr(j)]<i) { 584 if(chr(j)!=separator){ 585 SA[bkt[chr(j)]]=j; 586 DA[bkt[chr(j)]--]=DA[i]; 587 } 588 } 589 } 590 } 591 592 /*****************************************************************************/ 593 594 void induceSAl0_generalized_LCP_DA(uint_t *SA, int_t *LCP, int_da *DA, 595 uint_t *s, uint_t *bkt, 596 uint_t n, unsigned int K, int cs, uint_t separator) { 597 uint_t i, j; 598 599 for(i=0;i<K;i++) 600 if(bkt[i]+1<n) if(SA[bkt[i]+1]!=U_MAX) LCP[bkt[i]+1]=I_MIN; 601 602 // find the head of each bucket. 603 getBuckets_k((int_t*)s, bkt, n, K, false, cs); 604 605 for(i=0;i<K;i++) 606 if(bkt[i]<n) LCP[bkt[i]]=-2; 607 608 #if RMQ_L == 1 609 int_t *M=(int_t *)malloc(sizeof(int_t)*K); 610 for(i=0;i<K;i++){ 611 M[i]=I_MAX; 612 } 613 #elif RMQ_L == 2 614 uint_t* last_occ = (uint_t*) malloc(K*sizeof(uint_t)); 615 uint_t* tmp = (uint_t*) malloc(K*sizeof(uint_t)); 616 617 t_pair_k* STACK = (t_pair_k*) malloc((STACK_SIZE_L+2)*sizeof(t_pair_k)); 618 int_t top = 0; 619 stack_push_k(STACK, &top, 0, -1);//init 620 for(i=0;i<K;i++) last_occ[i]=0; 621 #endif 622 623 #if DEBUG 624 printf("inducing..\n"); 625 for(i=0; i<n; i++) 626 printf("%" PRIdN "\t", SA[i]+1); 627 printf("\n"); 628 printf("LCP\n"); 629 for(i=0; i<n; i++) 630 printf("%" PRIdN "\t", LCP[i]); 631 printf("\n\n"); 632 #endif 633 634 bkt[0]++; // skip the virtual sentinel. 635 for(i=0; i<n; i++){ 636 if(SA[i]!=U_MAX){ 637 638 if(LCP[i]==I_MIN){ //is a L/S-seam position 639 int_t l=0; 640 if(SA[bkt[chr(SA[i])]-1]<n-1) 641 while(chr(SA[i]+l)==chr(SA[bkt[chr(SA[i])]-1]+l))++l; 642 LCP[i]=l; 643 } 644 #if RMQ_L == 1 645 uint_t k; 646 for(k=0; k<K; k++) if(M[k]>LCP[i]) M[k] = max(0,LCP[i]); 647 #elif RMQ_L == 2 648 int_t min_lcp=0; 649 uint_t last; 650 651 if(!SA[i]) last = 0; 652 else{ 653 last = last_occ[chr(SA[i]-1)]; 654 last_occ[chr(SA[i]-1)] = i+1; 655 } 656 657 int_t lcp=max(0,LCP[i]); 658 while(STACK[(top)-1].lcp>=lcp) (top)--; 659 660 stack_push_k(STACK, &top, i+1, lcp); 661 j = top-1; 662 663 while(STACK[j].idx>last) j--; 664 min_lcp=STACK[(j+1)].lcp; 665 666 #endif 667 668 if(SA[i]>0) { 669 j=SA[i]-1; 670 if(chr(j)>=chr(j+1)) 671 if(chr(j)!=separator){//gsa-is 672 SA[bkt[chr(j)]]=j; 673 DA[bkt[chr(j)]]=DA[i]; 674 675 #if RMQ_L == 1 676 LCP[bkt[chr(j)]]+=M[chr(j)]+1; 677 M[chr(j)] = I_MAX; 678 #elif RMQ_L == 2 679 LCP[bkt[chr(j)]]+=min_lcp+1; 680 #endif 681 682 bkt[chr(j)]++; 683 } 684 if(bkt[chr(SA[i])]-1<i){ //if is LMS-type 685 if(chr(SA[i])!=separator) 686 SA[i]=U_MAX; 687 } 688 } 689 #if RMQ_L == 2 690 if(top>STACK_SIZE_L){//if stack is full 691 692 int_t j; 693 memcpy(tmp, last_occ, K*sizeof(uint_t)); 694 qsort(tmp, K, sizeof(uint_t), compare_k); 695 696 int_t curr=1, end=1; 697 STACK[top].idx=U_MAX; 698 for(j=0;j<K; j++){ 699 700 if(STACK[end-1].idx < tmp[j]+1){ 701 702 while(STACK[curr].idx<tmp[j]+1) curr++; 703 704 if(curr<top){ 705 stack_push_k(STACK, &end, STACK[curr].idx, STACK[curr].lcp); 706 curr++; 707 } 708 } 709 } 710 711 if(end>=STACK_SIZE_L){ 712 fprintf(stderr,"ERROR: induceSAl0_LCP\n"); 713 exit(1); 714 } 715 716 top = end; 717 } 718 #endif 719 } 720 } 721 #if RMQ_L == 1 722 free(M); 723 #elif RMQ_L == 2 724 free(STACK); 725 free(last_occ); 726 free(tmp); 727 #endif 728 729 } 730 731 void induceSAs0_generalized_LCP_DA(uint_t *SA, int_t* LCP, int_da* DA, 732 uint_t *s, uint_t *bkt, 733 uint_t n, uint_t K, int cs, uint_t separator) { 734 uint_t i, j; 735 736 // find the end of each bucket. 737 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 738 739 #if RMQ_S == 1 740 int_t *M=(int_t *)malloc(sizeof(int_t)*K); 741 for(i=0;i<K;i++) M[i]=I_MAX; 742 #elif RMQ_S == 2 743 uint_t* last_occ = (uint_t*) malloc(K*sizeof(uint_t)); 744 uint_t* tmp = (uint_t*) malloc(K*sizeof(uint_t)); 745 746 t_pair_k* STACK = (t_pair_k*) malloc((STACK_SIZE_S+2)*sizeof(t_pair_k)); 747 int_t top = 0; 748 stack_push_k(STACK, &top, n, -1); //init 749 for(i=0;i<K;i++) last_occ[i]=n-1; 750 #endif 751 752 for(i=n-1; i>0; i--){ 753 if(SA[i]>0) { 754 j=SA[i]-1; 755 if(chr(j)<=chr(j+1) && bkt[chr(j)]<i)// induce S-type 756 if(chr(j)!=separator){ 757 SA[bkt[chr(j)]]=j; 758 DA[bkt[chr(j)]]=DA[i]; 759 760 #if RMQ_S == 1 761 if(LCP[bkt[chr(j)]+1]>=0) 762 LCP[bkt[chr(j)]+1]=M[chr(j)]+1; 763 764 if(LCP[bkt[chr(j)]]>0) 765 LCP[bkt[chr(j)]]=I_MAX; 766 767 #elif RMQ_S == 2 768 int_t min = I_MAX, end = top-1; 769 770 int_t last=last_occ[chr(j)]; 771 while(STACK[end].idx<=last) end--; 772 773 min=STACK[(end+1)].lcp; 774 last_occ[chr(j)] = i; 775 776 if(LCP[bkt[chr(j)]+1]>=0) 777 LCP[bkt[chr(j)]+1]=min+1; 778 #endif 779 780 781 #if RMQ_S == 1 782 M[chr(j)] = I_MAX; 783 #endif 784 785 bkt[chr(j)]--; 786 787 if(SA[bkt[chr(j)]]!=U_MAX) {//L/S-seam 788 int_t l=0; 789 while(chr(SA[bkt[chr(j)]+1]+l)==chr(SA[bkt[chr(j)]]+l))++l; 790 LCP[bkt[chr(j)]+1]=l; 791 } 792 } 793 } 794 795 if(LCP[i]<0) LCP[i]=0; 796 #if RMQ_S == 1 797 int_t k; 798 for(k=0; k<K; k++) if(M[k]>LCP[i]) M[k] = LCP[i]; 799 #elif RMQ_S == 2 800 801 int_t lcp=max(0,LCP[i]); 802 803 while(STACK[(top)-1].lcp>=lcp) (top)--; 804 stack_push_k(STACK, &top, i, lcp); 805 806 if(top>=STACK_SIZE_S){ 807 808 int_t j; 809 memcpy(tmp, last_occ, K*sizeof(uint_t)); 810 qsort(tmp, K, sizeof(uint_t), compare_k); 811 812 int_t curr=1, end=1; 813 814 for(j=K-1;j>=0; j--){ 815 816 if(tmp[j] < STACK[end-1].idx){ 817 818 while(STACK[curr].idx>tmp[j] && curr < top) curr++; 819 if(curr>=top) break; 820 stack_push_k(STACK, &end, STACK[curr].idx, STACK[curr].lcp); 821 curr++; 822 } 823 } 824 825 if(end>=STACK_SIZE_S){ 826 fprintf(stderr,"ERROR: induceSAl0_LCP\n"); 827 exit(1); 828 } 829 top = end; 830 } 831 #endif 832 833 } 834 835 LCP[0]=0; 836 837 //variant 1 838 #if RMQ_S == 1 839 free(M); 840 #elif RMQ_S == 2 841 free(STACK); 842 free(last_occ); 843 free(tmp); 844 #endif 845 } 846 847 848 /*****************************************************************************/ 849 850 851 void putSubstr0(uint_t *SA, 852 int_t *s, uint_t *bkt, 853 uint_t n, unsigned int K, int cs) { 854 uint_t i, cur_t, succ_t; 855 856 // find the end of each bucket. 857 getBuckets_k(s, bkt, n, K, true, cs); 858 859 // set each item in SA as empty. 860 for(i=0; i<n; i++) SA[i]=0; 861 862 succ_t=0; // s[n-2] must be L-type. 863 for(i=n-2; i>0; i--) { 864 cur_t=(chr(i-1)<chr(i) || 865 (chr(i-1)==chr(i) && succ_t==1) 866 )?1:0; 867 if(cur_t==0 && succ_t==1) SA[bkt[chr(i)]--]=i; 868 succ_t=cur_t; 869 } 870 871 // set the single sentinel LMS-substring. 872 SA[0]=n-1; 873 } 874 875 /*****************************************************************************/ 876 void putSubstr0_generalized(uint_t *SA, 877 uint_t *s, uint_t *bkt, 878 uint_t n, unsigned int K, int cs, uint_t separator) { 879 uint_t i, cur_t, succ_t; 880 881 // find the end of each bucket. 882 getBuckets_k((int_t*)s, bkt, n, K, true, cs); 883 884 // set each item in SA as empty. 885 for(i=0; i<n; i++) SA[i]=0; 886 887 // gsa-is 888 int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] 889 890 SA[0]=n-1;// set the single sentinel LMS-substring. 891 892 SA[tmp] = SA[0]-1; // insert the last separator at the end of bkt[separator] 893 894 int_t p=n-2; 895 896 succ_t=0; // s[n-2] must be L-type. 897 for(i=n-2; i>0; i--) { 898 cur_t=(chr(i-1)<chr(i) || 899 (chr(i-1)==chr(i) && succ_t==1) 900 )?1:0; 901 if(cur_t==0 && succ_t==1){ 902 903 if(chr(i)==separator) 904 SA[++bkt[chr(p)]]=0; // removes LMS-positions that induces separator suffixes 905 906 SA[bkt[chr(i)]--]=i; 907 p=i; 908 } 909 succ_t=cur_t; 910 } 911 } 912 /*****************************************************************************/ 913 914 void putSuffix1(int_t *SA, int_t *s, int_t n1, int cs) { 915 int_t i, j, pos=n1, cur, pre=-1; 916 917 for(i=n1-1; i>0; i--) { 918 j=SA[i]; SA[i]=EMPTY_k; 919 cur=chr(j); 920 if(cur!=pre) { 921 pre=cur; pos=cur; 922 } 923 SA[pos--]=j; 924 } 925 } 926 927 void induceSAl1(int_t *SA, int_t *s, 928 int_t n, int_t suffix, int cs) { 929 int_t h, i, j, step=1; 930 931 for(i=0; i<n; i+=step) { 932 step=1; j=SA[i]-1; 933 if(SA[i]<=0) continue; 934 int_t c=chr(j), c1=chr(j+1); 935 int_t isL=c>=c1; 936 if(!isL) continue; 937 938 // s[j] is L-type. 939 940 int_t d=SA[c]; 941 if(d>=0) { 942 // SA[c] is borrowed by the left 943 // neighbor bucket. 944 // shift-left the items in the 945 // left neighbor bucket. 946 int_t foo, bar; 947 foo=SA[c]; 948 for(h=c-1; SA[h]>=0||SA[h]==EMPTY_k; h--) 949 { bar=SA[h]; SA[h]=foo; foo=bar; } 950 SA[h]=foo; 951 if(h<i) step=0; 952 953 d=EMPTY_k; 954 } 955 956 if(d==EMPTY_k) { // SA[c] is empty. 957 if(c<n-1 && SA[c+1]==EMPTY_k) { 958 SA[c]=-1; // init the counter. 959 SA[c+1]=j; 960 } 961 else 962 SA[c]=j; // a size-1 bucket. 963 } 964 else { // SA[c] is reused as a counter. 965 int_t pos=c-d+1; 966 if(pos>n-1 || SA[pos]!=EMPTY_k) { 967 // we are running into the right 968 // neighbor bucket. 969 // shift-left one step the items 970 // of bucket(SA, S, j). 971 for(h=0; h<-d; h++) 972 SA[c+h]=SA[c+h+1]; 973 pos--; 974 if(c<i) step=0; 975 } 976 else 977 SA[c]--; 978 979 SA[pos]=j; 980 } 981 982 int_t c2; 983 int_t isL1=(j+1<n-1) && (c1>(c2=chr(j+2)) || (c1==c2 && c1<i)); // is s[SA[i]] L-type? 984 if((!suffix || !isL1) && i>0) { 985 int_t i1=(step==0)?i-1:i; 986 SA[i1]=EMPTY_k; 987 } 988 } 989 990 // scan to shift-left the items in each bucket 991 // with its head being reused as a counter. 992 for(i=1; i<n; i++) { 993 j=SA[i]; 994 if(j<0 && j!=EMPTY_k) { // is SA[i] a counter? 995 for(h=0; h<-j; h++) 996 SA[i+h]=SA[i+h+1]; 997 SA[i+h]=EMPTY_k; 998 } 999 } 1000 } 1001 1002 void induceSAs1(int_t *SA, int_t *s, 1003 int_t n, int_t suffix, int cs) { 1004 int_t h, i, j, step=1; 1005 1006 for(i=n-1; i>0; i-=step) { 1007 step=1; j=SA[i]-1; 1008 if(SA[i]<=0) continue; 1009 int_t c=chr(j), c1=chr(j+1); 1010 int_t isS=(c<c1) || (c==c1 && c>i); 1011 if(!isS) continue; 1012 1013 // s[j] is S-type 1014 1015 int_t d=SA[c]; 1016 if(d>=0) { 1017 // SA[c] is borrowed by the right 1018 // neighbor bucket. 1019 // shift-right the items in the 1020 // right neighbor bucket. 1021 int_t foo, bar; 1022 foo=SA[c]; 1023 for(h=c+1; SA[h]>=0||SA[h]==EMPTY_k; h++) 1024 { bar=SA[h]; SA[h]=foo; foo=bar; } 1025 SA[h]=foo; 1026 if(h>i) step=0; 1027 1028 d=EMPTY_k; 1029 } 1030 1031 if(d==EMPTY_k) { // SA[c] is empty. 1032 if(SA[c-1]==EMPTY_k) { 1033 SA[c]=-1; // init the counter. 1034 SA[c-1]=j; 1035 } 1036 else 1037 SA[c]=j; // a size-1 bucket. 1038 } 1039 else { // SA[c] is reused as a counter. 1040 int_t pos=c+d-1; 1041 if(SA[pos]!=EMPTY_k) { 1042 // we are running into the left 1043 // neighbor bucket. 1044 // shift-right one step the items 1045 // of bucket(SA, S, j). 1046 for(h=0; h<-d; h++) 1047 SA[c-h]=SA[c-h-1]; 1048 pos++; 1049 if(c>i) step=0; 1050 } 1051 else 1052 SA[c]--; 1053 1054 SA[pos]=j; 1055 } 1056 1057 if(!suffix) { 1058 int_t i1=(step==0)?i+1:i; 1059 SA[i1]=EMPTY_k; 1060 } 1061 } 1062 1063 // scan to shift-right the items in each bucket 1064 // with its head being reused as a counter. 1065 if(!suffix) 1066 for(i=n-1; i>0; i--) { 1067 j=SA[i]; 1068 if(j<0 && j!=EMPTY_k) { // is SA[i] a counter? 1069 for(h=0; h<-j; h++) 1070 SA[i-h]=SA[i-h-1]; 1071 SA[i-h]=EMPTY_k; 1072 } 1073 } 1074 } 1075 1076 void putSubstr1(int_t *SA, int_t *s, int_t n, int cs) { 1077 int_t h, i, j; 1078 1079 for(i=0; i<n; i++) SA[i]=EMPTY_k; 1080 1081 int_t c, c1, t, t1; 1082 c1=chr(n-2); 1083 t1=0; 1084 for(i=n-2; i>0; i--) { 1085 c=c1; t=t1; 1086 c1=chr(i-1); 1087 t1=c1<c || (c1==c && t); 1088 if(t && !t1) { 1089 if(SA[c]>=0) { 1090 // SA[c] is borrowed by the right 1091 // neighbor bucket. 1092 // shift-right the items in the 1093 // right neighbor bucket. 1094 int_t foo, bar; 1095 foo=SA[c]; 1096 for(h=c+1; SA[h]>=0; h++) 1097 { bar=SA[h]; SA[h]=foo; foo=bar; } 1098 SA[h]=foo; 1099 1100 SA[c]=EMPTY_k; 1101 } 1102 1103 int_t d=SA[c]; 1104 if(d==EMPTY_k) { // SA[c] is empty. 1105 if(SA[c-1]==EMPTY_k) { 1106 SA[c]=-1; // init the counter. 1107 SA[c-1]=i; 1108 } 1109 else 1110 SA[c]=i; // a size-1 bucket. 1111 } 1112 else { // SA[c] is reused as a counter 1113 int_t pos=c+d-1; 1114 if(SA[pos]!=EMPTY_k) { 1115 // we are running into the left 1116 // neighbor bucket. 1117 // shift-right one step the items 1118 // of bucket(SA, S, i). 1119 for(h=0; h<-d; h++) 1120 SA[c-h]=SA[c-h-1]; 1121 pos++; 1122 } 1123 else 1124 SA[c]--; 1125 1126 SA[pos]=i; 1127 } 1128 } 1129 } 1130 1131 // scan to shift-right the items in each bucket 1132 // with its head being reused as a counter. 1133 for(i=n-1; i>0; i--) { 1134 j=SA[i]; 1135 if(j<0 && j!=EMPTY_k) { // is SA[i] a counter? 1136 for(h=0; h<-j; h++) 1137 SA[i-h]=SA[i-h-1]; 1138 SA[i-h]=EMPTY_k; 1139 } 1140 } 1141 1142 // put the single sentinel LMS-substring. 1143 SA[0]=n-1; 1144 } 1145 1146 uint_t getLengthOfLMS(int_t *s, 1147 uint_t n, int level, uint_t x, int cs) { 1148 if(x==n-1) return 1; 1149 1150 uint_t dist=0, i=1; 1151 while(1) { 1152 if(chr(x+i)<chr(x+i-1)) break; 1153 i++; 1154 } 1155 while(1) { 1156 if(x+i>n-1 || chr(x+i)>chr(x+i-1)) break; 1157 if(x+i==n-1 || chr(x+i)<chr(x+i-1)) dist=i; 1158 i++; 1159 } 1160 1161 return dist+1; 1162 } 1163 1164 uint_t nameSubstr(uint_t *SA, 1165 int_t *s, uint_t *s1, uint_t n, 1166 uint_t m, uint_t n1, int level, int cs) { 1167 uint_t i, j, cur_t, succ_t; 1168 1169 // init the name array buffer 1170 for(i=n1; i<n; i++) SA[i]=EMPTY_k; 1171 1172 // scan to compute the interim s1 1173 uint_t name=0, name_ctr=0; 1174 uint_t pre_pos=n-1, pre_len=0; 1175 for(i=0; i<n1; i++) { 1176 int diff=false; 1177 uint_t len, pos=SA[i]; 1178 1179 uint_t d; 1180 len=getLengthOfLMS(s, n, level, pos, cs); 1181 if(len!=pre_len) diff=true; 1182 else 1183 for(d=0; d<len; d++) 1184 if(pos+d==n-1 || pre_pos+d==n-1 || 1185 chr(pos+d)!=chr(pre_pos+d)) { 1186 diff=true; break; 1187 } 1188 1189 if(diff) { 1190 name=i; name_ctr++; 1191 SA[name]=1; // a new name. 1192 pre_pos=pos; pre_len=len; 1193 } 1194 else 1195 SA[name]++; // count this name. 1196 1197 SA[n1+pos/2]=name; 1198 } 1199 1200 // compact the interim s1 sparsely stored 1201 // in SA[n1, n-1] into SA[m-n1, m-1]. 1202 for(i=n-1, j=m-1; i>=n1; i--) 1203 if(SA[i]!=EMPTY_k) SA[j--]=SA[i]; 1204 1205 // rename each S-type character of the 1206 // interim s1 as the end of its bucket 1207 // to produce the final s1. 1208 succ_t=1; 1209 for(i=n1-1; i>0; i--) { 1210 int_t ch=s1[i], ch1=s1[i-1]; 1211 cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0; 1212 if(cur_t==1) { 1213 s1[i-1]+=SA[s1[i-1]]-1; 1214 } 1215 succ_t=cur_t; 1216 } 1217 1218 return name_ctr; 1219 } 1220 1221 /*****************************************************************************/ 1222 1223 uint_t nameSubstr_generalized(uint_t *SA, 1224 uint_t *s, uint_t *s1, uint_t n, 1225 uint_t m, uint_t n1, int level, int cs, uint_t separator) { 1226 uint_t i, j, cur_t, succ_t; 1227 1228 // init the name array buffer 1229 for(i=n1; i<n; i++) SA[i]=EMPTY_k; 1230 1231 // scan to compute the interim s1 1232 uint_t name=0, name_ctr=0; 1233 uint_t pre_pos=n-1, pre_len=0; 1234 for(i=0; i<n1; i++) { 1235 int diff=false; 1236 uint_t len, pos=SA[i]; 1237 1238 uint_t d; 1239 len=getLengthOfLMS((int_t*)s, n, level, pos, cs); 1240 if(len!=pre_len) diff=true; 1241 else 1242 for(d=0; d<len; d++) 1243 if(pos+d==n-1 || pre_pos+d==n-1 || 1244 chr(pos+d)!=chr(pre_pos+d) || 1245 (chr(pos+d)==separator && chr(pre_pos+d)==separator)){ 1246 diff=true; break; 1247 } 1248 1249 if(diff) { 1250 name=i; name_ctr++; 1251 SA[name]=1; // a new name. 1252 pre_pos=pos; pre_len=len; 1253 } 1254 else 1255 SA[name]++; // count this name. 1256 1257 SA[n1+pos/2]=name; 1258 } 1259 1260 // compact the interim s1 sparsely stored 1261 // in SA[n1, n-1] into SA[m-n1, m-1]. 1262 for(i=n-1, j=m-1; i>=n1; i--) 1263 if(SA[i]!=EMPTY_k) SA[j--]=SA[i]; 1264 1265 // rename each S-type character of the 1266 // interim s1 as the end of its bucket 1267 // to produce the final s1. 1268 succ_t=1; 1269 for(i=n1-1; i>0; i--) { 1270 int_t ch=s1[i], ch1=s1[i-1]; 1271 cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0; 1272 if(cur_t==1) { 1273 s1[i-1]+=SA[s1[i-1]]-1; 1274 } 1275 succ_t=cur_t; 1276 } 1277 1278 return name_ctr; 1279 } 1280 1281 /*****************************************************************************/ 1282 1283 uint_t nameSubstr_generalized_LCP(uint_t *SA, int_t *LCP, 1284 uint_t *s, uint_t *s1, uint_t n, 1285 uint_t m, uint_t n1, int level, int cs, uint_t separator) { 1286 uint_t i, j, cur_t, succ_t; 1287 1288 // init the name array buffer 1289 for(i=n1; i<n; i++) SA[i]=EMPTY_k; 1290 1291 // scan to compute the interim s1 1292 uint_t name=0, name_ctr=0; 1293 uint_t pre_pos=n-1, pre_len=0; 1294 for(i=0; i<n1; i++) { 1295 int diff=false; 1296 uint_t len, pos=SA[i]; 1297 1298 uint_t d; 1299 len=getLengthOfLMS((int_t*)s, n, level, pos, cs); 1300 if(len!=pre_len) diff=true; 1301 else{ 1302 for(d=0; d<len; d++){ 1303 if(pos+d==n-1 || pre_pos+d==n-1 || 1304 chr(pos+d)!=chr(pre_pos+d) || 1305 (chr(pos+d)==separator && chr(pre_pos+d)==separator)){ 1306 diff=true; break; 1307 } 1308 } 1309 LCP[i]=d; 1310 } 1311 1312 if(diff) { 1313 name=i; name_ctr++; 1314 SA[name]=1; // a new name. 1315 pre_pos=pos; pre_len=len; 1316 } 1317 else 1318 SA[name]++; // count this name. 1319 1320 SA[n1+pos/2]=name; 1321 } 1322 1323 // compact the interim s1 sparsely stored 1324 // in SA[n1, n-1] into SA[m-n1, m-1]. 1325 for(i=n-1, j=m-1; i>=n1; i--) 1326 if(SA[i]!=EMPTY_k) SA[j--]=SA[i]; 1327 1328 // rename each S-type character of the 1329 // interim s1 as the end of its bucket 1330 // to produce the final s1. 1331 succ_t=1; 1332 for(i=n1-1; i>0; i--) { 1333 int_t ch=s1[i], ch1=s1[i-1]; 1334 cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0; 1335 if(cur_t==1) { 1336 s1[i-1]+=SA[s1[i-1]]-1; 1337 } 1338 succ_t=cur_t; 1339 } 1340 1341 return name_ctr; 1342 } 1343 1344 /*****************************************************************************/ 1345 1346 void getSAlms(uint_t *SA, 1347 int_t *s, 1348 uint_t *s1, uint_t n, 1349 uint_t n1, int level, int cs) { 1350 uint_t i, j, cur_t, succ_t; 1351 1352 j=n1-1; s1[j--]=n-1; 1353 succ_t=0; // s[n-2] must be L-type 1354 for(i=n-2; i>0; i--) { 1355 cur_t=(chr(i-1)<chr(i) || 1356 (chr(i-1)==chr(i) && succ_t==1))?1:0; 1357 if(cur_t==0 && succ_t==1) s1[j--]=i; 1358 succ_t=cur_t; 1359 } 1360 } 1361 1362 1363 void getSAlms_DA(uint_t *SA, int_da* DA, 1364 int_t *s, 1365 uint_t *s1, uint_t n, 1366 uint_t n1, int level, int cs, uint_t separator) { 1367 uint_t i, j, cur_t, succ_t; 1368 1369 /**/ 1370 int_t k=0; 1371 for(i=n-2; i>0; i--) if(chr(i)==separator)k++; 1372 DA[n1-1]=(int_da) k; 1373 /**/ 1374 1375 j=n1-1; s1[j--]=n-1; 1376 succ_t=0; // s[n-2] must be L-type 1377 for(i=n-2; i>0; i--) { 1378 1379 if(chr(i)==separator)k--; 1380 1381 cur_t=(chr(i-1)<chr(i) || 1382 (chr(i-1)==chr(i) && succ_t==1))?1:0; 1383 if(cur_t==0 && succ_t==1){//LMS-suffix 1384 s1[j]=i; 1385 DA[j]=k; 1386 j--; 1387 } 1388 succ_t=cur_t; 1389 } 1390 1391 } 1392 1393 1394 /*****************************************************************************/ 1395 1396 int_t SACA_K(int_t *s, uint_t *SA, 1397 uint_t n, unsigned int K, 1398 uint_t m, int cs, int level) { 1399 uint_t i; 1400 uint_t *bkt=NULL; 1401 1402 #if PHASES 1403 time_t t_start_phase = 0.0; 1404 clock_t c_start_phase = 0.0; 1405 #endif 1406 1407 #if DEPTH 1408 time_t t_start = time(NULL); 1409 clock_t c_start = clock(); 1410 #endif 1411 1412 #if PHASES 1413 if(!level){ 1414 t_start_phase = time(NULL); 1415 c_start_phase = clock(); 1416 } 1417 #endif 1418 1419 // stage 1: reduce the problem by at least 1/2. 1420 if(level==0) { 1421 1422 bkt=(uint_t *)malloc(sizeof(int_t)*K); 1423 putSubstr0(SA, s, bkt, n, K, cs); 1424 1425 induceSAl0(SA, s, bkt, n, K, false, cs); 1426 induceSAs0(SA, s, bkt, n, K, false, cs); 1427 1428 } 1429 else { 1430 putSubstr1((int_t *)SA, (int_t *)s,(int_t)n, cs); 1431 induceSAl1((int_t *)SA, (int_t *)s, n ,false, cs); 1432 induceSAs1((int_t *)SA, (int_t *)s, n, false, cs); 1433 } 1434 1435 // now, all the LMS-substrings are sorted and 1436 // stored sparsely in SA. 1437 1438 // compact all the sorted substrings into 1439 // the first n1 items of SA. 1440 // 2*n1 must be not larger than n. 1441 uint_t n1=0; 1442 for(i=0; i<n; i++) 1443 if((!level&&SA[i]>0) || (level&&((int_t *)SA)[i]>0)) 1444 SA[n1++]=SA[i]; 1445 1446 uint_t *SA1=SA, *s1=SA+m-n1; 1447 uint_t name_ctr; 1448 name_ctr=nameSubstr(SA,s,s1,n,m,n1,level,cs); 1449 1450 #if PHASES 1451 if(!level){ 1452 printf("phase 1:\n"); 1453 time_stop(t_start_phase, c_start_phase); 1454 } 1455 #endif 1456 1457 #if PHASES 1458 if(!level){ 1459 t_start_phase = time(NULL); 1460 c_start_phase = clock(); 1461 } 1462 #endif 1463 1464 // stage 2: solve the reduced problem. 1465 int_t depth=1; 1466 // recurse if names are not yet unique. 1467 if(name_ctr<n1) 1468 depth += SACA_K((int_t*)s1, SA1, 1469 n1, 0, m-n1, sizeof(int_t), level+1); 1470 else // get the suffix array of s1 directly. 1471 for(i=0; i<n1; i++) SA1[s1[i]]=i; 1472 1473 // stage 3: induce SA(S) from SA(S1). 1474 1475 getSAlms(SA, s, s1, n, n1, level, cs); 1476 1477 for(i=0; i<n1; i++) SA[i]=s1[SA[i]]; 1478 for(i=n1; i<n; i++) SA[i]=level?EMPTY_k:0; 1479 1480 if(level==0) { 1481 putSuffix0(SA, s, bkt, n, K, n1, cs); 1482 1483 #if PHASES 1484 printf("phase 2:\n"); 1485 time_stop(t_start_phase, c_start_phase); 1486 #endif 1487 1488 #if PHASES 1489 t_start_phase = time(NULL); 1490 c_start_phase = clock(); 1491 #endif 1492 1493 induceSAl0(SA, s, bkt, n, K, true, cs); 1494 1495 #if PHASES 1496 printf("phase 3:\n"); 1497 time_stop(t_start_phase, c_start_phase); 1498 #endif 1499 1500 #if PHASES 1501 t_start_phase = time(NULL); 1502 c_start_phase = clock(); 1503 #endif 1504 1505 induceSAs0(SA, s, bkt, n, K, true, cs); 1506 free(bkt); 1507 1508 #if PHASES 1509 printf("phase 4:\n"); 1510 time_stop(t_start_phase, c_start_phase); 1511 #endif 1512 } 1513 else { 1514 putSuffix1((int_t *)SA, (int_t *)s, n1, cs); 1515 induceSAl1((int_t *)SA, (int_t *)s, n, true, cs); 1516 induceSAs1((int_t *)SA, (int_t *)s, n, true, cs); 1517 } 1518 1519 #if DEPTH 1520 printf("depth %" PRIdN ":\nname_ctr = %" PRIdN ", n1 =%" PRIdN ", n = %" PRIdN "\n", depth, name_ctr, n1, n); 1521 time_stop(t_start, c_start); 1522 #endif 1523 1524 return depth; 1525 } 1526 1527 /*****************************************************************************/ 1528 1529 int_t gSACA_K(uint_t *s, uint_t *SA, 1530 uint_t n, unsigned int K, 1531 int cs, uint_t separator, int level) { 1532 uint_t i; 1533 uint_t *bkt=NULL; 1534 uint_t m=n; 1535 1536 #if PHASES 1537 time_t t_start_phase = 0.0; 1538 clock_t c_start_phase = 0.0; 1539 #endif 1540 1541 #if DEPTH 1542 time_t t_start = time(NULL); 1543 clock_t c_start = clock(); 1544 #endif 1545 1546 #if PHASES 1547 t_start_phase = time(NULL); 1548 c_start_phase = clock(); 1549 #endif 1550 // stage 1: reduce the problem by at least 1/2. 1551 1552 bkt=(uint_t *)malloc(sizeof(int_t)*K); 1553 1554 putSubstr0_generalized(SA, s, bkt, n, K, cs, separator); 1555 1556 induceSAl0_generalized(SA, s, bkt, n, K, false, cs, separator); 1557 induceSAs0_generalized(SA, s, bkt, n, K, false, cs, separator); 1558 1559 // insert separator suffixes in their buckets 1560 // bkt[separator]=1; // gsa-is 1561 for(i=n-3; i>0; i--) 1562 if(chr(i)==separator) 1563 SA[bkt[chr(i)]--]=i; 1564 1565 // now, all the LMS-substrings are sorted and 1566 // stored sparsely in SA. 1567 1568 // compact all the sorted substrings into 1569 // the first n1 items of SA. 1570 // 2*n1 must be not larger than n. 1571 uint_t n1=0; 1572 for(i=0; i<n; i++) 1573 if((SA[i]>0)) 1574 SA[n1++]=SA[i]; 1575 1576 uint_t *SA1=SA, *s1=SA+m-n1; 1577 uint_t name_ctr; 1578 name_ctr=nameSubstr_generalized(SA,s,s1,n,m,n1,level,cs,separator); 1579 1580 #if PHASES 1581 printf("phase 1:\n"); 1582 time_stop(t_start_phase, c_start_phase); 1583 #endif 1584 1585 #if PHASES 1586 t_start_phase = time(NULL); 1587 c_start_phase = clock(); 1588 #endif 1589 1590 // stage 2: solve the reduced problem. 1591 int_t depth=1; 1592 // recurse if names are not yet unique. 1593 if(name_ctr<n1) 1594 depth += SACA_K((int_t*)s1, SA1, 1595 n1, 0, m-n1, sizeof(int_t), level+1); 1596 else // get the suffix array of s1 directly. 1597 for(i=0; i<n1; i++) SA1[s1[i]]=i; 1598 1599 // stage 3: induce SA(S) from SA(S1). 1600 1601 getSAlms(SA, (int_t*)s, s1, n, n1, level, cs); 1602 1603 for(i=0; i<n1; i++) SA[i]=s1[SA[i]]; 1604 for(i=n1; i<n; i++) SA[i]=0; 1605 1606 putSuffix0_generalized(SA, s, bkt, n, K, n1, cs, separator); 1607 1608 #if PHASES 1609 printf("phase 2:\n"); 1610 time_stop(t_start_phase, c_start_phase); 1611 #endif 1612 1613 #if PHASES 1614 t_start_phase = time(NULL); 1615 c_start_phase = clock(); 1616 #endif 1617 1618 induceSAl0_generalized(SA, s, bkt, n, K, true, cs, separator); 1619 1620 #if PHASES 1621 printf("phase 3:\n"); 1622 time_stop(t_start_phase, c_start_phase); 1623 #endif 1624 1625 #if PHASES 1626 t_start_phase = time(NULL); 1627 c_start_phase = clock(); 1628 #endif 1629 1630 induceSAs0_generalized(SA, s, bkt, n, K, true, cs, separator); 1631 1632 free(bkt); 1633 1634 #if DEPTH 1635 printf("depth %" PRIdN ":\nname_ctr = %" PRIdN ", n1 =%" PRIdN ", n = %" PRIdN "\n", depth, name_ctr, n1, n); 1636 time_stop(t_start, c_start); 1637 #endif 1638 1639 #if PHASES 1640 printf("phase 4:\n"); 1641 time_stop(t_start_phase, c_start_phase); 1642 #endif 1643 1644 return depth; 1645 } 1646 1647 /*****************************************************************************/ 1648 1649 int_t gSACA_K_LCP(uint_t *s, uint_t *SA, int_t *LCP, 1650 uint_t n, unsigned int K, 1651 int cs, uint_t separator, int level) { 1652 uint_t i; 1653 uint_t *bkt=NULL; 1654 uint_t m=n; 1655 1656 #if PHASES 1657 time_t t_start_phase = 0.0; 1658 clock_t c_start_phase = 0.0; 1659 #endif 1660 1661 #if DEPTH 1662 time_t t_start = time(NULL); 1663 clock_t c_start = clock(); 1664 #endif 1665 1666 #if PHASES 1667 t_start_phase = time(NULL); 1668 c_start_phase = clock(); 1669 #endif 1670 // stage 1: reduce the problem by at least 1/2. 1671 1672 bkt=(uint_t *)malloc(sizeof(int_t)*K); 1673 putSubstr0_generalized(SA, s, bkt, n, K, cs, separator); 1674 1675 #if DEBUG 1676 printf("bucket LMS-subs\n"); 1677 for(i=0; i<n; i++) 1678 printf("%" PRIdN "\t", SA[i]+1); 1679 printf("\n"); 1680 #endif 1681 1682 induceSAl0_generalized(SA, s, bkt, n, K, false, cs, separator); 1683 1684 #if DEBUG 1685 printf("L-type\n"); 1686 for(i=0; i<n; i++) 1687 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 1688 else printf("-1\t"); 1689 printf("\n"); 1690 #endif 1691 1692 induceSAs0_generalized(SA, s, bkt, n, K, false, cs, separator); 1693 1694 #if DEBUG 1695 printf("S-type\n"); 1696 for(i=0; i<n; i++) 1697 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 1698 else printf("-1\t"); 1699 printf("\n"); 1700 #endif 1701 1702 // insert separator suffixes in their buckets 1703 // bkt[separator]=1; // gsa-is 1704 for(i=n-3; i>0; i--) 1705 if(chr(i)==separator) 1706 SA[bkt[chr(i)]--]=i; 1707 1708 #if DEBUG 1709 printf("S-type (separators)\n"); 1710 for(i=0; i<n; i++) 1711 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 1712 else printf("-1\t"); 1713 printf("\n"); 1714 #endif 1715 1716 // now, all the LMS-substrings are sorted and 1717 // stored sparsely in SA. 1718 1719 // compact all the sorted substrings into 1720 // the first n1 items of SA. 1721 // 2*n1 must be not larger than n. 1722 uint_t n1=0; 1723 for(i=0; i<n; i++) 1724 if((SA[i]>0)) 1725 SA[n1++]=SA[i]; 1726 1727 uint_t *SA1=SA, *s1=SA+m-n1; 1728 uint_t name_ctr; 1729 1730 #if DEBUG 1731 printf("\nSA\n"); 1732 for(i=0; i<n; i++) 1733 printf("%" PRIdN "\t", SA[i]+1); 1734 printf("\n"); 1735 printf("LCP\n"); 1736 for(i=0; i<n; i++) 1737 printf("%" PRIdN "\t", LCP[i]); 1738 printf("\n\n"); 1739 #endif 1740 1741 name_ctr=nameSubstr_generalized_LCP(SA,LCP,s,s1,n,m,n1,level,cs,separator); 1742 1743 #if DEBUG 1744 printf("nameSubstr:\n"); 1745 printf("SA\n"); 1746 for(i=0; i<n; i++) 1747 printf("%" PRIdN "\t", SA[i]+1); 1748 printf("\n"); 1749 printf("LCP\n"); 1750 for(i=0; i<n; i++) 1751 printf("%" PRIdN "\t", LCP[i]); 1752 printf("\n\n"); 1753 #endif 1754 1755 #if PHASES 1756 printf("phase 1:\n"); 1757 time_stop(t_start_phase, c_start_phase); 1758 #endif 1759 1760 #if PHASES 1761 t_start_phase = time(NULL); 1762 c_start_phase = clock(); 1763 #endif 1764 1765 // stage 2: solve the reduced problem. 1766 int_t depth=1; 1767 // recurse if names are not yet unique. 1768 if(name_ctr<n1) 1769 depth += SACA_K((int_t*)s1, SA1, 1770 n1, 0, m-n1, sizeof(int_t), level+1); 1771 else // get the suffix array of s1 directly. 1772 for(i=0; i<n1; i++) SA1[s1[i]]=i; 1773 1774 // stage 3: induce SA(S) from SA(S1). 1775 #if DEBUG 1776 printf("recursive:\n"); 1777 printf("SA\n"); 1778 for(i=0; i<n; i++) 1779 printf("%" PRIdN "\t", SA[i]+1); 1780 printf("\n"); 1781 printf("LCP\n"); 1782 for(i=0; i<n; i++) 1783 printf("%" PRIdN "\t", LCP[i]); 1784 printf("\n\n"); 1785 #endif 1786 1787 getSAlms(SA, (int_t*)s, s1, n, n1, level, cs); 1788 1789 #if DEBUG 1790 printf("getSAlms:\n"); 1791 printf("SA\n"); 1792 for(i=0; i<n; i++) 1793 printf("%" PRIdN "\t", SA[i]+1); 1794 printf("\n"); 1795 printf("LCP\n"); 1796 for(i=0; i<n; i++) 1797 printf("%" PRIdN "\t", LCP[i]); 1798 printf("\n\n"); 1799 #endif 1800 1801 uint_t *RA=s1; 1802 int_t *PLCP=LCP+m-n1;//PHI is stored in PLCP array 1803 1804 //compute the LCP of consecutive LMS-suffixes 1805 compute_lcp_phi_sparse((int_t*)s, SA1, RA, LCP, PLCP, n1, cs, separator); 1806 1807 #if DEBUG 1808 printf("\nPHI-algorithm:\n"); 1809 printf("--\nSA1\n"); 1810 for(i=0; i<n1; i++)//SA1 1811 printf("%" PRIdN "\t", SA1[i]+1); 1812 printf("| "); 1813 for(i=0; i<n1; i++)//s1 1814 printf("%" PRIdN "\t", RA[i]+1); 1815 printf("\n"); 1816 printf("LCP\n"); 1817 for(i=0; i<n1; i++) 1818 printf("%" PRIdN "\t", LCP[i]); 1819 printf("| "); 1820 for(i=0; i<n1; i++) 1821 printf("%" PRIdN "\t", PLCP[i]); 1822 printf("\n"); 1823 #endif 1824 1825 for(i=0; i<n1; i++) SA[i]=s1[SA[i]]; 1826 for(i=n1; i<n; i++) SA[i]=U_MAX; 1827 for(i=n1;i<n;i++) LCP[i]=0; 1828 1829 #if DEBUG 1830 printf("\nstage 3:\n\n"); 1831 printf("mapping back:\n"); 1832 printf("SA\n"); 1833 for(i=0; i<n; i++) 1834 printf("%" PRIdN "\t", SA[i]+1); 1835 printf("\n"); 1836 printf("LCP\n"); 1837 for(i=0; i<n; i++) 1838 printf("%" PRIdN "\t", LCP[i]); 1839 printf("\n\n"); 1840 #endif 1841 1842 /* 1843 if(!lcp_array_check((unsigned char*)s, SA1, LCP, n, cs, separator))printf("isNotLCP (lms)!!\n"); 1844 else printf("isLCP (lms)!!\n"); 1845 */ 1846 1847 putSuffix0_generalized_LCP(SA, LCP, s, bkt, n, K, n1, cs, separator); 1848 1849 #if PHASES 1850 if(!level){ 1851 printf("phase 2:\n"); 1852 time_stop(t_start_phase, c_start_phase); 1853 } 1854 #endif 1855 1856 #if PHASES 1857 if(!level){ 1858 t_start_phase = time(NULL); 1859 c_start_phase = clock(); 1860 } 1861 #endif 1862 1863 1864 #if DEBUG 1865 printf("SA (mapped)\n"); 1866 for(i=0; i<n; i++) 1867 printf("%" PRIdN "\t", SA[i]+1); 1868 printf("\n"); 1869 printf("LCP\n"); 1870 for(i=0; i<n; i++) 1871 printf("%" PRIdN "\t", LCP[i]); 1872 printf("\n\n"); 1873 #endif 1874 1875 induceSAl0_generalized_LCP(SA, LCP, s, bkt, n, K, cs, separator); 1876 1877 #if DEBUG 1878 printf("L-type\n"); 1879 for(i=0; i<n; i++) 1880 printf("%" PRIdN "\t", SA[i]+1); 1881 printf("\n"); 1882 printf("LCP\n"); 1883 for(i=0; i<n; i++) 1884 printf("%" PRIdN "\t", LCP[i]); 1885 printf("\n\n"); 1886 #endif 1887 1888 #if PHASES 1889 if(!level){ 1890 printf("phase 3:\n"); 1891 time_stop(t_start_phase, c_start_phase); 1892 } 1893 #endif 1894 1895 #if PHASES 1896 if(!level){ 1897 t_start_phase = time(NULL); 1898 c_start_phase = clock(); 1899 } 1900 #endif 1901 1902 induceSAs0_generalized_LCP(SA, LCP, s, bkt, n, K, cs, separator); 1903 1904 #if DEBUG 1905 printf("S-type\n"); 1906 for(i=0; i<n; i++) 1907 printf("%" PRIdN "\t", SA[i]+1); 1908 printf("\n"); 1909 printf("LCP\n"); 1910 for(i=0; i<n; i++) 1911 printf("%" PRIdN "\t", LCP[i]); 1912 printf("\n\n"); 1913 #endif 1914 free(bkt); 1915 1916 #if DEPTH 1917 printf("depth %" PRIdN ":\nname_ctr = %" PRIdN ", n1 =%" PRIdN ", n = %" PRIdN "\n", depth, name_ctr, n1, n); 1918 time_stop(t_start, c_start); 1919 #endif 1920 1921 #if PHASES 1922 if(!level){ 1923 printf("phase 4:\n"); 1924 time_stop(t_start_phase, c_start_phase); 1925 } 1926 #endif 1927 1928 return depth; 1929 } 1930 1931 /*****************************************************************************/ 1932 1933 int_t gSACA_K_DA(uint_t *s, uint_t *SA, int_da *DA, 1934 uint_t n, unsigned int K, 1935 int cs, uint_t separator, int level) { 1936 uint_t i; 1937 uint_t *bkt=NULL; 1938 uint_t m=n; 1939 1940 #if PHASES 1941 time_t t_start_phase = 0.0; 1942 clock_t c_start_phase = 0.0; 1943 #endif 1944 1945 #if DEPTH 1946 time_t t_start = time(NULL); 1947 clock_t c_start = clock(); 1948 #endif 1949 1950 #if PHASES 1951 t_start_phase = time(NULL); 1952 c_start_phase = clock(); 1953 #endif 1954 // stage 1: reduce the problem by at least 1/2. 1955 1956 bkt=(uint_t *)malloc(sizeof(int_t)*K); 1957 putSubstr0_generalized(SA, s, bkt, n, K, cs, separator); 1958 1959 #if DEBUG 1960 printf("bucket LMS-subs\n"); 1961 for(i=0; i<n; i++) 1962 printf("%" PRIdN "\t", SA[i]+1); 1963 printf("\n"); 1964 #endif 1965 1966 induceSAl0_generalized(SA, s, bkt, n, K, false, cs, separator); 1967 1968 #if DEBUG 1969 printf("L-type\n"); 1970 for(i=0; i<n; i++) 1971 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 1972 else printf("-1\t"); 1973 printf("\n"); 1974 #endif 1975 1976 induceSAs0_generalized(SA, s, bkt, n, K, false, cs, separator); 1977 1978 #if DEBUG 1979 printf("S-type\n"); 1980 for(i=0; i<n; i++) 1981 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 1982 else printf("-1\t"); 1983 printf("\n"); 1984 #endif 1985 1986 // insert separator suffixes in their buckets 1987 // bkt[separator]=1; // gsa-is 1988 for(i=n-3; i>0; i--) 1989 if(chr(i)==separator) 1990 SA[bkt[chr(i)]--]=i; 1991 1992 #if DEBUG 1993 printf("S-type (separators)\n"); 1994 for(i=0; i<n; i++) 1995 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 1996 else printf("-1\t"); 1997 printf("\n"); 1998 #endif 1999 2000 // now, all the LMS-substrings are sorted and 2001 // stored sparsely in SA. 2002 2003 // compact all the sorted substrings into 2004 // the first n1 items of SA. 2005 // 2*n1 must be not larger than n. 2006 uint_t n1=0; 2007 for(i=0; i<n; i++) 2008 if((SA[i]>0)) 2009 SA[n1++]=SA[i]; 2010 2011 uint_t *SA1=SA, *s1=SA+m-n1; 2012 uint_t name_ctr; 2013 2014 #if DEBUG 2015 printf("\nSA\n"); 2016 for(i=0; i<n; i++) 2017 printf("%" PRIdN "\t", SA[i]+1); 2018 printf("\n\n"); 2019 #endif 2020 2021 name_ctr=nameSubstr_generalized(SA,s,s1,n,m,n1,level,cs,separator); 2022 2023 #if DEBUG 2024 printf("nameSubstr:\n"); 2025 printf("SA\n"); 2026 for(i=0; i<n; i++) 2027 printf("%" PRIdN "\t", SA[i]+1); 2028 printf("\n\n"); 2029 #endif 2030 2031 #if PHASES 2032 printf("phase 1:\n"); 2033 time_stop(t_start_phase, c_start_phase); 2034 #endif 2035 2036 #if PHASES 2037 t_start_phase = time(NULL); 2038 c_start_phase = clock(); 2039 #endif 2040 2041 // stage 2: solve the reduced problem. 2042 int_t depth=1; 2043 // recurse if names are not yet unique. 2044 if(name_ctr<n1) 2045 depth += SACA_K((int_t*)s1, SA1, 2046 n1, 0, m-n1, sizeof(int_t), level+1); 2047 else // get the suffix array of s1 directly. 2048 for(i=0; i<n1; i++) SA1[s1[i]]=i; 2049 2050 // stage 3: induce SA(S) from SA(S1). 2051 #if DEBUG 2052 printf("recursive:\n"); 2053 printf("SA\n"); 2054 for(i=0; i<n; i++) 2055 printf("%" PRIdN "\t", SA[i]+1); 2056 printf("\n\n"); 2057 #endif 2058 2059 int_da *d1=DA+m-n1; 2060 2061 getSAlms_DA(SA, d1, (int_t*)s, s1, n, n1, level, cs, separator); 2062 2063 #if DEBUG 2064 printf("getSAlms:\n"); 2065 printf("SA\n"); 2066 for(i=0; i<n; i++) 2067 printf("%" PRIdN "\t", SA[i]+1); 2068 printf("\n"); 2069 printf("DA\n"); 2070 for(i=0; i<n; i++) 2071 printf("%" PRIdA "\t", DA[i]); 2072 printf("\n\n"); 2073 #endif 2074 2075 /**/ 2076 for(i=0; i<n1; i++) {DA[i]=d1[SA[i]]; SA[i]=s1[SA[i]];} 2077 for(i=n1; i<n; i++) {SA[i]=0; DA[i]=0;} 2078 /**/ 2079 2080 #if DEBUG 2081 printf("\nstage 3:\n\n"); 2082 printf("mapping back:\n"); 2083 printf("SA\n"); 2084 for(i=0; i<n; i++) 2085 printf("%" PRIdN "\t", SA[i]+1); 2086 printf("\n"); 2087 printf("DA\n"); 2088 for(i=0; i<n; i++) 2089 printf("%" PRIdA "\t", DA[i]); 2090 printf("\n\n"); 2091 #endif 2092 2093 /**/ 2094 putSuffix0_generalized_DA(SA, DA, s, bkt, n, K, n1, cs, separator); 2095 2096 /**/ 2097 2098 #if PHASES 2099 if(!level){ 2100 printf("phase 2:\n"); 2101 time_stop(t_start_phase, c_start_phase); 2102 } 2103 #endif 2104 2105 #if PHASES 2106 if(!level){ 2107 t_start_phase = time(NULL); 2108 c_start_phase = clock(); 2109 } 2110 #endif 2111 2112 2113 #if DEBUG 2114 printf("SA (mapped)\n"); 2115 for(i=0; i<n; i++) 2116 printf("%" PRIdN "\t", SA[i]+1); 2117 printf("\n"); 2118 printf("DA\n"); 2119 for(i=0; i<n; i++) 2120 printf("%" PRIdA "\t", DA[i]); 2121 printf("\n\n"); 2122 #endif 2123 2124 /**/ 2125 induceSAl0_generalized_DA(SA, DA, s, bkt, n, K, cs, separator); 2126 /**/ 2127 2128 #if DEBUG 2129 printf("L-type\n"); 2130 for(i=0; i<n; i++) 2131 printf("%" PRIdN "\t", SA[i]+1); 2132 printf("\n"); 2133 printf("DA\n"); 2134 for(i=0; i<n; i++) 2135 printf("%" PRIdA "\t", DA[i]); 2136 printf("\n\n"); 2137 #endif 2138 2139 #if PHASES 2140 if(!level){ 2141 printf("phase 3:\n"); 2142 time_stop(t_start_phase, c_start_phase); 2143 } 2144 #endif 2145 2146 #if PHASES 2147 if(!level){ 2148 t_start_phase = time(NULL); 2149 c_start_phase = clock(); 2150 } 2151 #endif 2152 2153 /**/ 2154 induceSAs0_generalized_DA(SA, DA, s, bkt, n, K, cs, separator); 2155 /**/ 2156 2157 #if DEBUG 2158 printf("S-type\n"); 2159 for(i=0; i<n; i++) 2160 printf("%" PRIdN "\t", SA[i]+1); 2161 printf("\n"); 2162 printf("DA\n"); 2163 for(i=0; i<n; i++) 2164 printf("%" PRIdA "\t", DA[i]); 2165 printf("\n\n"); 2166 #endif 2167 free(bkt); 2168 2169 #if DEPTH 2170 printf("depth %" PRIdN ":\nname_ctr = %" PRIdN ", n1 =%" PRIdN ", n = %" PRIdN "\n", depth, name_ctr, n1, n); 2171 time_stop(t_start, c_start); 2172 #endif 2173 2174 #if PHASES 2175 if(!level){ 2176 printf("phase 4:\n"); 2177 time_stop(t_start_phase, c_start_phase); 2178 } 2179 #endif 2180 2181 return depth; 2182 } 2183 2184 /*****************************************************************************/ 2185 2186 int_t gSACA_K_LCP_DA(uint_t *s, uint_t *SA, int_t *LCP, int_da *DA, 2187 uint_t n, unsigned int K, 2188 int cs, uint_t separator, int level) { 2189 uint_t i; 2190 uint_t *bkt=NULL; 2191 uint_t m=n; 2192 2193 #if PHASES 2194 time_t t_start_phase = 0.0; 2195 clock_t c_start_phase = 0.0; 2196 #endif 2197 2198 #if DEPTH 2199 time_t t_start = time(NULL); 2200 clock_t c_start = clock(); 2201 #endif 2202 2203 #if PHASES 2204 t_start_phase = time(NULL); 2205 c_start_phase = clock(); 2206 #endif 2207 // stage 1: reduce the problem by at least 1/2. 2208 2209 bkt=(uint_t *)malloc(sizeof(int_t)*K); 2210 putSubstr0_generalized(SA, s, bkt, n, K, cs, separator); 2211 2212 #if DEBUG 2213 printf("bucket LMS-subs\n"); 2214 for(i=0; i<n; i++) 2215 printf("%" PRIdN "\t", SA[i]+1); 2216 printf("\n"); 2217 #endif 2218 2219 induceSAl0_generalized(SA, s, bkt, n, K, false, cs, separator); 2220 2221 #if DEBUG 2222 printf("L-type\n"); 2223 for(i=0; i<n; i++) 2224 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 2225 else printf("-1\t"); 2226 printf("\n"); 2227 #endif 2228 2229 induceSAs0_generalized(SA, s, bkt, n, K, false, cs, separator); 2230 2231 #if DEBUG 2232 printf("S-type\n"); 2233 for(i=0; i<n; i++) 2234 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 2235 else printf("-1\t"); 2236 printf("\n"); 2237 #endif 2238 2239 // insert separator suffixes in their buckets 2240 // bkt[separator]=1; // gsa-is 2241 for(i=n-3; i>0; i--) 2242 if(chr(i)==separator) 2243 SA[bkt[chr(i)]--]=i; 2244 2245 #if DEBUG 2246 printf("S-type (separators)\n"); 2247 for(i=0; i<n; i++) 2248 if(SA[i]!=0) printf("%" PRIdN "\t", SA[i]+1); 2249 else printf("-1\t"); 2250 printf("\n"); 2251 #endif 2252 2253 // now, all the LMS-substrings are sorted and 2254 // stored sparsely in SA. 2255 2256 // compact all the sorted substrings into 2257 // the first n1 items of SA. 2258 // 2*n1 must be not larger than n. 2259 uint_t n1=0; 2260 for(i=0; i<n; i++) 2261 if((SA[i]>0)) 2262 SA[n1++]=SA[i]; 2263 2264 uint_t *SA1=SA, *s1=SA+m-n1; 2265 uint_t name_ctr; 2266 2267 #if DEBUG 2268 printf("\nSA\n"); 2269 for(i=0; i<n; i++) 2270 printf("%" PRIdN "\t", SA[i]+1); 2271 printf("\n"); 2272 printf("LCP\n"); 2273 for(i=0; i<n; i++) 2274 printf("%" PRIdN "\t", LCP[i]); 2275 printf("\n\n"); 2276 #endif 2277 2278 name_ctr=nameSubstr_generalized_LCP(SA,LCP,s,s1,n,m,n1,level,cs,separator); 2279 2280 #if DEBUG 2281 printf("nameSubstr:\n"); 2282 printf("SA\n"); 2283 for(i=0; i<n; i++) 2284 printf("%" PRIdN "\t", SA[i]+1); 2285 printf("\n"); 2286 printf("LCP\n"); 2287 for(i=0; i<n; i++) 2288 printf("%" PRIdN "\t", LCP[i]); 2289 printf("\n\n"); 2290 #endif 2291 2292 #if PHASES 2293 printf("phase 1:\n"); 2294 time_stop(t_start_phase, c_start_phase); 2295 #endif 2296 2297 #if PHASES 2298 t_start_phase = time(NULL); 2299 c_start_phase = clock(); 2300 #endif 2301 2302 // stage 2: solve the reduced problem. 2303 int_t depth=1; 2304 // recurse if names are not yet unique. 2305 if(name_ctr<n1) 2306 depth += SACA_K((int_t*)s1, SA1, 2307 n1, 0, m-n1, sizeof(int_t), level+1); 2308 else // get the suffix array of s1 directly. 2309 for(i=0; i<n1; i++) SA1[s1[i]]=i; 2310 2311 // stage 3: induce SA(S) from SA(S1). 2312 #if DEBUG 2313 printf("recursive:\n"); 2314 printf("SA\n"); 2315 for(i=0; i<n; i++) 2316 printf("%" PRIdN "\t", SA[i]+1); 2317 printf("\n"); 2318 printf("LCP\n"); 2319 for(i=0; i<n; i++) 2320 printf("%" PRIdN "\t", LCP[i]); 2321 printf("\n\n"); 2322 #endif 2323 2324 int_da *d1=DA+m-n1; 2325 2326 getSAlms_DA(SA, d1, (int_t*)s, s1, n, n1, level, cs, separator); 2327 2328 2329 //FELIPE getSAlms(SA, (int_t*)s, s1, n, n1, level, cs); 2330 2331 #if DEBUG 2332 printf("getSAlms:\n"); 2333 printf("SA\n"); 2334 for(i=0; i<n; i++) 2335 printf("%" PRIdN "\t", SA[i]+1); 2336 printf("\n"); 2337 printf("LCP\n"); 2338 for(i=0; i<n; i++) 2339 printf("%" PRIdN "\t", LCP[i]); 2340 printf("\n"); 2341 printf("DA\n"); 2342 for(i=0; i<n; i++) 2343 printf("%" PRIdA "\t", DA[i]); 2344 printf("\n\n"); 2345 #endif 2346 2347 uint_t *RA=s1; 2348 int_t *PLCP=LCP+m-n1;//PHI is stored in PLCP array 2349 2350 //compute the LCP of consecutive LMS-suffixes 2351 compute_lcp_phi_sparse((int_t*)s, SA1, RA, LCP, PLCP, n1, cs, separator); 2352 2353 #if DEBUG 2354 printf("\nPHI-algorithm:\n"); 2355 printf("--\nSA1\n"); 2356 for(i=0; i<n1; i++)//SA1 2357 printf("%" PRIdN "\t", SA1[i]+1); 2358 printf("| "); 2359 for(i=0; i<n1; i++)//s1 2360 printf("%" PRIdN "\t", RA[i]+1); 2361 printf("\n"); 2362 printf("LCP\n"); 2363 for(i=0; i<n1; i++) 2364 printf("%" PRIdN "\t", LCP[i]); 2365 printf("| "); 2366 for(i=0; i<n1; i++) 2367 printf("%" PRIdN "\t", PLCP[i]); 2368 printf("\n"); 2369 #endif 2370 2371 for(i=0; i<n1; i++) {DA[i]=d1[SA[i]]; SA[i]=s1[SA[i]];} 2372 for(i=n1; i<n; i++) {SA[i]=U_MAX; DA[i]=0;} 2373 for(i=n1;i<n;i++) LCP[i]=0; 2374 2375 //DA for(i=n1; i<n; i++) {SA[i]=0; DA[i]=-1;} 2376 2377 #if DEBUG 2378 printf("\nstage 3:\n\n"); 2379 printf("mapping back:\n"); 2380 printf("SA\n"); 2381 for(i=0; i<n; i++) 2382 printf("%" PRIdN "\t", SA[i]+1); 2383 printf("\n"); 2384 printf("LCP\n"); 2385 for(i=0; i<n; i++) 2386 printf("%" PRIdN "\t", LCP[i]); 2387 printf("\n"); 2388 printf("DA\n"); 2389 for(i=0; i<n; i++) 2390 printf("%" PRIdA "\t", DA[i]); 2391 printf("\n\n"); 2392 #endif 2393 2394 /* 2395 if(!lcp_array_check((unsigned char*)s, SA1, LCP, n, cs, separator))printf("isNotLCP (lms)!!\n"); 2396 else printf("isLCP (lms)!!\n"); 2397 */ 2398 2399 putSuffix0_generalized_LCP_DA(SA, LCP, DA, s, bkt, n, K, n1, cs, separator); 2400 2401 #if PHASES 2402 if(!level){ 2403 printf("phase 2:\n"); 2404 time_stop(t_start_phase, c_start_phase); 2405 } 2406 #endif 2407 2408 #if PHASES 2409 if(!level){ 2410 t_start_phase = time(NULL); 2411 c_start_phase = clock(); 2412 } 2413 #endif 2414 2415 2416 #if DEBUG 2417 printf("SA (mapped)\n"); 2418 for(i=0; i<n; i++) 2419 printf("%" PRIdN "\t", SA[i]+1); 2420 printf("\n"); 2421 printf("LCP\n"); 2422 for(i=0; i<n; i++) 2423 printf("%" PRIdN "\t", LCP[i]); 2424 printf("\n\n"); 2425 #endif 2426 2427 induceSAl0_generalized_LCP_DA(SA, LCP, DA, s, bkt, n, K, cs, separator); 2428 2429 #if DEBUG 2430 printf("L-type\n"); 2431 for(i=0; i<n; i++) 2432 printf("%" PRIdN "\t", SA[i]+1); 2433 printf("\n"); 2434 printf("LCP\n"); 2435 for(i=0; i<n; i++) 2436 printf("%" PRIdN "\t", LCP[i]); 2437 printf("\n"); 2438 printf("DA\n"); 2439 for(i=0; i<n; i++) 2440 printf("%" PRIdA "\t", DA[i]); 2441 printf("\n\n"); 2442 #endif 2443 2444 #if PHASES 2445 if(!level){ 2446 printf("phase 3:\n"); 2447 time_stop(t_start_phase, c_start_phase); 2448 } 2449 #endif 2450 2451 #if PHASES 2452 if(!level){ 2453 t_start_phase = time(NULL); 2454 c_start_phase = clock(); 2455 } 2456 #endif 2457 2458 induceSAs0_generalized_LCP_DA(SA, LCP, DA, s, bkt, n, K, cs, separator); 2459 2460 #if DEBUG 2461 printf("S-type\n"); 2462 for(i=0; i<n; i++) 2463 printf("%" PRIdN "\t", SA[i]+1); 2464 printf("\n"); 2465 printf("LCP\n"); 2466 for(i=0; i<n; i++) 2467 printf("%" PRIdN "\t", LCP[i]); 2468 printf("\n"); 2469 printf("DA\n"); 2470 for(i=0; i<n; i++) 2471 printf("%" PRIdA "\t", DA[i]); 2472 printf("\n\n"); 2473 #endif 2474 free(bkt); 2475 2476 #if DEPTH 2477 printf("depth %" PRIdN ":\nname_ctr = %" PRIdN ", n1 =%" PRIdN ", n = %" PRIdN "\n", depth, name_ctr, n1, n); 2478 time_stop(t_start, c_start); 2479 #endif 2480 2481 #if PHASES 2482 if(!level){ 2483 printf("phase 4:\n"); 2484 time_stop(t_start_phase, c_start_phase); 2485 } 2486 #endif 2487 2488 return depth; 2489 } 2490 2491 /*****************************************************************************/ 2492 2493 2494 int sacak(unsigned char *s, uint_t *SA, uint_t n){ 2495 if((s == NULL) || (SA == NULL) || (n < 0)) return -1; 2496 return SACA_K((int_t*)s, (uint_t*)SA, n, 256, n, sizeof(char), 0); 2497 } 2498 2499 int sacak_int(int_text *s, uint_t *SA, uint_t n, uint_t k){ 2500 if((s == NULL) || (SA == NULL) || (n < 0)) return -1; 2501 return SACA_K((int_t*)s, (uint_t*)SA, n, k, n, sizeof(int_text), 0); 2502 } 2503 2504 int gsacak(unsigned char *s, uint_t *SA, int_t *LCP, int_da *DA, uint_t n){ 2505 2506 if((s == NULL) || (SA == NULL) || (n < 0)) return -1; 2507 2508 #if EMPTY_STRING 2509 for(i=0; i<n-1; i++) if(s[i]==1 && s[i+1]==1) return -2; 2510 #endif 2511 2512 if((LCP == NULL) && (DA == NULL)) 2513 return gSACA_K((uint_t*)s, SA, n, 256, sizeof(char), 1, 0); 2514 else if (DA == NULL) 2515 return gSACA_K_LCP((uint_t*)s, SA, LCP, n, 256, sizeof(char), 1, 0); 2516 else if (LCP == NULL) 2517 return gSACA_K_DA((uint_t*)s, SA, DA, n, 256, sizeof(char), 1, 0); 2518 else 2519 return gSACA_K_LCP_DA((uint_t*)s, SA, LCP, DA, n, 256, sizeof(char), 1, 0); 2520 } 2521 2522 int gsacak_int(int_text *s, uint_t *SA, int_t *LCP, int_da *DA, uint_t n, uint_t k){ 2523 2524 if((s == NULL) || (SA == NULL) || (n < 0)) return -1; 2525 2526 if((LCP == NULL) && (DA == NULL)) 2527 return gSACA_K((uint_t*)s, SA, n, k, sizeof(int_text), 1, 0); 2528 else if (DA == NULL) 2529 return gSACA_K_LCP((uint_t*)s, SA, LCP, n, k, sizeof(int_text), 1, 0); 2530 else if (LCP == NULL) 2531 return gSACA_K_DA((uint_t*)s, SA, DA, n, k, sizeof(int_text), 1, 0); 2532 else 2533 return gSACA_K_LCP_DA((uint_t*)s, SA, LCP, DA, n, k, sizeof(int_text), 1, 0); 2534 } 2535 2536 /*****************************************************************************/