github.com/SaadMTSA/goreporter@v0.0.0-20200505121753-0437ee0c8f64/linters/copycheck/suffixtree/suffixtree.go (about)

     1  package suffixtree
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"log"
     7  	"math"
     8  	"strings"
     9  )
    10  
    11  const infinity = math.MaxInt32
    12  
    13  // Pos denotes position in data slice.
    14  type Pos int32
    15  
    16  type Token interface {
    17  	Val() int
    18  }
    19  
    20  // STree is a struct representing a suffix tree.
    21  type STree struct {
    22  	data     []Token
    23  	root     *state
    24  	auxState *state // auxiliary state
    25  
    26  	// active point
    27  	s          *state
    28  	start, end Pos
    29  }
    30  
    31  // New creates new suffix tree.
    32  func New() *STree {
    33  	t := new(STree)
    34  	t.data = make([]Token, 0, 50)
    35  	t.root = newState(t)
    36  	t.auxState = newState(t)
    37  	t.root.linkState = t.auxState
    38  	t.s = t.root
    39  	return t
    40  }
    41  
    42  // Update refreshes the suffix tree to by new data.
    43  func (t *STree) Update(data ...Token) {
    44  	t.data = append(t.data, data...)
    45  	for range data {
    46  		t.update()
    47  		t.s, t.start = t.canonize(t.s, t.start, t.end)
    48  		t.end++
    49  	}
    50  }
    51  
    52  // update transforms suffix tree T(n) to T(n+1).
    53  func (t *STree) update() {
    54  	oldr := t.root
    55  
    56  	// (s, (start, end)) is the canonical reference pair for the active point
    57  	s := t.s
    58  	start, end := t.start, t.end
    59  	var r *state
    60  	for {
    61  		var endPoint bool
    62  		r, endPoint = t.testAndSplit(s, start, end-1)
    63  		if endPoint {
    64  			break
    65  		}
    66  		r.fork(end)
    67  		if oldr != t.root {
    68  			oldr.linkState = r
    69  		}
    70  		oldr = r
    71  		s, start = t.canonize(s.linkState, start, end-1)
    72  	}
    73  	if oldr != t.root {
    74  		oldr.linkState = r
    75  	}
    76  
    77  	// update active point
    78  	t.s = s
    79  	t.start = start
    80  }
    81  
    82  // testAndSplit tests whether a state with canonical ref. pair
    83  // (s, (start, end)) is the end point, that is, a state that have
    84  // a c-transition. If not, then state (exs, (start, end)) is made
    85  // explicit (if not already so).
    86  func (t *STree) testAndSplit(s *state, start, end Pos) (exs *state, endPoint bool) {
    87  	c := t.data[t.end]
    88  	if start <= end {
    89  		tr := s.findTran(t.data[start])
    90  		splitPoint := tr.start + end - start + 1
    91  		if t.data[splitPoint].Val() == c.Val() {
    92  			return s, true
    93  		}
    94  		// make the (s, (start, end)) state explicit
    95  		newSt := newState(s.tree)
    96  		newSt.addTran(splitPoint, tr.end, tr.state)
    97  		tr.end = splitPoint - 1
    98  		tr.state = newSt
    99  		return newSt, false
   100  	}
   101  	if s == t.auxState || s.findTran(c) != nil {
   102  		return s, true
   103  	}
   104  	return s, false
   105  }
   106  
   107  // canonize returns updated state and start position for ref. pair
   108  // (s, (start, end)) of state r so the new ref. pair is canonical,
   109  // that is, referenced from the closest explicit ancestor of r.
   110  func (t *STree) canonize(s *state, start, end Pos) (*state, Pos) {
   111  	if s == t.auxState {
   112  		s, start = t.root, start+1
   113  	}
   114  	if start > end {
   115  		return s, start
   116  	}
   117  
   118  	var tr *tran
   119  	for {
   120  		if start <= end {
   121  			tr = s.findTran(t.data[start])
   122  			if tr == nil {
   123  				log.Fatal(fmt.Sprintf("there should be some transition for '%d' at %d",
   124  					t.data[start].Val(), start))
   125  			}
   126  		}
   127  		if tr.end-tr.start > end-start {
   128  			break
   129  		}
   130  		start += tr.end - tr.start + 1
   131  		s = tr.state
   132  	}
   133  	if s == nil {
   134  		log.Fatal("there should always be some suffix link resolution")
   135  	}
   136  	return s, start
   137  }
   138  
   139  func (t *STree) At(p Pos) Token {
   140  	if p < 0 || p >= Pos(len(t.data)) {
   141  		log.Fatal("position out of bounds")
   142  	}
   143  	return t.data[p]
   144  }
   145  
   146  func (t *STree) String() string {
   147  	buf := new(bytes.Buffer)
   148  	printState(buf, t.root, 0)
   149  	return buf.String()
   150  }
   151  
   152  func printState(buf *bytes.Buffer, s *state, ident int) {
   153  	for _, tr := range s.trans {
   154  		fmt.Fprint(buf, strings.Repeat("  ", ident))
   155  		fmt.Fprintf(buf, "* (%d, %d)\n", tr.start, tr.ActEnd())
   156  		printState(buf, tr.state, ident+1)
   157  	}
   158  }
   159  
   160  // state is an explicit state of the suffix tree.
   161  type state struct {
   162  	tree      *STree
   163  	trans     []*tran
   164  	linkState *state
   165  }
   166  
   167  func newState(t *STree) *state {
   168  	return &state{
   169  		tree:      t,
   170  		trans:     make([]*tran, 0),
   171  		linkState: nil,
   172  	}
   173  }
   174  
   175  func (s *state) addTran(start, end Pos, r *state) {
   176  	s.trans = append(s.trans, newTran(start, end, r))
   177  }
   178  
   179  // fork creates a new branch from the state s.
   180  func (s *state) fork(i Pos) *state {
   181  	r := newState(s.tree)
   182  	s.addTran(i, infinity, r)
   183  	return r
   184  }
   185  
   186  // findTran finds c-transition.
   187  func (s *state) findTran(c Token) *tran {
   188  	for _, tran := range s.trans {
   189  		if s.tree.data[tran.start].Val() == c.Val() {
   190  			return tran
   191  		}
   192  	}
   193  	return nil
   194  }
   195  
   196  // tran represents a state's transition.
   197  type tran struct {
   198  	start, end Pos
   199  	state      *state
   200  }
   201  
   202  func newTran(start, end Pos, s *state) *tran {
   203  	return &tran{start, end, s}
   204  }
   205  
   206  func (t *tran) len() int {
   207  	return int(t.end - t.start + 1)
   208  }
   209  
   210  // ActEnd returns actual end position as consistent with
   211  // the actual length of the data in the STree.
   212  func (t *tran) ActEnd() Pos {
   213  	if t.end == infinity {
   214  		return Pos(len(t.state.tree.data)) - 1
   215  	}
   216  	return t.end
   217  }