github.com/zach-klippenstein/go@v0.0.0-20150108044943-fcfbeb3adf58/src/index/suffixarray/suffixarray.go

github.com/zach-klippenstein/go@v0.0.0-20150108044943-fcfbeb3adf58/src/index/suffixarray/suffixarray.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package suffixarray implements substring search in logarithmic time using
     6  // an in-memory suffix array.
     7  //
     8  // Example use:
     9  //
    10  //	// create index for some data
    11  //	index := suffixarray.New(data)
    12  //
    13  //	// lookup byte slice s
    14  //	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
    15  //	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
    16  //
    17  package suffixarray
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  	"regexp"
    24  	"sort"
    25  )
    26  
    27  // Index implements a suffix array for fast substring search.
    28  type Index struct {
    29  	data []byte
    30  	sa   []int // suffix array for data; len(sa) == len(data)
    31  }
    32  
    33  // New creates a new Index for data.
    34  // Index creation time is O(N*log(N)) for N = len(data).
    35  func New(data []byte) *Index {
    36  	return &Index{data, qsufsort(data)}
    37  }
    38  
    39  // writeInt writes an int x to w using buf to buffer the write.
    40  func writeInt(w io.Writer, buf []byte, x int) error {
    41  	binary.PutVarint(buf, int64(x))
    42  	_, err := w.Write(buf[0:binary.MaxVarintLen64])
    43  	return err
    44  }
    45  
    46  // readInt reads an int x from r using buf to buffer the read and returns x.
    47  func readInt(r io.Reader, buf []byte) (int, error) {
    48  	_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
    49  	x, _ := binary.Varint(buf)
    50  	return int(x), err
    51  }
    52  
    53  // writeSlice writes data[:n] to w and returns n.
    54  // It uses buf to buffer the write.
    55  func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
    56  	// encode as many elements as fit into buf
    57  	p := binary.MaxVarintLen64
    58  	for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
    59  		p += binary.PutUvarint(buf[p:], uint64(data[n]))
    60  	}
    61  
    62  	// update buffer size
    63  	binary.PutVarint(buf, int64(p))
    64  
    65  	// write buffer
    66  	_, err = w.Write(buf[0:p])
    67  	return
    68  }
    69  
    70  // readSlice reads data[:n] from r and returns n.
    71  // It uses buf to buffer the read.
    72  func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
    73  	// read buffer size
    74  	var size int
    75  	size, err = readInt(r, buf)
    76  	if err != nil {
    77  		return
    78  	}
    79  
    80  	// read buffer w/o the size
    81  	if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
    82  		return
    83  	}
    84  
    85  	// decode as many elements as present in buf
    86  	for p := binary.MaxVarintLen64; p < size; n++ {
    87  		x, w := binary.Uvarint(buf[p:])
    88  		data[n] = int(x)
    89  		p += w
    90  	}
    91  
    92  	return
    93  }
    94  
    95  const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
    96  
    97  // Read reads the index from r into x; x must not be nil.
    98  func (x *Index) Read(r io.Reader) error {
    99  	// buffer for all reads
   100  	buf := make([]byte, bufSize)
   101  
   102  	// read length
   103  	n, err := readInt(r, buf)
   104  	if err != nil {
   105  		return err
   106  	}
   107  
   108  	// allocate space
   109  	if 2*n < cap(x.data) || cap(x.data) < n {
   110  		// new data is significantly smaller or larger then
   111  		// existing buffers - allocate new ones
   112  		x.data = make([]byte, n)
   113  		x.sa = make([]int, n)
   114  	} else {
   115  		// re-use existing buffers
   116  		x.data = x.data[0:n]
   117  		x.sa = x.sa[0:n]
   118  	}
   119  
   120  	// read data
   121  	if _, err := io.ReadFull(r, x.data); err != nil {
   122  		return err
   123  	}
   124  
   125  	// read index
   126  	for sa := x.sa; len(sa) > 0; {
   127  		n, err := readSlice(r, buf, sa)
   128  		if err != nil {
   129  			return err
   130  		}
   131  		sa = sa[n:]
   132  	}
   133  	return nil
   134  }
   135  
   136  // Write writes the index x to w.
   137  func (x *Index) Write(w io.Writer) error {
   138  	// buffer for all writes
   139  	buf := make([]byte, bufSize)
   140  
   141  	// write length
   142  	if err := writeInt(w, buf, len(x.data)); err != nil {
   143  		return err
   144  	}
   145  
   146  	// write data
   147  	if _, err := w.Write(x.data); err != nil {
   148  		return err
   149  	}
   150  
   151  	// write index
   152  	for sa := x.sa; len(sa) > 0; {
   153  		n, err := writeSlice(w, buf, sa)
   154  		if err != nil {
   155  			return err
   156  		}
   157  		sa = sa[n:]
   158  	}
   159  	return nil
   160  }
   161  
   162  // Bytes returns the data over which the index was created.
   163  // It must not be modified.
   164  //
   165  func (x *Index) Bytes() []byte {
   166  	return x.data
   167  }
   168  
   169  func (x *Index) at(i int) []byte {
   170  	return x.data[x.sa[i]:]
   171  }
   172  
   173  // lookupAll returns a slice into the matching region of the index.
   174  // The runtime is O(log(N)*len(s)).
   175  func (x *Index) lookupAll(s []byte) []int {
   176  	// find matching suffix index range [i:j]
   177  	// find the first index where s would be the prefix
   178  	i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
   179  	// starting at i, find the first index at which s is not a prefix
   180  	j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
   181  	return x.sa[i:j]
   182  }
   183  
   184  // Lookup returns an unsorted list of at most n indices where the byte string s
   185  // occurs in the indexed data. If n < 0, all occurrences are returned.
   186  // The result is nil if s is empty, s is not found, or n == 0.
   187  // Lookup time is O(log(N)*len(s) + len(result)) where N is the
   188  // size of the indexed data.
   189  //
   190  func (x *Index) Lookup(s []byte, n int) (result []int) {
   191  	if len(s) > 0 && n != 0 {
   192  		matches := x.lookupAll(s)
   193  		if n < 0 || len(matches) < n {
   194  			n = len(matches)
   195  		}
   196  		// 0 <= n <= len(matches)
   197  		if n > 0 {
   198  			result = make([]int, n)
   199  			copy(result, matches)
   200  		}
   201  	}
   202  	return
   203  }
   204  
   205  // FindAllIndex returns a sorted list of non-overlapping matches of the
   206  // regular expression r, where a match is a pair of indices specifying
   207  // the matched slice of x.Bytes(). If n < 0, all matches are returned
   208  // in successive order. Otherwise, at most n matches are returned and
   209  // they may not be successive. The result is nil if there are no matches,
   210  // or if n == 0.
   211  //
   212  func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
   213  	// a non-empty literal prefix is used to determine possible
   214  	// match start indices with Lookup
   215  	prefix, complete := r.LiteralPrefix()
   216  	lit := []byte(prefix)
   217  
   218  	// worst-case scenario: no literal prefix
   219  	if prefix == "" {
   220  		return r.FindAllIndex(x.data, n)
   221  	}
   222  
   223  	// if regexp is a literal just use Lookup and convert its
   224  	// result into match pairs
   225  	if complete {
   226  		// Lookup returns indices that may belong to overlapping matches.
   227  		// After eliminating them, we may end up with fewer than n matches.
   228  		// If we don't have enough at the end, redo the search with an
   229  		// increased value n1, but only if Lookup returned all the requested
   230  		// indices in the first place (if it returned fewer than that then
   231  		// there cannot be more).
   232  		for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
   233  			indices := x.Lookup(lit, n1)
   234  			if len(indices) == 0 {
   235  				return
   236  			}
   237  			sort.Ints(indices)
   238  			pairs := make([]int, 2*len(indices))
   239  			result = make([][]int, len(indices))
   240  			count := 0
   241  			prev := 0
   242  			for _, i := range indices {
   243  				if count == n {
   244  					break
   245  				}
   246  				// ignore indices leading to overlapping matches
   247  				if prev <= i {
   248  					j := 2 * count
   249  					pairs[j+0] = i
   250  					pairs[j+1] = i + len(lit)
   251  					result[count] = pairs[j : j+2]
   252  					count++
   253  					prev = i + len(lit)
   254  				}
   255  			}
   256  			result = result[0:count]
   257  			if len(result) >= n || len(indices) != n1 {
   258  				// found all matches or there's no chance to find more
   259  				// (n and n1 can be negative)
   260  				break
   261  			}
   262  		}
   263  		if len(result) == 0 {
   264  			result = nil
   265  		}
   266  		return
   267  	}
   268  
   269  	// regexp has a non-empty literal prefix; Lookup(lit) computes
   270  	// the indices of possible complete matches; use these as starting
   271  	// points for anchored searches
   272  	// (regexp "^" matches beginning of input, not beginning of line)
   273  	r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
   274  
   275  	// same comment about Lookup applies here as in the loop above
   276  	for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
   277  		indices := x.Lookup(lit, n1)
   278  		if len(indices) == 0 {
   279  			return
   280  		}
   281  		sort.Ints(indices)
   282  		result = result[0:0]
   283  		prev := 0
   284  		for _, i := range indices {
   285  			if len(result) == n {
   286  				break
   287  			}
   288  			m := r.FindIndex(x.data[i:]) // anchored search - will not run off
   289  			// ignore indices leading to overlapping matches
   290  			if m != nil && prev <= i {
   291  				m[0] = i // correct m
   292  				m[1] += i
   293  				result = append(result, m)
   294  				prev = m[1]
   295  			}
   296  		}
   297  		if len(result) >= n || len(indices) != n1 {
   298  			// found all matches or there's no chance to find more
   299  			// (n and n1 can be negative)
   300  			break
   301  		}
   302  	}
   303  	if len(result) == 0 {
   304  		result = nil
   305  	}
   306  	return
   307  }