github.com/jmigpin/editor@v1.6.0/util/iout/iorw/index.go

github.com/jmigpin/editor@v1.6.0/util/iout/iorw/index.go (about)

     1  package iorw
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"unicode"
     8  
     9  	"golang.org/x/text/runes"
    10  	"golang.org/x/text/transform"
    11  	"golang.org/x/text/unicode/norm"
    12  )
    13  
    14  // NOTE: considerered golang.org/x/text/search.Matcher but index backwards search is not implemented, as well as some options flexibility
    15  
    16  //----------
    17  
    18  func Index(r ReaderAt, i int, sep []byte, ignoreCase bool) (int, int, error) {
    19  	ctx := context.Background()
    20  	opt := &IndexOpt{IgnoreCase: ignoreCase}
    21  	return IndexCtx(ctx, r, i, sep, opt)
    22  }
    23  
    24  // Returns (-1, 0, nil) if not found.
    25  func IndexCtx(ctx context.Context, r ReaderAt, i int, sep []byte, opt *IndexOpt) (index int, n int, _ error) {
    26  	return indexCtx2(ctx, r, i, sep, -1, opt)
    27  }
    28  func indexCtx2(ctx context.Context, r ReaderAt, i int, sep []byte, chunk int, opt *IndexOpt) (index int, n int, _ error) {
    29  
    30  	pfcFn := prepareForCompareFn(opt)
    31  	sep, sepN, err := pfcFn(sep)
    32  	if err != nil {
    33  		return 0, 0, err // TODO: continue?
    34  	}
    35  
    36  	chunk, err = setupChunkSize(chunk, sepN, opt)
    37  	if err != nil {
    38  		return 0, 0, err
    39  	}
    40  
    41  	max := r.Max()
    42  	for k := i; k < max; k += chunk - (sepN - 1) {
    43  		c := chunk
    44  		if c > max-k {
    45  			c = max - k
    46  		}
    47  
    48  		j, n, err := runIndexFn(bytes.Index, r, k, c, sep, pfcFn, opt)
    49  		if err != nil || j >= 0 {
    50  			return j, n, err
    51  		}
    52  
    53  		// check context cancelation
    54  		if err := ctx.Err(); err != nil {
    55  			return -1, 0, err
    56  		}
    57  	}
    58  
    59  	return -1, 0, nil
    60  }
    61  
    62  //----------
    63  
    64  func LastIndex(r ReaderAt, i int, sep []byte, ignoreCase bool) (int, int, error) {
    65  	ctx := context.Background()
    66  	opt := &IndexOpt{IgnoreCase: ignoreCase}
    67  	return LastIndexCtx(ctx, r, i, sep, opt)
    68  }
    69  
    70  // Returns (-1, 0, nil) if not found.
    71  func LastIndexCtx(ctx context.Context, r ReaderAt, i int, sep []byte, opt *IndexOpt) (int, int, error) {
    72  	return lastIndexCtx2(ctx, r, i, sep, -1, opt)
    73  }
    74  func lastIndexCtx2(ctx context.Context, r ReaderAt, i int, sep []byte, chunk int, opt *IndexOpt) (index int, n int, _ error) {
    75  
    76  	pfcFn := prepareForCompareFn(opt)
    77  	sep, sepN, err := pfcFn(sep)
    78  	if err != nil {
    79  		return 0, 0, err // TODO: continue?
    80  	}
    81  
    82  	chunk, err = setupChunkSize(chunk, len(sep), opt)
    83  	if err != nil {
    84  		return 0, 0, err
    85  	}
    86  
    87  	min := r.Min()
    88  	for k := i; k > min; k -= chunk - (sepN - 1) {
    89  		c := chunk
    90  		if c > k-min {
    91  			c = k - min
    92  		}
    93  
    94  		j, n, err := runIndexFn(bytes.LastIndex, r, k-c, c, sep, pfcFn, opt)
    95  		if err != nil || j >= 0 {
    96  			return j, n, err
    97  		}
    98  
    99  		// check context cancelation
   100  		if err := ctx.Err(); err != nil {
   101  			return -1, 0, err
   102  		}
   103  	}
   104  
   105  	return -1, 0, nil
   106  }
   107  
   108  //----------
   109  
   110  func runIndexFn(indexFn func(s, sep []byte) int, r ReaderAt, i, n int, sep []byte, pfcFn pfcType, opt *IndexOpt) (int, int, error) {
   111  	p, err := r.ReadFastAt(i, n)
   112  	if err != nil {
   113  		return 0, 0, err
   114  	}
   115  	p2, _, err := pfcFn(p) // prepare for compare
   116  	if err != nil {
   117  		return 0, 0, err // TODO: continue?
   118  	}
   119  	j := indexFn(p2, sep) // can be used by index/lastindex
   120  	if j >= 0 {
   121  		n := len(sep)
   122  		if opt.IgnoringDiacritics() {
   123  			j, n = correctRunesPos(p, p2, sep, j)
   124  		}
   125  		return i + j, n, nil
   126  	}
   127  	return -1, 0, nil
   128  }
   129  
   130  //----------
   131  //----------
   132  //----------
   133  
   134  type IndexOpt struct {
   135  	IgnoreCase           bool
   136  	IgnoreCaseDiacritics bool // also lower the case of diacritics (slow)
   137  	IgnoreDiacritics     bool
   138  }
   139  
   140  func (opt *IndexOpt) IgnoringDiacritics() bool {
   141  	return opt.IgnoreCaseDiacritics || opt.IgnoreDiacritics
   142  }
   143  
   144  //----------
   145  //----------
   146  //----------
   147  
   148  type pfcType func([]byte) (result []byte, nSrcBytesRead int, _ error)
   149  
   150  func prepareForCompareFn(opt *IndexOpt) pfcType {
   151  	w := []transform.Transformer{}
   152  	if opt.IgnoreCase {
   153  		tla := &toLowerAscii{lowerDiacritics: opt.IgnoreCaseDiacritics}
   154  		w = append(w, tla)
   155  	}
   156  	if opt.IgnoreDiacritics {
   157  		// https://go.dev/blog/normalization
   158  		w = append(w,
   159  			norm.NFD, // decompose
   160  			runes.Remove(runes.In(unicode.Mn)),
   161  			norm.NFC, // compose
   162  		)
   163  	}
   164  	t := transform.Chain(w...) // ok if w is empty
   165  	return func(b []byte) ([]byte, int, error) {
   166  		return transform.Bytes(t, b)
   167  	}
   168  }
   169  
   170  //----------
   171  
   172  type toLowerAscii struct {
   173  	lowerDiacritics bool
   174  }
   175  
   176  // implement transform.Transformer
   177  func (tla *toLowerAscii) Reset() {}
   178  func (tla *toLowerAscii) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   179  
   180  	if tla.lowerDiacritics {
   181  		// ~8x slower
   182  		// 'áb' will match 'ÁB' but not 'ab'
   183  		b := bytes.ToLower(src)
   184  		n := copy(dst, b)
   185  		return n, len(b), nil
   186  	}
   187  
   188  	min := len(src)
   189  	if min > len(dst) {
   190  		min = len(dst)
   191  	}
   192  	for i := 0; i < min; i++ {
   193  		c := src[i]
   194  		if 'A' <= c && c <= 'Z' {
   195  			dst[i] = c + ('a' - 'A')
   196  		} else {
   197  			dst[i] = c
   198  		}
   199  	}
   200  	return min, min, nil
   201  }
   202  
   203  //----------
   204  //----------
   205  //----------
   206  
   207  const chunkSize = 32 * 1024
   208  
   209  func setupChunkSize(chunkN, sepN int, opt *IndexOpt) (int, error) {
   210  	cN := chunkN
   211  	autoChunk := cN <= 0
   212  	if autoChunk {
   213  		cN = chunkSize
   214  	}
   215  	if opt.IgnoringDiacritics() {
   216  		// because the src contains diacritics, need a big enough chunk size to search a src equal to the separator but full of diacritics. Here we give N extra bytes for each sep byte.
   217  		sepN *= 4
   218  	}
   219  	if cN < sepN {
   220  		if !autoChunk {
   221  			return 0, fmt.Errorf("chunk smaller then sepN: %v, %v", chunkN, sepN)
   222  		}
   223  		cN = sepN
   224  	}
   225  	return cN, nil
   226  }
   227  
   228  //----------
   229  
   230  func correctRunesPos(src, norm, sep []byte, j int) (int, int) {
   231  	// correct j
   232  	runes1 := []rune(string(norm[:j])) // n runes before j
   233  	runes2 := []rune(string(src))      // runes from original p
   234  	// n bytes before j from original p
   235  	if len(runes1) <= len(runes2) {
   236  		j = len(string(runes2[:len(runes1)]))
   237  	}
   238  
   239  	n := len(sep)
   240  	// correct n
   241  	runes3 := []rune(string(sep))  // n runes in sep
   242  	runes4 := runes2[len(runes1):] // runes from original p after j
   243  	// n bytes in original p
   244  	if len(runes3) <= len(runes4) {
   245  		n = len(string(runes4[:len(runes3)]))
   246  	}
   247  	return j, n
   248  }