github.com/jmigpin/editor@v1.6.0/util/iout/iorw/index.go (about) 1 package iorw 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "unicode" 8 9 "golang.org/x/text/runes" 10 "golang.org/x/text/transform" 11 "golang.org/x/text/unicode/norm" 12 ) 13 14 // NOTE: considerered golang.org/x/text/search.Matcher but index backwards search is not implemented, as well as some options flexibility 15 16 //---------- 17 18 func Index(r ReaderAt, i int, sep []byte, ignoreCase bool) (int, int, error) { 19 ctx := context.Background() 20 opt := &IndexOpt{IgnoreCase: ignoreCase} 21 return IndexCtx(ctx, r, i, sep, opt) 22 } 23 24 // Returns (-1, 0, nil) if not found. 25 func IndexCtx(ctx context.Context, r ReaderAt, i int, sep []byte, opt *IndexOpt) (index int, n int, _ error) { 26 return indexCtx2(ctx, r, i, sep, -1, opt) 27 } 28 func indexCtx2(ctx context.Context, r ReaderAt, i int, sep []byte, chunk int, opt *IndexOpt) (index int, n int, _ error) { 29 30 pfcFn := prepareForCompareFn(opt) 31 sep, sepN, err := pfcFn(sep) 32 if err != nil { 33 return 0, 0, err // TODO: continue? 34 } 35 36 chunk, err = setupChunkSize(chunk, sepN, opt) 37 if err != nil { 38 return 0, 0, err 39 } 40 41 max := r.Max() 42 for k := i; k < max; k += chunk - (sepN - 1) { 43 c := chunk 44 if c > max-k { 45 c = max - k 46 } 47 48 j, n, err := runIndexFn(bytes.Index, r, k, c, sep, pfcFn, opt) 49 if err != nil || j >= 0 { 50 return j, n, err 51 } 52 53 // check context cancelation 54 if err := ctx.Err(); err != nil { 55 return -1, 0, err 56 } 57 } 58 59 return -1, 0, nil 60 } 61 62 //---------- 63 64 func LastIndex(r ReaderAt, i int, sep []byte, ignoreCase bool) (int, int, error) { 65 ctx := context.Background() 66 opt := &IndexOpt{IgnoreCase: ignoreCase} 67 return LastIndexCtx(ctx, r, i, sep, opt) 68 } 69 70 // Returns (-1, 0, nil) if not found. 71 func LastIndexCtx(ctx context.Context, r ReaderAt, i int, sep []byte, opt *IndexOpt) (int, int, error) { 72 return lastIndexCtx2(ctx, r, i, sep, -1, opt) 73 } 74 func lastIndexCtx2(ctx context.Context, r ReaderAt, i int, sep []byte, chunk int, opt *IndexOpt) (index int, n int, _ error) { 75 76 pfcFn := prepareForCompareFn(opt) 77 sep, sepN, err := pfcFn(sep) 78 if err != nil { 79 return 0, 0, err // TODO: continue? 80 } 81 82 chunk, err = setupChunkSize(chunk, len(sep), opt) 83 if err != nil { 84 return 0, 0, err 85 } 86 87 min := r.Min() 88 for k := i; k > min; k -= chunk - (sepN - 1) { 89 c := chunk 90 if c > k-min { 91 c = k - min 92 } 93 94 j, n, err := runIndexFn(bytes.LastIndex, r, k-c, c, sep, pfcFn, opt) 95 if err != nil || j >= 0 { 96 return j, n, err 97 } 98 99 // check context cancelation 100 if err := ctx.Err(); err != nil { 101 return -1, 0, err 102 } 103 } 104 105 return -1, 0, nil 106 } 107 108 //---------- 109 110 func runIndexFn(indexFn func(s, sep []byte) int, r ReaderAt, i, n int, sep []byte, pfcFn pfcType, opt *IndexOpt) (int, int, error) { 111 p, err := r.ReadFastAt(i, n) 112 if err != nil { 113 return 0, 0, err 114 } 115 p2, _, err := pfcFn(p) // prepare for compare 116 if err != nil { 117 return 0, 0, err // TODO: continue? 118 } 119 j := indexFn(p2, sep) // can be used by index/lastindex 120 if j >= 0 { 121 n := len(sep) 122 if opt.IgnoringDiacritics() { 123 j, n = correctRunesPos(p, p2, sep, j) 124 } 125 return i + j, n, nil 126 } 127 return -1, 0, nil 128 } 129 130 //---------- 131 //---------- 132 //---------- 133 134 type IndexOpt struct { 135 IgnoreCase bool 136 IgnoreCaseDiacritics bool // also lower the case of diacritics (slow) 137 IgnoreDiacritics bool 138 } 139 140 func (opt *IndexOpt) IgnoringDiacritics() bool { 141 return opt.IgnoreCaseDiacritics || opt.IgnoreDiacritics 142 } 143 144 //---------- 145 //---------- 146 //---------- 147 148 type pfcType func([]byte) (result []byte, nSrcBytesRead int, _ error) 149 150 func prepareForCompareFn(opt *IndexOpt) pfcType { 151 w := []transform.Transformer{} 152 if opt.IgnoreCase { 153 tla := &toLowerAscii{lowerDiacritics: opt.IgnoreCaseDiacritics} 154 w = append(w, tla) 155 } 156 if opt.IgnoreDiacritics { 157 // https://go.dev/blog/normalization 158 w = append(w, 159 norm.NFD, // decompose 160 runes.Remove(runes.In(unicode.Mn)), 161 norm.NFC, // compose 162 ) 163 } 164 t := transform.Chain(w...) // ok if w is empty 165 return func(b []byte) ([]byte, int, error) { 166 return transform.Bytes(t, b) 167 } 168 } 169 170 //---------- 171 172 type toLowerAscii struct { 173 lowerDiacritics bool 174 } 175 176 // implement transform.Transformer 177 func (tla *toLowerAscii) Reset() {} 178 func (tla *toLowerAscii) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 179 180 if tla.lowerDiacritics { 181 // ~8x slower 182 // 'áb' will match 'ÁB' but not 'ab' 183 b := bytes.ToLower(src) 184 n := copy(dst, b) 185 return n, len(b), nil 186 } 187 188 min := len(src) 189 if min > len(dst) { 190 min = len(dst) 191 } 192 for i := 0; i < min; i++ { 193 c := src[i] 194 if 'A' <= c && c <= 'Z' { 195 dst[i] = c + ('a' - 'A') 196 } else { 197 dst[i] = c 198 } 199 } 200 return min, min, nil 201 } 202 203 //---------- 204 //---------- 205 //---------- 206 207 const chunkSize = 32 * 1024 208 209 func setupChunkSize(chunkN, sepN int, opt *IndexOpt) (int, error) { 210 cN := chunkN 211 autoChunk := cN <= 0 212 if autoChunk { 213 cN = chunkSize 214 } 215 if opt.IgnoringDiacritics() { 216 // because the src contains diacritics, need a big enough chunk size to search a src equal to the separator but full of diacritics. Here we give N extra bytes for each sep byte. 217 sepN *= 4 218 } 219 if cN < sepN { 220 if !autoChunk { 221 return 0, fmt.Errorf("chunk smaller then sepN: %v, %v", chunkN, sepN) 222 } 223 cN = sepN 224 } 225 return cN, nil 226 } 227 228 //---------- 229 230 func correctRunesPos(src, norm, sep []byte, j int) (int, int) { 231 // correct j 232 runes1 := []rune(string(norm[:j])) // n runes before j 233 runes2 := []rune(string(src)) // runes from original p 234 // n bytes before j from original p 235 if len(runes1) <= len(runes2) { 236 j = len(string(runes2[:len(runes1)])) 237 } 238 239 n := len(sep) 240 // correct n 241 runes3 := []rune(string(sep)) // n runes in sep 242 runes4 := runes2[len(runes1):] // runes from original p after j 243 // n bytes in original p 244 if len(runes3) <= len(runes4) { 245 n = len(string(runes4[:len(runes3)])) 246 } 247 return j, n 248 }