github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/csi/csi.go (about) 1 // Copyright ©2015 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csi implements CSIv1 and CSIv2 coordinate sorted indexing. 6 package csi 7 8 import ( 9 "errors" 10 "sort" 11 12 "github.com/Schaudge/hts/bgzf" 13 "github.com/Schaudge/hts/bgzf/index" 14 ) 15 16 var csiMagic = [3]byte{'C', 'S', 'I'} 17 18 const ( 19 // DefaultShift is the default minimum shift setting for a CSI. 20 DefaultShift = 14 21 22 // DefaultDepth is the default index depth for a CSI. 23 DefaultDepth = 5 24 ) 25 26 const nextBinShift = 3 27 28 // MinimumShiftFor returns the lowest minimum shift value that can be used to index 29 // the given maximum position with the given index depth. 30 func MinimumShiftFor(max int64, depth uint32) (uint32, bool) { 31 for shift := uint32(0); shift < 32; shift++ { 32 if validIndexPos(int(max), shift, depth) { 33 return shift, true 34 } 35 } 36 return 0, false 37 } 38 39 // MinimumDepthFor returns the lowest depth value that can be used to index 40 // the given maximum position with the given index minimum shift. 41 func MinimumDepthFor(max int64, shift uint32) (uint32, bool) { 42 for depth := uint32(0); depth < 32; depth++ { 43 if validIndexPos(int(max), shift, depth) { 44 return depth, true 45 } 46 } 47 return 0, false 48 } 49 50 func validIndexPos(i int, minShift, depth uint32) bool { // 0-based. 51 return -1 <= i && i <= (1<<(minShift+depth*nextBinShift)-1)-1 52 } 53 54 // New returns a CSI index with the given minimum shift and depth. 55 // The returned index defaults to CSI version 2. 56 func New(minShift, depth int) *Index { 57 if minShift == 0 { 58 minShift = DefaultShift 59 } 60 if depth == 0 { 61 depth = DefaultDepth 62 } 63 return &Index{Version: 0x2, minShift: uint32(minShift), depth: uint32(depth)} 64 } 65 66 // Index implements coordinate sorted indexing. 67 type Index struct { 68 Auxilliary []byte 69 Version byte 70 71 refs []refIndex 72 unmapped *uint64 73 74 minShift uint32 75 depth uint32 76 77 isSorted bool 78 lastRecord int 79 } 80 81 type refIndex struct { 82 bins []bin 83 stats *index.ReferenceStats 84 } 85 86 type bin struct { 87 bin uint32 88 left bgzf.Offset 89 records uint64 90 chunks []bgzf.Chunk 91 } 92 93 // NumRefs returns the number of references in the index. 94 func (i *Index) NumRefs() int { 95 return len(i.refs) 96 } 97 98 // ReferenceStats returns the index statistics for the given reference and true 99 // if the statistics are valid. 100 func (i *Index) ReferenceStats(id int) (stats index.ReferenceStats, ok bool) { 101 s := i.refs[id].stats 102 if s == nil { 103 return index.ReferenceStats{}, false 104 } 105 return *s, true 106 } 107 108 // Unmapped returns the number of unmapped reads and true if the count is valid. 109 func (i *Index) Unmapped() (n uint64, ok bool) { 110 if i.unmapped == nil { 111 return 0, false 112 } 113 return *i.unmapped, true 114 } 115 116 // Record wraps types that may be indexed by an Index. 117 type Record interface { 118 RefID() int 119 Start() int 120 End() int 121 } 122 123 // Add records the Record as having being located at the given chunk with the given 124 // mapping and placement status. 125 func (i *Index) Add(r Record, c bgzf.Chunk, mapped, placed bool) error { 126 if !validIndexPos(r.Start(), i.minShift, i.depth) || !validIndexPos(r.End(), i.minShift, i.depth) { 127 return errors.New("csi: attempt to add record outside indexable range") 128 } 129 130 if i.unmapped == nil { 131 i.unmapped = new(uint64) 132 } 133 if !placed { 134 *i.unmapped++ 135 return nil 136 } 137 138 rid := r.RefID() 139 if rid < len(i.refs)-1 { 140 return errors.New("csi: attempt to add record out of reference ID sort order") 141 } 142 if rid == len(i.refs) { 143 i.refs = append(i.refs, refIndex{}) 144 i.lastRecord = 0 145 } else if rid > len(i.refs) { 146 refs := make([]refIndex, rid+1) 147 copy(refs, i.refs) 148 i.refs = refs 149 i.lastRecord = 0 150 } 151 ref := &i.refs[rid] 152 153 // Record bin information. 154 b := reg2bin(int64(r.Start()), int64(r.End()), i.minShift, i.depth) 155 for i, bin := range ref.bins { 156 if bin.bin == b { 157 for j, chunk := range ref.bins[i].chunks { 158 if vOffset(chunk.End) > vOffset(c.Begin) { 159 ref.bins[i].chunks[j].End = c.End 160 ref.bins[i].records++ 161 goto found 162 } 163 } 164 ref.bins[i].records++ 165 ref.bins[i].chunks = append(ref.bins[i].chunks, c) 166 goto found 167 } 168 } 169 i.isSorted = false // TODO(kortschak) Consider making use of this more effectively for bin search. 170 ref.bins = append(ref.bins, bin{ 171 bin: b, 172 left: c.Begin, 173 records: 1, 174 chunks: []bgzf.Chunk{c}, 175 }) 176 found: 177 178 if r.Start() < i.lastRecord { 179 return errors.New("csi: attempt to add record out of position sort order") 180 } 181 i.lastRecord = r.Start() 182 183 // Record index stats. 184 if ref.stats == nil { 185 ref.stats = &index.ReferenceStats{ 186 Chunk: c, 187 } 188 } else { 189 ref.stats.Chunk.End = c.End 190 } 191 if mapped { 192 ref.stats.Mapped++ 193 } else { 194 ref.stats.Unmapped++ 195 } 196 197 return nil 198 } 199 200 // Chunks returns a []bgzf.Chunk that corresponds to the given interval. 201 func (i *Index) Chunks(rid int, beg, end int) []bgzf.Chunk { 202 if rid < 0 || rid >= len(i.refs) { 203 return nil 204 } 205 i.sort() 206 ref := i.refs[rid] 207 208 // Collect candidate chunks according to a scheme modified 209 // from the one described in the SAM spec under section 5 210 // Indexing BAM. 211 var chunks []bgzf.Chunk 212 for _, bin := range reg2bins(int64(beg), int64(end), i.minShift, i.depth) { 213 b := uint32(bin) 214 c := sort.Search(len(ref.bins), func(i int) bool { return ref.bins[i].bin >= b }) 215 if c < len(ref.bins) && ref.bins[c].bin == b { 216 left := vOffset(ref.bins[c].left) 217 for _, chunk := range ref.bins[c].chunks { 218 if vOffset(chunk.End) > left { 219 chunks = append(chunks, chunk) 220 } 221 } 222 } 223 } 224 225 // Sort and merge overlaps. 226 if !sort.IsSorted(byBeginOffset(chunks)) { 227 sort.Sort(byBeginOffset(chunks)) 228 } 229 230 return adjacent(chunks) 231 } 232 233 var adjacent = index.Adjacent 234 235 func (i *Index) sort() { 236 if !i.isSorted { 237 for _, ref := range i.refs { 238 sort.Sort(byBinNumber(ref.bins)) 239 for _, bin := range ref.bins { 240 sort.Sort(byBeginOffset(bin.chunks)) 241 } 242 } 243 i.isSorted = true 244 } 245 } 246 247 // MergeChunks applies the given MergeStrategy to all bins in the Index. 248 func (i *Index) MergeChunks(s index.MergeStrategy) { 249 if s == nil { 250 return 251 } 252 for _, ref := range i.refs { 253 for b, bin := range ref.bins { 254 if !sort.IsSorted(byBeginOffset(bin.chunks)) { 255 sort.Sort(byBeginOffset(bin.chunks)) 256 } 257 ref.bins[b].chunks = s(bin.chunks) 258 if !sort.IsSorted(byBeginOffset(bin.chunks)) { 259 sort.Sort(byBeginOffset(bin.chunks)) 260 } 261 } 262 } 263 } 264 265 func makeOffset(vOff uint64) bgzf.Offset { 266 return bgzf.Offset{ 267 File: int64(vOff >> 16), 268 Block: uint16(vOff), 269 } 270 } 271 272 func isZero(o bgzf.Offset) bool { 273 return o == bgzf.Offset{} 274 } 275 276 func vOffset(o bgzf.Offset) int64 { 277 return o.File<<16 | int64(o.Block) 278 } 279 280 type byBinNumber []bin 281 282 func (b byBinNumber) Len() int { return len(b) } 283 func (b byBinNumber) Less(i, j int) bool { return b[i].bin < b[j].bin } 284 func (b byBinNumber) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 285 286 type byBeginOffset []bgzf.Chunk 287 288 func (c byBeginOffset) Len() int { return len(c) } 289 func (c byBeginOffset) Less(i, j int) bool { return vOffset(c[i].Begin) < vOffset(c[j].Begin) } 290 func (c byBeginOffset) Swap(i, j int) { c[i], c[j] = c[j], c[i] } 291 292 // calculate bin given an alignment covering [beg,end) (zero-based, half-close-half-open) 293 func reg2bin(beg, end int64, minShift, depth uint32) uint32 { 294 end-- 295 s := minShift 296 t := uint32(((1 << (depth * nextBinShift)) - 1) / 7) 297 for level := depth; level > 0; level-- { 298 offset := beg >> s 299 if offset == end>>s { 300 return t + uint32(offset) 301 } 302 s += nextBinShift 303 t -= 1 << (level * nextBinShift) 304 } 305 return 0 306 } 307 308 // calculate the list of bins that may overlap with region [beg,end) (zero-based) 309 func reg2bins(beg, end int64, minShift, depth uint32) []uint32 { 310 end-- 311 var list []uint32 312 s := minShift + depth*nextBinShift 313 for level, t := uint32(0), uint32(0); level <= depth; level++ { 314 b := t + uint32(beg>>s) 315 e := t + uint32(end>>s) 316 for i := b; i <= e; i++ { 317 list = append(list, i) 318 } 319 s -= nextBinShift 320 t += 1 << (level * nextBinShift) 321 } 322 return list 323 }