github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/internal/index.go (about) 1 // Copyright ©2014 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package internal provides shared code for BAI and tabix index implementations. 6 package internal 7 8 import ( 9 "errors" 10 "sort" 11 12 "github.com/Schaudge/hts/bgzf" 13 "github.com/Schaudge/hts/bgzf/index" 14 ) 15 16 const ( 17 // TileWidth is the length of the interval tiling used 18 // in BAI and tabix indexes. 19 TileWidth = 0x4000 20 21 // Levels is the number of levels in the index tree. 22 Levels = 6 23 24 // BinLimit is the maximum number of bins available in 25 // a BAI or tabix index. 26 BinLimit = ((1 << (Levels * nextBinShift)) - 1) / 7 27 28 // StatsDummyBin is the bin number of the reference 29 // statistics bin used in BAI and tabix indexes. 30 StatsDummyBin = BinLimit + 1 31 ) 32 33 // Index is a coordinate based index. 34 type Index struct { 35 Refs []RefIndex 36 Unmapped *uint64 37 IsSorted bool 38 LastRecord int 39 } 40 41 // RefIndex is the index of a single reference. 42 type RefIndex struct { 43 Bins []Bin 44 Stats *ReferenceStats 45 Intervals []bgzf.Offset 46 } 47 48 // Bin is an index bin. 49 type Bin struct { 50 Bin uint32 51 Chunks []bgzf.Chunk 52 } 53 54 // ReferenceStats holds mapping statistics for a genomic reference. 55 type ReferenceStats struct { 56 // Chunk is the span of the indexed BGZF 57 // holding alignments to the reference. 58 Chunk bgzf.Chunk 59 60 // Mapped is the count of mapped reads. 61 Mapped uint64 62 63 // Unmapped is the count of unmapped reads. 64 Unmapped uint64 65 } 66 67 // Record wraps types that may be indexed by an Index. 68 type Record interface { 69 RefID() int 70 Start() int 71 End() int 72 } 73 74 // Add records the SAM record as having being located at the given chunk. 75 func (i *Index) Add(r Record, bin uint32, c bgzf.Chunk, placed, mapped bool) error { 76 if !IsValidIndexPos(r.Start()) || !IsValidIndexPos(r.End()) { 77 return errors.New("index: attempt to add record outside indexable range") 78 } 79 80 if i.Unmapped == nil { 81 i.Unmapped = new(uint64) 82 } 83 if !placed { 84 *i.Unmapped++ 85 return nil 86 } 87 88 rid := r.RefID() 89 if rid < len(i.Refs)-1 { 90 return errors.New("index: attempt to add record out of reference ID sort order") 91 } 92 if rid == len(i.Refs) { 93 i.Refs = append(i.Refs, RefIndex{}) 94 i.LastRecord = 0 95 } else if rid > len(i.Refs) { 96 Refs := make([]RefIndex, rid+1) 97 copy(Refs, i.Refs) 98 i.Refs = Refs 99 i.LastRecord = 0 100 } 101 ref := &i.Refs[rid] 102 103 // Record bin information. 104 for i, b := range ref.Bins { 105 if b.Bin == bin { 106 for j, chunk := range ref.Bins[i].Chunks { 107 if vOffset(chunk.End) > vOffset(c.Begin) { 108 ref.Bins[i].Chunks[j].End = c.End 109 goto found 110 } 111 } 112 ref.Bins[i].Chunks = append(ref.Bins[i].Chunks, c) 113 goto found 114 } 115 } 116 i.IsSorted = false // TODO(kortschak) Consider making use of this more effectively for bin search. 117 ref.Bins = append(ref.Bins, Bin{ 118 Bin: bin, 119 Chunks: []bgzf.Chunk{c}, 120 }) 121 found: 122 123 // Record interval tile information. 124 biv := r.Start() / TileWidth 125 if r.Start() < i.LastRecord { 126 return errors.New("index: attempt to add record out of position sort order") 127 } 128 i.LastRecord = r.Start() 129 eiv := r.End() / TileWidth 130 if eiv == len(ref.Intervals) { 131 if eiv > biv { 132 panic("index: unexpected alignment length") 133 } 134 ref.Intervals = append(ref.Intervals, c.Begin) 135 } else if eiv > len(ref.Intervals) { 136 intvs := make([]bgzf.Offset, eiv) 137 if len(ref.Intervals) > biv { 138 biv = len(ref.Intervals) 139 } 140 for iv, offset := range intvs[biv:eiv] { 141 if !isZero(offset) { 142 panic("index: unexpected non-zero offset") 143 } 144 intvs[iv+biv] = c.Begin 145 } 146 copy(intvs, ref.Intervals) 147 ref.Intervals = intvs 148 } 149 150 // Record index stats. 151 if ref.Stats == nil { 152 ref.Stats = &ReferenceStats{ 153 Chunk: c, 154 } 155 } else { 156 ref.Stats.Chunk.End = c.End 157 } 158 if mapped { 159 ref.Stats.Mapped++ 160 } else { 161 ref.Stats.Unmapped++ 162 } 163 164 return nil 165 } 166 167 // Chunks returns a []bgzf.Chunk that corresponds to the given genomic interval. 168 func (i *Index) Chunks(rid, beg, end int) ([]bgzf.Chunk, error) { 169 if rid < 0 || rid >= len(i.Refs) { 170 return nil, index.ErrNoReference 171 } 172 i.sort() 173 ref := i.Refs[rid] 174 175 iv := beg / TileWidth 176 if iv >= len(ref.Intervals) { 177 return nil, index.ErrInvalid 178 } 179 180 // Collect candidate chunks according to the scheme described in 181 // the SAM spec under section 5 Indexing BAM. 182 var chunks []bgzf.Chunk 183 for _, b := range OverlappingBinsFor(beg, end) { 184 c := sort.Search(len(ref.Bins), func(i int) bool { return ref.Bins[i].Bin >= b }) 185 if c < len(ref.Bins) && ref.Bins[c].Bin == b { 186 for _, chunk := range ref.Bins[c].Chunks { 187 // Here we check all tiles starting from the left end of the 188 // query region until we get a non-zero offset. The spec states 189 // that we only need to check tiles that contain beg. That is 190 // not correct since we may have no alignments at the left end 191 // of the query region. 192 chunkEndOffset := vOffset(chunk.End) 193 haveNonZero := false 194 for j, tile := range ref.Intervals[iv:] { 195 // If we have found a non-zero tile, all subsequent active 196 // tiles must also be non-zero, so skip zero tiles. 197 if haveNonZero && isZero(tile) { 198 continue 199 } 200 haveNonZero = true 201 tbeg := (j + iv) * TileWidth 202 tend := tbeg + TileWidth 203 // We allow adjacent alignment since samtools behaviour here 204 // has always irritated me and it is cheap to discard these 205 // later if they are not wanted. 206 if tend >= beg && tbeg <= end && chunkEndOffset > vOffset(tile) { 207 chunks = append(chunks, chunk) 208 break 209 } 210 } 211 } 212 } 213 } 214 215 // Sort and merge overlaps. 216 if !sort.IsSorted(byBeginOffset(chunks)) { 217 sort.Sort(byBeginOffset(chunks)) 218 } 219 220 return chunks, nil 221 } 222 223 func (i *Index) sort() { 224 if !i.IsSorted { 225 for _, ref := range i.Refs { 226 sort.Sort(byBinNumber(ref.Bins)) 227 for _, bin := range ref.Bins { 228 sort.Sort(byBeginOffset(bin.Chunks)) 229 } 230 sort.Sort(byVirtOffset(ref.Intervals)) 231 } 232 i.IsSorted = true 233 } 234 } 235 236 // MergeChunks applies the given MergeStrategy to all bins in the Index. 237 func (i *Index) MergeChunks(s func([]bgzf.Chunk) []bgzf.Chunk) { 238 if s == nil { 239 return 240 } 241 for _, ref := range i.Refs { 242 for b, bin := range ref.Bins { 243 if !sort.IsSorted(byBeginOffset(bin.Chunks)) { 244 sort.Sort(byBeginOffset(bin.Chunks)) 245 } 246 ref.Bins[b].Chunks = s(bin.Chunks) 247 if !sort.IsSorted(byBeginOffset(bin.Chunks)) { 248 sort.Sort(byBeginOffset(bin.Chunks)) 249 } 250 } 251 } 252 } 253 254 const ( 255 indexWordBits = 29 256 nextBinShift = 3 257 ) 258 259 // IsValidIndexPos returns a boolean indicating whether 260 // the given position is in the valid range for BAM/SAM. 261 func IsValidIndexPos(i int) bool { return -1 <= i && i <= (1<<indexWordBits-1)-1 } // 0-based. 262 263 const ( 264 level0 = uint32(((1 << (iota * nextBinShift)) - 1) / 7) 265 level1 266 level2 267 level3 268 level4 269 level5 270 ) 271 272 const ( 273 level0Shift = indexWordBits - (iota * nextBinShift) 274 level1Shift 275 level2Shift 276 level3Shift 277 level4Shift 278 level5Shift 279 ) 280 281 // BinFor returns the bin number for given an interval covering 282 // [beg,end) (zero-based, half-close-half-open). 283 func BinFor(beg, end int) uint32 { 284 end-- 285 switch { 286 case beg>>level5Shift == end>>level5Shift: 287 return level5 + uint32(beg>>level5Shift) 288 case beg>>level4Shift == end>>level4Shift: 289 return level4 + uint32(beg>>level4Shift) 290 case beg>>level3Shift == end>>level3Shift: 291 return level3 + uint32(beg>>level3Shift) 292 case beg>>level2Shift == end>>level2Shift: 293 return level2 + uint32(beg>>level2Shift) 294 case beg>>level1Shift == end>>level1Shift: 295 return level1 + uint32(beg>>level1Shift) 296 } 297 return level0 298 } 299 300 // OverlappingBinsFor returns the bin numbers for all bins overlapping 301 // an interval covering [beg,end) (zero-based, half-close-half-open). 302 func OverlappingBinsFor(beg, end int) []uint32 { 303 end-- 304 list := []uint32{level0} 305 for _, r := range []struct { 306 offset, shift uint32 307 }{ 308 {level1, level1Shift}, 309 {level2, level2Shift}, 310 {level3, level3Shift}, 311 {level4, level4Shift}, 312 {level5, level5Shift}, 313 } { 314 for k := r.offset + uint32(beg>>r.shift); k <= r.offset+uint32(end>>r.shift); k++ { 315 list = append(list, k) 316 } 317 } 318 return list 319 } 320 321 func makeOffset(vOff uint64) bgzf.Offset { 322 return bgzf.Offset{ 323 File: int64(vOff >> 16), 324 Block: uint16(vOff), 325 } 326 } 327 328 func isZero(o bgzf.Offset) bool { 329 return o == bgzf.Offset{} 330 } 331 332 func vOffset(o bgzf.Offset) int64 { 333 return o.File<<16 | int64(o.Block) 334 } 335 336 type byBinNumber []Bin 337 338 func (b byBinNumber) Len() int { return len(b) } 339 func (b byBinNumber) Less(i, j int) bool { return b[i].Bin < b[j].Bin } 340 func (b byBinNumber) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 341 342 type byBeginOffset []bgzf.Chunk 343 344 func (c byBeginOffset) Len() int { return len(c) } 345 func (c byBeginOffset) Less(i, j int) bool { return vOffset(c[i].Begin) < vOffset(c[j].Begin) } 346 func (c byBeginOffset) Swap(i, j int) { c[i], c[j] = c[j], c[i] } 347 348 type byVirtOffset []bgzf.Offset 349 350 func (o byVirtOffset) Len() int { return len(o) } 351 func (o byVirtOffset) Less(i, j int) bool { return vOffset(o[i]) < vOffset(o[j]) } 352 func (o byVirtOffset) Swap(i, j int) { o[i], o[j] = o[j], o[i] }