github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/internal/index.go (about)

     1  // Copyright ©2014 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package internal provides shared code for BAI and tabix index implementations.
     6  package internal
     7  
     8  import (
     9  	"errors"
    10  	"sort"
    11  
    12  	"github.com/Schaudge/hts/bgzf"
    13  	"github.com/Schaudge/hts/bgzf/index"
    14  )
    15  
    16  const (
    17  	// TileWidth is the length of the interval tiling used
    18  	// in BAI and tabix indexes.
    19  	TileWidth = 0x4000
    20  
    21  	// Levels is the number of levels in the index tree.
    22  	Levels = 6
    23  
    24  	// BinLimit is the maximum number of bins available in
    25  	// a BAI or tabix index.
    26  	BinLimit = ((1 << (Levels * nextBinShift)) - 1) / 7
    27  
    28  	// StatsDummyBin is the bin number of the reference
    29  	// statistics bin used in BAI and tabix indexes.
    30  	StatsDummyBin = BinLimit + 1
    31  )
    32  
    33  // Index is a coordinate based index.
    34  type Index struct {
    35  	Refs       []RefIndex
    36  	Unmapped   *uint64
    37  	IsSorted   bool
    38  	LastRecord int
    39  }
    40  
    41  // RefIndex is the index of a single reference.
    42  type RefIndex struct {
    43  	Bins      []Bin
    44  	Stats     *ReferenceStats
    45  	Intervals []bgzf.Offset
    46  }
    47  
    48  // Bin is an index bin.
    49  type Bin struct {
    50  	Bin    uint32
    51  	Chunks []bgzf.Chunk
    52  }
    53  
    54  // ReferenceStats holds mapping statistics for a genomic reference.
    55  type ReferenceStats struct {
    56  	// Chunk is the span of the indexed BGZF
    57  	// holding alignments to the reference.
    58  	Chunk bgzf.Chunk
    59  
    60  	// Mapped is the count of mapped reads.
    61  	Mapped uint64
    62  
    63  	// Unmapped is the count of unmapped reads.
    64  	Unmapped uint64
    65  }
    66  
    67  // Record wraps types that may be indexed by an Index.
    68  type Record interface {
    69  	RefID() int
    70  	Start() int
    71  	End() int
    72  }
    73  
    74  // Add records the SAM record as having being located at the given chunk.
    75  func (i *Index) Add(r Record, bin uint32, c bgzf.Chunk, placed, mapped bool) error {
    76  	if !IsValidIndexPos(r.Start()) || !IsValidIndexPos(r.End()) {
    77  		return errors.New("index: attempt to add record outside indexable range")
    78  	}
    79  
    80  	if i.Unmapped == nil {
    81  		i.Unmapped = new(uint64)
    82  	}
    83  	if !placed {
    84  		*i.Unmapped++
    85  		return nil
    86  	}
    87  
    88  	rid := r.RefID()
    89  	if rid < len(i.Refs)-1 {
    90  		return errors.New("index: attempt to add record out of reference ID sort order")
    91  	}
    92  	if rid == len(i.Refs) {
    93  		i.Refs = append(i.Refs, RefIndex{})
    94  		i.LastRecord = 0
    95  	} else if rid > len(i.Refs) {
    96  		Refs := make([]RefIndex, rid+1)
    97  		copy(Refs, i.Refs)
    98  		i.Refs = Refs
    99  		i.LastRecord = 0
   100  	}
   101  	ref := &i.Refs[rid]
   102  
   103  	// Record bin information.
   104  	for i, b := range ref.Bins {
   105  		if b.Bin == bin {
   106  			for j, chunk := range ref.Bins[i].Chunks {
   107  				if vOffset(chunk.End) > vOffset(c.Begin) {
   108  					ref.Bins[i].Chunks[j].End = c.End
   109  					goto found
   110  				}
   111  			}
   112  			ref.Bins[i].Chunks = append(ref.Bins[i].Chunks, c)
   113  			goto found
   114  		}
   115  	}
   116  	i.IsSorted = false // TODO(kortschak) Consider making use of this more effectively for bin search.
   117  	ref.Bins = append(ref.Bins, Bin{
   118  		Bin:    bin,
   119  		Chunks: []bgzf.Chunk{c},
   120  	})
   121  found:
   122  
   123  	// Record interval tile information.
   124  	biv := r.Start() / TileWidth
   125  	if r.Start() < i.LastRecord {
   126  		return errors.New("index: attempt to add record out of position sort order")
   127  	}
   128  	i.LastRecord = r.Start()
   129  	eiv := r.End() / TileWidth
   130  	if eiv == len(ref.Intervals) {
   131  		if eiv > biv {
   132  			panic("index: unexpected alignment length")
   133  		}
   134  		ref.Intervals = append(ref.Intervals, c.Begin)
   135  	} else if eiv > len(ref.Intervals) {
   136  		intvs := make([]bgzf.Offset, eiv)
   137  		if len(ref.Intervals) > biv {
   138  			biv = len(ref.Intervals)
   139  		}
   140  		for iv, offset := range intvs[biv:eiv] {
   141  			if !isZero(offset) {
   142  				panic("index: unexpected non-zero offset")
   143  			}
   144  			intvs[iv+biv] = c.Begin
   145  		}
   146  		copy(intvs, ref.Intervals)
   147  		ref.Intervals = intvs
   148  	}
   149  
   150  	// Record index stats.
   151  	if ref.Stats == nil {
   152  		ref.Stats = &ReferenceStats{
   153  			Chunk: c,
   154  		}
   155  	} else {
   156  		ref.Stats.Chunk.End = c.End
   157  	}
   158  	if mapped {
   159  		ref.Stats.Mapped++
   160  	} else {
   161  		ref.Stats.Unmapped++
   162  	}
   163  
   164  	return nil
   165  }
   166  
   167  // Chunks returns a []bgzf.Chunk that corresponds to the given genomic interval.
   168  func (i *Index) Chunks(rid, beg, end int) ([]bgzf.Chunk, error) {
   169  	if rid < 0 || rid >= len(i.Refs) {
   170  		return nil, index.ErrNoReference
   171  	}
   172  	i.sort()
   173  	ref := i.Refs[rid]
   174  
   175  	iv := beg / TileWidth
   176  	if iv >= len(ref.Intervals) {
   177  		return nil, index.ErrInvalid
   178  	}
   179  
   180  	// Collect candidate chunks according to the scheme described in
   181  	// the SAM spec under section 5 Indexing BAM.
   182  	var chunks []bgzf.Chunk
   183  	for _, b := range OverlappingBinsFor(beg, end) {
   184  		c := sort.Search(len(ref.Bins), func(i int) bool { return ref.Bins[i].Bin >= b })
   185  		if c < len(ref.Bins) && ref.Bins[c].Bin == b {
   186  			for _, chunk := range ref.Bins[c].Chunks {
   187  				// Here we check all tiles starting from the left end of the
   188  				// query region until we get a non-zero offset. The spec states
   189  				// that we only need to check tiles that contain beg. That is
   190  				// not correct since we may have no alignments at the left end
   191  				// of the query region.
   192  				chunkEndOffset := vOffset(chunk.End)
   193  				haveNonZero := false
   194  				for j, tile := range ref.Intervals[iv:] {
   195  					// If we have found a non-zero tile, all subsequent active
   196  					// tiles must also be non-zero, so skip zero tiles.
   197  					if haveNonZero && isZero(tile) {
   198  						continue
   199  					}
   200  					haveNonZero = true
   201  					tbeg := (j + iv) * TileWidth
   202  					tend := tbeg + TileWidth
   203  					// We allow adjacent alignment since samtools behaviour here
   204  					// has always irritated me and it is cheap to discard these
   205  					// later if they are not wanted.
   206  					if tend >= beg && tbeg <= end && chunkEndOffset > vOffset(tile) {
   207  						chunks = append(chunks, chunk)
   208  						break
   209  					}
   210  				}
   211  			}
   212  		}
   213  	}
   214  
   215  	// Sort and merge overlaps.
   216  	if !sort.IsSorted(byBeginOffset(chunks)) {
   217  		sort.Sort(byBeginOffset(chunks))
   218  	}
   219  
   220  	return chunks, nil
   221  }
   222  
   223  func (i *Index) sort() {
   224  	if !i.IsSorted {
   225  		for _, ref := range i.Refs {
   226  			sort.Sort(byBinNumber(ref.Bins))
   227  			for _, bin := range ref.Bins {
   228  				sort.Sort(byBeginOffset(bin.Chunks))
   229  			}
   230  			sort.Sort(byVirtOffset(ref.Intervals))
   231  		}
   232  		i.IsSorted = true
   233  	}
   234  }
   235  
   236  // MergeChunks applies the given MergeStrategy to all bins in the Index.
   237  func (i *Index) MergeChunks(s func([]bgzf.Chunk) []bgzf.Chunk) {
   238  	if s == nil {
   239  		return
   240  	}
   241  	for _, ref := range i.Refs {
   242  		for b, bin := range ref.Bins {
   243  			if !sort.IsSorted(byBeginOffset(bin.Chunks)) {
   244  				sort.Sort(byBeginOffset(bin.Chunks))
   245  			}
   246  			ref.Bins[b].Chunks = s(bin.Chunks)
   247  			if !sort.IsSorted(byBeginOffset(bin.Chunks)) {
   248  				sort.Sort(byBeginOffset(bin.Chunks))
   249  			}
   250  		}
   251  	}
   252  }
   253  
   254  const (
   255  	indexWordBits = 29
   256  	nextBinShift  = 3
   257  )
   258  
   259  // IsValidIndexPos returns a boolean indicating whether
   260  // the given position is in the valid range for BAM/SAM.
   261  func IsValidIndexPos(i int) bool { return -1 <= i && i <= (1<<indexWordBits-1)-1 } // 0-based.
   262  
   263  const (
   264  	level0 = uint32(((1 << (iota * nextBinShift)) - 1) / 7)
   265  	level1
   266  	level2
   267  	level3
   268  	level4
   269  	level5
   270  )
   271  
   272  const (
   273  	level0Shift = indexWordBits - (iota * nextBinShift)
   274  	level1Shift
   275  	level2Shift
   276  	level3Shift
   277  	level4Shift
   278  	level5Shift
   279  )
   280  
   281  // BinFor returns the bin number for given an interval covering
   282  // [beg,end) (zero-based, half-close-half-open).
   283  func BinFor(beg, end int) uint32 {
   284  	end--
   285  	switch {
   286  	case beg>>level5Shift == end>>level5Shift:
   287  		return level5 + uint32(beg>>level5Shift)
   288  	case beg>>level4Shift == end>>level4Shift:
   289  		return level4 + uint32(beg>>level4Shift)
   290  	case beg>>level3Shift == end>>level3Shift:
   291  		return level3 + uint32(beg>>level3Shift)
   292  	case beg>>level2Shift == end>>level2Shift:
   293  		return level2 + uint32(beg>>level2Shift)
   294  	case beg>>level1Shift == end>>level1Shift:
   295  		return level1 + uint32(beg>>level1Shift)
   296  	}
   297  	return level0
   298  }
   299  
   300  // OverlappingBinsFor returns the bin numbers for all bins overlapping
   301  // an interval covering [beg,end) (zero-based, half-close-half-open).
   302  func OverlappingBinsFor(beg, end int) []uint32 {
   303  	end--
   304  	list := []uint32{level0}
   305  	for _, r := range []struct {
   306  		offset, shift uint32
   307  	}{
   308  		{level1, level1Shift},
   309  		{level2, level2Shift},
   310  		{level3, level3Shift},
   311  		{level4, level4Shift},
   312  		{level5, level5Shift},
   313  	} {
   314  		for k := r.offset + uint32(beg>>r.shift); k <= r.offset+uint32(end>>r.shift); k++ {
   315  			list = append(list, k)
   316  		}
   317  	}
   318  	return list
   319  }
   320  
   321  func makeOffset(vOff uint64) bgzf.Offset {
   322  	return bgzf.Offset{
   323  		File:  int64(vOff >> 16),
   324  		Block: uint16(vOff),
   325  	}
   326  }
   327  
   328  func isZero(o bgzf.Offset) bool {
   329  	return o == bgzf.Offset{}
   330  }
   331  
   332  func vOffset(o bgzf.Offset) int64 {
   333  	return o.File<<16 | int64(o.Block)
   334  }
   335  
   336  type byBinNumber []Bin
   337  
   338  func (b byBinNumber) Len() int           { return len(b) }
   339  func (b byBinNumber) Less(i, j int) bool { return b[i].Bin < b[j].Bin }
   340  func (b byBinNumber) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   341  
   342  type byBeginOffset []bgzf.Chunk
   343  
   344  func (c byBeginOffset) Len() int           { return len(c) }
   345  func (c byBeginOffset) Less(i, j int) bool { return vOffset(c[i].Begin) < vOffset(c[j].Begin) }
   346  func (c byBeginOffset) Swap(i, j int)      { c[i], c[j] = c[j], c[i] }
   347  
   348  type byVirtOffset []bgzf.Offset
   349  
   350  func (o byVirtOffset) Len() int           { return len(o) }
   351  func (o byVirtOffset) Less(i, j int) bool { return vOffset(o[i]) < vOffset(o[j]) }
   352  func (o byVirtOffset) Swap(i, j int)      { o[i], o[j] = o[j], o[i] }