github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/csi/csi.go (about)

     1  // Copyright ©2015 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package csi implements CSIv1 and CSIv2 coordinate sorted indexing.
     6  package csi
     7  
     8  import (
     9  	"errors"
    10  	"sort"
    11  
    12  	"github.com/Schaudge/hts/bgzf"
    13  	"github.com/Schaudge/hts/bgzf/index"
    14  )
    15  
    16  var csiMagic = [3]byte{'C', 'S', 'I'}
    17  
    18  const (
    19  	// DefaultShift is the default minimum shift setting for a CSI.
    20  	DefaultShift = 14
    21  
    22  	// DefaultDepth is the default index depth for a CSI.
    23  	DefaultDepth = 5
    24  )
    25  
    26  const nextBinShift = 3
    27  
    28  // MinimumShiftFor returns the lowest minimum shift value that can be used to index
    29  // the given maximum position with the given index depth.
    30  func MinimumShiftFor(max int64, depth uint32) (uint32, bool) {
    31  	for shift := uint32(0); shift < 32; shift++ {
    32  		if validIndexPos(int(max), shift, depth) {
    33  			return shift, true
    34  		}
    35  	}
    36  	return 0, false
    37  }
    38  
    39  // MinimumDepthFor returns the lowest depth value that can be used to index
    40  // the given maximum position with the given index minimum shift.
    41  func MinimumDepthFor(max int64, shift uint32) (uint32, bool) {
    42  	for depth := uint32(0); depth < 32; depth++ {
    43  		if validIndexPos(int(max), shift, depth) {
    44  			return depth, true
    45  		}
    46  	}
    47  	return 0, false
    48  }
    49  
    50  func validIndexPos(i int, minShift, depth uint32) bool { // 0-based.
    51  	return -1 <= i && i <= (1<<(minShift+depth*nextBinShift)-1)-1
    52  }
    53  
    54  // New returns a CSI index with the given minimum shift and depth.
    55  // The returned index defaults to CSI version 2.
    56  func New(minShift, depth int) *Index {
    57  	if minShift == 0 {
    58  		minShift = DefaultShift
    59  	}
    60  	if depth == 0 {
    61  		depth = DefaultDepth
    62  	}
    63  	return &Index{Version: 0x2, minShift: uint32(minShift), depth: uint32(depth)}
    64  }
    65  
    66  // Index implements coordinate sorted indexing.
    67  type Index struct {
    68  	Auxilliary []byte
    69  	Version    byte
    70  
    71  	refs     []refIndex
    72  	unmapped *uint64
    73  
    74  	minShift uint32
    75  	depth    uint32
    76  
    77  	isSorted   bool
    78  	lastRecord int
    79  }
    80  
    81  type refIndex struct {
    82  	bins  []bin
    83  	stats *index.ReferenceStats
    84  }
    85  
    86  type bin struct {
    87  	bin     uint32
    88  	left    bgzf.Offset
    89  	records uint64
    90  	chunks  []bgzf.Chunk
    91  }
    92  
    93  // NumRefs returns the number of references in the index.
    94  func (i *Index) NumRefs() int {
    95  	return len(i.refs)
    96  }
    97  
    98  // ReferenceStats returns the index statistics for the given reference and true
    99  // if the statistics are valid.
   100  func (i *Index) ReferenceStats(id int) (stats index.ReferenceStats, ok bool) {
   101  	s := i.refs[id].stats
   102  	if s == nil {
   103  		return index.ReferenceStats{}, false
   104  	}
   105  	return *s, true
   106  }
   107  
   108  // Unmapped returns the number of unmapped reads and true if the count is valid.
   109  func (i *Index) Unmapped() (n uint64, ok bool) {
   110  	if i.unmapped == nil {
   111  		return 0, false
   112  	}
   113  	return *i.unmapped, true
   114  }
   115  
   116  // Record wraps types that may be indexed by an Index.
   117  type Record interface {
   118  	RefID() int
   119  	Start() int
   120  	End() int
   121  }
   122  
   123  // Add records the Record as having being located at the given chunk with the given
   124  // mapping and placement status.
   125  func (i *Index) Add(r Record, c bgzf.Chunk, mapped, placed bool) error {
   126  	if !validIndexPos(r.Start(), i.minShift, i.depth) || !validIndexPos(r.End(), i.minShift, i.depth) {
   127  		return errors.New("csi: attempt to add record outside indexable range")
   128  	}
   129  
   130  	if i.unmapped == nil {
   131  		i.unmapped = new(uint64)
   132  	}
   133  	if !placed {
   134  		*i.unmapped++
   135  		return nil
   136  	}
   137  
   138  	rid := r.RefID()
   139  	if rid < len(i.refs)-1 {
   140  		return errors.New("csi: attempt to add record out of reference ID sort order")
   141  	}
   142  	if rid == len(i.refs) {
   143  		i.refs = append(i.refs, refIndex{})
   144  		i.lastRecord = 0
   145  	} else if rid > len(i.refs) {
   146  		refs := make([]refIndex, rid+1)
   147  		copy(refs, i.refs)
   148  		i.refs = refs
   149  		i.lastRecord = 0
   150  	}
   151  	ref := &i.refs[rid]
   152  
   153  	// Record bin information.
   154  	b := reg2bin(int64(r.Start()), int64(r.End()), i.minShift, i.depth)
   155  	for i, bin := range ref.bins {
   156  		if bin.bin == b {
   157  			for j, chunk := range ref.bins[i].chunks {
   158  				if vOffset(chunk.End) > vOffset(c.Begin) {
   159  					ref.bins[i].chunks[j].End = c.End
   160  					ref.bins[i].records++
   161  					goto found
   162  				}
   163  			}
   164  			ref.bins[i].records++
   165  			ref.bins[i].chunks = append(ref.bins[i].chunks, c)
   166  			goto found
   167  		}
   168  	}
   169  	i.isSorted = false // TODO(kortschak) Consider making use of this more effectively for bin search.
   170  	ref.bins = append(ref.bins, bin{
   171  		bin:     b,
   172  		left:    c.Begin,
   173  		records: 1,
   174  		chunks:  []bgzf.Chunk{c},
   175  	})
   176  found:
   177  
   178  	if r.Start() < i.lastRecord {
   179  		return errors.New("csi: attempt to add record out of position sort order")
   180  	}
   181  	i.lastRecord = r.Start()
   182  
   183  	// Record index stats.
   184  	if ref.stats == nil {
   185  		ref.stats = &index.ReferenceStats{
   186  			Chunk: c,
   187  		}
   188  	} else {
   189  		ref.stats.Chunk.End = c.End
   190  	}
   191  	if mapped {
   192  		ref.stats.Mapped++
   193  	} else {
   194  		ref.stats.Unmapped++
   195  	}
   196  
   197  	return nil
   198  }
   199  
   200  // Chunks returns a []bgzf.Chunk that corresponds to the given interval.
   201  func (i *Index) Chunks(rid int, beg, end int) []bgzf.Chunk {
   202  	if rid < 0 || rid >= len(i.refs) {
   203  		return nil
   204  	}
   205  	i.sort()
   206  	ref := i.refs[rid]
   207  
   208  	// Collect candidate chunks according to a scheme modified
   209  	// from the one described in the SAM spec under section 5
   210  	// Indexing BAM.
   211  	var chunks []bgzf.Chunk
   212  	for _, bin := range reg2bins(int64(beg), int64(end), i.minShift, i.depth) {
   213  		b := uint32(bin)
   214  		c := sort.Search(len(ref.bins), func(i int) bool { return ref.bins[i].bin >= b })
   215  		if c < len(ref.bins) && ref.bins[c].bin == b {
   216  			left := vOffset(ref.bins[c].left)
   217  			for _, chunk := range ref.bins[c].chunks {
   218  				if vOffset(chunk.End) > left {
   219  					chunks = append(chunks, chunk)
   220  				}
   221  			}
   222  		}
   223  	}
   224  
   225  	// Sort and merge overlaps.
   226  	if !sort.IsSorted(byBeginOffset(chunks)) {
   227  		sort.Sort(byBeginOffset(chunks))
   228  	}
   229  
   230  	return adjacent(chunks)
   231  }
   232  
   233  var adjacent = index.Adjacent
   234  
   235  func (i *Index) sort() {
   236  	if !i.isSorted {
   237  		for _, ref := range i.refs {
   238  			sort.Sort(byBinNumber(ref.bins))
   239  			for _, bin := range ref.bins {
   240  				sort.Sort(byBeginOffset(bin.chunks))
   241  			}
   242  		}
   243  		i.isSorted = true
   244  	}
   245  }
   246  
   247  // MergeChunks applies the given MergeStrategy to all bins in the Index.
   248  func (i *Index) MergeChunks(s index.MergeStrategy) {
   249  	if s == nil {
   250  		return
   251  	}
   252  	for _, ref := range i.refs {
   253  		for b, bin := range ref.bins {
   254  			if !sort.IsSorted(byBeginOffset(bin.chunks)) {
   255  				sort.Sort(byBeginOffset(bin.chunks))
   256  			}
   257  			ref.bins[b].chunks = s(bin.chunks)
   258  			if !sort.IsSorted(byBeginOffset(bin.chunks)) {
   259  				sort.Sort(byBeginOffset(bin.chunks))
   260  			}
   261  		}
   262  	}
   263  }
   264  
   265  func makeOffset(vOff uint64) bgzf.Offset {
   266  	return bgzf.Offset{
   267  		File:  int64(vOff >> 16),
   268  		Block: uint16(vOff),
   269  	}
   270  }
   271  
   272  func isZero(o bgzf.Offset) bool {
   273  	return o == bgzf.Offset{}
   274  }
   275  
   276  func vOffset(o bgzf.Offset) int64 {
   277  	return o.File<<16 | int64(o.Block)
   278  }
   279  
   280  type byBinNumber []bin
   281  
   282  func (b byBinNumber) Len() int           { return len(b) }
   283  func (b byBinNumber) Less(i, j int) bool { return b[i].bin < b[j].bin }
   284  func (b byBinNumber) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   285  
   286  type byBeginOffset []bgzf.Chunk
   287  
   288  func (c byBeginOffset) Len() int           { return len(c) }
   289  func (c byBeginOffset) Less(i, j int) bool { return vOffset(c[i].Begin) < vOffset(c[j].Begin) }
   290  func (c byBeginOffset) Swap(i, j int)      { c[i], c[j] = c[j], c[i] }
   291  
   292  // calculate bin given an alignment covering [beg,end) (zero-based, half-close-half-open)
   293  func reg2bin(beg, end int64, minShift, depth uint32) uint32 {
   294  	end--
   295  	s := minShift
   296  	t := uint32(((1 << (depth * nextBinShift)) - 1) / 7)
   297  	for level := depth; level > 0; level-- {
   298  		offset := beg >> s
   299  		if offset == end>>s {
   300  			return t + uint32(offset)
   301  		}
   302  		s += nextBinShift
   303  		t -= 1 << (level * nextBinShift)
   304  	}
   305  	return 0
   306  }
   307  
   308  // calculate the list of bins that may overlap with region [beg,end) (zero-based)
   309  func reg2bins(beg, end int64, minShift, depth uint32) []uint32 {
   310  	end--
   311  	var list []uint32
   312  	s := minShift + depth*nextBinShift
   313  	for level, t := uint32(0), uint32(0); level <= depth; level++ {
   314  		b := t + uint32(beg>>s)
   315  		e := t + uint32(end>>s)
   316  		for i := b; i <= e; i++ {
   317  			list = append(list, i)
   318  		}
   319  		s -= nextBinShift
   320  		t += 1 << (level * nextBinShift)
   321  	}
   322  	return list
   323  }