github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/tabix/tabix.go (about)

     1  // Copyright ©2014 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package tabix implements tabix coordinate sorted indexing.
     6  package tabix
     7  
     8  import (
     9  	"encoding/binary"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"strings"
    14  
    15  	"github.com/Schaudge/hts/bgzf"
    16  	"github.com/Schaudge/hts/bgzf/index"
    17  	"github.com/Schaudge/hts/internal"
    18  )
    19  
    20  // Index is a tabix index.
    21  type Index struct {
    22  	Format    byte
    23  	ZeroBased bool
    24  
    25  	NameColumn  int32
    26  	BeginColumn int32
    27  	EndColumn   int32
    28  
    29  	MetaChar rune
    30  	Skip     int32
    31  
    32  	refNames []string
    33  	nameMap  map[string]int
    34  
    35  	idx internal.Index
    36  }
    37  
    38  // New returns a new tabix index.
    39  func New() *Index {
    40  	return &Index{nameMap: make(map[string]int)}
    41  }
    42  
    43  // NumRefs returns the number of references in the index.
    44  func (i *Index) NumRefs() int {
    45  	return len(i.idx.Refs)
    46  }
    47  
    48  // Names returns the reference names in the index. The returned
    49  // slice should not be altered.
    50  func (i *Index) Names() []string {
    51  	return i.refNames
    52  }
    53  
    54  // IDs returns a map of strings to integer IDs. The returned
    55  // map should not be altered.
    56  func (i *Index) IDs() map[string]int {
    57  	return i.nameMap
    58  }
    59  
    60  // ReferenceStats returns the index statistics for the given reference and true
    61  // if the statistics are valid.
    62  func (i *Index) ReferenceStats(id int) (stats index.ReferenceStats, ok bool) {
    63  	s := i.idx.Refs[id].Stats
    64  	if s == nil {
    65  		return index.ReferenceStats{}, false
    66  	}
    67  	return index.ReferenceStats(*s), true
    68  }
    69  
    70  // Unmapped returns the number of unmapped reads and true if the count is valid.
    71  func (i *Index) Unmapped() (n uint64, ok bool) {
    72  	if i.idx.Unmapped == nil {
    73  		return 0, false
    74  	}
    75  	return *i.idx.Unmapped, true
    76  }
    77  
    78  // Record wraps types that may be indexed by an Index.
    79  type Record interface {
    80  	RefName() string
    81  	Start() int
    82  	End() int
    83  }
    84  
    85  type tabixShim struct {
    86  	id, start, end int
    87  }
    88  
    89  func (r tabixShim) RefID() int { return r.id }
    90  func (r tabixShim) Start() int { return r.start }
    91  func (r tabixShim) End() int   { return r.end }
    92  
    93  // Add records the SAM record as having being located at the given chunk.
    94  func (i *Index) Add(r Record, c bgzf.Chunk, placed, mapped bool) error {
    95  	refName := r.RefName()
    96  	rid, ok := i.nameMap[refName]
    97  	if !ok {
    98  		rid = len(i.refNames)
    99  		i.refNames = append(i.refNames, refName)
   100  	}
   101  	shim := tabixShim{id: rid, start: r.Start(), end: r.End()}
   102  	return i.idx.Add(shim, internal.BinFor(r.Start(), r.End()), c, placed, mapped)
   103  }
   104  
   105  // Chunks returns a []bgzf.Chunk that corresponds to the given genomic interval.
   106  func (i *Index) Chunks(ref string, beg, end int) ([]bgzf.Chunk, error) {
   107  	id, ok := i.nameMap[ref]
   108  	if !ok {
   109  		return nil, index.ErrNoReference
   110  	}
   111  	chunks, err := i.idx.Chunks(id, beg, end)
   112  	if err != nil {
   113  		return nil, err
   114  	}
   115  	return adjacent(chunks), nil
   116  }
   117  
   118  var adjacent = index.Adjacent
   119  
   120  // MergeChunks applies the given MergeStrategy to all bins in the Index.
   121  func (i *Index) MergeChunks(s index.MergeStrategy) {
   122  	i.idx.MergeChunks(s)
   123  }
   124  
   125  var tbiMagic = [4]byte{'T', 'B', 'I', 0x1}
   126  
   127  // ReadFrom reads the tabix index from the given io.Reader. Note that
   128  // the tabix specification states that the index is stored as BGZF, but
   129  // ReadFrom does not perform decompression.
   130  func ReadFrom(r io.Reader) (*Index, error) {
   131  	var (
   132  		idx   Index
   133  		magic [4]byte
   134  		err   error
   135  	)
   136  	err = binary.Read(r, binary.LittleEndian, &magic)
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  	if magic != tbiMagic {
   141  		return nil, errors.New("tabix: magic number mismatch")
   142  	}
   143  
   144  	var n int32
   145  	err = binary.Read(r, binary.LittleEndian, &n)
   146  	if err != nil {
   147  		return nil, err
   148  	}
   149  	if n == 0 {
   150  		return nil, nil
   151  	}
   152  
   153  	err = readTabixHeader(r, &idx)
   154  	if err != nil {
   155  		return nil, err
   156  	}
   157  	if len(idx.refNames) != int(n) {
   158  		return nil, fmt.Errorf("tabix: name count mismatch: %d != %d", len(idx.refNames), n)
   159  	}
   160  	idx.nameMap = make(map[string]int)
   161  	for i, n := range idx.refNames {
   162  		idx.nameMap[n] = i
   163  	}
   164  
   165  	idx.idx, err = internal.ReadIndex(r, n, "tabix")
   166  	if err != nil {
   167  		return nil, err
   168  	}
   169  	return &idx, nil
   170  }
   171  
   172  func readTabixHeader(r io.Reader, idx *Index) error {
   173  	var (
   174  		format int32
   175  		err    error
   176  	)
   177  	err = binary.Read(r, binary.LittleEndian, &format)
   178  	if err != nil {
   179  		return fmt.Errorf("tabix: failed to read format: %v", err)
   180  	}
   181  	idx.Format = byte(format)
   182  	idx.ZeroBased = format&0x10000 != 0
   183  
   184  	err = binary.Read(r, binary.LittleEndian, &idx.NameColumn)
   185  	if err != nil {
   186  		return fmt.Errorf("tabix: failed to read name column index: %v", err)
   187  	}
   188  	err = binary.Read(r, binary.LittleEndian, &idx.BeginColumn)
   189  	if err != nil {
   190  		return fmt.Errorf("tabix: failed to read begin column index: %v", err)
   191  	}
   192  	err = binary.Read(r, binary.LittleEndian, &idx.EndColumn)
   193  	if err != nil {
   194  		return fmt.Errorf("tabix: failed to read end column index: %v", err)
   195  	}
   196  	err = binary.Read(r, binary.LittleEndian, &idx.MetaChar)
   197  	if err != nil {
   198  		return fmt.Errorf("tabix: failed to read metacharacter: %v", err)
   199  	}
   200  	err = binary.Read(r, binary.LittleEndian, &idx.Skip)
   201  	if err != nil {
   202  		return fmt.Errorf("tabix: failed to read skip count: %v", err)
   203  	}
   204  	var n int32
   205  	err = binary.Read(r, binary.LittleEndian, &n)
   206  	if err != nil {
   207  		return fmt.Errorf("tabix: failed to read name lengths: %v", err)
   208  	}
   209  	nameBytes := make([]byte, n)
   210  	_, err = io.ReadFull(r, nameBytes)
   211  	if err != nil {
   212  		return fmt.Errorf("tabix: failed to read names: %v", err)
   213  	}
   214  	names := string(nameBytes)
   215  	if names[len(names)-1] != 0 {
   216  		return errors.New("tabix: last name not zero-terminated")
   217  	}
   218  	idx.refNames = strings.Split(names[:len(names)-1], string(0))
   219  
   220  	return nil
   221  }
   222  
   223  // WriteTo writes the index to the given io.Writer. Note that
   224  // the tabix specification states that the index is stored as BGZF, but
   225  // WriteTo does not perform compression.
   226  func WriteTo(w io.Writer, idx *Index) error {
   227  	err := binary.Write(w, binary.LittleEndian, tbiMagic)
   228  	if err != nil {
   229  		return err
   230  	}
   231  
   232  	err = binary.Write(w, binary.LittleEndian, int32(len(idx.idx.Refs)))
   233  	if err != nil {
   234  		return err
   235  	}
   236  	err = writeTabixHeader(w, idx)
   237  	if err != nil {
   238  		return err
   239  	}
   240  
   241  	return internal.WriteIndex(w, &idx.idx, "tabix")
   242  }
   243  
   244  func writeTabixHeader(w io.Writer, idx *Index) error {
   245  	var err error
   246  	format := int32(idx.Format)
   247  	if idx.ZeroBased {
   248  		format |= 0x10000
   249  	}
   250  	err = binary.Write(w, binary.LittleEndian, format)
   251  	if err != nil {
   252  		return fmt.Errorf("tabix: failed to write format: %v", err)
   253  	}
   254  	err = binary.Write(w, binary.LittleEndian, idx.NameColumn)
   255  	if err != nil {
   256  		return fmt.Errorf("tabix: failed to write name column index: %v", err)
   257  	}
   258  	err = binary.Write(w, binary.LittleEndian, idx.BeginColumn)
   259  	if err != nil {
   260  		return fmt.Errorf("tabix: failed to write begin column index: %v", err)
   261  	}
   262  	err = binary.Write(w, binary.LittleEndian, idx.EndColumn)
   263  	if err != nil {
   264  		return fmt.Errorf("tabix: failed to write end column index: %v", err)
   265  	}
   266  	err = binary.Write(w, binary.LittleEndian, idx.MetaChar)
   267  	if err != nil {
   268  		return fmt.Errorf("tabix: failed to write metacharacter: %v", err)
   269  	}
   270  	err = binary.Write(w, binary.LittleEndian, idx.Skip)
   271  	if err != nil {
   272  		return fmt.Errorf("tabix: failed to write skip count: %v", err)
   273  	}
   274  	var n int32
   275  	for _, name := range idx.refNames {
   276  		n += int32(len(name) + 1)
   277  	}
   278  	err = binary.Write(w, binary.LittleEndian, n)
   279  	if err != nil {
   280  		return fmt.Errorf("tabix: failed to write name lengths: %v", err)
   281  	}
   282  	for _, name := range idx.refNames {
   283  		_, err = w.Write([]byte(name))
   284  		if err != nil {
   285  			return fmt.Errorf("tabix: failed to write name: %v", err)
   286  		}
   287  		_, err = w.Write([]byte{0})
   288  		if err != nil {
   289  			return fmt.Errorf("tabix: failed to write name: %v", err)
   290  		}
   291  	}
   292  	return nil
   293  }