github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/tabix/tabix.go (about) 1 // Copyright ©2014 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package tabix implements tabix coordinate sorted indexing. 6 package tabix 7 8 import ( 9 "encoding/binary" 10 "errors" 11 "fmt" 12 "io" 13 "strings" 14 15 "github.com/Schaudge/hts/bgzf" 16 "github.com/Schaudge/hts/bgzf/index" 17 "github.com/Schaudge/hts/internal" 18 ) 19 20 // Index is a tabix index. 21 type Index struct { 22 Format byte 23 ZeroBased bool 24 25 NameColumn int32 26 BeginColumn int32 27 EndColumn int32 28 29 MetaChar rune 30 Skip int32 31 32 refNames []string 33 nameMap map[string]int 34 35 idx internal.Index 36 } 37 38 // New returns a new tabix index. 39 func New() *Index { 40 return &Index{nameMap: make(map[string]int)} 41 } 42 43 // NumRefs returns the number of references in the index. 44 func (i *Index) NumRefs() int { 45 return len(i.idx.Refs) 46 } 47 48 // Names returns the reference names in the index. The returned 49 // slice should not be altered. 50 func (i *Index) Names() []string { 51 return i.refNames 52 } 53 54 // IDs returns a map of strings to integer IDs. The returned 55 // map should not be altered. 56 func (i *Index) IDs() map[string]int { 57 return i.nameMap 58 } 59 60 // ReferenceStats returns the index statistics for the given reference and true 61 // if the statistics are valid. 62 func (i *Index) ReferenceStats(id int) (stats index.ReferenceStats, ok bool) { 63 s := i.idx.Refs[id].Stats 64 if s == nil { 65 return index.ReferenceStats{}, false 66 } 67 return index.ReferenceStats(*s), true 68 } 69 70 // Unmapped returns the number of unmapped reads and true if the count is valid. 71 func (i *Index) Unmapped() (n uint64, ok bool) { 72 if i.idx.Unmapped == nil { 73 return 0, false 74 } 75 return *i.idx.Unmapped, true 76 } 77 78 // Record wraps types that may be indexed by an Index. 79 type Record interface { 80 RefName() string 81 Start() int 82 End() int 83 } 84 85 type tabixShim struct { 86 id, start, end int 87 } 88 89 func (r tabixShim) RefID() int { return r.id } 90 func (r tabixShim) Start() int { return r.start } 91 func (r tabixShim) End() int { return r.end } 92 93 // Add records the SAM record as having being located at the given chunk. 94 func (i *Index) Add(r Record, c bgzf.Chunk, placed, mapped bool) error { 95 refName := r.RefName() 96 rid, ok := i.nameMap[refName] 97 if !ok { 98 rid = len(i.refNames) 99 i.refNames = append(i.refNames, refName) 100 } 101 shim := tabixShim{id: rid, start: r.Start(), end: r.End()} 102 return i.idx.Add(shim, internal.BinFor(r.Start(), r.End()), c, placed, mapped) 103 } 104 105 // Chunks returns a []bgzf.Chunk that corresponds to the given genomic interval. 106 func (i *Index) Chunks(ref string, beg, end int) ([]bgzf.Chunk, error) { 107 id, ok := i.nameMap[ref] 108 if !ok { 109 return nil, index.ErrNoReference 110 } 111 chunks, err := i.idx.Chunks(id, beg, end) 112 if err != nil { 113 return nil, err 114 } 115 return adjacent(chunks), nil 116 } 117 118 var adjacent = index.Adjacent 119 120 // MergeChunks applies the given MergeStrategy to all bins in the Index. 121 func (i *Index) MergeChunks(s index.MergeStrategy) { 122 i.idx.MergeChunks(s) 123 } 124 125 var tbiMagic = [4]byte{'T', 'B', 'I', 0x1} 126 127 // ReadFrom reads the tabix index from the given io.Reader. Note that 128 // the tabix specification states that the index is stored as BGZF, but 129 // ReadFrom does not perform decompression. 130 func ReadFrom(r io.Reader) (*Index, error) { 131 var ( 132 idx Index 133 magic [4]byte 134 err error 135 ) 136 err = binary.Read(r, binary.LittleEndian, &magic) 137 if err != nil { 138 return nil, err 139 } 140 if magic != tbiMagic { 141 return nil, errors.New("tabix: magic number mismatch") 142 } 143 144 var n int32 145 err = binary.Read(r, binary.LittleEndian, &n) 146 if err != nil { 147 return nil, err 148 } 149 if n == 0 { 150 return nil, nil 151 } 152 153 err = readTabixHeader(r, &idx) 154 if err != nil { 155 return nil, err 156 } 157 if len(idx.refNames) != int(n) { 158 return nil, fmt.Errorf("tabix: name count mismatch: %d != %d", len(idx.refNames), n) 159 } 160 idx.nameMap = make(map[string]int) 161 for i, n := range idx.refNames { 162 idx.nameMap[n] = i 163 } 164 165 idx.idx, err = internal.ReadIndex(r, n, "tabix") 166 if err != nil { 167 return nil, err 168 } 169 return &idx, nil 170 } 171 172 func readTabixHeader(r io.Reader, idx *Index) error { 173 var ( 174 format int32 175 err error 176 ) 177 err = binary.Read(r, binary.LittleEndian, &format) 178 if err != nil { 179 return fmt.Errorf("tabix: failed to read format: %v", err) 180 } 181 idx.Format = byte(format) 182 idx.ZeroBased = format&0x10000 != 0 183 184 err = binary.Read(r, binary.LittleEndian, &idx.NameColumn) 185 if err != nil { 186 return fmt.Errorf("tabix: failed to read name column index: %v", err) 187 } 188 err = binary.Read(r, binary.LittleEndian, &idx.BeginColumn) 189 if err != nil { 190 return fmt.Errorf("tabix: failed to read begin column index: %v", err) 191 } 192 err = binary.Read(r, binary.LittleEndian, &idx.EndColumn) 193 if err != nil { 194 return fmt.Errorf("tabix: failed to read end column index: %v", err) 195 } 196 err = binary.Read(r, binary.LittleEndian, &idx.MetaChar) 197 if err != nil { 198 return fmt.Errorf("tabix: failed to read metacharacter: %v", err) 199 } 200 err = binary.Read(r, binary.LittleEndian, &idx.Skip) 201 if err != nil { 202 return fmt.Errorf("tabix: failed to read skip count: %v", err) 203 } 204 var n int32 205 err = binary.Read(r, binary.LittleEndian, &n) 206 if err != nil { 207 return fmt.Errorf("tabix: failed to read name lengths: %v", err) 208 } 209 nameBytes := make([]byte, n) 210 _, err = io.ReadFull(r, nameBytes) 211 if err != nil { 212 return fmt.Errorf("tabix: failed to read names: %v", err) 213 } 214 names := string(nameBytes) 215 if names[len(names)-1] != 0 { 216 return errors.New("tabix: last name not zero-terminated") 217 } 218 idx.refNames = strings.Split(names[:len(names)-1], string(0)) 219 220 return nil 221 } 222 223 // WriteTo writes the index to the given io.Writer. Note that 224 // the tabix specification states that the index is stored as BGZF, but 225 // WriteTo does not perform compression. 226 func WriteTo(w io.Writer, idx *Index) error { 227 err := binary.Write(w, binary.LittleEndian, tbiMagic) 228 if err != nil { 229 return err 230 } 231 232 err = binary.Write(w, binary.LittleEndian, int32(len(idx.idx.Refs))) 233 if err != nil { 234 return err 235 } 236 err = writeTabixHeader(w, idx) 237 if err != nil { 238 return err 239 } 240 241 return internal.WriteIndex(w, &idx.idx, "tabix") 242 } 243 244 func writeTabixHeader(w io.Writer, idx *Index) error { 245 var err error 246 format := int32(idx.Format) 247 if idx.ZeroBased { 248 format |= 0x10000 249 } 250 err = binary.Write(w, binary.LittleEndian, format) 251 if err != nil { 252 return fmt.Errorf("tabix: failed to write format: %v", err) 253 } 254 err = binary.Write(w, binary.LittleEndian, idx.NameColumn) 255 if err != nil { 256 return fmt.Errorf("tabix: failed to write name column index: %v", err) 257 } 258 err = binary.Write(w, binary.LittleEndian, idx.BeginColumn) 259 if err != nil { 260 return fmt.Errorf("tabix: failed to write begin column index: %v", err) 261 } 262 err = binary.Write(w, binary.LittleEndian, idx.EndColumn) 263 if err != nil { 264 return fmt.Errorf("tabix: failed to write end column index: %v", err) 265 } 266 err = binary.Write(w, binary.LittleEndian, idx.MetaChar) 267 if err != nil { 268 return fmt.Errorf("tabix: failed to write metacharacter: %v", err) 269 } 270 err = binary.Write(w, binary.LittleEndian, idx.Skip) 271 if err != nil { 272 return fmt.Errorf("tabix: failed to write skip count: %v", err) 273 } 274 var n int32 275 for _, name := range idx.refNames { 276 n += int32(len(name) + 1) 277 } 278 err = binary.Write(w, binary.LittleEndian, n) 279 if err != nil { 280 return fmt.Errorf("tabix: failed to write name lengths: %v", err) 281 } 282 for _, name := range idx.refNames { 283 _, err = w.Write([]byte(name)) 284 if err != nil { 285 return fmt.Errorf("tabix: failed to write name: %v", err) 286 } 287 _, err = w.Write([]byte{0}) 288 if err != nil { 289 return fmt.Errorf("tabix: failed to write name: %v", err) 290 } 291 } 292 return nil 293 }