github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bam/reader.go (about) 1 // Copyright ©2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bam 6 7 import ( 8 "encoding/binary" 9 "errors" 10 "fmt" 11 "io" 12 "reflect" 13 "unsafe" 14 15 "github.com/Schaudge/hts/bgzf" 16 "github.com/Schaudge/hts/sam" 17 ) 18 19 // Reader implements BAM data reading. 20 type Reader struct { 21 r *bgzf.Reader 22 h *sam.Header 23 c *bgzf.Chunk 24 25 // references is cached header 26 // reference count. 27 references int32 28 29 // omit specifies how much of the 30 // record should be omitted during 31 // a read of the BAM input. 32 omit int 33 34 lastChunk bgzf.Chunk 35 36 // sizeBuf and sizeStorage are used to read the block size of each record 37 // without having to allocate new storage and a slice everytime. 38 sizeStorage [4]byte 39 sizeBuf []byte 40 } 41 42 const maxBAMRecordSize = 0xffffff 43 44 // NewReader returns a new Reader using the given io.Reader 45 // and setting the read concurrency to rd. If rd is zero 46 // concurrency is set to GOMAXPROCS. The returned Reader 47 // should be closed after use to avoid leaking resources. 48 func NewReader(r io.Reader, rd int) (*Reader, error) { 49 bg, err := bgzf.NewReader(r, rd) 50 if err != nil { 51 return nil, err 52 } 53 h, _ := sam.NewHeader(nil, nil) 54 br := &Reader{ 55 r: bg, 56 h: h, 57 58 references: int32(len(h.Refs())), 59 } 60 err = br.h.DecodeBinary(br.r) 61 if err != nil { 62 return nil, err 63 } 64 br.lastChunk.End = br.r.LastChunk().End 65 br.sizeBuf = br.sizeStorage[:] 66 return br, nil 67 } 68 69 // Header returns the SAM Header held by the Reader. 70 func (br *Reader) Header() *sam.Header { 71 return br.h 72 } 73 74 // BAM record layout. 75 type bamRecordFixed struct { 76 blockSize int32 77 refID int32 78 pos int32 79 nLen uint8 80 mapQ uint8 81 bin uint16 82 nCigar uint16 83 flags sam.Flags 84 lSeq int32 85 nextRefID int32 86 nextPos int32 87 tLen int32 88 } 89 90 var ( 91 lenFieldSize = binary.Size(bamRecordFixed{}.blockSize) 92 bamFixedRemainder = binary.Size(bamRecordFixed{}) - lenFieldSize 93 ) 94 95 func vOffset(o bgzf.Offset) int64 { 96 return o.File<<16 | int64(o.Block) 97 } 98 99 // Omit specifies what portions of the Record to omit reading. 100 // When o is None, a full sam.Record is returned by Read, when o 101 // is AuxTags the auxiliary tag data is omitted and when o is 102 // AllVariableLengthData, sequence, quality and auxiliary data 103 // is omitted. 104 func (br *Reader) Omit(o int) { 105 br.omit = o 106 } 107 108 // None, AuxTags and AllVariableLengthData are values taken 109 // by the Reader Omit method. 110 const ( 111 None = iota // Omit no field data from the record. 112 AuxTags // Omit auxiliary tag data. 113 AllVariableLengthData // Omit sequence, quality and auxiliary data. 114 ) 115 116 // Read returns the next sam.Record in the BAM stream. 117 // 118 // The sam.Record returned will not contain the sequence, quality or 119 // auxiliary tag data if Omit(AllVariableLengthData) has been called 120 // prior to the Read call and will not contain the auxiliary tag data 121 // is Omit(AuxTags) has been called. 122 func (br *Reader) Read() (*sam.Record, error) { 123 if br.c != nil && vOffset(br.r.LastChunk().End) >= vOffset(br.c.End) { 124 return nil, io.EOF 125 } 126 // Use a pool of buffer's to share buffers between concurrent clients 127 // and hence reduce the number of allocations required. 128 buf := bufPool.Get().([]byte) 129 if err := readAlignment(br, &buf); err != nil { 130 bufPool.Put(buf) 131 return nil, err 132 } 133 rec, err := unmarshal(buf, br.h, br.omit) 134 bufPool.Put(buf) 135 return rec, err 136 } 137 138 // Unmarshal a serialized record. Parameter omit is the value of Reader.Omit(). 139 // Most callers should pass zero as omit. 140 func unmarshal(b []byte, header *sam.Header, omit int) (*sam.Record, error) { 141 rec := sam.GetFromFreePool() 142 if len(b) < 32 { 143 return nil, errors.New("bam: record too short") 144 } 145 // Need to use int(int32(uint32)) to ensure 2's complement extension of -1. 146 refID := int(int32(binary.LittleEndian.Uint32(b))) 147 rec.Pos = int(int32(binary.LittleEndian.Uint32(b[4:]))) 148 nLen := int(b[8]) 149 rec.MapQ = b[9] 150 nCigar := int(binary.LittleEndian.Uint16(b[12:])) 151 rec.Flags = sam.Flags(binary.LittleEndian.Uint16(b[14:])) 152 lSeq := int(binary.LittleEndian.Uint32(b[16:])) 153 nextRefID := int(int32(binary.LittleEndian.Uint32(b[20:]))) 154 rec.MatePos = int(int32(binary.LittleEndian.Uint32(b[24:]))) 155 rec.TempLen = int(int32(binary.LittleEndian.Uint32(b[28:]))) 156 157 // Read variable length data. 158 pos := 32 159 160 blen := len(b) - pos 161 cigarOffset := alignOffset(blen) // store the cigar int32s here 162 auxOffset := alignOffset(cigarOffset + (nCigar * 4)) // store the AuxFields here 163 164 nDoubletBytes := (lSeq + 1) >> 1 165 bAuxOffset := pos + nLen + (nCigar * 4) + nDoubletBytes + lSeq 166 if len(b) < bAuxOffset { 167 return nil, fmt.Errorf("Corrupt BAM aux record: len(b)=%d, auxoffset=%d", len(b), bAuxOffset) 168 } 169 nAuxFields, err := countAuxFields(b[bAuxOffset:]) 170 if err != nil { 171 return nil, err 172 } 173 shadowSize := auxOffset + (nAuxFields * sizeofSliceHeader) 174 175 // shadowBuf is used as an 'arena' from which all objects/slices 176 // required to store the result of parsing the bam alignment record. 177 // This reduces the load on GC and consequently allows for better 178 // scalability with the number of cores used by clients of this package. 179 shadowOffset := 0 180 resizeScratch(&rec.Scratch, shadowSize) 181 shadowBuf := rec.Scratch 182 copy(shadowBuf, b[pos:]) 183 184 bufHdr := (*reflect.SliceHeader)(unsafe.Pointer(&shadowBuf)) 185 186 // Note that rec.Name now points to the shadow buffer 187 hdr := (*reflect.StringHeader)(unsafe.Pointer(&rec.Name)) 188 hdr.Data = uintptr(unsafe.Pointer(bufHdr.Data)) 189 hdr.Len = nLen - 1 // drop trailing '\0' 190 shadowOffset += nLen 191 192 var sliceHdr *reflect.SliceHeader 193 194 if nCigar > 0 { 195 for i := 0; i < nCigar; i++ { 196 *(*uint32)(unsafe.Pointer(&shadowBuf[cigarOffset+(i*4)])) = binary.LittleEndian.Uint32(shadowBuf[shadowOffset+(i*4):]) 197 } 198 sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Cigar)) 199 sliceHdr.Data = bufHdr.Data + uintptr(cigarOffset) 200 sliceHdr.Len = nCigar 201 sliceHdr.Cap = sliceHdr.Len 202 shadowOffset += nCigar * 4 203 } else { 204 sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Cigar)) 205 sliceHdr.Data = uintptr(0) 206 sliceHdr.Len = 0 207 sliceHdr.Cap = 0 208 } 209 210 if omit >= AllVariableLengthData { 211 goto done 212 } 213 214 rec.Seq.Length = lSeq 215 216 sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Seq.Seq)) 217 sliceHdr.Data = uintptr(unsafe.Pointer(bufHdr.Data + uintptr(shadowOffset))) 218 sliceHdr.Len = nDoubletBytes 219 sliceHdr.Cap = sliceHdr.Len 220 shadowOffset += nDoubletBytes 221 222 if omit >= AuxTags { 223 goto done 224 } 225 226 sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.Qual)) 227 sliceHdr.Data = uintptr(unsafe.Pointer(bufHdr.Data + uintptr(shadowOffset))) 228 sliceHdr.Len = lSeq 229 sliceHdr.Cap = sliceHdr.Len 230 231 shadowOffset += lSeq 232 233 if nAuxFields > 0 { 234 // Clear the array before updating rec.AuxFields. GC will be 235 // confused otherwise. 236 for i := auxOffset; i < auxOffset+nAuxFields*sizeofSliceHeader; i++ { 237 shadowBuf[i] = 0 238 } 239 sliceHdr = (*reflect.SliceHeader)(unsafe.Pointer(&rec.AuxFields)) 240 sliceHdr.Data = uintptr(unsafe.Pointer(bufHdr.Data + uintptr(auxOffset))) 241 sliceHdr.Len = nAuxFields 242 sliceHdr.Cap = sliceHdr.Len 243 parseAux(shadowBuf[shadowOffset:blen], rec.AuxFields) 244 } 245 246 done: 247 refs := len(header.Refs()) 248 if refID != -1 { 249 if refID < -1 || refID >= refs { 250 return nil, errors.New("bam: reference id out of range") 251 } 252 rec.Ref = header.Refs()[refID] 253 } 254 if nextRefID != -1 { 255 if refID == nextRefID { 256 rec.MateRef = rec.Ref 257 return rec, nil 258 } 259 if nextRefID < -1 || nextRefID >= refs { 260 return nil, errors.New("bam: mate reference id out of range") 261 } 262 rec.MateRef = header.Refs()[nextRefID] 263 } 264 return rec, nil 265 } 266 267 // SetCache sets the cache to be used by the Reader. 268 func (bg *Reader) SetCache(c bgzf.Cache) { 269 bg.r.SetCache(c) 270 } 271 272 // Seek performs a seek to the specified bgzf.Offset. 273 func (br *Reader) Seek(off bgzf.Offset) error { 274 return br.r.Seek(off) 275 } 276 277 // SetChunk sets a limited range of the underlying BGZF file to read, after 278 // seeking to the start of the given chunk. It may be used to iterate over 279 // a defined genomic interval. 280 func (br *Reader) SetChunk(c *bgzf.Chunk) error { 281 if c != nil { 282 err := br.r.Seek(c.Begin) 283 if err != nil { 284 return err 285 } 286 } 287 br.c = c 288 return nil 289 } 290 291 // LastChunk returns the bgzf.Chunk corresponding to the last Read operation. 292 // The bgzf.Chunk returned is only valid if the last Read operation returned a 293 // nil error. 294 func (br *Reader) LastChunk() bgzf.Chunk { 295 return br.lastChunk 296 } 297 298 // Close closes the Reader. 299 func (br *Reader) Close() error { 300 return br.r.Close() 301 } 302 303 // Iterator wraps a Reader to provide a convenient loop interface for reading BAM data. 304 // Successive calls to the Next method will step through the features of the provided 305 // Reader. Iteration stops unrecoverably at EOF or the first error. 306 type Iterator struct { 307 r *Reader 308 309 chunks []bgzf.Chunk 310 311 rec *sam.Record 312 err error 313 } 314 315 // NewIterator returns a Iterator to read from r, limiting the reads to the provided 316 // chunks. 317 // 318 // chunks, err := idx.Chunks(ref, beg, end) 319 // if err != nil { 320 // return err 321 // } 322 // i, err := NewIterator(r, chunks) 323 // if err != nil { 324 // return err 325 // } 326 // for i.Next() { 327 // fn(i.Record()) 328 // } 329 // return i.Close() 330 // 331 func NewIterator(r *Reader, chunks []bgzf.Chunk) (*Iterator, error) { 332 if len(chunks) == 0 { 333 return &Iterator{r: r, err: io.EOF}, nil 334 } 335 err := r.SetChunk(&chunks[0]) 336 if err != nil { 337 return nil, err 338 } 339 chunks = chunks[1:] 340 return &Iterator{r: r, chunks: chunks}, nil 341 } 342 343 // Next advances the Iterator past the next record, which will then be available through 344 // the Record method. It returns false when the iteration stops, either by reaching the end of the 345 // input or an error. After Next returns false, the Error method will return any error that 346 // occurred during iteration, except that if it was io.EOF, Error will return nil. 347 func (i *Iterator) Next() bool { 348 if i.err != nil { 349 return false 350 } 351 i.rec, i.err = i.r.Read() 352 if len(i.chunks) != 0 && i.err == io.EOF { 353 i.err = i.r.SetChunk(&i.chunks[0]) 354 i.chunks = i.chunks[1:] 355 return i.Next() 356 } 357 return i.err == nil 358 } 359 360 // Error returns the first non-EOF error that was encountered by the Iterator. 361 func (i *Iterator) Error() error { 362 if i.err == io.EOF { 363 return nil 364 } 365 return i.err 366 } 367 368 // Record returns the most recent record read by a call to Next. 369 func (i *Iterator) Record() *sam.Record { return i.rec } 370 371 // Close releases the underlying Reader. 372 func (i *Iterator) Close() error { 373 i.r.SetChunk(nil) 374 return i.Error() 375 } 376 377 var jumps = [256]int{ 378 'A': 1, 379 'c': 1, 'C': 1, 380 's': 2, 'S': 2, 381 'i': 4, 'I': 4, 382 'f': 4, 383 'Z': -1, 384 'H': -1, 385 'B': -1, 386 } 387 388 var errCorruptAuxField = errors.New("Corrupt aux field") 389 390 // countAuxFields examines the data of a SAM record's OPT field to determine 391 // the number of auxFields there are. 392 func countAuxFields(aux []byte) (int, error) { 393 naux := 0 394 for i := 0; i+2 < len(aux); { 395 t := aux[i+2] 396 switch j := jumps[t]; { 397 case j > 0: 398 j += 3 399 i += j 400 naux++ 401 case j < 0: 402 switch t { 403 case 'Z', 'H': 404 var ( 405 j int 406 v byte 407 ) 408 for j, v = range aux[i:] { 409 if v == 0 { // C string termination 410 break // Truncate terminal zero. 411 } 412 } 413 i += j + 1 414 naux++ 415 case 'B': 416 if len(aux) < i+8 { 417 return -1, errCorruptAuxField 418 } 419 length := binary.LittleEndian.Uint32(aux[i+4 : i+8]) 420 j = int(length)*jumps[aux[i+3]] + int(unsafe.Sizeof(length)) + 4 421 i += j 422 naux++ 423 } 424 default: 425 return -1, errCorruptAuxField 426 } 427 } 428 return naux, nil 429 } 430 431 // parseAux examines the data of a SAM record's OPT fields, 432 // returning a slice of sam.Aux that are backed by the original data. 433 func parseAux(aux []byte, aa []sam.Aux) { 434 naa := 0 435 /* var sliceHdr *reflect.SliceHeader 436 auxSlice := (*reflect.SliceHeader)(unsafe.Pointer(&aux))*/ 437 for i := 0; i+2 < len(aux); { 438 t := aux[i+2] 439 switch j := jumps[t]; { 440 case j > 0: 441 j += 3 442 aa[naa] = sam.Aux(aux[i : i+j : i+j]) 443 naa++ 444 i += j 445 case j < 0: 446 switch t { 447 case 'Z', 'H': 448 var ( 449 j int 450 v byte 451 ) 452 for j, v = range aux[i:] { 453 if v == 0 { // C string termination 454 break // Truncate terminal zero. 455 } 456 } 457 aa[naa] = sam.Aux(aux[i : i+j : i+j]) 458 naa++ 459 i += j + 1 460 case 'B': 461 length := binary.LittleEndian.Uint32(aux[i+4 : i+8]) 462 j = int(length)*jumps[aux[i+3]] + int(unsafe.Sizeof(length)) + 4 463 aa[naa] = sam.Aux(aux[i : i+j : i+j]) 464 naa++ 465 i += j 466 } 467 default: 468 panic(fmt.Sprintf("bam: unrecognised optional field type: %q", t)) 469 } 470 } 471 } 472 473 // readAlignment reads the alignment record from the Reader's underlying 474 // bgzf.Reader into the supplied bytes.Buffer and updates the Reader's lastChunk 475 // field. 476 func readAlignment(br *Reader, buf *[]byte) error { 477 n, err := io.ReadFull(br.r, br.sizeBuf) 478 // br.r.Chunk() is only valid after the call the Read(), so this 479 // must come after the first read in the record. 480 tx := br.r.Begin() 481 defer func() { 482 br.lastChunk = tx.End() 483 }() 484 if err != nil { 485 return err 486 } 487 if n != 4 { 488 return errors.New("bam: invalid record: short block size") 489 } 490 size := int(binary.LittleEndian.Uint32(br.sizeBuf)) 491 if size > maxBAMRecordSize { 492 return errors.New("bam: record too large") 493 } 494 resizeScratch(buf, size) 495 nn, err := io.ReadFull(br.r, *buf) 496 if err != nil { 497 return err 498 } 499 if nn != size { 500 return errors.New("bam: truncated record") 501 } 502 return nil 503 } 504 505 // buildAux constructs a single byte slice that represents a slice of sam.Aux. 506 // *buf should be an empty slice on call, and it is filled with the result on 507 // return. 508 func buildAux(aa []sam.Aux, buf *[]byte) { 509 for _, a := range aa { 510 // TODO: validate each 'a' 511 *buf = append(*buf, []byte(a)...) 512 switch a.Type() { 513 case 'Z', 'H': 514 *buf = append(*buf, 0) 515 } 516 } 517 } 518 519 type doublets []sam.Doublet 520 521 func (np doublets) Bytes() []byte { return *(*[]byte)(unsafe.Pointer(&np)) }