github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bam/merger.go (about)

     1  // Copyright ©2017 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bam
     6  
     7  import (
     8  	"container/heap"
     9  	"errors"
    10  	"io"
    11  
    12  	"github.com/Schaudge/hts/sam"
    13  )
    14  
    15  // Merger implements merging BAM data with a defined sort
    16  // order. It can be used for sorting, concatenating and
    17  // deduplicating BAM data.
    18  type Merger struct {
    19  	h *sam.Header
    20  	// refLinks is the set of mappings from a RefID in
    21  	// a src Header to a Reference in the dst Header.
    22  	refLinks [][]*sam.Reference
    23  
    24  	less    func(a, b *sam.Record) bool
    25  	readers []*reader
    26  }
    27  
    28  type reader struct {
    29  	id int
    30  	r  *Reader
    31  
    32  	head *sam.Record
    33  	err  error
    34  }
    35  
    36  // NewMerger returns a Merger that reads from the source
    37  // Readers.
    38  //
    39  // The sort order of the stream merge is defined by the sort
    40  // order field of the src Reader headers and the provided less
    41  // function. The header sort order fields must agree.
    42  //
    43  // Sort order is determined using the following rules:
    44  //  - for sam.QueryName the LessByName sam.Record method is used.
    45  //  - for sam.Coordinate the LessByCoordinate sam.Record method is used.
    46  //  - for sam.Unsorted the reader streams are concatenated.
    47  //  - for sam.Unknown the provided less function is used - if nil
    48  //    this is the same as sam.Unsorted.
    49  // For all sort orders other than sam.Unknown, the less parameter
    50  // is ignored.
    51  // The src Readers should be closed individually after use to avoid
    52  // leaking resources.
    53  func NewMerger(less func(a, b *sam.Record) bool, src ...*Reader) (*Merger, error) {
    54  	if len(src) == 0 {
    55  		return nil, io.EOF
    56  	}
    57  
    58  	m := &Merger{readers: make([]*reader, len(src))}
    59  
    60  	headers := make([]*sam.Header, len(src))
    61  	so := src[0].Header().SortOrder
    62  	for i, r := range src {
    63  		h := r.Header()
    64  		if h.SortOrder != so {
    65  			return nil, errors.New("bam: sort order mismatch")
    66  		}
    67  		headers[i] = h
    68  	}
    69  	var err error
    70  	m.h, m.refLinks, err = sam.MergeHeaders(headers)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  	m.h.SortOrder = so
    75  
    76  	readers := make([]reader, len(src))
    77  	switch m.h.SortOrder {
    78  	default:
    79  		fallthrough
    80  	case sam.UnknownOrder:
    81  		m.less = less
    82  	case sam.Unsorted:
    83  	case sam.QueryName:
    84  		m.less = (*sam.Record).LessByName
    85  	case sam.Coordinate:
    86  		m.less = (*sam.Record).LessByCoordinate
    87  	}
    88  	for i, r := range src {
    89  		if m.less == nil {
    90  			readers[i].id = i
    91  			readers[i].r = r
    92  			m.readers[i] = &readers[i]
    93  			continue
    94  		}
    95  		rec, err := r.Read()
    96  		readers[i] = reader{id: i, r: r, head: rec, err: err}
    97  		m.readers[i] = &readers[i]
    98  	}
    99  	if m.less != nil {
   100  		heap.Init((*bySortOrderAndID)(m))
   101  	}
   102  
   103  	return m, nil
   104  }
   105  
   106  // Header returns the SAM Header held by the Reader. This Header is
   107  // constructed using the sam.MergeHeaders function.
   108  func (m *Merger) Header() *sam.Header {
   109  	return m.h
   110  }
   111  
   112  // Read returns the next sam.Record in the BAM stream.
   113  //
   114  // The Read behaviour will depend on the underlying Readers.
   115  func (m *Merger) Read() (rec *sam.Record, err error) {
   116  	if len(m.readers) == 0 {
   117  		return nil, io.EOF
   118  	}
   119  	if m.less == nil {
   120  		return m.cat()
   121  	}
   122  	return m.nextBySortOrder()
   123  }
   124  
   125  func (m *Merger) cat() (rec *sam.Record, err error) {
   126  	id := m.readers[0].id
   127  	rec, err = m.readers[0].r.Read()
   128  	if err == io.EOF && len(m.readers) != 0 {
   129  		m.readers = m.readers[1:]
   130  		err = nil
   131  	}
   132  	if rec == nil {
   133  		return m.Read()
   134  	}
   135  	m.reassignReference(id, rec)
   136  	return rec, err
   137  }
   138  
   139  func (m *Merger) nextBySortOrder() (rec *sam.Record, err error) {
   140  	reader := m.pop()
   141  	rec, err = reader.head, reader.err
   142  	reader.head, reader.err = reader.r.Read()
   143  	if reader.err == nil {
   144  		m.push(reader)
   145  	}
   146  	if rec == nil {
   147  		return m.Read()
   148  	}
   149  	if err == io.EOF {
   150  		err = nil
   151  	}
   152  	m.reassignReference(reader.id, rec)
   153  	return rec, err
   154  }
   155  
   156  func (m *Merger) reassignReference(id int, rec *sam.Record) {
   157  	if rec.Ref == nil || m.refLinks == nil {
   158  		return
   159  	}
   160  	rec.Ref = m.refLinks[id][rec.RefID()]
   161  }
   162  
   163  func (m *Merger) push(r *reader) { heap.Push((*bySortOrderAndID)(m), r) }
   164  func (m *Merger) pop() *reader   { return heap.Pop((*bySortOrderAndID)(m)).(*reader) }
   165  
   166  type bySortOrderAndID Merger
   167  
   168  func (m *bySortOrderAndID) Push(i interface{}) {
   169  	m.readers = append(m.readers, i.(*reader))
   170  }
   171  func (m *bySortOrderAndID) Pop() interface{} {
   172  	var r *reader
   173  	r, m.readers = m.readers[len(m.readers)-1], m.readers[:len(m.readers)-1]
   174  	return r
   175  }
   176  func (m *bySortOrderAndID) Len() int {
   177  	return len(m.readers)
   178  }
   179  func (m *bySortOrderAndID) Less(i, j int) bool {
   180  	if m.less(m.readers[i].head, m.readers[j].head) {
   181  		return true
   182  	}
   183  	return m.readers[i].id < m.readers[j].id && !m.less(m.readers[j].head, m.readers[i].head)
   184  }
   185  func (m *bySortOrderAndID) Swap(i, j int) {
   186  	m.readers[i], m.readers[j] = m.readers[j], m.readers[i]
   187  }