github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bam/merger.go (about) 1 // Copyright ©2017 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bam 6 7 import ( 8 "container/heap" 9 "errors" 10 "io" 11 12 "github.com/Schaudge/hts/sam" 13 ) 14 15 // Merger implements merging BAM data with a defined sort 16 // order. It can be used for sorting, concatenating and 17 // deduplicating BAM data. 18 type Merger struct { 19 h *sam.Header 20 // refLinks is the set of mappings from a RefID in 21 // a src Header to a Reference in the dst Header. 22 refLinks [][]*sam.Reference 23 24 less func(a, b *sam.Record) bool 25 readers []*reader 26 } 27 28 type reader struct { 29 id int 30 r *Reader 31 32 head *sam.Record 33 err error 34 } 35 36 // NewMerger returns a Merger that reads from the source 37 // Readers. 38 // 39 // The sort order of the stream merge is defined by the sort 40 // order field of the src Reader headers and the provided less 41 // function. The header sort order fields must agree. 42 // 43 // Sort order is determined using the following rules: 44 // - for sam.QueryName the LessByName sam.Record method is used. 45 // - for sam.Coordinate the LessByCoordinate sam.Record method is used. 46 // - for sam.Unsorted the reader streams are concatenated. 47 // - for sam.Unknown the provided less function is used - if nil 48 // this is the same as sam.Unsorted. 49 // For all sort orders other than sam.Unknown, the less parameter 50 // is ignored. 51 // The src Readers should be closed individually after use to avoid 52 // leaking resources. 53 func NewMerger(less func(a, b *sam.Record) bool, src ...*Reader) (*Merger, error) { 54 if len(src) == 0 { 55 return nil, io.EOF 56 } 57 58 m := &Merger{readers: make([]*reader, len(src))} 59 60 headers := make([]*sam.Header, len(src)) 61 so := src[0].Header().SortOrder 62 for i, r := range src { 63 h := r.Header() 64 if h.SortOrder != so { 65 return nil, errors.New("bam: sort order mismatch") 66 } 67 headers[i] = h 68 } 69 var err error 70 m.h, m.refLinks, err = sam.MergeHeaders(headers) 71 if err != nil { 72 return nil, err 73 } 74 m.h.SortOrder = so 75 76 readers := make([]reader, len(src)) 77 switch m.h.SortOrder { 78 default: 79 fallthrough 80 case sam.UnknownOrder: 81 m.less = less 82 case sam.Unsorted: 83 case sam.QueryName: 84 m.less = (*sam.Record).LessByName 85 case sam.Coordinate: 86 m.less = (*sam.Record).LessByCoordinate 87 } 88 for i, r := range src { 89 if m.less == nil { 90 readers[i].id = i 91 readers[i].r = r 92 m.readers[i] = &readers[i] 93 continue 94 } 95 rec, err := r.Read() 96 readers[i] = reader{id: i, r: r, head: rec, err: err} 97 m.readers[i] = &readers[i] 98 } 99 if m.less != nil { 100 heap.Init((*bySortOrderAndID)(m)) 101 } 102 103 return m, nil 104 } 105 106 // Header returns the SAM Header held by the Reader. This Header is 107 // constructed using the sam.MergeHeaders function. 108 func (m *Merger) Header() *sam.Header { 109 return m.h 110 } 111 112 // Read returns the next sam.Record in the BAM stream. 113 // 114 // The Read behaviour will depend on the underlying Readers. 115 func (m *Merger) Read() (rec *sam.Record, err error) { 116 if len(m.readers) == 0 { 117 return nil, io.EOF 118 } 119 if m.less == nil { 120 return m.cat() 121 } 122 return m.nextBySortOrder() 123 } 124 125 func (m *Merger) cat() (rec *sam.Record, err error) { 126 id := m.readers[0].id 127 rec, err = m.readers[0].r.Read() 128 if err == io.EOF && len(m.readers) != 0 { 129 m.readers = m.readers[1:] 130 err = nil 131 } 132 if rec == nil { 133 return m.Read() 134 } 135 m.reassignReference(id, rec) 136 return rec, err 137 } 138 139 func (m *Merger) nextBySortOrder() (rec *sam.Record, err error) { 140 reader := m.pop() 141 rec, err = reader.head, reader.err 142 reader.head, reader.err = reader.r.Read() 143 if reader.err == nil { 144 m.push(reader) 145 } 146 if rec == nil { 147 return m.Read() 148 } 149 if err == io.EOF { 150 err = nil 151 } 152 m.reassignReference(reader.id, rec) 153 return rec, err 154 } 155 156 func (m *Merger) reassignReference(id int, rec *sam.Record) { 157 if rec.Ref == nil || m.refLinks == nil { 158 return 159 } 160 rec.Ref = m.refLinks[id][rec.RefID()] 161 } 162 163 func (m *Merger) push(r *reader) { heap.Push((*bySortOrderAndID)(m), r) } 164 func (m *Merger) pop() *reader { return heap.Pop((*bySortOrderAndID)(m)).(*reader) } 165 166 type bySortOrderAndID Merger 167 168 func (m *bySortOrderAndID) Push(i interface{}) { 169 m.readers = append(m.readers, i.(*reader)) 170 } 171 func (m *bySortOrderAndID) Pop() interface{} { 172 var r *reader 173 r, m.readers = m.readers[len(m.readers)-1], m.readers[:len(m.readers)-1] 174 return r 175 } 176 func (m *bySortOrderAndID) Len() int { 177 return len(m.readers) 178 } 179 func (m *bySortOrderAndID) Less(i, j int) bool { 180 if m.less(m.readers[i].head, m.readers[j].head) { 181 return true 182 } 183 return m.readers[i].id < m.readers[j].id && !m.less(m.readers[j].head, m.readers[i].head) 184 } 185 func (m *bySortOrderAndID) Swap(i, j int) { 186 m.readers[i], m.readers[j] = m.readers[j], m.readers[i] 187 }