github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/header.go (about) 1 // Copyright ©2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package sam 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "errors" 11 "fmt" 12 "io" 13 ) 14 15 var ( 16 errDupReference = errors.New("sam: duplicate reference name") 17 errDupReadGroup = errors.New("sam: duplicate read group name") 18 errDupProgram = errors.New("sam: duplicate program name") 19 errUsedReference = errors.New("sam: reference already used") 20 errUsedReadGroup = errors.New("sam: read group already used") 21 errUsedProgram = errors.New("sam: program already used") 22 errInvalidReference = errors.New("sam: reference not owned by header") 23 errInvalidReadGroup = errors.New("sam: read group not owned by header") 24 errInvalidProgram = errors.New("sam: program not owned by header") 25 errBadLen = errors.New("sam: reference length out of range") 26 ) 27 28 // SortOrder indicates the sort order of a SAM or BAM file. 29 type SortOrder int 30 31 const ( 32 UnknownOrder SortOrder = iota 33 Unsorted 34 QueryName 35 Coordinate 36 ) 37 38 var ( 39 sortOrder = [...]string{ 40 UnknownOrder: "unknown", 41 Unsorted: "unsorted", 42 QueryName: "queryname", 43 Coordinate: "coordinate", 44 } 45 sortOrderMap = map[string]SortOrder{ 46 "unknown": UnknownOrder, 47 "unsorted": Unsorted, 48 "queryname": QueryName, 49 "coordinate": Coordinate, 50 } 51 ) 52 53 // String returns the string representation of a SortOrder. 54 func (so SortOrder) String() string { 55 if so < Unsorted || so > Coordinate { 56 return sortOrder[UnknownOrder] 57 } 58 return sortOrder[so] 59 } 60 61 // GroupOrder indicates the grouping order of a SAM or BAM file. 62 type GroupOrder int 63 64 const ( 65 GroupUnspecified GroupOrder = iota 66 GroupNone 67 GroupQuery 68 GroupReference 69 ) 70 71 var ( 72 groupOrder = [...]string{ 73 GroupUnspecified: "none", 74 GroupNone: "none", 75 GroupQuery: "query", 76 GroupReference: "reference", 77 } 78 groupOrderMap = map[string]GroupOrder{ 79 "none": GroupNone, 80 "query": GroupQuery, 81 "reference": GroupReference, 82 } 83 ) 84 85 // String returns the string representation of a GroupOrder. 86 func (g GroupOrder) String() string { 87 if g < GroupNone || g > GroupReference { 88 return groupOrder[GroupUnspecified] 89 } 90 return groupOrder[g] 91 } 92 93 type set map[string]int32 94 95 // Header is a SAM or BAM header. 96 type Header struct { 97 Version string 98 SortOrder SortOrder 99 GroupOrder GroupOrder 100 otherTags []tagPair 101 102 refs []*Reference 103 rgs []*ReadGroup 104 progs []*Program 105 seenRefs set 106 seenGroups set 107 seenProgs set 108 109 Comments []string 110 } 111 112 type tagPair struct { 113 tag Tag 114 value string 115 } 116 117 // NewHeader returns a new Header based on the given text and list 118 // of References. If there is a conflict between the text and the 119 // given References NewHeader will return a non-nil error. 120 func NewHeader(text []byte, r []*Reference) (*Header, error) { 121 var err error 122 bh := &Header{ 123 refs: r, 124 seenRefs: set{}, 125 seenGroups: set{}, 126 seenProgs: set{}, 127 } 128 for i, r := range bh.refs { 129 if r.owner != nil || r.id >= 0 { 130 return nil, errUsedReference 131 } 132 r.owner = bh 133 r.id = int32(i) 134 } 135 if text != nil { 136 err = bh.UnmarshalText(text) 137 if err != nil { 138 return nil, err 139 } 140 } 141 return bh, nil 142 } 143 144 // Tags applies the function fn to each of the tag-value pairs of the Header. 145 // The SO and GO tags are only used if they are set to the non-default values. 146 // The function fn must not add or delete tags held by the receiver during 147 // iteration. 148 func (bh *Header) Tags(fn func(t Tag, value string)) { 149 if fn == nil { 150 return 151 } 152 fn(versionTag, bh.Version) 153 if bh.SortOrder != UnknownOrder { 154 fn(sortOrderTag, bh.SortOrder.String()) 155 } 156 if bh.GroupOrder != GroupNone { 157 fn(groupOrderTag, bh.GroupOrder.String()) 158 } 159 for _, tp := range bh.otherTags { 160 fn(tp.tag, tp.value) 161 } 162 } 163 164 // Get returns the string representation of the value associated with the 165 // given header line tag. If the tag is not present the empty string is returned. 166 func (bh *Header) Get(t Tag) string { 167 switch t { 168 case versionTag: 169 return bh.Version 170 case sortOrderTag: 171 return bh.SortOrder.String() 172 case groupOrderTag: 173 return bh.GroupOrder.String() 174 } 175 for _, tp := range bh.otherTags { 176 if t == tp.tag { 177 return tp.value 178 } 179 } 180 return "" 181 } 182 183 // Set sets the value associated with the given header line tag to the specified 184 // value. If value is the empty string and the tag may be absent, it is deleted 185 // or set to a meaningful default (SO:UnknownOrder and GO:GroupUnspecified), 186 // otherwise an error is returned. 187 func (bh *Header) Set(t Tag, value string) error { 188 switch t { 189 case versionTag: 190 if value == "" { 191 return errBadHeader 192 } 193 bh.Version = value 194 case sortOrderTag: 195 if value == "" { 196 bh.SortOrder = UnknownOrder 197 return nil 198 } 199 sortOrder, ok := sortOrderMap[value] 200 if !ok { 201 return errBadHeader 202 } 203 bh.SortOrder = sortOrder 204 case groupOrderTag: 205 if value == "" { 206 bh.GroupOrder = GroupUnspecified 207 return nil 208 } 209 groupOrder, ok := groupOrderMap[value] 210 if !ok { 211 return errBadHeader 212 } 213 bh.GroupOrder = groupOrder 214 default: 215 if value == "" { 216 for i, tp := range bh.otherTags { 217 if t == tp.tag { 218 copy(bh.otherTags[i:], bh.otherTags[i+1:]) 219 bh.otherTags = bh.otherTags[:len(bh.otherTags)-1] 220 return nil 221 } 222 } 223 } else { 224 for i, tp := range bh.otherTags { 225 if t == tp.tag { 226 bh.otherTags[i].value = value 227 return nil 228 } 229 } 230 bh.otherTags = append(bh.otherTags, tagPair{tag: t, value: value}) 231 } 232 } 233 return nil 234 } 235 236 // Clone returns a deep copy of the receiver. 237 func (bh *Header) Clone() *Header { 238 c := &Header{ 239 Version: bh.Version, 240 SortOrder: bh.SortOrder, 241 GroupOrder: bh.GroupOrder, 242 otherTags: append([]tagPair(nil), bh.otherTags...), 243 Comments: append([]string(nil), bh.Comments...), 244 seenRefs: make(set, len(bh.seenRefs)), 245 seenGroups: make(set, len(bh.seenGroups)), 246 seenProgs: make(set, len(bh.seenProgs)), 247 } 248 if len(bh.refs) != 0 { 249 c.refs = make([]*Reference, len(bh.refs)) 250 } 251 if len(bh.rgs) != 0 { 252 c.rgs = make([]*ReadGroup, len(bh.rgs)) 253 } 254 if len(bh.progs) != 0 { 255 c.progs = make([]*Program, len(bh.progs)) 256 } 257 258 for i, r := range bh.refs { 259 if r == nil { 260 continue 261 } 262 c.refs[i] = new(Reference) 263 *c.refs[i] = *r 264 c.refs[i].owner = c 265 } 266 for i, r := range bh.rgs { 267 c.rgs[i] = new(ReadGroup) 268 *c.rgs[i] = *r 269 c.rgs[i].owner = c 270 } 271 for i, p := range bh.progs { 272 c.progs[i] = new(Program) 273 *c.progs[i] = *p 274 c.progs[i].owner = c 275 } 276 for k, v := range bh.seenRefs { 277 c.seenRefs[k] = v 278 } 279 for k, v := range bh.seenGroups { 280 c.seenGroups[k] = v 281 } 282 for k, v := range bh.seenProgs { 283 c.seenProgs[k] = v 284 } 285 286 return c 287 } 288 289 // MergeHeaders returns a new Header resulting from the merge of the 290 // source Headers, and a mapping between the references in the source 291 // and the References in the returned Header. Sort order is set to 292 // unknown and group order is set to none. If a single Header is passed 293 // to MergeHeaders, the mapping between source and destination headers, 294 // reflink, is returned as nil. 295 // The returned Header contains the read groups and programs of the 296 // first Header in src. 297 func MergeHeaders(src []*Header) (h *Header, reflinks [][]*Reference, err error) { 298 switch len(src) { 299 case 0: 300 return nil, nil, nil 301 case 1: 302 return src[0], nil, nil 303 } 304 reflinks = make([][]*Reference, len(src)) 305 h = src[0].Clone() 306 h.SortOrder = UnknownOrder 307 h.GroupOrder = GroupUnspecified 308 for i, add := range src { 309 if i == 0 { 310 reflinks[i] = h.refs 311 continue 312 } 313 links := make([]*Reference, len(add.refs)) 314 for id, r := range add.refs { 315 r = r.Clone() 316 err := h.AddReference(r) 317 if err != nil { 318 return nil, nil, err 319 } 320 if r.owner != h { 321 // r was not actually added, so use the ref 322 // that h owns. 323 for _, hr := range h.refs { 324 if equalRefs(r, hr) { 325 r = hr 326 break 327 } 328 } 329 } 330 links[id] = r 331 } 332 reflinks[i] = links 333 } 334 335 return h, reflinks, nil 336 } 337 338 // MarshalText implements the encoding.TextMarshaler interface. 339 func (bh *Header) MarshalText() ([]byte, error) { 340 var buf bytes.Buffer 341 if bh.Version != "" { 342 if bh.GroupOrder == GroupUnspecified { 343 fmt.Fprintf(&buf, "@HD\tVN:%s\tSO:%s", bh.Version, bh.SortOrder) 344 } else { 345 fmt.Fprintf(&buf, "@HD\tVN:%s\tSO:%s\tGO:%s", bh.Version, bh.SortOrder, bh.GroupOrder) 346 } 347 for _, tp := range bh.otherTags { 348 fmt.Fprintf(&buf, "\t%s:%s", tp.tag, tp.value) 349 } 350 buf.WriteByte('\n') 351 } 352 for _, r := range bh.refs { 353 fmt.Fprintf(&buf, "%s\n", r) 354 } 355 for _, rg := range bh.rgs { 356 fmt.Fprintf(&buf, "%s\n", rg) 357 } 358 for _, p := range bh.progs { 359 fmt.Fprintf(&buf, "%s\n", p) 360 } 361 for _, co := range bh.Comments { 362 fmt.Fprintf(&buf, "@CO\t%s\n", co) 363 } 364 return buf.Bytes(), nil 365 } 366 367 // MarshalBinary implements the encoding.BinaryMarshaler. 368 func (bh *Header) MarshalBinary() ([]byte, error) { 369 b := &bytes.Buffer{} 370 err := bh.EncodeBinary(b) 371 if err != nil { 372 return nil, err 373 } 374 return b.Bytes(), nil 375 } 376 377 // EncodeBinary writes a binary encoding of the Header to the given io.Writer. 378 // The format of the encoding is defined in the SAM specification, section 4.2. 379 func (bh *Header) EncodeBinary(w io.Writer) error { 380 wb := &errWriter{w: w} 381 382 binary.Write(wb, binary.LittleEndian, bamMagic) 383 text, _ := bh.MarshalText() 384 binary.Write(wb, binary.LittleEndian, int32(len(text))) 385 wb.Write(text) 386 binary.Write(wb, binary.LittleEndian, int32(len(bh.refs))) 387 388 if !validInt32(len(bh.refs)) { 389 return errors.New("sam: value out of range") 390 } 391 var name []byte 392 for _, r := range bh.refs { 393 name = append(name, []byte(r.name)...) 394 name = append(name, 0) 395 binary.Write(wb, binary.LittleEndian, int32(len(name))) 396 wb.Write(name) 397 name = name[:0] 398 binary.Write(wb, binary.LittleEndian, r.lRef) 399 } 400 if wb.err != nil { 401 return wb.err 402 } 403 404 return nil 405 } 406 407 type errWriter struct { 408 w io.Writer 409 err error 410 } 411 412 func (w *errWriter) Write(p []byte) (int, error) { 413 if w.err != nil { 414 return 0, w.err 415 } 416 var n int 417 n, w.err = w.w.Write(p) 418 return n, w.err 419 } 420 421 // Validate checks r against the Header for record validity according to the 422 // SAM specification: 423 // 424 // - a program auxiliary field must refer to a program listed in the header 425 // - a read group auxiliary field must refer to a read group listed in the 426 // header and these must agree on platform unit and library. 427 func (bh *Header) Validate(r *Record) error { 428 rp := r.AuxFields.Get(programTag) 429 found := false 430 for _, hp := range bh.Progs() { 431 if hp.UID() == rp.Value() { 432 found = true 433 break 434 } 435 } 436 if !found && len(bh.Progs()) != 0 { 437 return fmt.Errorf("sam: program uid not found: %v", rp.Value()) 438 } 439 440 rg := r.AuxFields.Get(readGroupTag) 441 found = false 442 for _, hg := range bh.RGs() { 443 if hg.Name() == rg.Value() { 444 rPlatformUnit := r.AuxFields.Get(platformUnitTag).Value() 445 if rPlatformUnit != hg.PlatformUnit() { 446 return fmt.Errorf("sam: mismatched platform for read group %s: %v != %v", hg.Name(), rPlatformUnit, hg.platformUnit) 447 } 448 rLibrary := r.AuxFields.Get(libraryTag).Value() 449 if rLibrary != hg.Library() { 450 return fmt.Errorf("sam: mismatched library for read group %s: %v != %v", hg.Name(), rLibrary, hg.library) 451 } 452 found = true 453 break 454 } 455 } 456 if !found && len(bh.RGs()) != 0 { 457 return fmt.Errorf("sam: read group not found: %v", rg.Value()) 458 } 459 460 return nil 461 } 462 463 // Refs returns the Header's list of References. The returned slice 464 // should not be altered. 465 func (bh *Header) Refs() []*Reference { 466 return bh.refs 467 } 468 469 // RGs returns the Header's list of ReadGroups. The returned slice 470 // should not be altered. 471 func (bh *Header) RGs() []*ReadGroup { 472 return bh.rgs 473 } 474 475 // Progs returns the Header's list of Programs. The returned slice 476 // should not be altered. 477 func (bh *Header) Progs() []*Program { 478 return bh.progs 479 } 480 481 // AddReference adds r to the Header. 482 func (bh *Header) AddReference(r *Reference) error { 483 if dupID, dup := bh.seenRefs[r.name]; dup { 484 er := bh.refs[dupID] 485 if equalRefs(er, r) { 486 return nil 487 } else if !equalRefs(r, &Reference{id: -1, name: er.name, lRef: er.lRef}) { 488 return errDupReference 489 } 490 if r.md5 == "" { 491 r.md5 = er.md5 492 } 493 if r.assemID == "" { 494 r.assemID = er.assemID 495 } 496 if r.species == "" { 497 r.species = er.species 498 } 499 if r.uri == nil { 500 r.uri = er.uri 501 } 502 bh.refs[dupID] = r 503 return nil 504 } 505 if r.owner != nil || r.id >= 0 { 506 return errUsedReference 507 } 508 r.owner = bh 509 r.id = int32(len(bh.refs)) 510 bh.seenRefs[r.name] = r.id 511 bh.refs = append(bh.refs, r) 512 return nil 513 } 514 515 // RemoveReference removes r from the Header and makes it 516 // available to add to another Header. 517 func (bh *Header) RemoveReference(r *Reference) error { 518 if r.id < 0 || int(r.id) >= len(bh.refs) || bh.refs[r.id] != r { 519 return errInvalidReference 520 } 521 bh.refs = append(bh.refs[:r.id], bh.refs[r.id+1:]...) 522 for i := range bh.refs[r.id:] { 523 bh.refs[i+int(r.id)].id-- 524 } 525 r.id = -1 526 delete(bh.seenRefs, r.name) 527 return nil 528 } 529 530 // AddReadGroup adds rg to the Header. 531 func (bh *Header) AddReadGroup(rg *ReadGroup) error { 532 if _, ok := bh.seenGroups[rg.name]; ok { 533 return errDupReadGroup 534 } 535 if rg.owner != nil || rg.id >= 0 { 536 return errUsedReadGroup 537 } 538 rg.owner = bh 539 rg.id = int32(len(bh.rgs)) 540 bh.seenGroups[rg.name] = rg.id 541 bh.rgs = append(bh.rgs, rg) 542 return nil 543 } 544 545 // RemoveReadGroup removes rg from the Header and makes it 546 // available to add to another Header. 547 func (bh *Header) RemoveReadGroup(rg *ReadGroup) error { 548 if rg.id < 0 || int(rg.id) >= len(bh.refs) || bh.rgs[rg.id] != rg { 549 return errInvalidReadGroup 550 } 551 bh.rgs = append(bh.rgs[:rg.id], bh.rgs[rg.id+1:]...) 552 for i := range bh.rgs[rg.id:] { 553 bh.rgs[i+int(rg.id)].id-- 554 } 555 rg.id = -1 556 delete(bh.seenGroups, rg.name) 557 return nil 558 } 559 560 // AddProgram adds p to the Header. 561 func (bh *Header) AddProgram(p *Program) error { 562 if _, ok := bh.seenProgs[p.uid]; ok { 563 return errDupProgram 564 } 565 if p.owner != nil || p.id >= 0 { 566 return errUsedProgram 567 } 568 p.owner = bh 569 p.id = int32(len(bh.progs)) 570 bh.seenProgs[p.uid] = p.id 571 bh.progs = append(bh.progs, p) 572 return nil 573 } 574 575 // RemoveProgram removes p from the Header and makes it 576 // available to add to another Header. 577 func (bh *Header) RemoveProgram(p *Program) error { 578 if p.id < 0 || int(p.id) >= len(bh.progs) || bh.progs[p.id] != p { 579 return errInvalidProgram 580 } 581 bh.progs = append(bh.progs[:p.id], bh.progs[p.id+1:]...) 582 for i := range bh.progs[p.id:] { 583 bh.progs[i+int(p.id)].id-- 584 } 585 p.id = -1 586 delete(bh.seenProgs, p.uid) 587 return nil 588 }