github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/header.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package sam
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  )
    14  
    15  var (
    16  	errDupReference     = errors.New("sam: duplicate reference name")
    17  	errDupReadGroup     = errors.New("sam: duplicate read group name")
    18  	errDupProgram       = errors.New("sam: duplicate program name")
    19  	errUsedReference    = errors.New("sam: reference already used")
    20  	errUsedReadGroup    = errors.New("sam: read group already used")
    21  	errUsedProgram      = errors.New("sam: program already used")
    22  	errInvalidReference = errors.New("sam: reference not owned by header")
    23  	errInvalidReadGroup = errors.New("sam: read group not owned by header")
    24  	errInvalidProgram   = errors.New("sam: program not owned by header")
    25  	errBadLen           = errors.New("sam: reference length out of range")
    26  )
    27  
    28  // SortOrder indicates the sort order of a SAM or BAM file.
    29  type SortOrder int
    30  
    31  const (
    32  	UnknownOrder SortOrder = iota
    33  	Unsorted
    34  	QueryName
    35  	Coordinate
    36  )
    37  
    38  var (
    39  	sortOrder = [...]string{
    40  		UnknownOrder: "unknown",
    41  		Unsorted:     "unsorted",
    42  		QueryName:    "queryname",
    43  		Coordinate:   "coordinate",
    44  	}
    45  	sortOrderMap = map[string]SortOrder{
    46  		"unknown":    UnknownOrder,
    47  		"unsorted":   Unsorted,
    48  		"queryname":  QueryName,
    49  		"coordinate": Coordinate,
    50  	}
    51  )
    52  
    53  // String returns the string representation of a SortOrder.
    54  func (so SortOrder) String() string {
    55  	if so < Unsorted || so > Coordinate {
    56  		return sortOrder[UnknownOrder]
    57  	}
    58  	return sortOrder[so]
    59  }
    60  
    61  // GroupOrder indicates the grouping order of a SAM or BAM file.
    62  type GroupOrder int
    63  
    64  const (
    65  	GroupUnspecified GroupOrder = iota
    66  	GroupNone
    67  	GroupQuery
    68  	GroupReference
    69  )
    70  
    71  var (
    72  	groupOrder = [...]string{
    73  		GroupUnspecified: "none",
    74  		GroupNone:        "none",
    75  		GroupQuery:       "query",
    76  		GroupReference:   "reference",
    77  	}
    78  	groupOrderMap = map[string]GroupOrder{
    79  		"none":      GroupNone,
    80  		"query":     GroupQuery,
    81  		"reference": GroupReference,
    82  	}
    83  )
    84  
    85  // String returns the string representation of a GroupOrder.
    86  func (g GroupOrder) String() string {
    87  	if g < GroupNone || g > GroupReference {
    88  		return groupOrder[GroupUnspecified]
    89  	}
    90  	return groupOrder[g]
    91  }
    92  
    93  type set map[string]int32
    94  
    95  // Header is a SAM or BAM header.
    96  type Header struct {
    97  	Version    string
    98  	SortOrder  SortOrder
    99  	GroupOrder GroupOrder
   100  	otherTags  []tagPair
   101  
   102  	refs       []*Reference
   103  	rgs        []*ReadGroup
   104  	progs      []*Program
   105  	seenRefs   set
   106  	seenGroups set
   107  	seenProgs  set
   108  
   109  	Comments []string
   110  }
   111  
   112  type tagPair struct {
   113  	tag   Tag
   114  	value string
   115  }
   116  
   117  // NewHeader returns a new Header based on the given text and list
   118  // of References. If there is a conflict between the text and the
   119  // given References NewHeader will return a non-nil error.
   120  func NewHeader(text []byte, r []*Reference) (*Header, error) {
   121  	var err error
   122  	bh := &Header{
   123  		refs:       r,
   124  		seenRefs:   set{},
   125  		seenGroups: set{},
   126  		seenProgs:  set{},
   127  	}
   128  	for i, r := range bh.refs {
   129  		if r.owner != nil || r.id >= 0 {
   130  			return nil, errUsedReference
   131  		}
   132  		r.owner = bh
   133  		r.id = int32(i)
   134  	}
   135  	if text != nil {
   136  		err = bh.UnmarshalText(text)
   137  		if err != nil {
   138  			return nil, err
   139  		}
   140  	}
   141  	return bh, nil
   142  }
   143  
   144  // Tags applies the function fn to each of the tag-value pairs of the Header.
   145  // The SO and GO tags are only used if they are set to the non-default values.
   146  // The function fn must not add or delete tags held by the receiver during
   147  // iteration.
   148  func (bh *Header) Tags(fn func(t Tag, value string)) {
   149  	if fn == nil {
   150  		return
   151  	}
   152  	fn(versionTag, bh.Version)
   153  	if bh.SortOrder != UnknownOrder {
   154  		fn(sortOrderTag, bh.SortOrder.String())
   155  	}
   156  	if bh.GroupOrder != GroupNone {
   157  		fn(groupOrderTag, bh.GroupOrder.String())
   158  	}
   159  	for _, tp := range bh.otherTags {
   160  		fn(tp.tag, tp.value)
   161  	}
   162  }
   163  
   164  // Get returns the string representation of the value associated with the
   165  // given header line tag. If the tag is not present the empty string is returned.
   166  func (bh *Header) Get(t Tag) string {
   167  	switch t {
   168  	case versionTag:
   169  		return bh.Version
   170  	case sortOrderTag:
   171  		return bh.SortOrder.String()
   172  	case groupOrderTag:
   173  		return bh.GroupOrder.String()
   174  	}
   175  	for _, tp := range bh.otherTags {
   176  		if t == tp.tag {
   177  			return tp.value
   178  		}
   179  	}
   180  	return ""
   181  }
   182  
   183  // Set sets the value associated with the given header line tag to the specified
   184  // value. If value is the empty string and the tag may be absent, it is deleted
   185  // or set to a meaningful default (SO:UnknownOrder and GO:GroupUnspecified),
   186  // otherwise an error is returned.
   187  func (bh *Header) Set(t Tag, value string) error {
   188  	switch t {
   189  	case versionTag:
   190  		if value == "" {
   191  			return errBadHeader
   192  		}
   193  		bh.Version = value
   194  	case sortOrderTag:
   195  		if value == "" {
   196  			bh.SortOrder = UnknownOrder
   197  			return nil
   198  		}
   199  		sortOrder, ok := sortOrderMap[value]
   200  		if !ok {
   201  			return errBadHeader
   202  		}
   203  		bh.SortOrder = sortOrder
   204  	case groupOrderTag:
   205  		if value == "" {
   206  			bh.GroupOrder = GroupUnspecified
   207  			return nil
   208  		}
   209  		groupOrder, ok := groupOrderMap[value]
   210  		if !ok {
   211  			return errBadHeader
   212  		}
   213  		bh.GroupOrder = groupOrder
   214  	default:
   215  		if value == "" {
   216  			for i, tp := range bh.otherTags {
   217  				if t == tp.tag {
   218  					copy(bh.otherTags[i:], bh.otherTags[i+1:])
   219  					bh.otherTags = bh.otherTags[:len(bh.otherTags)-1]
   220  					return nil
   221  				}
   222  			}
   223  		} else {
   224  			for i, tp := range bh.otherTags {
   225  				if t == tp.tag {
   226  					bh.otherTags[i].value = value
   227  					return nil
   228  				}
   229  			}
   230  			bh.otherTags = append(bh.otherTags, tagPair{tag: t, value: value})
   231  		}
   232  	}
   233  	return nil
   234  }
   235  
   236  // Clone returns a deep copy of the receiver.
   237  func (bh *Header) Clone() *Header {
   238  	c := &Header{
   239  		Version:    bh.Version,
   240  		SortOrder:  bh.SortOrder,
   241  		GroupOrder: bh.GroupOrder,
   242  		otherTags:  append([]tagPair(nil), bh.otherTags...),
   243  		Comments:   append([]string(nil), bh.Comments...),
   244  		seenRefs:   make(set, len(bh.seenRefs)),
   245  		seenGroups: make(set, len(bh.seenGroups)),
   246  		seenProgs:  make(set, len(bh.seenProgs)),
   247  	}
   248  	if len(bh.refs) != 0 {
   249  		c.refs = make([]*Reference, len(bh.refs))
   250  	}
   251  	if len(bh.rgs) != 0 {
   252  		c.rgs = make([]*ReadGroup, len(bh.rgs))
   253  	}
   254  	if len(bh.progs) != 0 {
   255  		c.progs = make([]*Program, len(bh.progs))
   256  	}
   257  
   258  	for i, r := range bh.refs {
   259  		if r == nil {
   260  			continue
   261  		}
   262  		c.refs[i] = new(Reference)
   263  		*c.refs[i] = *r
   264  		c.refs[i].owner = c
   265  	}
   266  	for i, r := range bh.rgs {
   267  		c.rgs[i] = new(ReadGroup)
   268  		*c.rgs[i] = *r
   269  		c.rgs[i].owner = c
   270  	}
   271  	for i, p := range bh.progs {
   272  		c.progs[i] = new(Program)
   273  		*c.progs[i] = *p
   274  		c.progs[i].owner = c
   275  	}
   276  	for k, v := range bh.seenRefs {
   277  		c.seenRefs[k] = v
   278  	}
   279  	for k, v := range bh.seenGroups {
   280  		c.seenGroups[k] = v
   281  	}
   282  	for k, v := range bh.seenProgs {
   283  		c.seenProgs[k] = v
   284  	}
   285  
   286  	return c
   287  }
   288  
   289  // MergeHeaders returns a new Header resulting from the merge of the
   290  // source Headers, and a mapping between the references in the source
   291  // and the References in the returned Header. Sort order is set to
   292  // unknown and group order is set to none. If a single Header is passed
   293  // to MergeHeaders, the mapping between source and destination headers,
   294  // reflink, is returned as nil.
   295  // The returned Header contains the read groups and programs of the
   296  // first Header in src.
   297  func MergeHeaders(src []*Header) (h *Header, reflinks [][]*Reference, err error) {
   298  	switch len(src) {
   299  	case 0:
   300  		return nil, nil, nil
   301  	case 1:
   302  		return src[0], nil, nil
   303  	}
   304  	reflinks = make([][]*Reference, len(src))
   305  	h = src[0].Clone()
   306  	h.SortOrder = UnknownOrder
   307  	h.GroupOrder = GroupUnspecified
   308  	for i, add := range src {
   309  		if i == 0 {
   310  			reflinks[i] = h.refs
   311  			continue
   312  		}
   313  		links := make([]*Reference, len(add.refs))
   314  		for id, r := range add.refs {
   315  			r = r.Clone()
   316  			err := h.AddReference(r)
   317  			if err != nil {
   318  				return nil, nil, err
   319  			}
   320  			if r.owner != h {
   321  				// r was not actually added, so use the ref
   322  				// that h owns.
   323  				for _, hr := range h.refs {
   324  					if equalRefs(r, hr) {
   325  						r = hr
   326  						break
   327  					}
   328  				}
   329  			}
   330  			links[id] = r
   331  		}
   332  		reflinks[i] = links
   333  	}
   334  
   335  	return h, reflinks, nil
   336  }
   337  
   338  // MarshalText implements the encoding.TextMarshaler interface.
   339  func (bh *Header) MarshalText() ([]byte, error) {
   340  	var buf bytes.Buffer
   341  	if bh.Version != "" {
   342  		if bh.GroupOrder == GroupUnspecified {
   343  			fmt.Fprintf(&buf, "@HD\tVN:%s\tSO:%s", bh.Version, bh.SortOrder)
   344  		} else {
   345  			fmt.Fprintf(&buf, "@HD\tVN:%s\tSO:%s\tGO:%s", bh.Version, bh.SortOrder, bh.GroupOrder)
   346  		}
   347  		for _, tp := range bh.otherTags {
   348  			fmt.Fprintf(&buf, "\t%s:%s", tp.tag, tp.value)
   349  		}
   350  		buf.WriteByte('\n')
   351  	}
   352  	for _, r := range bh.refs {
   353  		fmt.Fprintf(&buf, "%s\n", r)
   354  	}
   355  	for _, rg := range bh.rgs {
   356  		fmt.Fprintf(&buf, "%s\n", rg)
   357  	}
   358  	for _, p := range bh.progs {
   359  		fmt.Fprintf(&buf, "%s\n", p)
   360  	}
   361  	for _, co := range bh.Comments {
   362  		fmt.Fprintf(&buf, "@CO\t%s\n", co)
   363  	}
   364  	return buf.Bytes(), nil
   365  }
   366  
   367  // MarshalBinary implements the encoding.BinaryMarshaler.
   368  func (bh *Header) MarshalBinary() ([]byte, error) {
   369  	b := &bytes.Buffer{}
   370  	err := bh.EncodeBinary(b)
   371  	if err != nil {
   372  		return nil, err
   373  	}
   374  	return b.Bytes(), nil
   375  }
   376  
   377  // EncodeBinary writes a binary encoding of the Header to the given io.Writer.
   378  // The format of the encoding is defined in the SAM specification, section 4.2.
   379  func (bh *Header) EncodeBinary(w io.Writer) error {
   380  	wb := &errWriter{w: w}
   381  
   382  	binary.Write(wb, binary.LittleEndian, bamMagic)
   383  	text, _ := bh.MarshalText()
   384  	binary.Write(wb, binary.LittleEndian, int32(len(text)))
   385  	wb.Write(text)
   386  	binary.Write(wb, binary.LittleEndian, int32(len(bh.refs)))
   387  
   388  	if !validInt32(len(bh.refs)) {
   389  		return errors.New("sam: value out of range")
   390  	}
   391  	var name []byte
   392  	for _, r := range bh.refs {
   393  		name = append(name, []byte(r.name)...)
   394  		name = append(name, 0)
   395  		binary.Write(wb, binary.LittleEndian, int32(len(name)))
   396  		wb.Write(name)
   397  		name = name[:0]
   398  		binary.Write(wb, binary.LittleEndian, r.lRef)
   399  	}
   400  	if wb.err != nil {
   401  		return wb.err
   402  	}
   403  
   404  	return nil
   405  }
   406  
   407  type errWriter struct {
   408  	w   io.Writer
   409  	err error
   410  }
   411  
   412  func (w *errWriter) Write(p []byte) (int, error) {
   413  	if w.err != nil {
   414  		return 0, w.err
   415  	}
   416  	var n int
   417  	n, w.err = w.w.Write(p)
   418  	return n, w.err
   419  }
   420  
   421  // Validate checks r against the Header for record validity according to the
   422  // SAM specification:
   423  //
   424  //  - a program auxiliary field must refer to a program listed in the header
   425  //  - a read group auxiliary field must refer to a read group listed in the
   426  //    header and these must agree on platform unit and library.
   427  func (bh *Header) Validate(r *Record) error {
   428  	rp := r.AuxFields.Get(programTag)
   429  	found := false
   430  	for _, hp := range bh.Progs() {
   431  		if hp.UID() == rp.Value() {
   432  			found = true
   433  			break
   434  		}
   435  	}
   436  	if !found && len(bh.Progs()) != 0 {
   437  		return fmt.Errorf("sam: program uid not found: %v", rp.Value())
   438  	}
   439  
   440  	rg := r.AuxFields.Get(readGroupTag)
   441  	found = false
   442  	for _, hg := range bh.RGs() {
   443  		if hg.Name() == rg.Value() {
   444  			rPlatformUnit := r.AuxFields.Get(platformUnitTag).Value()
   445  			if rPlatformUnit != hg.PlatformUnit() {
   446  				return fmt.Errorf("sam: mismatched platform for read group %s: %v != %v", hg.Name(), rPlatformUnit, hg.platformUnit)
   447  			}
   448  			rLibrary := r.AuxFields.Get(libraryTag).Value()
   449  			if rLibrary != hg.Library() {
   450  				return fmt.Errorf("sam: mismatched library for read group %s: %v != %v", hg.Name(), rLibrary, hg.library)
   451  			}
   452  			found = true
   453  			break
   454  		}
   455  	}
   456  	if !found && len(bh.RGs()) != 0 {
   457  		return fmt.Errorf("sam: read group not found: %v", rg.Value())
   458  	}
   459  
   460  	return nil
   461  }
   462  
   463  // Refs returns the Header's list of References. The returned slice
   464  // should not be altered.
   465  func (bh *Header) Refs() []*Reference {
   466  	return bh.refs
   467  }
   468  
   469  // RGs returns the Header's list of ReadGroups. The returned slice
   470  // should not be altered.
   471  func (bh *Header) RGs() []*ReadGroup {
   472  	return bh.rgs
   473  }
   474  
   475  // Progs returns the Header's list of Programs. The returned slice
   476  // should not be altered.
   477  func (bh *Header) Progs() []*Program {
   478  	return bh.progs
   479  }
   480  
   481  // AddReference adds r to the Header.
   482  func (bh *Header) AddReference(r *Reference) error {
   483  	if dupID, dup := bh.seenRefs[r.name]; dup {
   484  		er := bh.refs[dupID]
   485  		if equalRefs(er, r) {
   486  			return nil
   487  		} else if !equalRefs(r, &Reference{id: -1, name: er.name, lRef: er.lRef}) {
   488  			return errDupReference
   489  		}
   490  		if r.md5 == "" {
   491  			r.md5 = er.md5
   492  		}
   493  		if r.assemID == "" {
   494  			r.assemID = er.assemID
   495  		}
   496  		if r.species == "" {
   497  			r.species = er.species
   498  		}
   499  		if r.uri == nil {
   500  			r.uri = er.uri
   501  		}
   502  		bh.refs[dupID] = r
   503  		return nil
   504  	}
   505  	if r.owner != nil || r.id >= 0 {
   506  		return errUsedReference
   507  	}
   508  	r.owner = bh
   509  	r.id = int32(len(bh.refs))
   510  	bh.seenRefs[r.name] = r.id
   511  	bh.refs = append(bh.refs, r)
   512  	return nil
   513  }
   514  
   515  // RemoveReference removes r from the Header and makes it
   516  // available to add to another Header.
   517  func (bh *Header) RemoveReference(r *Reference) error {
   518  	if r.id < 0 || int(r.id) >= len(bh.refs) || bh.refs[r.id] != r {
   519  		return errInvalidReference
   520  	}
   521  	bh.refs = append(bh.refs[:r.id], bh.refs[r.id+1:]...)
   522  	for i := range bh.refs[r.id:] {
   523  		bh.refs[i+int(r.id)].id--
   524  	}
   525  	r.id = -1
   526  	delete(bh.seenRefs, r.name)
   527  	return nil
   528  }
   529  
   530  // AddReadGroup adds rg to the Header.
   531  func (bh *Header) AddReadGroup(rg *ReadGroup) error {
   532  	if _, ok := bh.seenGroups[rg.name]; ok {
   533  		return errDupReadGroup
   534  	}
   535  	if rg.owner != nil || rg.id >= 0 {
   536  		return errUsedReadGroup
   537  	}
   538  	rg.owner = bh
   539  	rg.id = int32(len(bh.rgs))
   540  	bh.seenGroups[rg.name] = rg.id
   541  	bh.rgs = append(bh.rgs, rg)
   542  	return nil
   543  }
   544  
   545  // RemoveReadGroup removes rg from the Header and makes it
   546  // available to add to another Header.
   547  func (bh *Header) RemoveReadGroup(rg *ReadGroup) error {
   548  	if rg.id < 0 || int(rg.id) >= len(bh.refs) || bh.rgs[rg.id] != rg {
   549  		return errInvalidReadGroup
   550  	}
   551  	bh.rgs = append(bh.rgs[:rg.id], bh.rgs[rg.id+1:]...)
   552  	for i := range bh.rgs[rg.id:] {
   553  		bh.rgs[i+int(rg.id)].id--
   554  	}
   555  	rg.id = -1
   556  	delete(bh.seenGroups, rg.name)
   557  	return nil
   558  }
   559  
   560  // AddProgram adds p to the Header.
   561  func (bh *Header) AddProgram(p *Program) error {
   562  	if _, ok := bh.seenProgs[p.uid]; ok {
   563  		return errDupProgram
   564  	}
   565  	if p.owner != nil || p.id >= 0 {
   566  		return errUsedProgram
   567  	}
   568  	p.owner = bh
   569  	p.id = int32(len(bh.progs))
   570  	bh.seenProgs[p.uid] = p.id
   571  	bh.progs = append(bh.progs, p)
   572  	return nil
   573  }
   574  
   575  // RemoveProgram removes p from the Header and makes it
   576  // available to add to another Header.
   577  func (bh *Header) RemoveProgram(p *Program) error {
   578  	if p.id < 0 || int(p.id) >= len(bh.progs) || bh.progs[p.id] != p {
   579  		return errInvalidProgram
   580  	}
   581  	bh.progs = append(bh.progs[:p.id], bh.progs[p.id+1:]...)
   582  	for i := range bh.progs[p.id:] {
   583  		bh.progs[i+int(p.id)].id--
   584  	}
   585  	p.id = -1
   586  	delete(bh.seenProgs, p.uid)
   587  	return nil
   588  }