github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/parse_header.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package sam
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"encoding/hex"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"net/url"
    15  	"strconv"
    16  	"strings"
    17  	"time"
    18  )
    19  
    20  var (
    21  	errBadHeader = errors.New("sam: malformed header line")
    22  	errDupTag    = errors.New("sam: duplicate field")
    23  )
    24  
    25  var bamMagic = [4]byte{'B', 'A', 'M', 0x1}
    26  
    27  // UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
    28  func (bh *Header) UnmarshalBinary(b []byte) error {
    29  	return bh.DecodeBinary(bytes.NewReader(b))
    30  }
    31  
    32  // Max possible SAM header size, in bytes. For detecting a corrupt header
    33  // without blowing up memory usage.
    34  const maxSAMHeaderSize = 0xffffff
    35  
    36  // DecodeBinary unmarshals a Header from the given io.Reader. The byte
    37  // stream must be in the format described in the SAM specification,
    38  // section 4.2.
    39  func (bh *Header) DecodeBinary(r io.Reader) error {
    40  	var (
    41  		lText, nRef int32
    42  		err         error
    43  	)
    44  	var magic [4]byte
    45  	err = binary.Read(r, binary.LittleEndian, &magic)
    46  	if err != nil {
    47  		return err
    48  	}
    49  	if magic != bamMagic {
    50  		return errors.New("sam: magic number mismatch")
    51  	}
    52  	err = binary.Read(r, binary.LittleEndian, &lText)
    53  	if err != nil {
    54  		return err
    55  	}
    56  	if lText < 0 || lText >= maxSAMHeaderSize {
    57  		return errors.New("sam: wrong header length")
    58  	}
    59  	text := make([]byte, lText)
    60  	n, err := r.Read(text)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	if n != int(lText) {
    65  		return errors.New("sam: truncated header")
    66  	}
    67  	err = bh.UnmarshalText(text)
    68  	if err != nil {
    69  		return err
    70  	}
    71  	err = binary.Read(r, binary.LittleEndian, &nRef)
    72  	if err != nil {
    73  		return err
    74  	}
    75  	if nRef < 0 || nRef > maxSAMHeaderSize {
    76  		return errors.New("sam: wrong reference length")
    77  	}
    78  	refs, err := readRefRecords(r, nRef)
    79  	if err != nil {
    80  		return err
    81  	}
    82  	for _, r := range refs {
    83  		err = bh.AddReference(r)
    84  		if err != nil {
    85  			return err
    86  		}
    87  	}
    88  	return nil
    89  }
    90  
    91  func readRefRecords(r io.Reader, n int32) ([]*Reference, error) {
    92  	// bootstrapSize is the maximum number of
    93  	// reference records to pre-allocate.
    94  	const bootstrapSize = 1000
    95  
    96  	rr := make([]*Reference, 0, min(n, bootstrapSize))
    97  	var (
    98  		lName int32
    99  		err   error
   100  	)
   101  	for i := 0; i < int(n); i++ {
   102  		rr = append(rr, &Reference{id: int32(i)})
   103  		err = binary.Read(r, binary.LittleEndian, &lName)
   104  		if err != nil {
   105  			return nil, err
   106  		}
   107  		if lName < 1 || lName > maxSAMHeaderSize {
   108  			return nil, errors.New("sam: wrong reference name length")
   109  		}
   110  		name := make([]byte, lName)
   111  		n, err := r.Read(name)
   112  		if err != nil {
   113  			return nil, err
   114  		}
   115  		if n != int(lName) || name[n-1] != 0 {
   116  			return nil, errors.New("sam: truncated reference name")
   117  		}
   118  		rr[i].name = string(name[:n-1])
   119  		err = binary.Read(r, binary.LittleEndian, &rr[i].lRef)
   120  		if err != nil {
   121  			return nil, err
   122  		}
   123  	}
   124  	return rr, nil
   125  }
   126  
   127  func min(a, b int32) int32 {
   128  	if a < b {
   129  		return a
   130  	}
   131  	return b
   132  }
   133  
   134  // UnmarshalText implements the encoding.TextUnmarshaler interface.
   135  func (bh *Header) UnmarshalText(text []byte) error {
   136  	if bh.seenRefs == nil {
   137  		bh.seenRefs = set{}
   138  	}
   139  	if bh.seenGroups == nil {
   140  		bh.seenGroups = set{}
   141  	}
   142  	if bh.seenProgs == nil {
   143  		bh.seenProgs = set{}
   144  	}
   145  	var t Tag
   146  	for i, l := range bytes.Split(text, []byte{'\n'}) {
   147  		if len(l) > 0 && l[len(l)-1] == '\r' {
   148  			l = l[:len(l)-1]
   149  		}
   150  		if len(l) == 0 {
   151  			continue
   152  		}
   153  		if l[0] != '@' || len(l) < 3 {
   154  			return errBadHeader
   155  		}
   156  		copy(t[:], l[1:3])
   157  		var err error
   158  		switch t {
   159  		case headerTag:
   160  			err = headerLine(l, bh)
   161  		case refDictTag:
   162  			err = referenceLine(l, bh)
   163  		case readGroupTag:
   164  			err = readGroupLine(l, bh)
   165  		case programTag:
   166  			err = programLine(l, bh)
   167  		case commentTag:
   168  			err = commentLine(l, bh)
   169  		default:
   170  			return errBadHeader
   171  		}
   172  		if err != nil {
   173  			return fmt.Errorf("%v: line %d: %q", err, i+1, l)
   174  		}
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  func headerLine(l []byte, bh *Header) error {
   181  	fields := bytes.Split(l, []byte{'\t'})
   182  	if len(fields) < 2 {
   183  		return errBadHeader
   184  	}
   185  
   186  	var t Tag
   187  	for _, f := range fields[1:] {
   188  		if f[2] != ':' {
   189  			return errBadHeader
   190  		}
   191  		copy(t[:], f[:2])
   192  		fs := string(f[3:])
   193  		switch t {
   194  		case versionTag:
   195  			if bh.Version != "" {
   196  				return errBadHeader
   197  			}
   198  			bh.Version = fs
   199  		case sortOrderTag:
   200  			if bh.SortOrder != UnknownOrder {
   201  				return errBadHeader
   202  			}
   203  			bh.SortOrder = sortOrderMap[fs]
   204  		case groupOrderTag:
   205  			if bh.GroupOrder != GroupUnspecified {
   206  				return errBadHeader
   207  			}
   208  			bh.GroupOrder = groupOrderMap[fs]
   209  		default:
   210  			bh.otherTags = append(bh.otherTags, tagPair{tag: t, value: fs})
   211  		}
   212  	}
   213  
   214  	if bh.Version == "" {
   215  		return errBadHeader
   216  	}
   217  
   218  	return nil
   219  }
   220  
   221  func referenceLine(l []byte, bh *Header) error {
   222  	fields := bytes.Split(l, []byte{'\t'})
   223  	if len(fields) < 3 {
   224  		return errBadHeader
   225  	}
   226  
   227  	var (
   228  		t        Tag
   229  		rf       = &Reference{}
   230  		seen     = map[Tag]struct{}{}
   231  		nok, lok bool
   232  		dupID    int32
   233  		dup      bool
   234  	)
   235  
   236  	for _, f := range fields[1:] {
   237  		if f[2] != ':' {
   238  			return errBadHeader
   239  		}
   240  		copy(t[:], f[:2])
   241  		if _, ok := seen[t]; ok {
   242  			return errDupTag
   243  		}
   244  		seen[t] = struct{}{}
   245  		fs := string(f[3:])
   246  		switch t {
   247  		case refNameTag:
   248  			dupID, dup = bh.seenRefs[fs]
   249  			rf.name = fs
   250  			nok = true
   251  		case refLengthTag:
   252  			l, err := strconv.Atoi(fs)
   253  			if err != nil {
   254  				return errBadHeader
   255  			}
   256  			if !validLen(l) {
   257  				return errBadLen
   258  			}
   259  			rf.lRef = int32(l)
   260  			lok = true
   261  		case assemblyIDTag:
   262  			rf.assemID = fs
   263  		case md5Tag:
   264  			hb := [16]byte{}
   265  			n, err := hex.Decode(hb[:], f[3:])
   266  			if err != nil {
   267  				return err
   268  			}
   269  			if n != 16 {
   270  				return errBadHeader
   271  			}
   272  			rf.md5 = string(hb[:])
   273  		case speciesTag:
   274  			rf.species = fs
   275  		case uriTag:
   276  			var err error
   277  			rf.uri, err = url.Parse(fs)
   278  			if err != nil {
   279  				return err
   280  			}
   281  			if rf.uri.Scheme != "http" && rf.uri.Scheme != "ftp" {
   282  				rf.uri.Scheme = "file"
   283  			}
   284  		default:
   285  			rf.otherTags = append(rf.otherTags, tagPair{tag: t, value: fs})
   286  		}
   287  	}
   288  
   289  	if dup {
   290  		if er := bh.refs[dupID]; equalRefs(er, rf) {
   291  			return nil
   292  		} else if !equalRefs(er, &Reference{id: er.id, name: er.name, lRef: er.lRef}) {
   293  			return errDupReference
   294  		}
   295  		old := bh.refs[dupID]
   296  		old.owner = nil
   297  		old.id = -1
   298  		bh.refs[dupID] = rf
   299  		rf.owner = bh
   300  		return nil
   301  	}
   302  	if !nok || !lok {
   303  		return errBadHeader
   304  	}
   305  	id := int32(len(bh.refs))
   306  	rf.owner = bh
   307  	rf.id = id
   308  	bh.seenRefs[rf.name] = id
   309  	bh.refs = append(bh.refs, rf)
   310  
   311  	return nil
   312  }
   313  
   314  // http://en.wikipedia.org/wiki/ISO_8601
   315  //
   316  // Date: 2014-08-13
   317  // Time: 2014-08-13T16:02:01Z
   318  //     : 2014-08-13T16:02:01
   319  //     : 2014-08-13T16:02:01+00:00
   320  //     : 2014-08-13T16:02:01.000+00:00
   321  //
   322  const (
   323  	// This is the ISO8601 format used for output.
   324  	iso8601TimeDateN = "2006-01-02T15:04:05-0700"
   325  
   326  	// This is the set of ISO8601 formats we accept.
   327  	// The input values are first converted to a
   328  	// basic ISO8601 form by removing all ':'
   329  	// characters. We cannot do the same thing with
   330  	// '-' since this has two meanings in ISO8601,
   331  	// a separator and a negative time zone offset.
   332  	iso8601DateB          = "20060102"
   333  	iso8601DateE          = "2006-01-02"
   334  	iso8601TimeDateB      = "20060102T150405"
   335  	iso8601TimeDateE      = "2006-01-02T150405"
   336  	iso8601TimeDateZB     = "20060102T150405Z"
   337  	iso8601TimeDateZE     = "2006-01-02T150405Z"
   338  	iso8601TimeDateNB     = "20060102T150405-0700"
   339  	iso8601TimeDateNE     = "2006-01-02T150405-0700"
   340  	iso8601TimeThouDateZB = "20060102T150405.999Z"
   341  	iso8601TimeThouDateZE = "2006-01-02T150405.999Z"
   342  	iso8601TimeThouDateNB = "20060102T150405.999-0700"
   343  	iso8601TimeThouDateNE = "2006-01-02T150405.999-0700"
   344  )
   345  
   346  var iso8601 = []struct {
   347  	isLocal bool
   348  	format  string
   349  }{
   350  	{isLocal: true, format: iso8601DateB},
   351  	{isLocal: true, format: iso8601DateE},
   352  	{isLocal: false, format: iso8601TimeDateZB},
   353  	{isLocal: false, format: iso8601TimeDateZE},
   354  	{isLocal: false, format: iso8601TimeDateNB},
   355  	{isLocal: false, format: iso8601TimeDateNE},
   356  	{isLocal: false, format: iso8601TimeThouDateZB},
   357  	{isLocal: false, format: iso8601TimeThouDateZE},
   358  	{isLocal: false, format: iso8601TimeThouDateNB},
   359  	{isLocal: false, format: iso8601TimeThouDateNE},
   360  	{isLocal: true, format: iso8601TimeDateB},
   361  	{isLocal: true, format: iso8601TimeDateE},
   362  }
   363  
   364  func parseISO8601(value string) (time.Time, error) {
   365  	value = strings.Replace(value, ":", "", -1)
   366  	var err error
   367  	for _, format := range iso8601 {
   368  		loc := time.UTC
   369  		if format.isLocal {
   370  			loc = time.Local
   371  		}
   372  		var t time.Time
   373  		t, err = time.ParseInLocation(format.format, value, loc)
   374  		if err == nil {
   375  			return t, nil
   376  		}
   377  	}
   378  	return time.Time{}, err
   379  }
   380  
   381  func readGroupLine(l []byte, bh *Header) error {
   382  	fields := bytes.Split(l, []byte{'\t'})
   383  	if len(fields) < 2 {
   384  		return errBadHeader
   385  	}
   386  
   387  	var (
   388  		t    Tag
   389  		rg   = &ReadGroup{}
   390  		seen = map[Tag]struct{}{}
   391  		idok bool
   392  	)
   393  
   394  	for _, f := range fields[1:] {
   395  		if f[2] != ':' {
   396  			return errBadHeader
   397  		}
   398  		copy(t[:], f[:2])
   399  		if _, ok := seen[t]; ok {
   400  			return errDupTag
   401  		}
   402  		seen[t] = struct{}{}
   403  		fs := string(f[3:])
   404  		switch t {
   405  		case idTag:
   406  			if _, ok := bh.seenGroups[fs]; ok {
   407  				return errDupReadGroup
   408  			}
   409  			rg.name = fs
   410  			idok = true
   411  		case centerTag:
   412  			rg.center = fs
   413  		case descriptionTag:
   414  			rg.description = fs
   415  		case dateTag:
   416  			var err error
   417  			rg.date, err = parseISO8601(fs)
   418  			if err != nil {
   419  				return err
   420  			}
   421  		case flowOrderTag:
   422  			rg.flowOrder = fs
   423  		case keySequenceTag:
   424  			rg.keySeq = fs
   425  		case libraryTag:
   426  			rg.library = fs
   427  		case programTag:
   428  			rg.program = fs
   429  		case insertSizeTag:
   430  			i, err := strconv.Atoi(fs)
   431  			if err != nil {
   432  				return err
   433  			}
   434  			if !validInt32(i) {
   435  				return errBadLen
   436  			}
   437  			rg.insertSize = i
   438  		case platformTag:
   439  			rg.platform = fs
   440  		case platformUnitTag:
   441  			rg.platformUnit = fs
   442  		case sampleTag:
   443  			rg.sample = fs
   444  		default:
   445  			rg.otherTags = append(rg.otherTags, tagPair{tag: t, value: fs})
   446  		}
   447  	}
   448  
   449  	if !idok {
   450  		return errBadHeader
   451  	}
   452  	id := int32(len(bh.rgs))
   453  	rg.owner = bh
   454  	rg.id = id
   455  	bh.seenGroups[rg.name] = id
   456  	bh.rgs = append(bh.rgs, rg)
   457  
   458  	return nil
   459  }
   460  
   461  func programLine(l []byte, bh *Header) error {
   462  	fields := bytes.Split(l, []byte{'\t'})
   463  	if len(fields) < 2 {
   464  		return errBadHeader
   465  	}
   466  
   467  	var (
   468  		t    Tag
   469  		p    = &Program{}
   470  		seen = map[Tag]struct{}{}
   471  		idok bool
   472  	)
   473  
   474  	for _, f := range fields[1:] {
   475  		if f[2] != ':' {
   476  			return errBadHeader
   477  		}
   478  		copy(t[:], f[:2])
   479  		if _, ok := seen[t]; ok {
   480  			return errDupTag
   481  		}
   482  		seen[t] = struct{}{}
   483  		fs := string(f[3:])
   484  		switch t {
   485  		case idTag:
   486  			if _, ok := bh.seenProgs[fs]; ok {
   487  				return errDupProgram
   488  			}
   489  			p.uid = fs
   490  			idok = true
   491  		case programNameTag:
   492  			p.name = fs
   493  		case commandLineTag:
   494  			p.command = fs
   495  		case previousProgTag:
   496  			p.previous = fs
   497  		case versionTag:
   498  			p.version = fs
   499  		default:
   500  			p.otherTags = append(p.otherTags, tagPair{tag: t, value: fs})
   501  		}
   502  	}
   503  
   504  	if !idok {
   505  		return errBadHeader
   506  	}
   507  	id := int32(len(bh.progs))
   508  	p.owner = bh
   509  	p.id = id
   510  	bh.seenProgs[p.uid] = id
   511  	bh.progs = append(bh.progs, p)
   512  
   513  	return nil
   514  }
   515  
   516  func commentLine(l []byte, bh *Header) error {
   517  	fields := bytes.Split(l, []byte{'\t'})
   518  	if len(fields) < 2 {
   519  		return errBadHeader
   520  	}
   521  	bh.Comments = append(bh.Comments, string(fields[1]))
   522  	return nil
   523  }