github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/auxtags.go (about)

     1  // Copyright ©2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package sam
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"encoding/hex"
    11  	"fmt"
    12  	"math"
    13  	"reflect"
    14  	"strconv"
    15  	"unsafe"
    16  )
    17  
    18  // ASCII is a printable ASCII character included in an Aux tag.
    19  type ASCII byte
    20  
    21  // Hex is a byte slice represented as a hex string in an Aux tag.
    22  type Hex []byte
    23  
    24  // Text is a byte slice represented as a string in an Aux tag.
    25  type Text []byte
    26  
    27  // An Aux represents an auxiliary data field from a SAM alignment record.
    28  type Aux []byte
    29  
    30  // NewAux returns a new Aux with the given tag, type and value. Acceptable value
    31  // types and their corresponding SAM type are:
    32  //
    33  //  A - ASCII
    34  //  c - int8
    35  //  C - uint8
    36  //  s - int16
    37  //  S - uint16
    38  //  i - int, uint or int32
    39  //  I - int, uint or uint32
    40  //  f - float32
    41  //  Z - Text or string
    42  //  H - Hex
    43  //  B - []int8, []int16, []int32, []uint8, []uint16, []uint32 or []float32
    44  //
    45  // The handling of int and uint types is provided as a convenience - values must
    46  // fit within either int32 or uint32 and are converted to the smallest possible
    47  // representation.
    48  //
    49  func NewAux(t Tag, value interface{}) (Aux, error) {
    50  	var a Aux
    51  	switch v := value.(type) {
    52  	case ASCII:
    53  		a = Aux{t[0], t[1], 'A', byte(v)}
    54  	case int:
    55  		switch {
    56  		case math.MinInt8 <= v && v <= math.MaxInt8:
    57  			a = Aux{t[0], t[1], 'c', byte(v)}
    58  		case math.MinInt16 <= v && v <= math.MaxInt16:
    59  			a = Aux{t[0], t[1], 's', 0, 0}
    60  			binary.LittleEndian.PutUint16(a[3:5], uint16(v))
    61  		case math.MinInt32 <= v && v <= math.MaxInt32:
    62  			a = Aux{t[0], t[1], 'i', 0, 0, 0, 0}
    63  			binary.LittleEndian.PutUint32(a[3:7], uint32(v))
    64  		default:
    65  			return nil, fmt.Errorf("sam: integer value out of range %d > %d", v, math.MaxInt32)
    66  		}
    67  	case uint:
    68  		switch {
    69  		case v <= math.MaxUint8:
    70  			a = Aux{t[0], t[1], 'C', byte(v)}
    71  		case v <= math.MaxUint16:
    72  			a = Aux{t[0], t[1], 'S', 0, 0}
    73  			binary.LittleEndian.PutUint16(a[3:5], uint16(v))
    74  		case v <= math.MaxUint32:
    75  			a = Aux{t[0], t[1], 'I', 0, 0, 0, 0}
    76  			binary.LittleEndian.PutUint32(a[3:7], uint32(v))
    77  		default:
    78  			return nil, fmt.Errorf("sam: unsigned integer value out of range %d > %d", v, uint(math.MaxUint32))
    79  		}
    80  	case int8:
    81  		a = Aux{t[0], t[1], 'c', byte(v)}
    82  	case uint8:
    83  		a = Aux{t[0], t[1], 'C', v}
    84  	case int16:
    85  		a = Aux{t[0], t[1], 's', 0, 0}
    86  		binary.LittleEndian.PutUint16(a[3:5], uint16(v))
    87  	case uint16:
    88  		a = Aux{t[0], t[1], 'S', 0, 0}
    89  		binary.LittleEndian.PutUint16(a[3:5], v)
    90  	case int32:
    91  		a = Aux{t[0], t[1], 'i', 0, 0, 0, 0}
    92  		binary.LittleEndian.PutUint32(a[3:7], uint32(v))
    93  	case uint32:
    94  		a = Aux{t[0], t[1], 'I', 0, 0, 0, 0}
    95  		binary.LittleEndian.PutUint32(a[3:7], v)
    96  	case float32:
    97  		a = Aux{t[0], t[1], 'f', 0, 0, 0, 0}
    98  		binary.LittleEndian.PutUint32(a[3:7], math.Float32bits(v))
    99  	case Text:
   100  		a = make(Aux, len(v)+3)
   101  		a[0], a[1], a[2] = t[0], t[1], 'Z'
   102  		copy(a[3:], v)
   103  	case string:
   104  		a = make(Aux, len(v)+3)
   105  		a[0], a[1], a[2] = t[0], t[1], 'Z'
   106  		copy(a[3:], v)
   107  	case Hex:
   108  		a = make(Aux, 3, len(v)+3)
   109  		copy(a, Aux{t[0], t[1], 'H'})
   110  		a = append(a, v...)
   111  	default:
   112  		rv := reflect.ValueOf(value)
   113  		rt := rv.Type()
   114  		if k := rt.Kind(); k != reflect.Array && k != reflect.Slice {
   115  			return nil, fmt.Errorf("sam: unknown type %T", value)
   116  		}
   117  		l := rv.Len()
   118  		if uint(l) > math.MaxUint32 {
   119  			return nil, fmt.Errorf("sam: array too long")
   120  		}
   121  		a = Aux{t[0], t[1], 'B', 0xff, 0, 0, 0, 0}
   122  		binary.LittleEndian.PutUint32([]byte(a[4:8]), uint32(l))
   123  
   124  		switch rt.Elem().Kind() {
   125  		case reflect.Int8:
   126  			a[3] = 'c'
   127  			value := value.([]int8)
   128  			b := *(*[]byte)(unsafe.Pointer(&value))
   129  			return append(a, b...), nil
   130  		case reflect.Uint8:
   131  			a[3] = 'C'
   132  			return append(a, value.([]uint8)...), nil
   133  		case reflect.Int16:
   134  			a[3] = 's'
   135  		case reflect.Uint16:
   136  			a[3] = 'S'
   137  		case reflect.Int32:
   138  			a[3] = 'i'
   139  		case reflect.Uint32:
   140  			a[3] = 'I'
   141  		case reflect.Float32:
   142  			a[3] = 'f'
   143  		default:
   144  			return nil, fmt.Errorf("sam: unsupported array type: %T", value)
   145  		}
   146  		buf := bytes.NewBuffer(a)
   147  		err := binary.Write(buf, binary.LittleEndian, value)
   148  		a = buf.Bytes()
   149  		if err != nil {
   150  			return nil, fmt.Errorf("sam: failed to encode array: %v", err)
   151  		}
   152  	}
   153  	return a, nil
   154  }
   155  
   156  // ParseAux returns an AUX parsed from the given text.
   157  func ParseAux(text []byte) (Aux, error) {
   158  	// TG:T:v...
   159  	// 012345...
   160  	if len(text) < 6 || text[2] != ':' || text[4] != ':' {
   161  		return nil, fmt.Errorf("sam: invalid aux tag field: %q", text)
   162  	}
   163  	txt := text[5:]
   164  	var value interface{}
   165  	switch typ := text[3]; typ {
   166  	case 'A':
   167  		if len(txt) != 1 {
   168  			return nil, fmt.Errorf("sam: invalid aux tag field: %q", text)
   169  		}
   170  		value = ASCII(txt[0])
   171  	case 'i':
   172  		i, err := strconv.Atoi(string(txt))
   173  		if err != nil {
   174  			return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   175  		}
   176  		if i < 0 {
   177  			value = i
   178  		} else {
   179  			value = uint(i)
   180  		}
   181  	case 'f':
   182  		f, err := strconv.ParseFloat(string(txt), 32)
   183  		if err != nil {
   184  			return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   185  		}
   186  		value = float32(f)
   187  	case 'Z':
   188  		value = Text(txt)
   189  	case 'H':
   190  		b := make([]byte, hex.DecodedLen(len(txt)))
   191  		_, err := hex.Decode(b, txt)
   192  		if err != nil {
   193  			return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   194  		}
   195  		value = Hex(b)
   196  	case 'B':
   197  		if txt[1] != ',' {
   198  			return nil, fmt.Errorf("sam: invalid aux tag field: %q", text)
   199  		}
   200  		nf := bytes.Split(txt[2:], []byte{','})
   201  		if len(nf) == 0 {
   202  			return nil, fmt.Errorf("sam: invalid aux tag field: %q", text)
   203  		}
   204  		switch txt[0] {
   205  		case 'c':
   206  			a := make([]int8, len(nf))
   207  			for i, n := range nf {
   208  				v, err := strconv.ParseInt(string(n), 0, 8)
   209  				if err != nil {
   210  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   211  				}
   212  				a[i] = int8(v)
   213  			}
   214  			value = a
   215  		case 'C':
   216  			a := make([]uint8, len(nf))
   217  			for i, n := range nf {
   218  				v, err := strconv.ParseUint(string(n), 0, 8)
   219  				if err != nil {
   220  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   221  				}
   222  				a[i] = uint8(v)
   223  			}
   224  			value = a
   225  		case 's':
   226  			a := make([]int16, len(nf))
   227  			for i, n := range nf {
   228  				v, err := strconv.ParseInt(string(n), 0, 16)
   229  				if err != nil {
   230  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   231  				}
   232  				a[i] = int16(v)
   233  			}
   234  			value = a
   235  		case 'S':
   236  			a := make([]uint16, len(nf))
   237  			for i, n := range nf {
   238  				v, err := strconv.ParseUint(string(n), 0, 16)
   239  				if err != nil {
   240  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   241  				}
   242  				a[i] = uint16(v)
   243  			}
   244  			value = a
   245  		case 'i':
   246  			a := make([]int32, len(nf))
   247  			for i, n := range nf {
   248  				v, err := strconv.ParseInt(string(n), 0, 32)
   249  				if err != nil {
   250  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   251  				}
   252  				a[i] = int32(v)
   253  			}
   254  			value = a
   255  		case 'I':
   256  			a := make([]uint32, len(nf))
   257  			for i, n := range nf {
   258  				v, err := strconv.ParseUint(string(n), 0, 32)
   259  				if err != nil {
   260  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   261  				}
   262  				a[i] = uint32(v)
   263  			}
   264  			value = a
   265  		case 'f':
   266  			a := make([]float32, len(nf))
   267  			for i, n := range nf {
   268  				f, err := strconv.ParseFloat(string(n), 32)
   269  				if err != nil {
   270  					return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   271  				}
   272  				a[i] = float32(f)
   273  			}
   274  			value = a
   275  		default:
   276  			return nil, fmt.Errorf("sam: invalid aux tag field: %q", text)
   277  		}
   278  	default:
   279  		return nil, fmt.Errorf("sam: invalid aux tag field: %q", text)
   280  	}
   281  	aux, err := NewAux(Tag{text[0], text[1]}, value)
   282  	if err != nil {
   283  		return nil, fmt.Errorf("sam: invalid aux tag field: %v", err)
   284  	}
   285  	return aux, nil
   286  }
   287  
   288  var auxKind = [256]byte{
   289  	'A': 'A',
   290  	'c': 'i', 'C': 'i',
   291  	's': 'i', 'S': 'i',
   292  	'i': 'i', 'I': 'i',
   293  	'f': 'f',
   294  	'Z': 'Z',
   295  	'H': 'H',
   296  	'B': 'B',
   297  }
   298  
   299  // String returns the string representation of an Aux type.
   300  func (a Aux) String() string {
   301  	switch a.Type() {
   302  	case 'A':
   303  		return fmt.Sprintf("%s:%c:%c", []byte(a[:2]), a.Kind(), a.Value())
   304  	case 'H':
   305  		return fmt.Sprintf("%s:%c:%02x", []byte(a[:2]), a.Kind(), a.Value())
   306  	case 'B':
   307  		return fmt.Sprintf("%s:%c:%c:%v", []byte(a[:2]), a.Kind(), a[3], a.Value())
   308  	}
   309  	return fmt.Sprintf("%s:%c:%v", []byte(a[:2]), a.Kind(), a.Value())
   310  }
   311  
   312  // samAux implements SAM aux field formatting.
   313  type samAux Aux
   314  
   315  // String returns the string representation of an Aux type.
   316  func (sa samAux) String() string {
   317  	a := Aux(sa)
   318  	switch a.Type() {
   319  	case 'A':
   320  		return fmt.Sprintf("%s:%c:%c", []byte(a[:2]), a.Kind(), a.Value())
   321  	case 'H':
   322  		return fmt.Sprintf("%s:%c:%02x", []byte(a[:2]), a.Kind(), a.Value())
   323  	case 'B':
   324  		var buf bytes.Buffer
   325  		fmt.Fprintf(&buf, "%s:%c:%c", []byte(a[:2]), a.Kind(), a[3])
   326  		rv := reflect.ValueOf(a.Value())
   327  		for i := 0; i < rv.Len(); i++ {
   328  			fmt.Fprintf(&buf, ",%v", rv.Index(i).Interface())
   329  		}
   330  		return buf.String()
   331  	}
   332  	return fmt.Sprintf("%s:%c:%v", []byte(a[:2]), a.Kind(), a.Value())
   333  }
   334  
   335  // A Tag represents an auxiliary or header tag label.
   336  type Tag [2]byte
   337  
   338  var (
   339  	headerTag     = Tag{'H', 'D'}
   340  	versionTag    = Tag{'V', 'N'}
   341  	sortOrderTag  = Tag{'S', 'O'}
   342  	groupOrderTag = Tag{'G', 'O'}
   343  
   344  	refDictTag       = Tag{'S', 'Q'}
   345  	refNameTag       = Tag{'S', 'N'}
   346  	refLengthTag     = Tag{'L', 'N'}
   347  	alternativeLocus = Tag{'A', 'H'} // nolint
   348  	assemblyIDTag    = Tag{'A', 'S'}
   349  	md5Tag           = Tag{'M', '5'}
   350  	speciesTag       = Tag{'S', 'P'}
   351  	uriTag           = Tag{'U', 'R'}
   352  
   353  	readGroupTag    = Tag{'R', 'G'}
   354  	centerTag       = Tag{'C', 'N'}
   355  	descriptionTag  = Tag{'D', 'S'}
   356  	dateTag         = Tag{'D', 'T'}
   357  	flowOrderTag    = Tag{'F', 'O'}
   358  	keySequenceTag  = Tag{'K', 'S'}
   359  	libraryTag      = Tag{'L', 'B'}
   360  	insertSizeTag   = Tag{'P', 'I'}
   361  	platformTag     = Tag{'P', 'L'}
   362  	platformUnitTag = Tag{'P', 'U'}
   363  	sampleTag       = Tag{'S', 'M'}
   364  
   365  	programTag      = Tag{'P', 'G'}
   366  	idTag           = Tag{'I', 'D'}
   367  	programNameTag  = Tag{'P', 'N'}
   368  	commandLineTag  = Tag{'C', 'L'}
   369  	previousProgTag = Tag{'P', 'P'}
   370  	progDesc        = Tag{'D', 'S'} // nolint
   371  
   372  	bagIDTag          = Tag{'D', 'I'}
   373  	bagSizeTag        = Tag{'D', 'S'}
   374  	dupTypeTag        = Tag{'D', 'T'}
   375  	libraryBagSizeTag = Tag{'D', 'L'}
   376  	linearDupTag      = Tag{'L', 'D'}
   377  	linearBagIDTag    = Tag{'L', 'I'}
   378  	linearBagSizeTag  = Tag{'L', 'S'}
   379  
   380  	commentTag = Tag{'C', 'O'}
   381  )
   382  
   383  // NewTag returns a Tag from the tag string. It panics is len(tag) != 2.
   384  func NewTag(tag string) Tag {
   385  	var t Tag
   386  	if copy(t[:], tag) != 2 {
   387  		panic("sam: illegal tag length")
   388  	}
   389  	return t
   390  }
   391  
   392  // String returns a string representation of a Tag.
   393  func (t Tag) String() string { return string(t[:]) }
   394  
   395  // Tag returns the Tag representation of the Aux tag ID.
   396  func (a Aux) Tag() Tag { var t Tag; copy(t[:], a[:2]); return t }
   397  
   398  // Type returns a byte corresponding to the type of the auxiliary tag.
   399  // Returned values are in {'A', 'c', 'C', 's', 'S', 'i', 'I', 'f', 'Z', 'H', 'B'}.
   400  func (a Aux) Type() byte { return a[2] }
   401  
   402  // Kind returns a byte corresponding to the kind of the auxiliary tag.
   403  // Returned values are in {'A', 'i', 'f', 'Z', 'H', 'B'}.
   404  func (a Aux) Kind() byte { return auxKind[a[2]] }
   405  
   406  // Value returns v containing the value of the auxiliary tag.
   407  func (a Aux) Value() interface{} {
   408  	switch t := a.Type(); t {
   409  	case 'A':
   410  		return a[3]
   411  	case 'c':
   412  		return int8(a[3])
   413  	case 'C':
   414  		return uint8(a[3])
   415  	case 's':
   416  		return int16(binary.LittleEndian.Uint16(a[3:5]))
   417  	case 'S':
   418  		return binary.LittleEndian.Uint16(a[3:5])
   419  	case 'i':
   420  		return int32(binary.LittleEndian.Uint32(a[3:7]))
   421  	case 'I':
   422  		return binary.LittleEndian.Uint32(a[3:7])
   423  	case 'f':
   424  		return math.Float32frombits(binary.LittleEndian.Uint32(a[3:7]))
   425  	case 'Z': // Z and H Require that parsing stops before the terminating zero.
   426  		return string(a[3:])
   427  	case 'H':
   428  		return []byte(a[3:])
   429  	case 'B':
   430  		length := int32(binary.LittleEndian.Uint32(a[4:8]))
   431  		switch t := a[3]; t {
   432  		case 'c':
   433  			c := a[8:]
   434  			return *(*[]int8)(unsafe.Pointer(&c))
   435  		case 'C':
   436  			return []uint8(a[8:])
   437  		case 's':
   438  			Bs := make([]int16, length)
   439  			err := binary.Read(bytes.NewBuffer(a[8:]), binary.LittleEndian, &Bs)
   440  			if err != nil {
   441  				panic(fmt.Sprintf("sam: binary.Read of s field failed: %v", err))
   442  			}
   443  			return Bs
   444  		case 'S':
   445  			BS := make([]uint16, length)
   446  			err := binary.Read(bytes.NewBuffer(a[8:]), binary.LittleEndian, &BS)
   447  			if err != nil {
   448  				panic(fmt.Sprintf("sam: binary.Read of S field failed: %v", err))
   449  			}
   450  			return BS
   451  		case 'i':
   452  			Bi := make([]int32, length)
   453  			err := binary.Read(bytes.NewBuffer(a[8:]), binary.LittleEndian, &Bi)
   454  			if err != nil {
   455  				panic(fmt.Sprintf("sam: binary.Read of i field failed: %v", err))
   456  			}
   457  			return Bi
   458  		case 'I':
   459  			BI := make([]uint32, length)
   460  			err := binary.Read(bytes.NewBuffer(a[8:]), binary.LittleEndian, &BI)
   461  			if err != nil {
   462  				panic(fmt.Sprintf("sam: binary.Read of I field failed: %v", err))
   463  			}
   464  			return BI
   465  		case 'f':
   466  			Bf := make([]float32, length)
   467  			err := binary.Read(bytes.NewBuffer(a[8:]), binary.LittleEndian, &Bf)
   468  			if err != nil {
   469  				panic(fmt.Sprintf("sam: binary.Read of f field failed: %v", err))
   470  			}
   471  			return Bf
   472  		default:
   473  			return fmt.Errorf("%%B!(UNKNOWN ARRAY type=%c)", t)
   474  		}
   475  	default:
   476  		return fmt.Errorf("%%?!(UNKNOWN type=%c)", t)
   477  	}
   478  }
   479  
   480  func (a Aux) matches(tag []byte) bool {
   481  	return a[1] == tag[1] && a[0] == tag[0]
   482  }
   483  
   484  // AuxFields is a set of auxiliary fields.
   485  type AuxFields []Aux
   486  
   487  // Get returns the auxiliary field identified by the given tag, or nil
   488  // if no field matches.
   489  func (a AuxFields) Get(tag Tag) Aux {
   490  	for _, f := range a {
   491  		if f.Tag() == tag {
   492  			return f
   493  		}
   494  	}
   495  	return nil
   496  }
   497  
   498  // GetUnique returns an error if the tag appears more than once, and is
   499  // otherwise identical to Get.
   500  func (a AuxFields) GetUnique(tag Tag) (Aux, error) {
   501  	for i, f := range a {
   502  		if f.Tag() == tag {
   503  			for _, f2 := range a[i+1:] {
   504  				if f2.Tag() == tag {
   505  					return nil, fmt.Errorf("sam.GetUnique: tag %v appears multiple times", tag)
   506  				}
   507  			}
   508  			return f, nil
   509  		}
   510  	}
   511  	return nil, nil
   512  }