github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/record.go (about) 1 // Copyright ©2012-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package sam 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "strconv" 12 "unsafe" 13 14 "github.com/Schaudge/grailbase/simd" 15 "github.com/Schaudge/grailbio/biosimd" 16 "github.com/Schaudge/hts/internal" 17 ) 18 19 // DupType enumerates the different possible values for the Duplicate 20 // Type (DT) aux tag. 21 type DupType int 22 23 const ( 24 // DupTypeNone specifies duplicate type not present. 25 DupTypeNone DupType = iota 26 27 // DupTypeLB specifies "library" or PCR duplicate type. 28 DupTypeLB 29 30 // DupTypeSQ specifies "sequencer" or optical duplicate type. 31 DupTypeSQ 32 ) 33 34 // LinearDupState enumerates the different possible Linear Duplicate 35 // states for a read pair, stored in the LD aux tag. 36 type LinearDupState int 37 38 const ( 39 // LinearNone specifies linear dup tag not present. 40 LinearNone = iota 41 42 // LinearPrimary specifies linear primary. 43 LinearPrimary 44 45 // LinearDuplicate specifies linear duplicate. 46 LinearDuplicate 47 ) 48 49 // Record represents a SAM/BAM record. 50 type Record struct { 51 Name string 52 Ref *Reference 53 Pos int 54 MapQ byte 55 Cigar Cigar 56 Flags Flags 57 MateRef *Reference 58 MatePos int 59 TempLen int 60 Seq Seq 61 Qual []byte 62 AuxFields AuxFields 63 64 Scratch []byte 65 } 66 67 // NewRecord returns a Record, checking for consistency of the provided 68 // attributes. 69 func NewRecord(name string, ref, mRef *Reference, p, mPos, tLen int, mapQ byte, co []CigarOp, seq, qual []byte, aux []Aux) (*Record, error) { 70 if !(validPos(p) && validPos(mPos) && validTmpltLen(tLen) && validLen(len(seq)) && (qual == nil || validLen(len(qual)))) { 71 return nil, errors.New("sam: value out of range") 72 } 73 if len(name) == 0 || len(name) > 254 { 74 return nil, errors.New("sam: name absent or too long") 75 } 76 if qual != nil && len(qual) != len(seq) { 77 return nil, errors.New("sam: sequence/quality length mismatch") 78 } 79 if ref != nil { 80 if ref.id < 0 { 81 return nil, errors.New("sam: linking to invalid reference") 82 } 83 } else { 84 if p != -1 { 85 return nil, errors.New("sam: specified position != -1 without reference") 86 } 87 } 88 if mRef != nil { 89 if mRef.id < 0 { 90 return nil, errors.New("sam: linking to invalid mate reference") 91 } 92 } else { 93 if mPos != -1 { 94 return nil, errors.New("sam: specified mate position != -1 without mate reference") 95 } 96 } 97 r := GetFromFreePool() 98 r.Name = name 99 r.Ref = ref 100 r.Pos = p 101 r.MapQ = mapQ 102 r.Cigar = co 103 r.Flags = 0 104 r.MateRef = mRef 105 r.MatePos = mPos 106 r.TempLen = tLen 107 r.Seq = NewSeq(seq) 108 r.Qual = qual 109 r.AuxFields = aux 110 return r, nil 111 } 112 113 // IsValidRecord returns whether the record satisfies the conditions that 114 // it has the Unmapped flag set if it not placed; that the MateUnmapped 115 // flag is set if it paired its mate is unplaced; that the CIGAR length 116 // matches the sequence and quality string lengths if they are non-zero; and 117 // that the Paired, ProperPair, Unmapped and MateUnmapped flags are consistent. 118 func IsValidRecord(r *Record) bool { 119 if (r.Ref == nil || r.Pos == -1) && r.Flags&Unmapped == 0 { 120 return false 121 } 122 if r.Flags&Paired != 0 && (r.MateRef == nil || r.MatePos == -1) && r.Flags&MateUnmapped == 0 { 123 return false 124 } 125 if r.Flags&(Unmapped|ProperPair) == Unmapped|ProperPair { 126 return false 127 } 128 if r.Flags&(Paired|MateUnmapped|ProperPair) == Paired|MateUnmapped|ProperPair { 129 return false 130 } 131 if len(r.Qual) != 0 && r.Seq.Length != len(r.Qual) { 132 return false 133 } 134 if cigarLen := r.Len(); cigarLen < 0 || (r.Seq.Length != 0 && r.Seq.Length != cigarLen) { 135 return false 136 } 137 return true 138 } 139 140 // Tag returns an Aux tag whose tag ID matches the first two bytes of tag and true. 141 // If no tag matches, nil and false are returned. 142 func (r *Record) Tag(tag []byte) (v Aux, ok bool) { 143 if len(tag) < 2 { 144 panic("sam: tag too short") 145 } 146 for _, aux := range r.AuxFields { 147 if aux.matches(tag) { 148 return aux, true 149 } 150 } 151 return nil, false 152 } 153 154 // RefID returns the reference ID for the Record. 155 func (r *Record) RefID() int { 156 return r.Ref.ID() 157 } 158 159 // Start returns the lower-coordinate end of the alignment. 160 func (r *Record) Start() int { 161 return r.Pos 162 } 163 164 // Bin returns the BAM index bin of the record. 165 func (r *Record) Bin() int { 166 if r.Flags&(Unmapped|MateUnmapped) == Unmapped|MateUnmapped { 167 return 4680 // reg2bin(-1, 0) 168 } 169 end := r.End() 170 171 // If the alignment length is zero (for example, if the read is 172 // unmapped), then increment end by 1 and treat the read as length 173 // 1 for binning purposes. 174 if end == r.Pos { 175 end++ 176 } 177 178 if !internal.IsValidIndexPos(r.Pos) || !internal.IsValidIndexPos(end) { 179 return -1 180 } 181 return int(internal.BinFor(r.Pos, end)) 182 } 183 184 // Len returns the length of the alignment. 185 func (r *Record) Len() int { 186 return r.End() - r.Start() 187 } 188 189 func max(a, b int) int { 190 if a < b { 191 return b 192 } 193 return a 194 } 195 196 // End returns the highest query-consuming coordinate end of the alignment. 197 // The position returned by End is not valid if r.Cigar.IsValid(r.Seq.Length) 198 // is false. 199 func (r *Record) End() int { 200 if r.Flags&Unmapped != 0 || len(r.Cigar) == 0 { 201 return r.Pos + 1 202 } 203 pos := r.Pos 204 end := pos 205 for _, co := range r.Cigar { 206 pos += co.Len() * co.Type().Consumes().Reference 207 end = max(end, pos) 208 } 209 return end 210 } 211 212 // Strand returns an int8 indicating the strand of the alignment. A positive return indicates 213 // alignment in the forward orientation, a negative returns indicates alignment in the reverse 214 // orientation. 215 func (r *Record) Strand() int8 { 216 if r.Flags&Reverse == Reverse { 217 return -1 218 } 219 return 1 220 } 221 222 // LessByName returns true if the receiver sorts by record name before other. 223 func (r *Record) LessByName(other *Record) bool { 224 return r.Name < other.Name 225 } 226 227 // LessByCoordinate returns true if the receiver sorts by coordinate before other 228 // according to the SAM specification. 229 func (r *Record) LessByCoordinate(other *Record) bool { 230 rRefName := r.Ref.Name() 231 oRefName := other.Ref.Name() 232 switch { 233 case oRefName == "*": 234 return true 235 case rRefName == "*": 236 return false 237 } 238 return (rRefName < oRefName) || (rRefName == oRefName && r.Pos < other.Pos) 239 } 240 241 // BagID returns the bag id (given by aux tag "DI") for r. If the DI 242 // tag is not set, returns (-1, nil). If the tag is present, but malformed, 243 // returns (-1, err). 244 func (r *Record) BagID() (int64, error) { 245 val, found, err := r.auxInt64Value(bagIDTag) 246 if found && val < 0 { 247 return -1, fmt.Errorf("bag id: expected bag id >= 0, not %d", val) 248 } 249 return val, err 250 } 251 252 // BagSize returns the size of the bag as defined in the "DS" aux 253 // tag. For a description of the DS tag, please see 254 // bio-mark-duplicates. If the aux tag is not present, returns (-1, 255 // nil). If the aux tag is malformed, returns (-1, err). 256 func (r *Record) BagSize() (int, error) { 257 val, found, err := r.auxIntValue(bagSizeTag) 258 if found && val <= 0 { 259 return -1, fmt.Errorf("bag size: expected bag size >= 1, not %d", val) 260 } 261 return val, err 262 } 263 264 // DupType returns (DupTypeSQ, nil) if r has the DT tag, and its value 265 // is "SQ" (optical). If the DT tag is present, and its value is "LB" 266 // (PCR), then returns (DupTypeLB, nil). If the DT tag is not present, 267 // then returns (DupTypeNone, nil). If the aux value is malformed, 268 // then returns (DupTypeNone, err). 269 func (r *Record) DupType() (DupType, error) { 270 aux, err := r.AuxFields.GetUnique(dupTypeTag) 271 if err != nil || aux == nil { 272 return DupTypeNone, err 273 } 274 275 s, ok := aux.Value().(string) 276 if !ok { 277 return DupTypeNone, fmt.Errorf("optical dup: unexpected type: %s", aux.String()) 278 } 279 280 switch s { 281 case "SQ": 282 return DupTypeSQ, nil 283 case "LB": 284 return DupTypeLB, nil 285 } 286 return DupTypeNone, fmt.Errorf("optical dup: unexpected value: %s", aux.String()) 287 } 288 289 // LibraryBagSize returns the number of library duplicate fragments in the bag of the given 290 // record, as defined by the DL tag. For a description of the DL tag and how it relates to 291 // the DS tag, please see bio-mark-duplicates. If the DL tag is not present (e.g., earlier 292 // pipeline versions or read pairs with an unmapped read), -1 will be returned without an 293 // error. If the DL tag is malformed, an error will be returned. 294 func (r *Record) LibraryBagSize() (int, error) { 295 val, found, err := r.auxIntValue(libraryBagSizeTag) 296 if found && val < 1 { 297 return -1, fmt.Errorf("%s: expected value >= 1, not %d", libraryBagSizeTag, val) 298 } 299 return val, err 300 } 301 302 // LinearDup returns the linear duplicate state of the record as 303 // specified in the LD aux tag. If the LD tag is not present (e.g., 304 // earlier pipeline versions or read pairs with an unmapped read, or 305 // mapq below threshold), returns (LinearNone, nil). If the LD 306 // tag is present, but has an invalid value, then returns 307 // (LinearNone, err). Otherwise, returns (LinearPrimary, 308 // nil) or (LinearDuplicate, nil) depending on the value of the 309 // LD tag. 310 func (r *Record) LinearDup() (LinearDupState, error) { 311 aux, err := r.AuxFields.GetUnique(linearDupTag) 312 if err != nil || aux == nil { 313 return LinearNone, err 314 } 315 316 s, ok := aux.Value().(string) 317 if !ok { 318 return LinearNone, fmt.Errorf("linear dup: unexpected type: %s", aux.String()) 319 } 320 switch s { 321 case "primary": 322 return LinearPrimary, nil 323 case "duplicate": 324 return LinearDuplicate, nil 325 } 326 return LinearNone, fmt.Errorf("linear dup: unexpected value: %s", aux.String()) 327 } 328 329 // LinearBagID returns the linear bag id (given by aux tag "LI") for r. If the LI 330 // tag is not set, returns (-1, nil). If the tag is present, but malformed, 331 // returns (-1, err). 332 func (r *Record) LinearBagID() (int64, error) { 333 val, found, err := r.auxInt64Value(linearBagIDTag) 334 if found && val < 0 { 335 return -1, fmt.Errorf("linear bag id: expected bag id >= 0, not %d", val) 336 } 337 return val, err 338 } 339 340 // LinearBagSize returns the size of the linear bag as defined in the "LS" aux 341 // tag. If the aux tag is not present, returns (-1, nil). If the aux 342 // tag is malformed, returns (-1, err). 343 func (r *Record) LinearBagSize() (int, error) { 344 val, found, err := r.auxIntValue(linearBagSizeTag) 345 if found && val <= 0 { 346 return -1, fmt.Errorf("linear bag size: expected bag size >= 1, not %d", val) 347 } 348 return val, err 349 } 350 351 // auxIntValue finds the unique specified aux tag. If there is an 352 // error while looking for the aux tag or the type is not an int, 353 // return (-1, false, err). If the aux tag is not present, return (-1, 354 // false, nil). If the aux tag is found, and it is an integer type, 355 // then return (value, true, nil). 356 func (r *Record) auxIntValue(tag Tag) (val int, found bool, err error) { 357 aux, err := r.AuxFields.GetUnique(tag) 358 if err != nil || aux == nil { 359 return -1, false, err 360 } 361 362 switch v := aux.Value().(type) { 363 case uint8: 364 val = int(v) 365 case int8: 366 val = int(v) 367 case int16: 368 val = int(v) 369 case uint16: 370 val = int(v) 371 case int32: 372 val = int(v) 373 default: 374 return -1, false, fmt.Errorf("%s: unexpected type: %T", tag, v) 375 } 376 return val, true, nil 377 } 378 379 // auxInt64Value finds the unique specified aux tag. It is like 380 // auxIntValue, but returns an int64 and also converts a string type 381 // aux tag if it can be parsed as an integer. 382 // 383 // If there is an error while looking for the aux tag or the type is 384 // not an int, return (-1, false, err). If the aux tag is not present, 385 // return (-1, false, nil). If the aux tag is found, and it is an 386 // integer type, then return (value, true, nil). 387 func (r *Record) auxInt64Value(tag Tag) (val int64, found bool, err error) { 388 aux, err := r.AuxFields.GetUnique(tag) 389 if err != nil || aux == nil { 390 return -1, false, err 391 } 392 393 switch v := aux.Value().(type) { 394 case uint8: 395 val = int64(v) 396 case int8: 397 val = int64(v) 398 case int16: 399 val = int64(v) 400 case uint16: 401 val = int64(v) 402 case int32: 403 val = int64(v) 404 case string: 405 val, err = strconv.ParseInt(v, 10, 64) 406 if err != nil { 407 return -1, false, err 408 } 409 default: 410 return -1, false, fmt.Errorf("%s: unexpected type: %T", tag, v) 411 } 412 return val, true, nil 413 } 414 415 // String returns a string representation of the Record. 416 func (r *Record) String() string { 417 end := r.End() 418 return fmt.Sprintf("%s %v %v %d %s:%d..%d (%d) %d %s:%d %d %s %v %v", 419 r.Name, 420 r.Flags, 421 r.Cigar, 422 r.MapQ, 423 r.Ref.Name(), 424 r.Pos, 425 end, 426 r.Bin(), 427 end-r.Pos, 428 r.MateRef.Name(), 429 r.MatePos, 430 r.TempLen, 431 r.Seq.Expand(), 432 r.Qual, 433 r.AuxFields, 434 ) 435 } 436 437 // UnmarshalText implements the encoding.TextUnmarshaler. It calls UnmarshalSAM with 438 // a nil Header. 439 func (r *Record) UnmarshalText(b []byte) error { 440 return r.UnmarshalSAM(nil, b) 441 } 442 443 // UnmarshalSAM parses a SAM format alignment line in the provided []byte, using 444 // references from the provided Header. If a nil Header is passed to UnmarshalSAM 445 // and the SAM data include non-empty refence and mate reference names, fake 446 // references with zero length and an ID of -1 are created to hold the reference 447 // names. 448 func (r *Record) UnmarshalSAM(h *Header, b []byte) error { 449 f := bytes.Split(b, []byte{'\t'}) 450 if len(f) < 11 { 451 return errors.New("sam: missing SAM fields") 452 } 453 *r = Record{Name: string(f[0])} 454 // TODO(kortschak): Consider parsing string format flags. 455 flags, err := strconv.ParseUint(string(f[1]), 0, 16) 456 if err != nil { 457 return fmt.Errorf("sam: failed to parse flags: %v", err) 458 } 459 r.Flags = Flags(flags) 460 r.Ref, err = referenceForName(h, string(f[2])) 461 if err != nil { 462 return fmt.Errorf("sam: failed to assign reference: %v", err) 463 } 464 r.Pos, err = strconv.Atoi(string(f[3])) 465 r.Pos-- 466 if err != nil { 467 return fmt.Errorf("sam: failed to parse position: %v", err) 468 } 469 mapQ, err := strconv.ParseUint(string(f[4]), 10, 8) 470 if err != nil { 471 return fmt.Errorf("sam: failed to parse map quality: %v", err) 472 } 473 r.MapQ = byte(mapQ) 474 r.Cigar, err = ParseCigar(f[5]) 475 if err != nil { 476 return fmt.Errorf("sam: failed to parse cigar string: %v", err) 477 } 478 if bytes.Equal(f[2], f[6]) || bytes.Equal(f[6], []byte{'='}) { 479 r.MateRef = r.Ref 480 } else { 481 r.MateRef, err = referenceForName(h, string(f[6])) 482 if err != nil { 483 return fmt.Errorf("sam: failed to assign mate reference: %v", err) 484 } 485 } 486 r.MatePos, err = strconv.Atoi(string(f[7])) 487 r.MatePos-- 488 if err != nil { 489 return fmt.Errorf("sam: failed to parse mate position: %v", err) 490 } 491 r.TempLen, err = strconv.Atoi(string(f[8])) 492 if err != nil { 493 return fmt.Errorf("sam: failed to parse template length: %v", err) 494 } 495 if !bytes.Equal(f[9], []byte{'*'}) { 496 r.Seq = NewSeq(f[9]) 497 if len(r.Cigar) != 0 && !r.Cigar.IsValid(r.Seq.Length) { 498 return errors.New("sam: sequence/CIGAR length mismatch") 499 } 500 } 501 if !bytes.Equal(f[10], []byte{'*'}) { 502 r.Qual = append(r.Qual, f[10]...) 503 simd.AddConst8Inplace(r.Qual, 256-33) 504 } else if r.Seq.Length != 0 { 505 r.Qual = make([]byte, r.Seq.Length) 506 simd.Memset8(r.Qual, 0xff) 507 } 508 if len(r.Qual) != 0 && len(r.Qual) != r.Seq.Length { 509 return errors.New("sam: sequence/quality length mismatch") 510 } 511 if len(f) > 11 { 512 r.AuxFields = make([]Aux, len(f)-11) 513 for i, aux := range f[11:] { 514 a, err := ParseAux(aux) 515 if err != nil { 516 return err 517 } 518 r.AuxFields[i] = a 519 } 520 } 521 return nil 522 } 523 524 func referenceForName(h *Header, name string) (*Reference, error) { 525 if name == "*" { 526 return nil, nil 527 } 528 if h == nil { 529 // If we don't have a Header, return a fake Reference. 530 return &Reference{ 531 id: -1, 532 name: name, 533 }, nil 534 } 535 536 for _, r := range h.refs { 537 if r.Name() == name { 538 return r, nil 539 } 540 } 541 return nil, fmt.Errorf("no reference with name %q", name) 542 } 543 544 // MarshalText implements encoding.TextMarshaler. It calls MarshalSAM with FlagDecimal. 545 func (r *Record) MarshalText() ([]byte, error) { 546 return r.MarshalSAM(0) 547 } 548 549 // MarshalSAM formats a Record as SAM using the specified flag format. Acceptable 550 // formats are FlagDecimal, FlagHex and FlagString. 551 func (r *Record) MarshalSAM(flags int) ([]byte, error) { 552 if flags < FlagDecimal || flags > FlagString { 553 return nil, errors.New("sam: flag format option out of range") 554 } 555 if r.Qual != nil && len(r.Qual) != r.Seq.Length { 556 return nil, errors.New("sam: sequence/quality length mismatch") 557 } 558 var buf bytes.Buffer 559 fmt.Fprintf(&buf, "%s\t%v\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", 560 r.Name, 561 formatFlags(r.Flags, flags), 562 r.Ref.Name(), 563 r.Pos+1, 564 r.MapQ, 565 r.Cigar, 566 formatMate(r.Ref, r.MateRef), 567 r.MatePos+1, 568 r.TempLen, 569 formatSeq(r.Seq), 570 formatQual(r.Qual), 571 ) 572 for _, t := range r.AuxFields { 573 fmt.Fprintf(&buf, "\t%v", samAux(t)) 574 } 575 return buf.Bytes(), nil 576 } 577 578 // Flag format constants. 579 const ( 580 FlagDecimal = iota 581 FlagHex 582 FlagString 583 ) 584 585 func formatFlags(f Flags, format int) interface{} { 586 switch format { 587 case FlagDecimal: 588 return uint16(f) 589 case FlagHex: 590 return fmt.Sprintf("0x%x", f) 591 case FlagString: 592 // If 0x01 is unset, no assumptions can be made about 0x02, 0x08, 0x20, 0x40 and 0x80 593 const pairedMask = ProperPair | MateUnmapped | MateReverse | MateReverse | Read1 | Read2 594 if f&1 == 0 { 595 f &^= pairedMask 596 } 597 598 const flags = "pPuUrR12sfdS" 599 600 b := make([]byte, 0, len(flags)) 601 for i, c := range flags { 602 if f&(1<<uint(i)) != 0 { 603 b = append(b, byte(c)) 604 } 605 } 606 607 return string(b) 608 default: 609 panic("sam: invalid flag format") 610 } 611 } 612 613 func formatMate(ref, mate *Reference) string { 614 if mate != nil && ref == mate { 615 return "=" 616 } 617 return mate.Name() 618 } 619 620 func formatSeq(s Seq) []byte { 621 if s.Length == 0 { 622 return []byte{'*'} 623 } 624 return s.Expand() 625 } 626 627 func formatQual(q []byte) []byte { 628 for _, v := range q { 629 if v != 0xff { 630 a := make([]byte, len(q)) 631 simd.AddConst8(a, q, 33) 632 return a 633 } 634 } 635 return []byte{'*'} 636 } 637 638 // Doublet is a nybble-encode pair of nucleotide bases. 639 type Doublet byte 640 641 // Seq is a nybble-encode pair of nucleotide sequence. 642 type Seq struct { 643 Length int 644 Seq []Doublet 645 } 646 647 var ( 648 n16TableRev = simd.MakeNibbleLookupTable([16]byte{'=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}) 649 n16Table = [256]Doublet{ 650 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 651 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 652 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 653 0x1, 0x2, 0x4, 0x8, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x0, 0xf, 0xf, 654 0xf, 0x1, 0xe, 0x2, 0xd, 0xf, 0xf, 0x4, 0xb, 0xf, 0xf, 0xc, 0xf, 0x3, 0xf, 0xf, 655 0xf, 0xf, 0x5, 0x6, 0x8, 0xf, 0x7, 0x9, 0xf, 0xa, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 656 0xf, 0x1, 0xe, 0x2, 0xd, 0xf, 0xf, 0x4, 0xb, 0xf, 0xf, 0xc, 0xf, 0x3, 0xf, 0xf, 657 0xf, 0xf, 0x5, 0x6, 0x8, 0xf, 0x7, 0x9, 0xf, 0xa, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 658 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 659 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 660 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 661 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 662 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 663 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 664 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 665 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 666 } 667 ) 668 669 // NewSeq returns a new Seq based on the given byte slice. 670 func NewSeq(s []byte) Seq { 671 return Seq{ 672 Length: len(s), 673 Seq: contract(s), 674 } 675 } 676 677 func contract(s []byte) []Doublet { 678 ns := make([]Doublet, (len(s)+1)>>1) 679 var np Doublet 680 for i, b := range s { 681 if i&1 == 0 { 682 np = n16Table[b] << 4 683 } else { 684 ns[i>>1] = np | n16Table[b] 685 } 686 } 687 // We haven't written the last base if the 688 // sequence was odd length, so do that now. 689 if len(s)&1 != 0 { 690 ns[len(ns)-1] = np 691 } 692 return ns 693 } 694 695 // Expand returns the byte encoded form of the receiver. 696 // 697 // This now has decent performance for ns.Length >= 32 (allocation is now the 698 // main bottleneck in that case), but it should still be avoided in new code. 699 // Base/BaseChar() is better if you are just performing a small number of point 700 // queries. Direct calls to biosimd.UnpackSeq{Unsafe} or 701 // UnpackAndReplaceSeq{Unsafe}, which populate preallocated buffers, are better 702 // when you are iterating through many bases. (The main advantage of the 703 // Unsafe functions is great performance for length < 32.) 704 func (ns Seq) Expand() []byte { 705 s := make([]byte, ns.Length) 706 nsSeqPtr := (*[]byte)(unsafe.Pointer(&ns.Seq)) 707 biosimd.UnpackAndReplaceSeq(s, *nsSeqPtr, &n16TableRev) 708 return s 709 } 710 711 // SeqBase is BAM's 4-bit encoding of nucleotide base types. See section 4.2 of 712 // https://samtools.github.io/hts-specs/SAMv1.pdf 713 type SeqBase byte 714 715 const ( 716 // Commonly used SeqBase constants. 717 BaseA SeqBase = 1 718 BaseC SeqBase = 2 719 BaseG SeqBase = 4 720 BaseT SeqBase = 8 721 BaseS SeqBase = 6 722 BaseN SeqBase = 15 723 724 // NumSeqBaseTypes is number of possible SeqBase values. SeqBase starts 725 // from 0. 726 NumSeqBaseTypes = 16 727 ) 728 729 func CharToSeqBase(char byte) SeqBase { 730 return SeqBase(n16Table[char]) 731 } 732 733 // Base returns the pos'th base of the sequence. 734 // 735 // REQUIRES: 0 <= pos < seq.Length 736 func (ns Seq) Base(pos int) SeqBase { 737 var base SeqBase 738 if pos%2 == 0 { 739 base = SeqBase(ns.Seq[pos/2] >> 4) 740 } else { 741 base = SeqBase(ns.Seq[pos/2] & 0xf) 742 } 743 return base 744 } 745 746 // BaseChar returns the pos'th base of the as a character, such as 'A', 'T'. 747 // 748 // REQUIRES: 0 <= pos < seq.Length 749 func (ns Seq) BaseChar(pos int) byte { 750 return n16TableRev.Get(byte(ns.Base(pos))) 751 } 752 753 // Char converts a SeqBase to a human-readable character. For example, 754 // BaseA.Char() == 'A'. 755 // 756 // REQUIRES: 0 <= b < NumSeqBaseTypes 757 func (b SeqBase) Char() byte { 758 return n16TableRev.Get(byte(b)) 759 }