github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/sam/parse_header.go (about) 1 // Copyright ©2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package sam 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "encoding/hex" 11 "errors" 12 "fmt" 13 "io" 14 "net/url" 15 "strconv" 16 "strings" 17 "time" 18 ) 19 20 var ( 21 errBadHeader = errors.New("sam: malformed header line") 22 errDupTag = errors.New("sam: duplicate field") 23 ) 24 25 var bamMagic = [4]byte{'B', 'A', 'M', 0x1} 26 27 // UnmarshalBinary implements the encoding.BinaryUnmarshaler interface. 28 func (bh *Header) UnmarshalBinary(b []byte) error { 29 return bh.DecodeBinary(bytes.NewReader(b)) 30 } 31 32 // Max possible SAM header size, in bytes. For detecting a corrupt header 33 // without blowing up memory usage. 34 const maxSAMHeaderSize = 0xffffff 35 36 // DecodeBinary unmarshals a Header from the given io.Reader. The byte 37 // stream must be in the format described in the SAM specification, 38 // section 4.2. 39 func (bh *Header) DecodeBinary(r io.Reader) error { 40 var ( 41 lText, nRef int32 42 err error 43 ) 44 var magic [4]byte 45 err = binary.Read(r, binary.LittleEndian, &magic) 46 if err != nil { 47 return err 48 } 49 if magic != bamMagic { 50 return errors.New("sam: magic number mismatch") 51 } 52 err = binary.Read(r, binary.LittleEndian, &lText) 53 if err != nil { 54 return err 55 } 56 if lText < 0 || lText >= maxSAMHeaderSize { 57 return errors.New("sam: wrong header length") 58 } 59 text := make([]byte, lText) 60 n, err := r.Read(text) 61 if err != nil { 62 return err 63 } 64 if n != int(lText) { 65 return errors.New("sam: truncated header") 66 } 67 err = bh.UnmarshalText(text) 68 if err != nil { 69 return err 70 } 71 err = binary.Read(r, binary.LittleEndian, &nRef) 72 if err != nil { 73 return err 74 } 75 if nRef < 0 || nRef > maxSAMHeaderSize { 76 return errors.New("sam: wrong reference length") 77 } 78 refs, err := readRefRecords(r, nRef) 79 if err != nil { 80 return err 81 } 82 for _, r := range refs { 83 err = bh.AddReference(r) 84 if err != nil { 85 return err 86 } 87 } 88 return nil 89 } 90 91 func readRefRecords(r io.Reader, n int32) ([]*Reference, error) { 92 // bootstrapSize is the maximum number of 93 // reference records to pre-allocate. 94 const bootstrapSize = 1000 95 96 rr := make([]*Reference, 0, min(n, bootstrapSize)) 97 var ( 98 lName int32 99 err error 100 ) 101 for i := 0; i < int(n); i++ { 102 rr = append(rr, &Reference{id: int32(i)}) 103 err = binary.Read(r, binary.LittleEndian, &lName) 104 if err != nil { 105 return nil, err 106 } 107 if lName < 1 || lName > maxSAMHeaderSize { 108 return nil, errors.New("sam: wrong reference name length") 109 } 110 name := make([]byte, lName) 111 n, err := r.Read(name) 112 if err != nil { 113 return nil, err 114 } 115 if n != int(lName) || name[n-1] != 0 { 116 return nil, errors.New("sam: truncated reference name") 117 } 118 rr[i].name = string(name[:n-1]) 119 err = binary.Read(r, binary.LittleEndian, &rr[i].lRef) 120 if err != nil { 121 return nil, err 122 } 123 } 124 return rr, nil 125 } 126 127 func min(a, b int32) int32 { 128 if a < b { 129 return a 130 } 131 return b 132 } 133 134 // UnmarshalText implements the encoding.TextUnmarshaler interface. 135 func (bh *Header) UnmarshalText(text []byte) error { 136 if bh.seenRefs == nil { 137 bh.seenRefs = set{} 138 } 139 if bh.seenGroups == nil { 140 bh.seenGroups = set{} 141 } 142 if bh.seenProgs == nil { 143 bh.seenProgs = set{} 144 } 145 var t Tag 146 for i, l := range bytes.Split(text, []byte{'\n'}) { 147 if len(l) > 0 && l[len(l)-1] == '\r' { 148 l = l[:len(l)-1] 149 } 150 if len(l) == 0 { 151 continue 152 } 153 if l[0] != '@' || len(l) < 3 { 154 return errBadHeader 155 } 156 copy(t[:], l[1:3]) 157 var err error 158 switch t { 159 case headerTag: 160 err = headerLine(l, bh) 161 case refDictTag: 162 err = referenceLine(l, bh) 163 case readGroupTag: 164 err = readGroupLine(l, bh) 165 case programTag: 166 err = programLine(l, bh) 167 case commentTag: 168 err = commentLine(l, bh) 169 default: 170 return errBadHeader 171 } 172 if err != nil { 173 return fmt.Errorf("%v: line %d: %q", err, i+1, l) 174 } 175 } 176 177 return nil 178 } 179 180 func headerLine(l []byte, bh *Header) error { 181 fields := bytes.Split(l, []byte{'\t'}) 182 if len(fields) < 2 { 183 return errBadHeader 184 } 185 186 var t Tag 187 for _, f := range fields[1:] { 188 if f[2] != ':' { 189 return errBadHeader 190 } 191 copy(t[:], f[:2]) 192 fs := string(f[3:]) 193 switch t { 194 case versionTag: 195 if bh.Version != "" { 196 return errBadHeader 197 } 198 bh.Version = fs 199 case sortOrderTag: 200 if bh.SortOrder != UnknownOrder { 201 return errBadHeader 202 } 203 bh.SortOrder = sortOrderMap[fs] 204 case groupOrderTag: 205 if bh.GroupOrder != GroupUnspecified { 206 return errBadHeader 207 } 208 bh.GroupOrder = groupOrderMap[fs] 209 default: 210 bh.otherTags = append(bh.otherTags, tagPair{tag: t, value: fs}) 211 } 212 } 213 214 if bh.Version == "" { 215 return errBadHeader 216 } 217 218 return nil 219 } 220 221 func referenceLine(l []byte, bh *Header) error { 222 fields := bytes.Split(l, []byte{'\t'}) 223 if len(fields) < 3 { 224 return errBadHeader 225 } 226 227 var ( 228 t Tag 229 rf = &Reference{} 230 seen = map[Tag]struct{}{} 231 nok, lok bool 232 dupID int32 233 dup bool 234 ) 235 236 for _, f := range fields[1:] { 237 if f[2] != ':' { 238 return errBadHeader 239 } 240 copy(t[:], f[:2]) 241 if _, ok := seen[t]; ok { 242 return errDupTag 243 } 244 seen[t] = struct{}{} 245 fs := string(f[3:]) 246 switch t { 247 case refNameTag: 248 dupID, dup = bh.seenRefs[fs] 249 rf.name = fs 250 nok = true 251 case refLengthTag: 252 l, err := strconv.Atoi(fs) 253 if err != nil { 254 return errBadHeader 255 } 256 if !validLen(l) { 257 return errBadLen 258 } 259 rf.lRef = int32(l) 260 lok = true 261 case assemblyIDTag: 262 rf.assemID = fs 263 case md5Tag: 264 hb := [16]byte{} 265 n, err := hex.Decode(hb[:], f[3:]) 266 if err != nil { 267 return err 268 } 269 if n != 16 { 270 return errBadHeader 271 } 272 rf.md5 = string(hb[:]) 273 case speciesTag: 274 rf.species = fs 275 case uriTag: 276 var err error 277 rf.uri, err = url.Parse(fs) 278 if err != nil { 279 return err 280 } 281 if rf.uri.Scheme != "http" && rf.uri.Scheme != "ftp" { 282 rf.uri.Scheme = "file" 283 } 284 default: 285 rf.otherTags = append(rf.otherTags, tagPair{tag: t, value: fs}) 286 } 287 } 288 289 if dup { 290 if er := bh.refs[dupID]; equalRefs(er, rf) { 291 return nil 292 } else if !equalRefs(er, &Reference{id: er.id, name: er.name, lRef: er.lRef}) { 293 return errDupReference 294 } 295 old := bh.refs[dupID] 296 old.owner = nil 297 old.id = -1 298 bh.refs[dupID] = rf 299 rf.owner = bh 300 return nil 301 } 302 if !nok || !lok { 303 return errBadHeader 304 } 305 id := int32(len(bh.refs)) 306 rf.owner = bh 307 rf.id = id 308 bh.seenRefs[rf.name] = id 309 bh.refs = append(bh.refs, rf) 310 311 return nil 312 } 313 314 // http://en.wikipedia.org/wiki/ISO_8601 315 // 316 // Date: 2014-08-13 317 // Time: 2014-08-13T16:02:01Z 318 // : 2014-08-13T16:02:01 319 // : 2014-08-13T16:02:01+00:00 320 // : 2014-08-13T16:02:01.000+00:00 321 // 322 const ( 323 // This is the ISO8601 format used for output. 324 iso8601TimeDateN = "2006-01-02T15:04:05-0700" 325 326 // This is the set of ISO8601 formats we accept. 327 // The input values are first converted to a 328 // basic ISO8601 form by removing all ':' 329 // characters. We cannot do the same thing with 330 // '-' since this has two meanings in ISO8601, 331 // a separator and a negative time zone offset. 332 iso8601DateB = "20060102" 333 iso8601DateE = "2006-01-02" 334 iso8601TimeDateB = "20060102T150405" 335 iso8601TimeDateE = "2006-01-02T150405" 336 iso8601TimeDateZB = "20060102T150405Z" 337 iso8601TimeDateZE = "2006-01-02T150405Z" 338 iso8601TimeDateNB = "20060102T150405-0700" 339 iso8601TimeDateNE = "2006-01-02T150405-0700" 340 iso8601TimeThouDateZB = "20060102T150405.999Z" 341 iso8601TimeThouDateZE = "2006-01-02T150405.999Z" 342 iso8601TimeThouDateNB = "20060102T150405.999-0700" 343 iso8601TimeThouDateNE = "2006-01-02T150405.999-0700" 344 ) 345 346 var iso8601 = []struct { 347 isLocal bool 348 format string 349 }{ 350 {isLocal: true, format: iso8601DateB}, 351 {isLocal: true, format: iso8601DateE}, 352 {isLocal: false, format: iso8601TimeDateZB}, 353 {isLocal: false, format: iso8601TimeDateZE}, 354 {isLocal: false, format: iso8601TimeDateNB}, 355 {isLocal: false, format: iso8601TimeDateNE}, 356 {isLocal: false, format: iso8601TimeThouDateZB}, 357 {isLocal: false, format: iso8601TimeThouDateZE}, 358 {isLocal: false, format: iso8601TimeThouDateNB}, 359 {isLocal: false, format: iso8601TimeThouDateNE}, 360 {isLocal: true, format: iso8601TimeDateB}, 361 {isLocal: true, format: iso8601TimeDateE}, 362 } 363 364 func parseISO8601(value string) (time.Time, error) { 365 value = strings.Replace(value, ":", "", -1) 366 var err error 367 for _, format := range iso8601 { 368 loc := time.UTC 369 if format.isLocal { 370 loc = time.Local 371 } 372 var t time.Time 373 t, err = time.ParseInLocation(format.format, value, loc) 374 if err == nil { 375 return t, nil 376 } 377 } 378 return time.Time{}, err 379 } 380 381 func readGroupLine(l []byte, bh *Header) error { 382 fields := bytes.Split(l, []byte{'\t'}) 383 if len(fields) < 2 { 384 return errBadHeader 385 } 386 387 var ( 388 t Tag 389 rg = &ReadGroup{} 390 seen = map[Tag]struct{}{} 391 idok bool 392 ) 393 394 for _, f := range fields[1:] { 395 if f[2] != ':' { 396 return errBadHeader 397 } 398 copy(t[:], f[:2]) 399 if _, ok := seen[t]; ok { 400 return errDupTag 401 } 402 seen[t] = struct{}{} 403 fs := string(f[3:]) 404 switch t { 405 case idTag: 406 if _, ok := bh.seenGroups[fs]; ok { 407 return errDupReadGroup 408 } 409 rg.name = fs 410 idok = true 411 case centerTag: 412 rg.center = fs 413 case descriptionTag: 414 rg.description = fs 415 case dateTag: 416 var err error 417 rg.date, err = parseISO8601(fs) 418 if err != nil { 419 return err 420 } 421 case flowOrderTag: 422 rg.flowOrder = fs 423 case keySequenceTag: 424 rg.keySeq = fs 425 case libraryTag: 426 rg.library = fs 427 case programTag: 428 rg.program = fs 429 case insertSizeTag: 430 i, err := strconv.Atoi(fs) 431 if err != nil { 432 return err 433 } 434 if !validInt32(i) { 435 return errBadLen 436 } 437 rg.insertSize = i 438 case platformTag: 439 rg.platform = fs 440 case platformUnitTag: 441 rg.platformUnit = fs 442 case sampleTag: 443 rg.sample = fs 444 default: 445 rg.otherTags = append(rg.otherTags, tagPair{tag: t, value: fs}) 446 } 447 } 448 449 if !idok { 450 return errBadHeader 451 } 452 id := int32(len(bh.rgs)) 453 rg.owner = bh 454 rg.id = id 455 bh.seenGroups[rg.name] = id 456 bh.rgs = append(bh.rgs, rg) 457 458 return nil 459 } 460 461 func programLine(l []byte, bh *Header) error { 462 fields := bytes.Split(l, []byte{'\t'}) 463 if len(fields) < 2 { 464 return errBadHeader 465 } 466 467 var ( 468 t Tag 469 p = &Program{} 470 seen = map[Tag]struct{}{} 471 idok bool 472 ) 473 474 for _, f := range fields[1:] { 475 if f[2] != ':' { 476 return errBadHeader 477 } 478 copy(t[:], f[:2]) 479 if _, ok := seen[t]; ok { 480 return errDupTag 481 } 482 seen[t] = struct{}{} 483 fs := string(f[3:]) 484 switch t { 485 case idTag: 486 if _, ok := bh.seenProgs[fs]; ok { 487 return errDupProgram 488 } 489 p.uid = fs 490 idok = true 491 case programNameTag: 492 p.name = fs 493 case commandLineTag: 494 p.command = fs 495 case previousProgTag: 496 p.previous = fs 497 case versionTag: 498 p.version = fs 499 default: 500 p.otherTags = append(p.otherTags, tagPair{tag: t, value: fs}) 501 } 502 } 503 504 if !idok { 505 return errBadHeader 506 } 507 id := int32(len(bh.progs)) 508 p.owner = bh 509 p.id = id 510 bh.seenProgs[p.uid] = id 511 bh.progs = append(bh.progs, p) 512 513 return nil 514 } 515 516 func commentLine(l []byte, bh *Header) error { 517 fields := bytes.Split(l, []byte{'\t'}) 518 if len(fields) < 2 { 519 return errBadHeader 520 } 521 bh.Comments = append(bh.Comments, string(fields[1])) 522 return nil 523 }