github.com/Schaudge/hts@v0.0.0-20240223063651-737b4d69d68c/bgzf/index/index_test.go (about)

     1  // Copyright ©2013 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package index
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"io"
    11  	"strings"
    12  	"testing"
    13  
    14  	"github.com/Schaudge/hts/bgzf"
    15  
    16  	"gopkg.in/check.v1"
    17  )
    18  
    19  var conc = flag.Int("conc", 1, "sets the level of concurrency for compression")
    20  
    21  func Test(t *testing.T) { check.TestingT(t) }
    22  
    23  type S struct{}
    24  
    25  var _ = check.Suite(&S{})
    26  
    27  // conceptualBAMdata is the BAM corresponding to:
    28  //
    29  // @HD	VN:1.0	SO:coordinate
    30  // @SQ	SN:conceptual	LN:134217728
    31  // 60m66m:bin0	0	conceptual	62914561	40	6291456M	*	0	0	*	*
    32  // 70m76m:bin2	0	conceptual	73400321	40	6291456M	*	0	0	*	*
    33  // 73m75m:bin18	0	conceptual	76546049	40	2097152M	*	0	0	*	*
    34  //
    35  // This is a coordinate-translated version of the conceptual example in the
    36  // SAM spec using binning as actually used by BAM rather than as presented.
    37  var conceptualBAMdata = []byte{
    38  	// sam.Header block [{File:0, Block:0}, {File:0, Block:87}).
    39  	0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
    40  	0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x64, 0x00, 0x73, 0x72,
    41  	0xf4, 0x65, 0xb4, 0x60, 0x60, 0x60, 0x70, 0xf0, 0x70, 0xe1,
    42  	0x0c, 0xf3, 0xb3, 0x32, 0xd4, 0x33, 0xe0, 0x0c, 0xf6, 0xb7,
    43  	0x4a, 0xce, 0xcf, 0x2f, 0x4a, 0xc9, 0xcc, 0x4b, 0x2c, 0x49,
    44  	0xe5, 0x72, 0x08, 0x0e, 0xe4, 0x0c, 0xf6, 0x03, 0x8a, 0xe4,
    45  	0x25, 0xa7, 0x16, 0x94, 0x94, 0x26, 0xe6, 0x70, 0xfa, 0x00,
    46  	0x95, 0x19, 0x9b, 0x18, 0x19, 0x9a, 0x9b, 0x1b, 0x59, 0x70,
    47  	0x31, 0x02, 0xf5, 0x72, 0x03, 0x31, 0x42, 0x1e, 0xc8, 0x61,
    48  	0xe0, 0x00, 0x00, 0x42, 0x51, 0xcc, 0xea, 0x57, 0x00, 0x00,
    49  	0x00,
    50  
    51  	// Record block [{File:101, Block:0}, {File:101, Block:157}).
    52  	0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
    53  	0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x62, 0x00, 0x33, 0x60,
    54  	0x80, 0x81, 0x03, 0xcc, 0x3c, 0x1a, 0x0c, 0x0c, 0x8c, 0x50,
    55  	0xde, 0x7f, 0x28, 0x00, 0xb1, 0xcd, 0x0c, 0x72, 0xcd, 0xcc,
    56  	0x72, 0xad, 0x92, 0x32, 0xf3, 0x0c, 0x40, 0x5c, 0x36, 0x03,
    57  	0xb8, 0x9e, 0x04, 0x16, 0x1e, 0x0d, 0x26, 0xac, 0x7a, 0xcc,
    58  	0x0d, 0x72, 0xcd, 0x21, 0x7a, 0x8c, 0xc0, 0x7a, 0x0c, 0xe1,
    59  	0x7a, 0x26, 0xb0, 0xf0, 0x6a, 0x08, 0x61, 0xd7, 0x63, 0x9c,
    60  	0x6b, 0x6e, 0x0a, 0xd6, 0x63, 0x68, 0x01, 0xe2, 0x33, 0x01,
    61  	0x00, 0x5a, 0x80, 0xfe, 0xec, 0x9d, 0x00, 0x00, 0x00,
    62  
    63  	// Magic block [{File:200, Block:0}, {File:200, Block:0}).
    64  	0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
    65  	0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00,
    66  	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    67  
    68  	// End {File:228, Block:0}
    69  }
    70  
    71  var conceptualChunks = []bgzf.Chunk{
    72  	{Begin: bgzf.Offset{File: 0, Block: 0}, End: bgzf.Offset{File: 0, Block: 87}},        // header
    73  	{Begin: bgzf.Offset{File: 101, Block: 0}, End: bgzf.Offset{File: 101, Block: 52}},    // 60m66m:bin0
    74  	{Begin: bgzf.Offset{File: 101, Block: 52}, End: bgzf.Offset{File: 101, Block: 104}},  // 70m76m:bin2
    75  	{Begin: bgzf.Offset{File: 101, Block: 104}, End: bgzf.Offset{File: 101, Block: 157}}, // 73m75m:bin18
    76  	{Begin: bgzf.Offset{File: 228, Block: 0}, End: bgzf.Offset{File: 228, Block: 0}},     // EOF
    77  }
    78  
    79  // Test for issue #6 https://github.com/biogo/hts/issues/6
    80  func (s *S) TestChunkReader(c *check.C) {
    81  	br, err := bgzf.NewReader(bytes.NewReader(conceptualBAMdata), *conc)
    82  	c.Assert(err, check.Equals, nil)
    83  	defer br.Close()
    84  	cr, err := NewChunkReader(br, conceptualChunks)
    85  	c.Assert(err, check.Equals, nil)
    86  	defer cr.Close()
    87  	// 2 is shorter than the length of the first block.
    88  	// This panics prior to the fix.
    89  	n, err := cr.Read(make([]byte, 2))
    90  	c.Check(n, check.Equals, 2)
    91  	c.Check(err, check.Equals, nil)
    92  }
    93  
    94  // Test for issue #8 https://github.com/biogo/hts/issues/8
    95  func (s *S) TestIssue8(c *check.C) {
    96  	br, err := bgzf.NewReader(bytes.NewReader(conceptualBAMdata), *conc)
    97  	c.Assert(err, check.Equals, nil)
    98  	defer br.Close()
    99  	cr, err := NewChunkReader(br, conceptualChunks[:2])
   100  	c.Assert(err, check.Equals, nil)
   101  	defer cr.Close()
   102  	var last []byte
   103  	for {
   104  		p := make([]byte, 1024)
   105  		n, err := cr.Read(p)
   106  		if n != 0 {
   107  			c.Check(p[:n], check.Not(check.DeepEquals), last[:min(n, len(last))])
   108  		}
   109  		last = p
   110  		if err != nil {
   111  			if err == io.EOF {
   112  				break
   113  			}
   114  			c.Fatalf("unexpected error: %v", err)
   115  		}
   116  	}
   117  }
   118  
   119  // issue10Tests are test cases for https://github.com/biogo/hts/issues/10.
   120  var issue10Tests = []struct {
   121  	words     []wordBlocks
   122  	chunks    []string
   123  	canSquash bool
   124  	canTrunc  bool
   125  }{
   126  	{
   127  		// This is semantically identical to the test case given in issue 10.
   128  		words:     commonWords,
   129  		chunks:    []string{"<three>", "<five>"},
   130  		canSquash: true,
   131  		canTrunc:  false,
   132  	},
   133  	{
   134  		words:     commonWords,
   135  		chunks:    []string{"<one>", "<two>", "<three>"},
   136  		canSquash: true,
   137  		canTrunc:  false,
   138  	},
   139  	{
   140  		words:     commonWords,
   141  		chunks:    []string{"<two>", "<three>", "<four>", "<five>"},
   142  		canSquash: true,
   143  		canTrunc:  true,
   144  	},
   145  	{
   146  		words:     commonWords,
   147  		chunks:    []string{"<three>", "<four>"},
   148  		canSquash: true,
   149  		canTrunc:  true,
   150  	},
   151  	{
   152  		words:     commonWords,
   153  		chunks:    []string{"<seven>", "<eight>"},
   154  		canSquash: true,
   155  		canTrunc:  true,
   156  	},
   157  	{
   158  		words:     commonWords,
   159  		chunks:    []string{"<zero>", "<one>", "<two>", "<three>", "<four>", "<five>", "<six>", "<seven>", "<eight>"},
   160  		canSquash: true,
   161  		canTrunc:  true,
   162  	},
   163  	{
   164  		// This case would never happen with an htslib-like index, but
   165  		// it is a possible use case and not prohibited, so test it.
   166  		words:  commonWords,
   167  		chunks: []string{"<three>", "<zero>", "<five>", "<seven>", "<two>", "<eight>", "<five>"},
   168  
   169  		// Not in order.
   170  		canSquash: false,
   171  		canTrunc:  false,
   172  	},
   173  }
   174  
   175  var commonWords = []wordBlocks{
   176  	// Begin:{File:0 Block:0} End:{File:0 Block:6}
   177  	// Begin:{File:0 Block:6} End:{File:0 Block:11}
   178  	{word: "<zero>"}, {word: "<one>", flush: true},
   179  	// Begin:{File:43 Block:0} End:{File:43 Block:5}
   180  	// Begin:{File:43 Block:5} End:{File:43 Block:12}
   181  	// Begin:{File:43 Block:12} End:{File:43 Block:18}
   182  	{word: "<two>"}, {word: "<three>"}, {word: "<four>", flush: true},
   183  	// Begin:{File:93 Block:0} End:{File:93 Block:6}
   184  	// Begin:{File:93 Block:6} End:{File:93 Block:11}
   185  	{word: "<five>"}, {word: "<six>"}, {word: "<seven>", flush: true},
   186  	// Begin:{File:142 Block:0} End:{File:142 Block:7}
   187  	{word: "<eight>"},
   188  }
   189  
   190  type wordBlocks struct {
   191  	word  string
   192  	flush bool
   193  }
   194  
   195  type word int
   196  
   197  func (w word) RefID() int { return 0 }
   198  func (w word) Start() int { return int(w) }
   199  func (w word) End() int   { return int(w + 1) }
   200  
   201  func (s *S) TestIssue10(c *check.C) {
   202  	for _, test := range issue10Tests {
   203  		var buf bytes.Buffer
   204  
   205  		// Write the set of words to a bgzf stream.
   206  		w := bgzf.NewWriter(&buf, *conc)
   207  		for _, wb := range test.words {
   208  			w.Write([]byte(wb.word))
   209  			if wb.flush {
   210  				w.Flush()
   211  			}
   212  		}
   213  		w.Close()
   214  
   215  		for _, strategy := range []MergeStrategy{nil, adjacent} {
   216  			if strategy != nil && !test.canSquash {
   217  				continue
   218  			}
   219  			for _, clean := range []bool{false, true} {
   220  				for _, truncFinal := range []bool{false, true} {
   221  					if truncFinal && !test.canTrunc {
   222  						continue
   223  					}
   224  					// Build an index into the words.
   225  					r, err := bgzf.NewReader(bytes.NewReader(buf.Bytes()), *conc)
   226  					c.Assert(err, check.Equals, nil)
   227  					idx := make(map[string]bgzf.Chunk)
   228  					for i, wb := range test.words {
   229  						p := make([]byte, len(wb.word))
   230  						n, err := r.Read(p)
   231  						c.Assert(err, check.Equals, nil)
   232  						c.Assert(string(p[:n]), check.Equals, wb.word)
   233  
   234  						last := r.LastChunk()
   235  						if !clean {
   236  							// This simulates the index construction behaviour
   237  							// that appears to be what is done by htslib. The
   238  							// behaviour of bgzf is to elide seeks that will not
   239  							// result in a productive read.
   240  							if i != 0 && test.words[i-1].flush {
   241  								last.Begin = idx[test.words[i-1].word].End
   242  							}
   243  						}
   244  						idx[wb.word] = last
   245  					}
   246  
   247  					var chunks []bgzf.Chunk
   248  					for _, w := range test.chunks {
   249  						chunks = append(chunks, idx[w])
   250  					}
   251  					var want string
   252  					if truncFinal {
   253  						want = strings.Join(test.chunks[:len(test.chunks)-1], "")
   254  						chunks[len(chunks)-2].End = chunks[len(chunks)-1].Begin
   255  						chunks = chunks[:len(chunks)-1]
   256  					} else {
   257  						want = strings.Join(test.chunks, "")
   258  					}
   259  
   260  					if strategy != nil {
   261  						chunks = strategy(chunks)
   262  					}
   263  					cr, err := NewChunkReader(r, chunks)
   264  					c.Assert(err, check.Equals, nil)
   265  
   266  					var got bytes.Buffer
   267  					io.Copy(&got, cr)
   268  					c.Check(got.String(), check.Equals, want,
   269  						check.Commentf("clean=%t merge=%t trunc=%t chunks=%+v", clean, strategy != nil, truncFinal, chunks),
   270  					)
   271  				}
   272  			}
   273  		}
   274  	}
   275  }