github.com/vertgenlab/gonomics@v1.0.0/cmd/mergesort/mergesort.go (about)

     1  // Command Group: "Sorting"
     2  
     3  // Executes an external merge sort of the input file based on desired sort criteria
     4  package main
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"log"
    10  	"path"
    11  	"strings"
    12  
    13  	"github.com/vertgenlab/gonomics/axt"
    14  	"github.com/vertgenlab/gonomics/bed"
    15  	"github.com/vertgenlab/gonomics/dna"
    16  	"github.com/vertgenlab/gonomics/exception"
    17  	"github.com/vertgenlab/gonomics/fileio"
    18  	"github.com/vertgenlab/gonomics/sam"
    19  	"github.com/vertgenlab/gonomics/sort"
    20  	"github.com/vertgenlab/gonomics/vcf"
    21  )
    22  
    23  func usage() {
    24  	fmt.Print(
    25  		"mergesort - Executes an external merge sort of the input file based on desired sort criteria. \n" +
    26  			"\t The input file should have a proper file extension depending on the input file type.\n" +
    27  			"\tDefault sort criteria is byGenomicCoordinates. Chromosome -> StartPos -> EndPos\n" +
    28  			"Usage:\n" +
    29  			" mergesort [options] input.filetype outputFile\n")
    30  	flag.PrintDefaults()
    31  }
    32  
    33  func axtSort(infile, outfile string, numRecordsPerChunk int) {
    34  	data, header := axt.GoReadToChan(infile)
    35  	out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b axt.Axt) bool {
    36  		switch {
    37  		case a.RName < b.RName:
    38  			return true
    39  		case a.RName > b.RName:
    40  			return false
    41  		case a.RStart < b.RStart:
    42  			return true
    43  		case a.RStart > b.RStart:
    44  			return false
    45  		default:
    46  			return a.REnd < b.REnd
    47  		}
    48  	})
    49  
    50  	o := fileio.EasyCreate(outfile)
    51  	if len(header) != 0 {
    52  		_, err := fmt.Fprintln(o, strings.Join(header, "\n"))
    53  		exception.PanicOnErr(err)
    54  	}
    55  	var i int
    56  	for r := range out {
    57  		axt.WriteToFileHandle(o, r, i)
    58  		i++
    59  	}
    60  
    61  	err := o.Close()
    62  	exception.PanicOnErr(err)
    63  }
    64  
    65  func bedSort(infile, outfile string, numRecordsPerChunk int) {
    66  	data := bed.GoReadToChan(infile)
    67  	out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b bed.Bed) bool {
    68  		switch {
    69  		case a.Chrom < b.Chrom:
    70  			return true
    71  		case a.Chrom > b.Chrom:
    72  			return false
    73  		case a.ChromStart < b.ChromStart:
    74  			return true
    75  		case a.ChromStart > b.ChromStart:
    76  			return false
    77  		default:
    78  			return a.ChromEnd < b.ChromEnd
    79  		}
    80  	})
    81  
    82  	o := fileio.EasyCreate(outfile)
    83  	for r := range out {
    84  		bed.WriteToFileHandle(o, r)
    85  	}
    86  
    87  	err := o.Close()
    88  	exception.PanicOnErr(err)
    89  }
    90  
    91  func vcfSort(infile, outfile string, numRecordsPerChunk int) {
    92  	data, header := vcf.GoReadToChan(infile)
    93  	out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b vcf.Vcf) bool {
    94  		switch {
    95  		case a.Chr < b.Chr:
    96  			return true
    97  		case a.Chr > b.Chr:
    98  			return false
    99  		default:
   100  			return a.Pos < b.Pos
   101  		}
   102  	})
   103  
   104  	o := fileio.EasyCreate(outfile)
   105  	if len(header.Text) != 0 {
   106  		_, err := fmt.Fprintln(o, strings.Join(header.Text, "\n"))
   107  		exception.PanicOnErr(err)
   108  	}
   109  	for r := range out {
   110  		vcf.WriteVcf(o, r)
   111  	}
   112  
   113  	err := o.Close()
   114  	exception.PanicOnErr(err)
   115  }
   116  
   117  func samSort(infile, outfile string, numRecordsPerChunk int, sortCriteria string) {
   118  	data, header := sam.GoReadToChan(infile)
   119  	var out <-chan sam.Sam
   120  	if sortCriteria == "singleCellBx" {
   121  		out = sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b sam.Sam) bool {
   122  			iSingle := sam.ToSingleCellAlignment(a)
   123  			jSingle := sam.ToSingleCellAlignment(b)
   124  			return dna.BasesToString(iSingle.Bx) < dna.BasesToString(jSingle.Bx)
   125  		})
   126  	} else {
   127  		out = sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b sam.Sam) bool {
   128  			switch {
   129  			case a.RName < b.RName:
   130  				return true
   131  			case a.RName > b.RName:
   132  				return false
   133  			default:
   134  				return a.Pos < b.Pos
   135  			}
   136  		})
   137  	}
   138  
   139  	o := fileio.EasyCreate(outfile)
   140  	if len(header.Text) != 0 {
   141  		_, err := fmt.Fprintln(o, strings.Join(header.Text, "\n"))
   142  		exception.PanicOnErr(err)
   143  	}
   144  	for r := range out {
   145  		sam.WriteToFileHandle(o, r)
   146  	}
   147  
   148  	err := o.Close()
   149  	exception.PanicOnErr(err)
   150  }
   151  
   152  // TODO remove giraf pointers and uncomment
   153  //func girafSort(infile, outfile string, numRecordsPerChunk int) {
   154  //	data := giraf.GoReadToChan(infile)
   155  //	out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b *giraf.Giraf) bool {
   156  //		// First sort criteria is node
   157  //		if a.GetChrom() < b.GetChrom() {
   158  //			return true
   159  //		} else if a.GetChrom() == b.GetChrom() {
   160  //			// If start nodes are equal then sort by start position
   161  //			if a.GetChromStart() < b.GetChromStart() {
   162  //				return true
   163  //			} else if a.GetChromStart() == b.GetChromStart() {
   164  //				// If start positions are equal then loop through nodes and see if one has priority
   165  //				minPathLength := len(a.Path.Nodes)
   166  //				if len(b.Path.Nodes) < minPathLength {
   167  //					minPathLength = len(b.Path.Nodes)
   168  //				}
   169  //				for k := 0; k < minPathLength; k++ {
   170  //					if a.Path.Nodes[k] < b.Path.Nodes[k] {
   171  //						return true
   172  //					}
   173  //				}
   174  //				// If all nodes match, sort based on longest path
   175  //				if len(a.Path.Nodes) < len(b.Path.Nodes) {
   176  //					return true
   177  //				} else if len(a.Path.Nodes) == len(b.Path.Nodes) {
   178  //					// If nodes are equal length, then sort based on the ending position
   179  //					if a.GetChromEnd() < b.GetChromEnd() {
   180  //						return true
   181  //					}
   182  //				}
   183  //			}
   184  //		}
   185  //		return false
   186  //	})
   187  //
   188  //	o := fileio.EasyCreate(outfile)
   189  //	for r := range out {
   190  //		giraf.WriteGirafToFileHandle(o, r)
   191  //	}
   192  //
   193  //	err := o.Close()
   194  //	exception.PanicOnErr(err)
   195  //}
   196  
   197  func mergeSort(filename string, outFile string, numRecordsPerChunk int, sortCriteria string) {
   198  	// How the file is read is dependent on the file extension
   199  	filetype := path.Ext(filename)
   200  
   201  	if filetype == ".gz" {
   202  		// If terminal extension is ".gz" then trim off the gz and get the next extension
   203  		filetype = path.Ext(filename[0 : len(filename)-len(filetype)])
   204  	}
   205  
   206  	switch filetype {
   207  	case ".axt":
   208  		axtSort(filename, outFile, numRecordsPerChunk)
   209  	case ".bed":
   210  		bedSort(filename, outFile, numRecordsPerChunk)
   211  	case ".vcf":
   212  		vcfSort(filename, outFile, numRecordsPerChunk)
   213  	case ".sam":
   214  		samSort(filename, outFile, numRecordsPerChunk, sortCriteria)
   215  	case ".giraf":
   216  		// TODO enable after giraf pointers are removed
   217  		log.Fatalln("ERROR: giraf sorting in currently disabled")
   218  		//girafSort(filename, outFile, numRecordsPerChunk)
   219  	default:
   220  		if filetype == "" {
   221  			log.Fatalln("ERROR: Input file must have proper file extension")
   222  		} else {
   223  			log.Fatalln("ERROR: Merge sort methods have not been implemented for file type:", filetype)
   224  		}
   225  	}
   226  }
   227  
   228  func main() {
   229  	expectedNumArgs := 2
   230  	var numLinesPerChunk *int = flag.Int("tmpsize", 1000000, "The number of records to read into memory before writing to a tmp file.``")
   231  	var singleCellBx *bool = flag.Bool("singleCellBx", false, "Sort single-cell sam records by barcode.")
   232  	var sortCriteria string = "byGenomicCoordinates" // default the genomicCoordinates criteria.
   233  	flag.Usage = usage
   234  	log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
   235  	flag.Parse()
   236  
   237  	if *singleCellBx {
   238  		sortCriteria = "singleCellBx"
   239  	}
   240  
   241  	if len(flag.Args()) != expectedNumArgs {
   242  		flag.Usage()
   243  		log.Fatalf("ERROR: expecting %d arguments, but got %d\n", expectedNumArgs, len(flag.Args()))
   244  	}
   245  
   246  	inFile := flag.Arg(0)
   247  	outFile := flag.Arg(1)
   248  
   249  	mergeSort(inFile, outFile, *numLinesPerChunk, sortCriteria)
   250  }