github.com/vertgenlab/gonomics@v1.0.0/cmd/mergesort/mergesort.go (about) 1 // Command Group: "Sorting" 2 3 // Executes an external merge sort of the input file based on desired sort criteria 4 package main 5 6 import ( 7 "flag" 8 "fmt" 9 "log" 10 "path" 11 "strings" 12 13 "github.com/vertgenlab/gonomics/axt" 14 "github.com/vertgenlab/gonomics/bed" 15 "github.com/vertgenlab/gonomics/dna" 16 "github.com/vertgenlab/gonomics/exception" 17 "github.com/vertgenlab/gonomics/fileio" 18 "github.com/vertgenlab/gonomics/sam" 19 "github.com/vertgenlab/gonomics/sort" 20 "github.com/vertgenlab/gonomics/vcf" 21 ) 22 23 func usage() { 24 fmt.Print( 25 "mergesort - Executes an external merge sort of the input file based on desired sort criteria. \n" + 26 "\t The input file should have a proper file extension depending on the input file type.\n" + 27 "\tDefault sort criteria is byGenomicCoordinates. Chromosome -> StartPos -> EndPos\n" + 28 "Usage:\n" + 29 " mergesort [options] input.filetype outputFile\n") 30 flag.PrintDefaults() 31 } 32 33 func axtSort(infile, outfile string, numRecordsPerChunk int) { 34 data, header := axt.GoReadToChan(infile) 35 out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b axt.Axt) bool { 36 switch { 37 case a.RName < b.RName: 38 return true 39 case a.RName > b.RName: 40 return false 41 case a.RStart < b.RStart: 42 return true 43 case a.RStart > b.RStart: 44 return false 45 default: 46 return a.REnd < b.REnd 47 } 48 }) 49 50 o := fileio.EasyCreate(outfile) 51 if len(header) != 0 { 52 _, err := fmt.Fprintln(o, strings.Join(header, "\n")) 53 exception.PanicOnErr(err) 54 } 55 var i int 56 for r := range out { 57 axt.WriteToFileHandle(o, r, i) 58 i++ 59 } 60 61 err := o.Close() 62 exception.PanicOnErr(err) 63 } 64 65 func bedSort(infile, outfile string, numRecordsPerChunk int) { 66 data := bed.GoReadToChan(infile) 67 out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b bed.Bed) bool { 68 switch { 69 case a.Chrom < b.Chrom: 70 return true 71 case a.Chrom > b.Chrom: 72 return false 73 case a.ChromStart < b.ChromStart: 74 return true 75 case a.ChromStart > b.ChromStart: 76 return false 77 default: 78 return a.ChromEnd < b.ChromEnd 79 } 80 }) 81 82 o := fileio.EasyCreate(outfile) 83 for r := range out { 84 bed.WriteToFileHandle(o, r) 85 } 86 87 err := o.Close() 88 exception.PanicOnErr(err) 89 } 90 91 func vcfSort(infile, outfile string, numRecordsPerChunk int) { 92 data, header := vcf.GoReadToChan(infile) 93 out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b vcf.Vcf) bool { 94 switch { 95 case a.Chr < b.Chr: 96 return true 97 case a.Chr > b.Chr: 98 return false 99 default: 100 return a.Pos < b.Pos 101 } 102 }) 103 104 o := fileio.EasyCreate(outfile) 105 if len(header.Text) != 0 { 106 _, err := fmt.Fprintln(o, strings.Join(header.Text, "\n")) 107 exception.PanicOnErr(err) 108 } 109 for r := range out { 110 vcf.WriteVcf(o, r) 111 } 112 113 err := o.Close() 114 exception.PanicOnErr(err) 115 } 116 117 func samSort(infile, outfile string, numRecordsPerChunk int, sortCriteria string) { 118 data, header := sam.GoReadToChan(infile) 119 var out <-chan sam.Sam 120 if sortCriteria == "singleCellBx" { 121 out = sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b sam.Sam) bool { 122 iSingle := sam.ToSingleCellAlignment(a) 123 jSingle := sam.ToSingleCellAlignment(b) 124 return dna.BasesToString(iSingle.Bx) < dna.BasesToString(jSingle.Bx) 125 }) 126 } else { 127 out = sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b sam.Sam) bool { 128 switch { 129 case a.RName < b.RName: 130 return true 131 case a.RName > b.RName: 132 return false 133 default: 134 return a.Pos < b.Pos 135 } 136 }) 137 } 138 139 o := fileio.EasyCreate(outfile) 140 if len(header.Text) != 0 { 141 _, err := fmt.Fprintln(o, strings.Join(header.Text, "\n")) 142 exception.PanicOnErr(err) 143 } 144 for r := range out { 145 sam.WriteToFileHandle(o, r) 146 } 147 148 err := o.Close() 149 exception.PanicOnErr(err) 150 } 151 152 // TODO remove giraf pointers and uncomment 153 //func girafSort(infile, outfile string, numRecordsPerChunk int) { 154 // data := giraf.GoReadToChan(infile) 155 // out := sort.GoExternalMergeSort(data, numRecordsPerChunk, func(a, b *giraf.Giraf) bool { 156 // // First sort criteria is node 157 // if a.GetChrom() < b.GetChrom() { 158 // return true 159 // } else if a.GetChrom() == b.GetChrom() { 160 // // If start nodes are equal then sort by start position 161 // if a.GetChromStart() < b.GetChromStart() { 162 // return true 163 // } else if a.GetChromStart() == b.GetChromStart() { 164 // // If start positions are equal then loop through nodes and see if one has priority 165 // minPathLength := len(a.Path.Nodes) 166 // if len(b.Path.Nodes) < minPathLength { 167 // minPathLength = len(b.Path.Nodes) 168 // } 169 // for k := 0; k < minPathLength; k++ { 170 // if a.Path.Nodes[k] < b.Path.Nodes[k] { 171 // return true 172 // } 173 // } 174 // // If all nodes match, sort based on longest path 175 // if len(a.Path.Nodes) < len(b.Path.Nodes) { 176 // return true 177 // } else if len(a.Path.Nodes) == len(b.Path.Nodes) { 178 // // If nodes are equal length, then sort based on the ending position 179 // if a.GetChromEnd() < b.GetChromEnd() { 180 // return true 181 // } 182 // } 183 // } 184 // } 185 // return false 186 // }) 187 // 188 // o := fileio.EasyCreate(outfile) 189 // for r := range out { 190 // giraf.WriteGirafToFileHandle(o, r) 191 // } 192 // 193 // err := o.Close() 194 // exception.PanicOnErr(err) 195 //} 196 197 func mergeSort(filename string, outFile string, numRecordsPerChunk int, sortCriteria string) { 198 // How the file is read is dependent on the file extension 199 filetype := path.Ext(filename) 200 201 if filetype == ".gz" { 202 // If terminal extension is ".gz" then trim off the gz and get the next extension 203 filetype = path.Ext(filename[0 : len(filename)-len(filetype)]) 204 } 205 206 switch filetype { 207 case ".axt": 208 axtSort(filename, outFile, numRecordsPerChunk) 209 case ".bed": 210 bedSort(filename, outFile, numRecordsPerChunk) 211 case ".vcf": 212 vcfSort(filename, outFile, numRecordsPerChunk) 213 case ".sam": 214 samSort(filename, outFile, numRecordsPerChunk, sortCriteria) 215 case ".giraf": 216 // TODO enable after giraf pointers are removed 217 log.Fatalln("ERROR: giraf sorting in currently disabled") 218 //girafSort(filename, outFile, numRecordsPerChunk) 219 default: 220 if filetype == "" { 221 log.Fatalln("ERROR: Input file must have proper file extension") 222 } else { 223 log.Fatalln("ERROR: Merge sort methods have not been implemented for file type:", filetype) 224 } 225 } 226 } 227 228 func main() { 229 expectedNumArgs := 2 230 var numLinesPerChunk *int = flag.Int("tmpsize", 1000000, "The number of records to read into memory before writing to a tmp file.``") 231 var singleCellBx *bool = flag.Bool("singleCellBx", false, "Sort single-cell sam records by barcode.") 232 var sortCriteria string = "byGenomicCoordinates" // default the genomicCoordinates criteria. 233 flag.Usage = usage 234 log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) 235 flag.Parse() 236 237 if *singleCellBx { 238 sortCriteria = "singleCellBx" 239 } 240 241 if len(flag.Args()) != expectedNumArgs { 242 flag.Usage() 243 log.Fatalf("ERROR: expecting %d arguments, but got %d\n", expectedNumArgs, len(flag.Args())) 244 } 245 246 inFile := flag.Arg(0) 247 outFile := flag.Arg(1) 248 249 mergeSort(inFile, outFile, *numLinesPerChunk, sortCriteria) 250 }