github.com/vertgenlab/gonomics@v1.0.0/fileio/byteio.go (about) 1 package fileio 2 3 import ( 4 "bufio" 5 "bytes" 6 "compress/gzip" 7 "errors" 8 "fmt" 9 "io" 10 "log" 11 "os" 12 "strconv" 13 "strings" 14 15 "github.com/vertgenlab/gonomics/exception" 16 ) 17 18 const ( 19 defaultBufSize = 4096 20 ) 21 22 // ByteReader implements the io.Reader interface by providing 23 // the Read(b []byte) method. The struct contains an embedded *bufio.Reader 24 // and a pointer to os.File for closure when reading is complete. 25 // The primary advantage of ByteReader over EasyReader is that data 26 // is read into a shared bytes.Buffer instead of allocating memory for a 27 // string as is done with EasyReader. 28 // ByteReader can be be used to efficiently parse files on the byte level 29 // which may be significantly faster and require less memory than using 30 // strings.Split on a string derived from EasyReader. 31 // The drawback is that ByteReader is not as easy to use as EasyReader 32 // and should generally be reserved for performance intensive tasks. 33 type ByteReader struct { 34 *bufio.Reader 35 File *os.File 36 internalGzip *gzip.Reader 37 line []byte 38 Buffer *bytes.Buffer 39 } 40 41 // Read reads data into p and is a method required to implement the io.Reader interface. 42 // It returns the number of bytes read into p. 43 func (reader *ByteReader) Read(b []byte) (n int, err error) { 44 return reader.Read(b) 45 } 46 47 // NewByteReader will process a given file and performs error handling if an error occurs. 48 // ByteReader will process gzipped files accordingly by performing a check on the suffix 49 // of the provided file. 50 func NewByteReader(filename string) *ByteReader { 51 var err error 52 file := MustOpen(filename) 53 var answer ByteReader = ByteReader{ 54 File: file, 55 Buffer: &bytes.Buffer{}, 56 } 57 switch true { 58 case strings.HasSuffix(filename, ".gz"): 59 answer.internalGzip, err = gzip.NewReader(file) 60 exception.PanicOnErr(err) 61 answer.Reader = bufio.NewReader(answer.internalGzip) 62 default: 63 answer.Reader = bufio.NewReader(file) 64 } 65 return &answer 66 } 67 68 // ReadLine will return a bytes.Buffer pointing to the internal slice of bytes. Provided this function is called within a loop, 69 // the function will read one line at a time, and return bool to continue reading. Important to note the buffer return points to 70 // the internal slice belonging to the reader, meaning the slice will be overridden if the data is not copied. 71 func ReadLine(reader *ByteReader) (*bytes.Buffer, bool) { 72 var err error 73 reader.line, err = reader.ReadSlice('\n') 74 reader.Buffer.Reset() 75 if err == nil { 76 if reader.line[len(reader.line)-1] == '\n' { 77 return bytesToBuffer(reader), false 78 } else { 79 log.Panicf("Error: end of line did not end with an end of line character...\n") 80 } 81 } else { 82 if err == bufio.ErrBufferFull { 83 reader.line = readMore(reader) 84 return bytesToBuffer(reader), false 85 } else { 86 CatchErrThrowEOF(err) 87 } 88 } 89 return nil, true 90 } 91 92 // readMore is a private helper function to deal with very long lines to 93 // avoid alocating too much memory upfront and only resize the size of the buffer 94 // only when necessary. 95 func readMore(reader *ByteReader) []byte { 96 _, err := reader.Buffer.Write(reader.line) 97 exception.PanicOnErr(err) 98 reader.line, err = reader.ReadSlice('\n') 99 if err == nil { 100 return reader.line 101 } 102 if err == bufio.ErrBufferFull { 103 _, err = reader.Buffer.Write(reader.line) 104 exception.PanicOnErr(err) 105 // recursive call to read next bytes until reaching end of line character 106 return readMore(reader) 107 } 108 exception.PanicOnErr(err) 109 return reader.line 110 } 111 112 // CatchErrThrowEOF will silently handles and throws the EOF error and will log and exit any other errors. 113 func CatchErrThrowEOF(err error) { 114 if err == io.EOF { 115 return 116 } else { 117 exception.PanicOnErr(err) 118 } 119 } 120 121 // bytesToBuffer will parse []byte and return a pointer to the same underlying bytes.Buffer. 122 func bytesToBuffer(reader *ByteReader) *bytes.Buffer { 123 var err error 124 if reader.line[len(reader.line)-2] == '\r' { 125 _, err = reader.Buffer.Write(reader.line[:len(reader.line)-2]) 126 } else { 127 _, err = reader.Buffer.Write(reader.line[:len(reader.line)-1]) 128 } 129 exception.PanicOnErr(err) 130 return reader.Buffer 131 } 132 133 // Close closes the File, rendering it unusable for I/O. On files that support SetDeadline, 134 // any pending I/O operations will be canceled and return immediately with an error. 135 // Close will return an error if it has already been called. 136 func (br *ByteReader) Close() error { 137 var gzErr, fileErr error 138 if br.internalGzip != nil { 139 gzErr = br.internalGzip.Close() 140 } 141 if br.File != nil { 142 fileErr = br.File.Close() 143 } else { 144 return errors.New("no file found") 145 } 146 147 switch { // Handle error returns. Priority is gzErr > fileErr 148 case gzErr != nil: 149 return gzErr 150 151 case fileErr != nil: 152 log.Println("WARNING: attempted to close file, but file already closed") 153 return nil 154 155 default: 156 return nil 157 } 158 } 159 160 // StringToIntSlice will process a row of data separated by commas, convert the slice into a slice of type int. 161 // PSL and genePred formats have a trailing comma we need to account for and the check at the beginning will adjust 162 // the length of the working slice. 163 func StringToIntSlice(line string) []int { 164 work := strings.Split(line, ",") 165 var sliceSize int = len(work) 166 if line[len(line)-1] == ',' { 167 sliceSize-- 168 } 169 var answer []int = make([]int, sliceSize) 170 var err error 171 for i := 0; i < sliceSize; i++ { 172 answer[i], err = strconv.Atoi(work[i]) 173 exception.PanicOnErr(err) 174 } 175 return answer 176 } 177 178 // IntListToString will process a slice of type int as an input and return a each value separated by a comma as a string. 179 // Important Note: string will include a trailing comma to satisfy UCSC's anomalies. 180 func IntSliceToString(nums []int) string { 181 ans := strings.Builder{} 182 ans.Grow(2 * len(nums)) 183 for i := 0; i < len(nums); i++ { 184 ans.WriteString(fmt.Sprintf("%d", nums[i])) 185 ans.WriteByte(',') 186 } 187 return ans.String() 188 } 189 190 // IntToString a function that converts a number of type int and return a string. 191 func IntToString(i int) string { 192 return fmt.Sprintf("%d", i) 193 }