github.com/andrewrech/ih-abstract@v0.0.0-20210322142951-2fec1c8d0f38/filter.go (about) 1 package main 2 3 import ( 4 "log" 5 "regexp" 6 "runtime" 7 "strings" 8 "sync/atomic" 9 ) 10 11 // Pdl1Report is the string form of the regular expression used to match PD-L1 reports of interest. 12 const Pdl1Report = "(?i)pd-?l1" 13 14 // MsiReport is the string form of the regular expression used to match microsatellite instability reports of interest. 15 const MsiReport = "[Mm]icrosatellite[ ]+[Ii]nstability" 16 17 // Pdl1Result is the string form of the regular expression used to extract PD-L1 tumor/cancer score results. 18 const Pdl1Result = "(?i)(tumor proportion score|combined positive score \\(cps\\)|cps score):? ?[><~]* ?[0-9\\-\\.]+ ?%?" 19 20 // MsiResult is the string form of the regular expression used to extract microsatellite instability results. 21 const MsiResult = "[^\\.:]+findings[^\\.]+[Mm]icrosat[^\\.]+." 22 23 // SpacesAndBreaks is the string form of the replace-all regular expression used to normalize whitespace in pathology report strings of interest. 24 const SpacesAndBreaks = `\s+` 25 26 // WbcLymph efficiently selects records that are WbcLymph or lymphocyte counts using a lookup table. 27 func WbcLymph(s string) bool { 28 wbc := map[string]bool{ 29 "WBC": true, 30 "WBC Corr": true, 31 "Lymph Man": true, 32 } 33 34 return wbc[s] 35 } 36 37 // CPD efficiently selects reports that are CPD reports using a lookup table. 38 func CPD(s string) bool { 39 cpd := map[string]bool{ 40 "Solid Tumor NGS Report": true, 41 "Fusion Transcript Report": true, 42 } 43 44 return cpd[s] 45 } 46 47 // PDL1 uses a lookup table to efficiently test if a string should be evaluated via regular expression as a potential PD-L1 report. 48 func PDL1(s string) bool { 49 pats := []string{ 50 "PD", 51 "Pd", 52 "pD", 53 "pd", 54 } 55 56 for _, i := range pats { 57 if strings.Contains(s, i) { 58 return true 59 } 60 } 61 62 return false 63 } 64 65 // MSI uses a lookup table to efficiently test if a string should be evaluated via regular expression as a potential PD-L1 report. 66 func MSI(s string) bool { 67 pats := []string{ 68 "Microsatellite", 69 "microsatellite", 70 } 71 72 for _, i := range pats { 73 if strings.Contains(s, i) { 74 return true 75 } 76 } 77 78 return false 79 } 80 81 // Exclude efficiently excludes unwanted report categories using a lookup table. 82 func Exclude(s string) bool { 83 excl := map[string]bool{ 84 "CMV": true, 85 "RVP": true, 86 "HIVQNT": true, 87 "HCVQNT": true, 88 "SDIFF": true, 89 "CFPLUS Report": true, 90 "Case - HIV Quantitation": true, 91 "Case - Respiratory Virus Panel": true, 92 "Case - Epstein-Barr Virus Quantitation": true, 93 "HBV DNA": true, 94 "Case - Cytomegalovirus Quantitation": true, 95 "HCVGENO": true, 96 "BME Post Report": true, 97 "Case - HCV Quantitation": true, 98 "BCR Quant Report": true, 99 "HyperCoag Report": true, 100 "CML Report": true, 101 "TCRPCR Report": true, 102 "FLT3 Report": true, 103 "Case - Cystic Fibrosis": true, 104 "BRAF Report": true, 105 "BRCA1/BRCA2/ESR1 Report": true, 106 "Heme NGS Report": true, 107 "SPAD": true, 108 "Immunophen Report": true, 109 } 110 111 return excl[s] 112 } 113 114 // Whitespace normalizes whitespace in report strings of interest. 115 func Whitespace(s []string) []string { 116 r := regexp.MustCompile(SpacesAndBreaks) 117 118 for i := range s { 119 s[i] = strings.Trim(r.ReplaceAllString(s[i], " "), " \r\n") 120 } 121 122 return s 123 } 124 125 // filterRow filters a row of input data for matches to patterns of interest. 126 func filterRow(l []string, colNames map[string]int, pat map[string](*regexp.Regexp), channels map[string](chan []string), counter *int64) { 127 switch { 128 129 case Exclude(l[colNames["OrderTypeMnemonic"]]): 130 131 // WBC are sent directly to output 132 // WBC are not counted as 'new data' 133 case WbcLymph(l[colNames["TestTypeMnemonic"]]): 134 channels["wbc"] <- l 135 channels["results"] <- l 136 137 case CPD(l[colNames["OrderTypeMnemonic"]]): 138 channels["cpd"] <- l 139 // CPD reports, PD-L1 reports, and MSI reports count 140 // as "new" data and trigger a new report 141 channels["results"] <- l 142 channels["diff"] <- l 143 144 case PDL1(l[colNames["Value"]]): 145 if pat["pdl1Report"].MatchString(l[colNames["Value"]]) { 146 channels["results"] <- l 147 channels["pdl1"] <- l 148 channels["diff"] <- l 149 150 pdl1Result := pat["pdl1Result"].FindAllString(l[colNames["Value"]], 10) 151 channels["pdl1-to-diff"] <- Whitespace(pdl1Result) 152 } 153 154 case MSI(l[colNames["Value"]]): 155 if pat["msiReport"].MatchString(l[colNames["Value"]]) { 156 channels["results"] <- l 157 channels["diff"] <- l 158 channels["msi"] <- l 159 160 msiResult := pat["msiResult"].FindAllString(l[colNames["Value"]], 10) 161 channels["msi-to-diff"] <- Whitespace(msiResult) 162 } 163 } 164 165 atomic.AddInt64(counter, 1) 166 } 167 168 // filterResults filters a raw data input stream row by row. 169 func filterResults(in chan []string, header []string) (results map[string](chan []string), done chan struct{}) { 170 done = make(chan struct{}) 171 172 var buf int64 = 1e7 173 174 // channels contains communication of rows 175 // between goroutines processing data 176 results = make(map[string](chan []string)) 177 178 // other channels for filtering data are closed in this function 179 resultTypes := []string{ 180 "results", 181 "diff", 182 "wbc", 183 "cpd", 184 "pdl1", 185 "msi", 186 "pdl1-to-diff", 187 "msi-to-diff", 188 } 189 190 for _, name := range resultTypes { 191 results[name] = make(chan []string, buf) 192 } 193 194 ioCores := 2 // save cores for I/O 195 196 nProc := runtime.GOMAXPROCS(0) - ioCores 197 198 // run at least two filtering processes 199 if nProc < 2 { 200 nProc = 2 201 } 202 203 signal := make(chan struct{}, nProc) 204 205 // create patterns to use for filtering 206 pat := make(map[string](*regexp.Regexp)) 207 208 pat["pdl1Report"] = regexp.MustCompile(Pdl1Report) 209 pat["msiReport"] = regexp.MustCompile(MsiReport) 210 pat["pdl1Result"] = regexp.MustCompile(Pdl1Result) 211 pat["msiResult"] = regexp.MustCompile(MsiResult) 212 213 colNames := headerParse(header) 214 215 var counter int64 216 217 // filter records on each core 218 for i := 0; i < nProc; i++ { 219 go func() { 220 for l := range in { 221 filterRow(l, colNames, pat, results, &counter) 222 } 223 signal <- struct{}{} 224 }() 225 } 226 227 stopCounter := make(chan struct{}) 228 count(&counter, "filtered", stopCounter) 229 230 // wait and close 231 go func() { 232 for i := 0; i < nProc; i++ { 233 <-signal 234 } 235 236 stopCounter <- struct{}{} 237 238 log.Println("total filtered:", counter, "records") 239 240 for _, name := range resultTypes { 241 close(results[name]) 242 } 243 244 done <- struct{}{} 245 }() 246 247 return results, done 248 }