github.com/andrewrech/ih-abstract@v0.0.0-20210322142951-2fec1c8d0f38/filter.go (about)

     1  package main
     2  
     3  import (
     4  	"log"
     5  	"regexp"
     6  	"runtime"
     7  	"strings"
     8  	"sync/atomic"
     9  )
    10  
    11  // Pdl1Report is the string form of the regular expression used to match PD-L1 reports of interest.
    12  const Pdl1Report = "(?i)pd-?l1"
    13  
    14  // MsiReport is the string form of the regular expression used to match microsatellite instability reports of interest.
    15  const MsiReport = "[Mm]icrosatellite[ ]+[Ii]nstability"
    16  
    17  // Pdl1Result is the string form of the regular expression used to extract PD-L1 tumor/cancer score results.
    18  const Pdl1Result = "(?i)(tumor proportion score|combined positive score \\(cps\\)|cps score):? ?[><~]* ?[0-9\\-\\.]+ ?%?"
    19  
    20  // MsiResult is the string form of the regular expression used to extract microsatellite instability results.
    21  const MsiResult = "[^\\.:]+findings[^\\.]+[Mm]icrosat[^\\.]+."
    22  
    23  // SpacesAndBreaks is the string form of the replace-all regular expression used to normalize whitespace in pathology report strings of interest.
    24  const SpacesAndBreaks = `\s+`
    25  
    26  // WbcLymph efficiently selects records that are WbcLymph or lymphocyte counts using a lookup table.
    27  func WbcLymph(s string) bool {
    28  	wbc := map[string]bool{
    29  		"WBC":       true,
    30  		"WBC Corr":  true,
    31  		"Lymph Man": true,
    32  	}
    33  
    34  	return wbc[s]
    35  }
    36  
    37  // CPD efficiently selects reports that are CPD reports using a lookup table.
    38  func CPD(s string) bool {
    39  	cpd := map[string]bool{
    40  		"Solid Tumor NGS Report":   true,
    41  		"Fusion Transcript Report": true,
    42  	}
    43  
    44  	return cpd[s]
    45  }
    46  
    47  // PDL1 uses a lookup table to efficiently test if a string should be evaluated via regular expression as a potential PD-L1 report.
    48  func PDL1(s string) bool {
    49  	pats := []string{
    50  		"PD",
    51  		"Pd",
    52  		"pD",
    53  		"pd",
    54  	}
    55  
    56  	for _, i := range pats {
    57  		if strings.Contains(s, i) {
    58  			return true
    59  		}
    60  	}
    61  
    62  	return false
    63  }
    64  
    65  // MSI uses a lookup table to efficiently test if a string should be evaluated via regular expression as a potential PD-L1 report.
    66  func MSI(s string) bool {
    67  	pats := []string{
    68  		"Microsatellite",
    69  		"microsatellite",
    70  	}
    71  
    72  	for _, i := range pats {
    73  		if strings.Contains(s, i) {
    74  			return true
    75  		}
    76  	}
    77  
    78  	return false
    79  }
    80  
    81  // Exclude efficiently excludes unwanted report categories using a lookup table.
    82  func Exclude(s string) bool {
    83  	excl := map[string]bool{
    84  		"CMV":                                    true,
    85  		"RVP":                                    true,
    86  		"HIVQNT":                                 true,
    87  		"HCVQNT":                                 true,
    88  		"SDIFF":                                  true,
    89  		"CFPLUS Report":                          true,
    90  		"Case - HIV Quantitation":                true,
    91  		"Case - Respiratory Virus Panel":         true,
    92  		"Case - Epstein-Barr Virus Quantitation": true,
    93  		"HBV DNA":                                true,
    94  		"Case - Cytomegalovirus Quantitation":    true,
    95  		"HCVGENO":                                true,
    96  		"BME Post Report":                        true,
    97  		"Case - HCV Quantitation":                true,
    98  		"BCR Quant Report":                       true,
    99  		"HyperCoag Report":                       true,
   100  		"CML Report":                             true,
   101  		"TCRPCR Report":                          true,
   102  		"FLT3 Report":                            true,
   103  		"Case - Cystic Fibrosis":                 true,
   104  		"BRAF Report":                            true,
   105  		"BRCA1/BRCA2/ESR1 Report":                true,
   106  		"Heme NGS Report":                        true,
   107  		"SPAD":                                   true,
   108  		"Immunophen Report":                      true,
   109  	}
   110  
   111  	return excl[s]
   112  }
   113  
   114  // Whitespace normalizes whitespace in report strings of interest.
   115  func Whitespace(s []string) []string {
   116  	r := regexp.MustCompile(SpacesAndBreaks)
   117  
   118  	for i := range s {
   119  		s[i] = strings.Trim(r.ReplaceAllString(s[i], " "), " \r\n")
   120  	}
   121  
   122  	return s
   123  }
   124  
   125  // filterRow filters a row of input data for matches to patterns of interest.
   126  func filterRow(l []string, colNames map[string]int, pat map[string](*regexp.Regexp), channels map[string](chan []string), counter *int64) {
   127  	switch {
   128  
   129  	case Exclude(l[colNames["OrderTypeMnemonic"]]):
   130  
   131  	// WBC are sent directly to output
   132  	// WBC are not counted as 'new data'
   133  	case WbcLymph(l[colNames["TestTypeMnemonic"]]):
   134  		channels["wbc"] <- l
   135  		channels["results"] <- l
   136  
   137  	case CPD(l[colNames["OrderTypeMnemonic"]]):
   138  		channels["cpd"] <- l
   139  		// CPD reports, PD-L1 reports, and MSI reports count
   140  		// as "new" data and trigger a new report
   141  		channels["results"] <- l
   142  		channels["diff"] <- l
   143  
   144  	case PDL1(l[colNames["Value"]]):
   145  		if pat["pdl1Report"].MatchString(l[colNames["Value"]]) {
   146  			channels["results"] <- l
   147  			channels["pdl1"] <- l
   148  			channels["diff"] <- l
   149  
   150  			pdl1Result := pat["pdl1Result"].FindAllString(l[colNames["Value"]], 10)
   151  			channels["pdl1-to-diff"] <- Whitespace(pdl1Result)
   152  		}
   153  
   154  	case MSI(l[colNames["Value"]]):
   155  		if pat["msiReport"].MatchString(l[colNames["Value"]]) {
   156  			channels["results"] <- l
   157  			channels["diff"] <- l
   158  			channels["msi"] <- l
   159  
   160  			msiResult := pat["msiResult"].FindAllString(l[colNames["Value"]], 10)
   161  			channels["msi-to-diff"] <- Whitespace(msiResult)
   162  		}
   163  	}
   164  
   165  	atomic.AddInt64(counter, 1)
   166  }
   167  
   168  // filterResults filters a raw data input stream row by row.
   169  func filterResults(in chan []string, header []string) (results map[string](chan []string), done chan struct{}) {
   170  	done = make(chan struct{})
   171  
   172  	var buf int64 = 1e7
   173  
   174  	// channels contains communication of rows
   175  	// between goroutines processing data
   176  	results = make(map[string](chan []string))
   177  
   178  	// other channels for filtering data are closed in this function
   179  	resultTypes := []string{
   180  		"results",
   181  		"diff",
   182  		"wbc",
   183  		"cpd",
   184  		"pdl1",
   185  		"msi",
   186  		"pdl1-to-diff",
   187  		"msi-to-diff",
   188  	}
   189  
   190  	for _, name := range resultTypes {
   191  		results[name] = make(chan []string, buf)
   192  	}
   193  
   194  	ioCores := 2 // save cores for I/O
   195  
   196  	nProc := runtime.GOMAXPROCS(0) - ioCores
   197  
   198  	// run at least two filtering processes
   199  	if nProc < 2 {
   200  		nProc = 2
   201  	}
   202  
   203  	signal := make(chan struct{}, nProc)
   204  
   205  	// create patterns to use for filtering
   206  	pat := make(map[string](*regexp.Regexp))
   207  
   208  	pat["pdl1Report"] = regexp.MustCompile(Pdl1Report)
   209  	pat["msiReport"] = regexp.MustCompile(MsiReport)
   210  	pat["pdl1Result"] = regexp.MustCompile(Pdl1Result)
   211  	pat["msiResult"] = regexp.MustCompile(MsiResult)
   212  
   213  	colNames := headerParse(header)
   214  
   215  	var counter int64
   216  
   217  	// filter records on each core
   218  	for i := 0; i < nProc; i++ {
   219  		go func() {
   220  			for l := range in {
   221  				filterRow(l, colNames, pat, results, &counter)
   222  			}
   223  			signal <- struct{}{}
   224  		}()
   225  	}
   226  
   227  	stopCounter := make(chan struct{})
   228  	count(&counter, "filtered", stopCounter)
   229  
   230  	// wait and close
   231  	go func() {
   232  		for i := 0; i < nProc; i++ {
   233  			<-signal
   234  		}
   235  
   236  		stopCounter <- struct{}{}
   237  
   238  		log.Println("total filtered:", counter, "records")
   239  
   240  		for _, name := range resultTypes {
   241  			close(results[name])
   242  		}
   243  
   244  		done <- struct{}{}
   245  	}()
   246  
   247  	return results, done
   248  }