github.com/andrewrech/ih-abstract@v0.0.0-20210322142951-2fec1c8d0f38/unique.go (about)

     1  package main
     2  
     3  import (
     4  	"log"
     5  	"os"
     6  	"strings"
     7  )
     8  
     9  // prevUnq adds previously identified unique strings from an existing output file to a hash map.
    10  func prevUnq(f string) (r *Records) {
    11  	var records Records
    12  	records.Store = make(Store)
    13  
    14  	r = &records
    15  
    16  	if _, err := os.Stat(f); err == nil {
    17  		log.Println("reading patterns from existing records file", f)
    18  		r = Existing(&f)
    19  	} else {
    20  		log.Println("existing records file", f, "does not exist, skipping diff")
    21  	}
    22  
    23  	return r
    24  }
    25  
    26  // DiffUnq identifies unique strings from an input stream and compares the unique strings to an existing output file. The function returns 1) unique strings and 2) new strings compared to the existing output file.
    27  func DiffUnq(in chan []string, name string) (channels map[string](chan []string), done chan struct{}) {
    28  	done = make(chan struct{})
    29  
    30  	var buf int64 = 1e7
    31  
    32  	// channels contains communication of rows
    33  	// between goroutines processing data
    34  	channels = make(map[string](chan []string))
    35  
    36  	// add to an existing records map if
    37  	// if CSV output already exists
    38  	unqRecordsName := strings.Join([]string{name, "-unique-strings"}, "")
    39  	unqRecordsNameNew := strings.Join([]string{name, "-unique-strings-new"}, "")
    40  
    41  	channels[unqRecordsName] = make(chan []string, buf)
    42  
    43  	channels[unqRecordsNameNew] = make(chan []string, buf)
    44  
    45  	// read previous output
    46  	f := strings.Join([]string{name, "-unique-strings.csv"}, "")
    47  	prevResults := prevUnq(f)
    48  
    49  	var records Records
    50  	records.Store = make(Store)
    51  	currentResults := &records
    52  
    53  	go func() {
    54  		for l := range in { // for each slice
    55  			for _, s := range l { // each string of slice
    56  
    57  				i := []string{s}
    58  
    59  				existsCurrent, err := currentResults.Check(&i)
    60  				if err != nil {
    61  					log.Fatalln(err)
    62  				}
    63  
    64  				// string does not exist in current records
    65  				if !existsCurrent {
    66  					err = currentResults.Add(&i)
    67  					if err != nil {
    68  						log.Fatalln(err)
    69  					}
    70  
    71  					channels[unqRecordsName] <- []string{s}
    72  				}
    73  
    74  				// string does not exist in previous records
    75  				existsPrev, err := prevResults.Check(&i)
    76  				if err != nil {
    77  					log.Fatalln(err)
    78  				}
    79  
    80  				if !existsPrev {
    81  					err = prevResults.Add(&i)
    82  					if err != nil {
    83  						log.Fatalln(err)
    84  					}
    85  
    86  					log.Println("New string:", s)
    87  					channels[unqRecordsNameNew] <- []string{s}
    88  				}
    89  			}
    90  		}
    91  
    92  		close(channels[unqRecordsName])
    93  		close(channels[unqRecordsNameNew])
    94  		done <- struct{}{}
    95  	}()
    96  
    97  	return channels, done
    98  }