github.com/andrewrech/ih-abstract@v0.0.0-20210322142951-2fec1c8d0f38/records.go (about)

     1  package main
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/gob"
     6  	"errors"
     7  	"log"
     8  	"os"
     9  	"runtime"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  
    14  	"golang.org/x/crypto/blake2b"
    15  )
    16  
    17  // Store is a blake2b hash map that stores string slices.
    18  type Store map[[blake2b.Size256]byte](struct{})
    19  
    20  // Records provides thread safe access to Store.
    21  type Records struct {
    22  	Store
    23  	sync.Mutex
    24  }
    25  
    26  // Add adds a record.
    27  func (r *Records) Add(l *[]string) (err error) {
    28  	buf := &bytes.Buffer{}
    29  
    30  	var x struct{}
    31  
    32  	err = gob.NewEncoder(buf).Encode(l)
    33  	if err != nil {
    34  		return err
    35  	}
    36  
    37  	bs := buf.Bytes()
    38  
    39  	hash := blake2b.Sum256(bs)
    40  
    41  	r.Lock()
    42  	r.Store[hash] = x
    43  	r.Unlock()
    44  
    45  	return nil
    46  }
    47  
    48  // Check checks that a record exists.
    49  func (r *Records) Check(l *[]string) (exists bool, err error) {
    50  	buf := &bytes.Buffer{}
    51  
    52  	err = gob.NewEncoder(buf).Encode(l)
    53  	if err != nil {
    54  		return false, err
    55  	}
    56  
    57  	bs := buf.Bytes()
    58  
    59  	hash := blake2b.Sum256(bs)
    60  
    61  	r.Lock()
    62  	_, ok := r.Store[hash]
    63  	r.Unlock()
    64  
    65  	return ok, nil
    66  }
    67  
    68  // Existing creates a map of existing records.
    69  func Existing(name *string) (rs *Records) {
    70  	var records Records
    71  	records.Store = make(Store)
    72  
    73  	rs = &records
    74  
    75  	f, err := os.Open(*name)
    76  	if err != nil {
    77  		log.Fatalln(err)
    78  	}
    79  
    80  	log.Println("reading file", *name, "to hash map")
    81  
    82  	r := readCSV(f)
    83  
    84  	signal := make(chan struct{})
    85  
    86  	var counter int64
    87  	stopCounter := make(chan struct{})
    88  	count(&counter, "hashed", stopCounter)
    89  
    90  	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
    91  		go func() {
    92  			for l := range r.out {
    93  				i := l
    94  
    95  				err := rs.Add(&i)
    96  				if err != nil {
    97  					log.Fatalln(err)
    98  				}
    99  
   100  				atomic.AddInt64(&counter, 1)
   101  			}
   102  			signal <- struct{}{}
   103  		}()
   104  	}
   105  
   106  	<-r.done
   107  
   108  	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
   109  		<-signal
   110  	}
   111  
   112  	stopCounter <- struct{}{}
   113  
   114  	log.Println("total:", counter, "records")
   115  
   116  	return rs
   117  }
   118  
   119  // New identifies new Pathology database records based on a record hash.
   120  // For each new record, the corresponding patient identifier to saved to a file.
   121  func New(r *Records, header []string, in chan []string, out chan []string, done chan struct{}) {
   122  	var counter int64
   123  
   124  	n := make(map[string](struct{}))
   125  	w := File("new-ids.txt", []string{"identifier"})
   126  
   127  	id, err := RecordID(header)
   128  	if err != nil {
   129  		log.Fatalln(err)
   130  	}
   131  	colNames := headerParse(header)
   132  	idIdx := colNames[id]
   133  
   134  	go func() {
   135  		for l := range in {
   136  			i := l
   137  			out <- i
   138  
   139  			exists, err := r.Check(&i)
   140  			if err != nil {
   141  				log.Fatalln(err)
   142  			}
   143  
   144  			if exists {
   145  				continue
   146  			}
   147  
   148  			_, ok := n[l[idIdx]] // do not duplicate person instance output
   149  
   150  			if ok {
   151  				continue
   152  			}
   153  
   154  			n[l[idIdx]] = struct{}{}
   155  		}
   156  
   157  		for k := range n {
   158  			err := w.w.Write([]string{k})
   159  			if err != nil {
   160  				log.Fatalln(err)
   161  			}
   162  
   163  			atomic.AddInt64(&counter, 1)
   164  		}
   165  
   166  		w.done()
   167  		close(out)
   168  		close(done)
   169  
   170  		log.Println("Person-instances with new records:", counter)
   171  	}()
   172  }
   173  
   174  // RecordID gets a single input data column name containing a person-instance identifier.
   175  // The person instance identifier is either an MRN (preferred) or UID.
   176  func RecordID(header []string) (id string, err error) {
   177  	for _, id := range header {
   178  		if strings.Contains(id, "MRN") {
   179  			return id, nil
   180  		}
   181  	}
   182  
   183  	for _, id := range header {
   184  		if strings.Contains(id, "UID") {
   185  			return id, nil
   186  		}
   187  	}
   188  
   189  	return "", errors.New("cannot identify patient instance column name")
   190  }
   191  
   192  // Diff diffs old and new record sets.
   193  func Diff(oldFile *string, in chan []string, header []string) (out chan []string, done chan struct{}) {
   194  	var buf int64 = 2e7
   195  	out = make(chan []string, buf)
   196  	done = make(chan struct{})
   197  
   198  	go func() {
   199  		var records Records
   200  
   201  		r := &records
   202  
   203  		r.Store = make(Store)
   204  
   205  		r = Existing(oldFile)
   206  
   207  		New(r, header, in, out, done)
   208  	}()
   209  
   210  	return out, done
   211  }