github.com/andrewrech/ih-abstract@v0.0.0-20210322142951-2fec1c8d0f38/records.go (about) 1 package main 2 3 import ( 4 "bytes" 5 "encoding/gob" 6 "errors" 7 "log" 8 "os" 9 "runtime" 10 "strings" 11 "sync" 12 "sync/atomic" 13 14 "golang.org/x/crypto/blake2b" 15 ) 16 17 // Store is a blake2b hash map that stores string slices. 18 type Store map[[blake2b.Size256]byte](struct{}) 19 20 // Records provides thread safe access to Store. 21 type Records struct { 22 Store 23 sync.Mutex 24 } 25 26 // Add adds a record. 27 func (r *Records) Add(l *[]string) (err error) { 28 buf := &bytes.Buffer{} 29 30 var x struct{} 31 32 err = gob.NewEncoder(buf).Encode(l) 33 if err != nil { 34 return err 35 } 36 37 bs := buf.Bytes() 38 39 hash := blake2b.Sum256(bs) 40 41 r.Lock() 42 r.Store[hash] = x 43 r.Unlock() 44 45 return nil 46 } 47 48 // Check checks that a record exists. 49 func (r *Records) Check(l *[]string) (exists bool, err error) { 50 buf := &bytes.Buffer{} 51 52 err = gob.NewEncoder(buf).Encode(l) 53 if err != nil { 54 return false, err 55 } 56 57 bs := buf.Bytes() 58 59 hash := blake2b.Sum256(bs) 60 61 r.Lock() 62 _, ok := r.Store[hash] 63 r.Unlock() 64 65 return ok, nil 66 } 67 68 // Existing creates a map of existing records. 69 func Existing(name *string) (rs *Records) { 70 var records Records 71 records.Store = make(Store) 72 73 rs = &records 74 75 f, err := os.Open(*name) 76 if err != nil { 77 log.Fatalln(err) 78 } 79 80 log.Println("reading file", *name, "to hash map") 81 82 r := readCSV(f) 83 84 signal := make(chan struct{}) 85 86 var counter int64 87 stopCounter := make(chan struct{}) 88 count(&counter, "hashed", stopCounter) 89 90 for i := 0; i < runtime.GOMAXPROCS(0); i++ { 91 go func() { 92 for l := range r.out { 93 i := l 94 95 err := rs.Add(&i) 96 if err != nil { 97 log.Fatalln(err) 98 } 99 100 atomic.AddInt64(&counter, 1) 101 } 102 signal <- struct{}{} 103 }() 104 } 105 106 <-r.done 107 108 for i := 0; i < runtime.GOMAXPROCS(0); i++ { 109 <-signal 110 } 111 112 stopCounter <- struct{}{} 113 114 log.Println("total:", counter, "records") 115 116 return rs 117 } 118 119 // New identifies new Pathology database records based on a record hash. 120 // For each new record, the corresponding patient identifier to saved to a file. 121 func New(r *Records, header []string, in chan []string, out chan []string, done chan struct{}) { 122 var counter int64 123 124 n := make(map[string](struct{})) 125 w := File("new-ids.txt", []string{"identifier"}) 126 127 id, err := RecordID(header) 128 if err != nil { 129 log.Fatalln(err) 130 } 131 colNames := headerParse(header) 132 idIdx := colNames[id] 133 134 go func() { 135 for l := range in { 136 i := l 137 out <- i 138 139 exists, err := r.Check(&i) 140 if err != nil { 141 log.Fatalln(err) 142 } 143 144 if exists { 145 continue 146 } 147 148 _, ok := n[l[idIdx]] // do not duplicate person instance output 149 150 if ok { 151 continue 152 } 153 154 n[l[idIdx]] = struct{}{} 155 } 156 157 for k := range n { 158 err := w.w.Write([]string{k}) 159 if err != nil { 160 log.Fatalln(err) 161 } 162 163 atomic.AddInt64(&counter, 1) 164 } 165 166 w.done() 167 close(out) 168 close(done) 169 170 log.Println("Person-instances with new records:", counter) 171 }() 172 } 173 174 // RecordID gets a single input data column name containing a person-instance identifier. 175 // The person instance identifier is either an MRN (preferred) or UID. 176 func RecordID(header []string) (id string, err error) { 177 for _, id := range header { 178 if strings.Contains(id, "MRN") { 179 return id, nil 180 } 181 } 182 183 for _, id := range header { 184 if strings.Contains(id, "UID") { 185 return id, nil 186 } 187 } 188 189 return "", errors.New("cannot identify patient instance column name") 190 } 191 192 // Diff diffs old and new record sets. 193 func Diff(oldFile *string, in chan []string, header []string) (out chan []string, done chan struct{}) { 194 var buf int64 = 2e7 195 out = make(chan []string, buf) 196 done = make(chan struct{}) 197 198 go func() { 199 var records Records 200 201 r := &records 202 203 r.Store = make(Store) 204 205 r = Existing(oldFile) 206 207 New(r, header, in, out, done) 208 }() 209 210 return out, done 211 }