github.com/biogo/biogo@v1.0.4/index/kmerindex/kmerindex.go (about) 1 // Copyright ©2011-2013 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package kmerindex performs Kmer indexing package based on Bob Edgar and 6 // Gene Meyers' approach used in PALS. 7 package kmerindex 8 9 import ( 10 "errors" 11 "fmt" 12 "math" 13 "unsafe" 14 15 "github.com/biogo/biogo/alphabet" 16 "github.com/biogo/biogo/seq/linear" 17 "github.com/biogo/biogo/util" 18 ) 19 20 var ( 21 ErrKTooLarge = errors.New("kmerindex: k too large") 22 ErrKTooSmall = errors.New("kmerindex: k too small") 23 ErrShortSeq = errors.New("kmerindex: sequence to short for k") 24 ErrBadAlphabet = errors.New("kmerindex: alphabet size != 4") 25 ErrBadKmer = errors.New("kmerindex: kmer out of range") 26 ErrBadKmerTextLen = errors.New("kmerindex: kmertext length != k") 27 ErrBadKmerText = errors.New("kmerindex: kmertext contains illegal character") 28 ) 29 30 // Constraints on Kmer length. 31 var ( 32 MinKmerLen = 4 // default minimum 33 34 // MaxKmerLen is the maximum Kmer length that can be used. 35 // It is 16 on 64 bit architectures and 14 on 32 bit architectures. 36 MaxKmerLen = 16 - offset 37 ) 38 39 const offset = int(unsafe.Sizeof(int(0))%0x4) / 2 40 41 var Debug = false // Set Debug to true to prevent recovering from panics in ForEachKmer f Eval function. 42 43 // 2-bit per base packed word 44 type Kmer uint32 // Sensible size for word type uint64 will double the size of the index (already large for high k) 45 46 // Kmer index type 47 type Index struct { 48 finger []Kmer 49 pos []int 50 seq *linear.Seq 51 lookUp alphabet.Index 52 k int 53 kMask Kmer 54 indexed bool 55 } 56 57 // Create a new Kmer Index with a word size k based on sequence 58 func New(k int, s *linear.Seq) (*Index, error) { 59 switch { 60 case k > MaxKmerLen: 61 return nil, ErrKTooLarge 62 case k < MinKmerLen: 63 return nil, ErrKTooSmall 64 case k+1 > s.Len(): 65 return nil, ErrShortSeq 66 case s.Alpha.Len() != 4: 67 return nil, ErrBadAlphabet 68 } 69 70 ki := &Index{ 71 finger: make([]Kmer, util.Pow4(k)+1), // Need a Tn+1 finger position so that Tn can be recognised 72 k: k, 73 kMask: Kmer(util.Pow4(k) - 1), 74 seq: s, 75 lookUp: s.Alpha.LetterIndex(), 76 indexed: false, 77 } 78 ki.buildKmerTable() 79 80 return ki, nil 81 } 82 83 // Build the table of Kmer frequencies - called by New 84 func (ki *Index) buildKmerTable() { 85 incrementFinger := func(index *Index, _, kmer int) { 86 index.finger[kmer]++ 87 } 88 ki.ForEachKmerOf(ki.seq, 0, ki.seq.Len(), incrementFinger) 89 } 90 91 // Build the Kmer position table destructively replacing Kmer frequencies 92 func (ki *Index) Build() { 93 var sum Kmer 94 for i, v := range ki.finger { 95 ki.finger[i], sum = sum, sum+v 96 } 97 98 locatePositions := func(index *Index, position, kmer int) { 99 index.pos[index.finger[kmer]] = position 100 index.finger[kmer]++ 101 } 102 ki.pos = make([]int, ki.seq.Len()-ki.k+1) 103 ki.ForEachKmerOf(ki.seq, 0, ki.seq.Len(), locatePositions) 104 105 ki.indexed = true 106 } 107 108 // Return an array of positions for the Kmer string kmertext 109 func (ki *Index) KmerPositionsString(kmertext string) (positions []int, err error) { 110 switch { 111 case len(kmertext) != ki.k: 112 return nil, ErrBadKmerTextLen 113 case !ki.indexed: 114 return nil, errors.New("kmerindex: index not built: call Build()") 115 } 116 117 var kmer Kmer 118 if kmer, err = ki.KmerOf(kmertext); err != nil { 119 return nil, err 120 } 121 122 return ki.KmerPositions(kmer) 123 } 124 125 // Return an array of positions for the Kmer kmer 126 func (ki *Index) KmerPositions(kmer Kmer) (positions []int, err error) { 127 if kmer > ki.kMask { 128 return nil, ErrBadKmer 129 } 130 131 i := Kmer(0) 132 if kmer > 0 { // special case: An has no predecessor 133 i = ki.finger[kmer-1] 134 } 135 j := ki.finger[kmer] 136 if i == j { 137 return 138 } 139 140 positions = make([]int, j-i) 141 for l, p := range ki.pos[i:j] { 142 positions[l] = int(p) 143 } 144 145 return 146 } 147 148 // Return a map containing absolute Kmer frequencies and true if called before Build(). 149 // If called after Build returns a nil map and false. 150 func (ki *Index) KmerFrequencies() (map[Kmer]int, bool) { 151 if ki.indexed { 152 return nil, false 153 } 154 155 m := map[Kmer]int{} 156 157 for i, f := range ki.finger { 158 if f > 0 { 159 m[Kmer(i)] = int(f) // not always safe - perhaps check that Kmer <= MaxInt 160 } 161 } 162 163 return m, true 164 } 165 166 // Return a map containing relative Kmer frequencies and true if called before Build(). 167 // If called after Build returns a nil map and false. 168 func (ki *Index) NormalisedKmerFrequencies() (map[Kmer]float64, bool) { 169 if ki.indexed { 170 return nil, false 171 } 172 173 m := map[Kmer]float64{} 174 175 l := float64(ki.seq.Len()) 176 for i, f := range ki.finger { 177 if f > 0 { 178 m[Kmer(i)] = float64(f) / l 179 } 180 } 181 182 return m, true 183 } 184 185 // Returns a Kmer-keyed map containing slices of kmer positions and true if called after Build, 186 // otherwise nil and false. 187 func (ki *Index) KmerIndex() (map[Kmer][]int, bool) { 188 if !ki.indexed { 189 return nil, false 190 } 191 192 m := make(map[Kmer][]int) 193 194 for i := range ki.finger { 195 if p, _ := ki.KmerPositions(Kmer(i)); len(p) > 0 { 196 m[Kmer(i)] = p 197 } 198 } 199 200 return m, true 201 } 202 203 // Returns a string-keyed map containing slices of kmer positions and true if called after Build, 204 // otherwise nil and false. 205 func (ki *Index) StringKmerIndex() (map[string][]int, bool) { 206 if !ki.indexed { 207 return nil, false 208 } 209 210 m := make(map[string][]int) 211 212 for i := range ki.finger { 213 if p, _ := ki.KmerPositions(Kmer(i)); len(p) > 0 { 214 m[ki.Format(Kmer(i))] = p 215 } 216 } 217 218 return m, true 219 } 220 221 // errors should be handled through a panic which will be recovered by ForEachKmerOf 222 type Eval func(index *Index, j, kmer int) 223 224 // Applies the f Eval func to all kmers in s from start to end. Returns any panic raised by f as an error. 225 func (ki *Index) ForEachKmerOf(s *linear.Seq, start, end int, f Eval) (err error) { 226 if !Debug { 227 defer func() { 228 if r := recover(); r != nil { 229 var ok bool 230 err, ok = r.(error) 231 if !ok { 232 err = fmt.Errorf("kmerindex: %v", r) 233 } 234 } 235 }() 236 } 237 238 kmer := Kmer(0) 239 high := 0 240 var currentBase int 241 242 // Preload the first k-1 bases of the first well defined k-mer or set high to the next position 243 basePosition := start 244 for ; basePosition < start+ki.k-1; basePosition++ { 245 currentBase = ki.lookUp[s.Seq[basePosition]] 246 if currentBase >= 0 { 247 kmer = (kmer << 2) | Kmer(currentBase) 248 } else { 249 kmer = 0 250 high = basePosition + 1 251 } 252 } 253 254 // Call f(position, kmer) for each of the next well defined k-mers 255 for position := basePosition - ki.k + 1; basePosition < end; position++ { 256 currentBase = ki.lookUp[s.Seq[basePosition]] 257 basePosition++ 258 if currentBase >= 0 { 259 kmer = ((kmer << 2) | Kmer(currentBase)) & ki.kMask 260 } else { 261 kmer = 0 262 high = basePosition 263 } 264 if position >= high { 265 f(ki, position, int(kmer)) 266 } 267 } 268 269 return 270 } 271 272 // Return the Kmer length of the Index. 273 func (ki *Index) K() int { 274 return ki.k 275 } 276 277 // Returns a pointer to the indexed seq.Seq. 278 func (ki *Index) Seq() *linear.Seq { 279 return ki.seq 280 } 281 282 // Returns the value of the finger slice at p. This signifies the absolute kmer frequency of the Kmer(p) 283 // if called before Build() and points to the relevant position lookup if called after. 284 func (ki *Index) FingerAt(p int) int { 285 return int(ki.finger[p]) 286 } 287 288 // Returns the value of the pos slice at p. This signifies the position of the pth kmer if called after Build(). 289 // Not valid before Build() - will panic. 290 func (ki *Index) PosAt(p int) int { 291 return ki.pos[p] 292 } 293 294 // Convert a Kmer into a string of bases 295 func (ki *Index) Format(kmer Kmer) string { 296 s, _ := Format(kmer, ki.k, ki.seq.Alpha) 297 return s 298 } 299 300 // Convert a string of bases into a len k Kmer, returns an error if string length does not match k. 301 // lookUp is an index lookup table as returned by alphabet.Alphabet.LetterIndex(). 302 func KmerOf(k int, lookUp alphabet.Index, kmertext string) (kmer Kmer, err error) { 303 if len(kmertext) != k { 304 return 0, ErrBadKmerTextLen 305 } 306 307 for _, v := range kmertext { 308 x := lookUp[v] 309 if x < 0 { 310 return 0, ErrBadKmerText 311 } 312 kmer = (kmer << 2) | Kmer(x) 313 } 314 315 return 316 } 317 318 // Return the GC fraction of a Kmer 319 func (ki *Index) GCof(kmer Kmer) float64 { 320 return GCof(ki.k, kmer) 321 } 322 323 // Return the GC fraction of a Kmer of len k 324 func GCof(k int, kmer Kmer) float64 { 325 gc := 0 326 for i := k - 1; i >= 0; i, kmer = i-1, kmer>>2 { 327 gc += int((kmer & 1) ^ ((kmer & 2) >> 1)) 328 } 329 330 return float64(gc) / float64(k) 331 } 332 333 // Convert a Kmer into a string of bases 334 func Format(kmer Kmer, k int, alpha alphabet.Alphabet) (string, error) { 335 if alpha.Len() != 4 { 336 return "", ErrBadAlphabet 337 } 338 kmertext := make([]byte, k) 339 340 for i := k - 1; i >= 0; i, kmer = i-1, kmer>>2 { 341 kmertext[i] = byte(alpha.Letter(int(kmer & 3))) 342 } 343 344 return string(kmertext), nil 345 } 346 347 // Reverse complement a Kmer. Complementation is performed according to letter index: 348 // 349 // 0, 1, 2, 3 = 3, 2, 1, 0 350 func (ki *Index) ComplementOf(kmer Kmer) (c Kmer) { 351 return ComplementOf(ki.k, kmer) 352 } 353 354 // Reverse complement a Kmer of len k. Complementation is performed according to letter index: 355 // 356 // 0, 1, 2, 3 = 3, 2, 1, 0 357 func ComplementOf(k int, kmer Kmer) (c Kmer) { 358 for i, j := uint(0), uint(k-1)*2; i <= j; i, j = i+2, j-2 { 359 c |= (^(kmer >> (j - i)) & (3 << i)) | (^(kmer>>i)&3)<<j 360 } 361 362 return 363 } 364 365 // Convert a string of bases into a Kmer, returns an error if string length does not match word length 366 func (ki *Index) KmerOf(kmertext string) (kmer Kmer, err error) { 367 if len(kmertext) != ki.k { 368 return 0, ErrBadKmerTextLen 369 } 370 371 for _, v := range kmertext { 372 x := ki.lookUp[v] 373 if x < 0 { 374 return 0, ErrBadKmerText 375 } 376 kmer = (kmer << 2) | Kmer(x) 377 } 378 379 return 380 } 381 382 // Return the Euclidean distance between two sequences measured by abolsolute kmer frequencies. 383 func Distance(a, b map[Kmer]float64) (dist float64) { 384 c := make(map[Kmer]struct{}, len(a)+len(b)) 385 for k := range a { 386 c[k] = struct{}{} 387 } 388 for k := range b { 389 c[k] = struct{}{} 390 } 391 for k := range c { 392 dist += math.Pow(a[k]-b[k], 2) 393 } 394 395 return math.Sqrt(dist) 396 } 397 398 // Confirm that a Build() is correct. Returns boolean indicating this and the number of kmers indexed. 399 func (ki *Index) Check() (ok bool, found int) { 400 ok = true 401 f := func(index *Index, position, kmer int) { 402 hit := false 403 var base Kmer 404 if kmer == 0 { 405 base = 0 406 } else { 407 base = index.finger[kmer-1] 408 } 409 for j := base; j < index.finger[kmer]; j++ { 410 if index.pos[j] == position { 411 found++ 412 hit = true 413 break 414 } 415 } 416 if !hit { 417 ok = false 418 } 419 } 420 421 if err := ki.ForEachKmerOf(ki.seq, 0, ki.seq.Len(), f); err != nil { 422 ok = false 423 } 424 425 return 426 } 427 428 // Return a copy of the internal finger slice. 429 func (ki *Index) Finger() (f []Kmer) { 430 f = make([]Kmer, len(ki.finger)) 431 copy(f, ki.finger) 432 return 433 } 434 435 // Return a copy of the internal pos slice. 436 func (ki *Index) Pos() (p []int) { 437 p = make([]int, len(ki.pos)) 438 copy(p, ki.pos) 439 return 440 }