github.com/hiyorimi/geobed@v0.0.0-20190227204948-42ebdc6a8871/geobed.go (about) 1 package geobed 2 3 import ( 4 "archive/zip" 5 "bufio" 6 "bytes" 7 "compress/gzip" 8 "encoding/gob" 9 "io" 10 "log" 11 "net/http" 12 "os" 13 "regexp" 14 "sort" 15 "strconv" 16 "strings" 17 18 geohash "github.com/mmcloughlin/geohash" 19 ) 20 21 // There are over 2.4 million cities in the world. The Geonames data set only 22 // contains 143,270 and the MaxMind set contains 567,382 and 3,173,959 in the 23 // other MaxMind set. 24 // Obviously there's a lot of overlap and the worldcitiespop.txt from MaxMind 25 // contains a lot of dupes, though it by far the most comprehensive in terms 26 // of city - lat/lng. 27 // It may not be possible to have information for all cities, but many of the 28 // cities are also fairly remote and likely don't have internet access anyway. 29 // The Geonames data is preferred because it contains additional information 30 // such as elevation, population, and more. Population is good particuarly nice 31 // because a sense for 32 // the city size can be understood by applications. So showing all major 33 // cities is pretty easy. Though the primary goal of this package is to 34 // geocode, the additional information 35 // is bonus. So after checking the Geonames set, the geocoding functions 36 // will then look at MaxMind's. 37 // Maybe in the future this package will even use the Geonames premium data 38 // and have functions to look up nearest airports, etc. 39 // I would simply use just Geonames data, but there's so many more cities 40 // in the MaxMind set despite the lack of additional details. 41 // 42 // http://download.geonames.org/export/dump/cities1000.zip 43 // http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip 44 // http://download.maxmind.com/download/worldcities/worldcitiespop.txt.gz 45 46 // A list of data sources. 47 var dataSetFiles = []map[string]string{ 48 {"url": "http://download.geonames.org/export/dump/cities1000.zip", 49 "path": "./geobed-data/cities1000.zip", "id": "geonamesCities1000"}, 50 {"url": "http://download.geonames.org/export/dump/countryInfo.txt", 51 "path": "./geobed-data/countryInfo.txt", "id": "geonamesCountryInfo"}, 52 {"url": "https://github.com/CODAIT/redrock/raw/master/twitter-decahose/src/main/resources/Location/worldcitiespop.txt.gz", 53 "path": "./geobed-data/worldcitiespop.txt.gz", "id": "maxmindWorldCities"}, 54 //{"url": "http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip", 55 //"path": "./geobed-data/GeoLiteCity-latest.zip", "id": "maxmindLiteCity"}, 56 } 57 58 // UsSateCodes is a handy map of US state codes to full names. 59 var UsSateCodes = map[string]string{ 60 "AL": "Alabama", 61 "AK": "Alaska", 62 "AZ": "Arizona", 63 "AR": "Arkansas", 64 "CA": "California", 65 "CO": "Colorado", 66 "CT": "Connecticut", 67 "DE": "Delaware", 68 "FL": "Florida", 69 "GA": "Georgia", 70 "HI": "Hawaii", 71 "ID": "Idaho", 72 "IL": "Illinois", 73 "IN": "Indiana", 74 "IA": "Iowa", 75 "KS": "Kansas", 76 "KY": "Kentucky", 77 "LA": "Louisiana", 78 "ME": "Maine", 79 "MD": "Maryland", 80 "MA": "Massachusetts", 81 "MI": "Michigan", 82 "MN": "Minnesota", 83 "MS": "Mississippi", 84 "MO": "Missouri", 85 "MT": "Montana", 86 "NE": "Nebraska", 87 "NV": "Nevada", 88 "NH": "New Hampshire", 89 "NJ": "New Jersey", 90 "NM": "New Mexico", 91 "NY": "New York", 92 "NC": "North Carolina", 93 "ND": "North Dakota", 94 "OH": "Ohio", 95 "OK": "Oklahoma", 96 "OR": "Oregon", 97 "PA": "Pennsylvania", 98 "RI": "Rhode Island", 99 "SC": "South Carolina", 100 "SD": "South Dakota", 101 "TN": "Tennessee", 102 "TX": "Texas", 103 "UT": "Utah", 104 "VT": "Vermont", 105 "VA": "Virginia", 106 "WA": "Washington", 107 "WV": "West Virginia", 108 "WI": "Wisconsin", 109 "WY": "Wyoming", 110 // Territories 111 "AS": "American Samoa", 112 "DC": "District of Columbia", 113 "FM": "Federated States of Micronesia", 114 "GU": "Guam", 115 "MH": "Marshall Islands", 116 "MP": "Northern Mariana Islands", 117 "PW": "Palau", 118 "PR": "Puerto Rico", 119 "VI": "Virgin Islands", 120 // Armed Forces (AE includes Europe, Africa, Canada, and the Middle East) 121 "AA": "Armed Forces Americas", 122 "AE": "Armed Forces Europe", 123 "AP": "Armed Forces Pacific", 124 } 125 126 // GeoBed contains all of the city and country data. Cities are split into buckets by 127 // country to increase lookup speed when the country is known. 128 type GeoBed struct { 129 c Cities 130 co []CountryInfo 131 } 132 133 // Cities is a type alias to hold slice of GeobedCity. 134 type Cities []GeobedCity 135 136 func (c Cities) Len() int { 137 return len(c) 138 } 139 func (c Cities) Swap(i, j int) { 140 c[i], c[j] = c[j], c[i] 141 } 142 func (c Cities) Less(i, j int) bool { 143 return toLower(c[i].City) < toLower(c[j].City) 144 } 145 146 // GeobedCity is combined city struct (the various data sets have different 147 // fields, this combines what's available and keeps things smaller). 148 type GeobedCity struct { 149 City string 150 CityAlt string 151 // TODO: Think about converting this to a small int to save on memory 152 // allocation. Lookup requests can have the strings converted to the 153 // same int if there are any matches. 154 // This could make lookup more accurate, easier, and faster even. IF 155 // the int uses less bytes than the two letter code string. 156 Country string 157 Region string 158 Latitude float64 159 Longitude float64 160 Population int32 161 Geohash string 162 } 163 164 // TODO: String interning? (much like converting country code to int) 165 // https://gist.github.com/karlseguin/6570372 166 167 // TODO: Store the cities in mmap...??? 168 // https://github.com/boltdb/bolt/blob/master/bolt_unix.go#L42-L69 169 // Maybe even use bolt? 170 171 var maxMindCityDedupeIdx map[string][]string 172 173 // Holds information about the index ranges for city names (1st and 2nd 174 // characters) to help narrow down sets of the GeobedCity slice to scan 175 // when looking for a match. 176 var cityNameIdx map[string]int 177 var locationDedupeIdx map[string]bool 178 179 // CountryInfo contains information about each country from Geonames 180 // including; ISO codes, FIPS, country capital, area (sq km), population, etc. 181 // Particularly useful for validating a location string contains a country 182 // name which can help the search process. 183 // Adding to this info, a slice of partial geohashes to help narrow down 184 // reverse geocoding lookups (maps to country buckets). 185 type CountryInfo struct { 186 Country string 187 Capital string 188 Area int32 189 Population int32 190 GeonameID int32 191 ISONumeric int16 192 ISO string 193 ISO3 string 194 Fips string 195 Continent string 196 Tld string 197 CurrencyCode string 198 CurrencyName string 199 Phone string 200 PostalCodeFormat string 201 PostalCodeRegex string 202 Languages string 203 Neighbours string 204 EquivalentFipsCode string 205 } 206 207 // GeocodeOptions contains options for geocoding. 208 // For now just an exact match on city name, but 209 // there will be potentially other options that can be set to adjust how 210 // searching/matching works. 211 type GeocodeOptions struct { 212 ExactCity bool 213 } 214 215 // An index range struct that's used for narrowing down ranges over the 216 // large Cities struct. 217 type r struct { 218 f int 219 t int 220 } 221 222 // NewGeobed creates a new Geobed instance. You do not need more than one. 223 // Should be a singleton. There's a fair bit of data to load into memory. 224 func NewGeobed() GeoBed { 225 g := GeoBed{} 226 227 var err error 228 g.c, err = loadGeobedCityData() 229 if err != nil { 230 log.Println("Got err", err, "loading GeobedCity Data into memory.") 231 } 232 g.co, err = loadGeobedCountryData() 233 if err != nil { 234 log.Println("Got err", err, "loading GeobedCountry Data into memory.") 235 } 236 err = loadGeobedCityNameIdx() 237 if err != nil || len(g.c) == 0 { 238 log.Println("Got err", err, "loading data into memory. Will try to download now.") 239 g.downloadDataSets() 240 g.loadDataSets() 241 g.store() 242 } 243 244 return g 245 } 246 247 // Downloads the data sets if needed. 248 func (g *GeoBed) downloadDataSets() { 249 os.Mkdir("./geobed-data", 0777) 250 for _, f := range dataSetFiles { 251 _, err := os.Stat(f["path"]) 252 if err != nil { 253 if os.IsNotExist(err) { 254 // log.Println(f["path"] + " does not exist, downloading...") 255 out, oErr := os.Create(f["path"]) 256 defer out.Close() 257 if oErr == nil { 258 r, rErr := http.Get(f["url"]) 259 if r.StatusCode == 404 { 260 log.Println("Got 404 downloading ", f["url"], "file. Try placing it in", f["path"], 261 "manually.") 262 return 263 } 264 if rErr == nil { 265 _, nErr := io.Copy(out, r.Body) 266 if nErr != nil { 267 log.Println("Failed to copy data file, it will be tried again" + 268 "on next application start.") 269 // remove file so another attempt can be made, should something fail 270 err = os.Remove(f["path"]) 271 if err != nil { 272 log.Println("Encountered an error on file deletion:", err) 273 } 274 } 275 r.Body.Close() 276 } 277 defer r.Body.Close() 278 out.Close() 279 } else { 280 log.Println(oErr) 281 } 282 } 283 } 284 } 285 } 286 287 func (g *GeoBed) createCityNamesLocationsIndex() { 288 // Index the locations of city names in the g.c []GeoCity slice. This way when 289 // searching the range can be limited so it will be faster. 290 cityNameIdx = make(map[string]int) 291 for k, v := range g.c { 292 // Get the index key for the first character of the city name. 293 ik := toLower(string(v.City[0])) 294 if val, ok := cityNameIdx[ik]; ok { 295 // If this key number is greater than what was previously recorded, 296 // then set it as the new indexed key. 297 if val < k { 298 cityNameIdx[ik] = k 299 } 300 } else { 301 // If the index key has not yet been set for this value, then set it. 302 cityNameIdx[ik] = k 303 } 304 305 // Get the index key for the first two characters of the city name. 306 // if len(v.CityLower) >= 2 { 307 // ik2 := v.CityLower[0:2] 308 // if val, ok := cityNameIdx[ik2]; ok { 309 // // If this key number is greater than what was previously 310 // // recorded, then set it as the new indexed key. 311 // if val < k { 312 // cityNameIdx[ik2] = k 313 // } 314 // } else { 315 // // If the index key has not yet been set for this value, then set it. 316 // cityNameIdx[ik2] = k 317 // } 318 // } 319 } 320 } 321 322 func (g *GeoBed) loadGeonamesCities1000(f map[string]string) { 323 rz, err := zip.OpenReader(f["path"]) 324 if err != nil { 325 log.Fatal(err) 326 } 327 defer rz.Close() 328 329 for _, uF := range rz.File { 330 fi, err := uF.Open() 331 332 if err != nil { 333 log.Fatal(err) 334 } 335 defer fi.Close() 336 337 // Geonames uses a tab delineated format and it's not even 338 // consistent. No CSV reader that I've found for Go can understand this. 339 // I'm not expecting any reader to either because it's an 340 // invalid CSV to be frank. However, we can still split up each row by \t 341 scanner := bufio.NewScanner(fi) 342 scanner.Split(bufio.ScanLines) 343 344 i := 1 345 for scanner.Scan() { 346 i++ 347 348 // So regexp, sadly, must be used (well, unless I wanted parse 349 // each string byte by byte, pushing each into a buffer to 350 // append to a slice until a tab is reached, etc.). 351 // But I'd have to also then put in a condition if the next 352 // byte was a \t rune, then append an empty string, etc. This 353 // just, for now, seems nicer (easier). 354 // This is only an import/update, so it shouldn't be an issue 355 // for performance. If it is, then I'll look into other solutions. 356 fields := regexp.MustCompile("\t").Split(scanner.Text(), 19) 357 358 // NOTE: Now using a combined GeobedCity struct since not all 359 // data sets have the same fields. 360 // Plus, the entire point was to geocode forward and reverse. 361 // Bonus information like elevation and such is just superfluous. 362 // Leaving it here because it may be configurable... If options 363 // are passed to NewGeobed() then maybe Geobed can simply be a Geonames search. 364 // Don't even load in MaxMind data...And if that's the case, 365 // maybe that bonus information is desired. 366 if len(fields) == 19 { 367 //id, _ := strconv.Atoi(fields[0]) 368 lat, _ := strconv.ParseFloat(fields[4], 64) 369 lng, _ := strconv.ParseFloat(fields[5], 64) 370 pop, _ := strconv.Atoi(fields[14]) 371 //elv, _ := strconv.Atoi(fields[15]) 372 //dem, _ := strconv.Atoi(fields[16]) 373 374 gh := geohash.Encode(lat, lng) 375 // This is produced with empty lat/lng values - don't store it. 376 if gh == "7zzzzzzzzzzz" { 377 gh = "" 378 } 379 380 var c GeobedCity 381 c.City = strings.Trim(string(fields[1]), " ") 382 c.CityAlt = string(fields[3]) 383 c.Country = string(fields[8]) 384 c.Region = string(fields[10]) 385 c.Latitude = lat 386 c.Longitude = lng 387 c.Population = int32(pop) 388 c.Geohash = gh 389 390 // Don't include entries without a city name. If we want to 391 // geocode the centers of countries and states, then we can 392 // do that faster through other means. 393 if len(c.City) > 0 { 394 g.c = append(g.c, c) 395 } 396 } 397 } 398 } 399 } 400 401 func (g *GeoBed) loadMaxmindWorldCities(f map[string]string) { 402 // It also has a lot of dupes 403 maxMindCityDedupeIdx = make(map[string][]string) 404 405 fi, err := os.Open(f["path"]) 406 if err != nil { 407 log.Println(err) 408 } 409 defer fi.Close() 410 411 fz, err := gzip.NewReader(fi) 412 if err != nil { 413 log.Println(err) 414 } 415 defer fz.Close() 416 417 scanner := bufio.NewScanner(fz) 418 scanner.Split(bufio.ScanLines) 419 420 i := 1 421 for scanner.Scan() { 422 i++ 423 t := scanner.Text() 424 425 fields := strings.Split(t, ",") 426 if len(fields) == 7 { 427 var b bytes.Buffer 428 b.WriteString(fields[0]) // country 429 b.WriteString(fields[3]) // region 430 b.WriteString(fields[1]) // city 431 432 idx := b.String() 433 b.Reset() 434 maxMindCityDedupeIdx[idx] = fields 435 } 436 } 437 438 // Loop the map of fields after dupes have been removed (about 1/5th 439 // less... 2.6m vs 3.1m inreases lookup performance). 440 for _, fields := range maxMindCityDedupeIdx { 441 if fields[0] != "" && fields[0] != "0" { 442 if fields[2] != "AccentCity" { 443 pop, _ := strconv.Atoi(fields[4]) 444 lat, _ := strconv.ParseFloat(fields[5], 64) 445 lng, _ := strconv.ParseFloat(fields[6], 64) 446 // MaxMind's data set is a bit dirty. I've seen city names 447 // surrounded by parenthesis in a few places. 448 cn := strings.Trim(string(fields[2]), " ") 449 cn = strings.Trim(cn, "( )") 450 451 // Don't take any city names with erroneous punctuation either. 452 if strings.Contains(cn, "!") || strings.Contains(cn, "@") { 453 continue 454 } 455 456 gh := geohash.Encode(lat, lng) 457 // This is produced with empty lat/lng values - don't store it. 458 if gh == "7zzzzzzzzzzz" { 459 gh = "" 460 } 461 462 // If the geohash was seen before... 463 _, ok := locationDedupeIdx[gh] 464 if !ok { 465 locationDedupeIdx[gh] = true 466 467 var c GeobedCity 468 c.City = cn 469 c.Country = toUpper(string(fields[0])) 470 c.Region = string(fields[3]) 471 c.Latitude = lat 472 c.Longitude = lng 473 c.Population = int32(pop) 474 c.Geohash = gh 475 476 // Don't include entries without a city name. If we want 477 // to geocode the centers of countries and states, then 478 // we can do that faster through other means. 479 if len(c.City) > 0 && len(c.Country) > 0 { 480 g.c = append(g.c, c) 481 } 482 } 483 } 484 } 485 } 486 // Clear out the temrporary index (set to nil, it does get re-created) 487 // so that Go can garbage collect it at some point whenever it feels the need. 488 maxMindCityDedupeIdx = nil 489 locationDedupeIdx = nil 490 } 491 492 func (g *GeoBed) loadGeonamesCountryInfo(f map[string]string) { 493 fi, err := os.Open(f["path"]) 494 495 if err != nil { 496 log.Fatal(err) 497 } 498 defer fi.Close() 499 500 scanner := bufio.NewScanner(fi) 501 scanner.Split(bufio.ScanLines) 502 503 i := 1 504 for scanner.Scan() { 505 t := scanner.Text() 506 // There are a bunch of lines in this file that are comments, they start with # 507 if string(t[0]) != "#" { 508 i++ 509 fields := regexp.MustCompile("\t").Split(t, 19) 510 511 if len(fields) == 19 { 512 if fields[0] != "" && fields[0] != "0" { 513 isoNumeric, _ := strconv.Atoi(fields[2]) 514 area, _ := strconv.Atoi(fields[6]) 515 pop, _ := strconv.Atoi(fields[7]) 516 gid, _ := strconv.Atoi(fields[16]) 517 518 var ci CountryInfo 519 ci.ISO = string(fields[0]) 520 ci.ISO3 = string(fields[1]) 521 ci.ISONumeric = int16(isoNumeric) 522 ci.Fips = string(fields[3]) 523 ci.Country = string(fields[4]) 524 ci.Capital = string(fields[5]) 525 ci.Area = int32(area) 526 ci.Population = int32(pop) 527 ci.Continent = string(fields[8]) 528 ci.Tld = string(fields[9]) 529 ci.CurrencyCode = string(fields[10]) 530 ci.CurrencyName = string(fields[11]) 531 ci.Phone = string(fields[12]) 532 ci.PostalCodeFormat = string(fields[13]) 533 ci.PostalCodeRegex = string(fields[14]) 534 ci.Languages = string(fields[15]) 535 ci.GeonameID = int32(gid) 536 ci.Neighbours = string(fields[17]) 537 ci.EquivalentFipsCode = string(fields[18]) 538 539 g.co = append(g.co, ci) 540 } 541 } 542 } 543 } 544 } 545 546 // Unzips the data sets and loads the data. 547 func (g *GeoBed) loadDataSets() { 548 locationDedupeIdx = make(map[string]bool) 549 550 for _, f := range dataSetFiles { 551 // This one is zipped 552 if f["id"] == "geonamesCities1000" { 553 g.loadGeonamesCities1000(f) 554 } 555 556 // ...And this one is Gzipped (and this one may have worked with the CSV 557 // package, but parse it the same way as the others line by line) 558 if f["id"] == "maxmindWorldCities" { 559 g.loadMaxmindWorldCities(f) 560 } 561 562 // ...And this one is just plain text 563 if f["id"] == "geonamesCountryInfo" { 564 g.loadGeonamesCountryInfo(f) 565 } 566 } 567 568 // Sort []GeobedCity by city names to help with binary search (the City field is the 569 // most searched upon field and the matching names can be easily filtered down from there). 570 sort.Sort(g.c) 571 572 //debug 573 //log.Println("TOTAL RECORDS:") 574 //log.Println(len(g.c)) 575 576 g.createCityNamesLocationsIndex() 577 } 578 579 // Geocode forward geocode, location string to lat/lng (returns a struct though). 580 // Calls exactMatchCity / fuzzyMatchLocation to perform a search. 581 func (g *GeoBed) Geocode(n string, opts ...GeocodeOptions) GeobedCity { 582 var c GeobedCity 583 n = strings.TrimSpace(n) 584 if n == "" { 585 return c 586 } 587 // variadic optional argument trick 588 options := GeocodeOptions{} 589 if len(opts) > 0 { 590 options = opts[0] 591 } 592 593 if options.ExactCity { 594 c = g.exactMatchCity(n) 595 } else { 596 // NOTE: The downside of this (currently) is that something is basically 597 // always returned. It's a best guess. 598 // There's not much chance of it returning "not found" (or an empty 599 // GeobedCity struct). 600 // If you'd rather have nothing returned if not found, look at 601 // more exact matching options. 602 c = g.fuzzyMatchLocation(n) 603 } 604 605 return c 606 } 607 608 func filterMatchingCities(nCo, nSt string, matchingCities []GeobedCity) GeobedCity { 609 var c GeobedCity 610 // Then range over those matching cities and try to figure out which 611 // one it is - city names are unfortunately not unique of course. 612 // There shouldn't be very many so I don't mind the multiple loops. 613 for _, city := range matchingCities { 614 // Was the state abbreviation present? That sounds promising. 615 if strings.EqualFold(nSt, city.Region) { 616 c = city 617 } 618 } 619 620 for _, city := range matchingCities { 621 // Matches the state and country? Likely the best scenario, 622 // I'd call it the best match. 623 if strings.EqualFold(nSt, city.Region) && strings.EqualFold(nCo, city.Country) { 624 c = city 625 } 626 } 627 628 // If we still don't have a city, maybe we have a country with the 629 // city name, ie. "New York, USA" 630 // This is tougher because there's a "New York" in Florida, Kentucky, 631 // and more. Let's use population to assist if we can. 632 if c.City == "" { 633 matchingCountryCities := []GeobedCity{} 634 for _, city := range matchingCities { 635 if strings.EqualFold(nCo, city.Country) { 636 matchingCountryCities = append(matchingCountryCities, city) 637 } 638 } 639 640 // If someone says, "New York, USA" they most likely mean 641 // New York, NY because it's the largest city. 642 // Specific locations are often implied based on size or 643 // popularity even though the names aren't unique. 644 biggestCity := GeobedCity{} 645 for _, city := range matchingCountryCities { 646 if city.Population > biggestCity.Population { 647 biggestCity = city 648 } 649 } 650 c = biggestCity 651 } 652 return c 653 } 654 655 // Returns a GeobedCity only if there is an exact city name match. A stricter 656 // match, though if state or country are missing a guess will be made. 657 func (g *GeoBed) exactMatchCity(n string) GeobedCity { 658 var c GeobedCity 659 // Ignore the `abbrevSlice` value for now. Use `nCo` and `nSt` for more accuracy. 660 nCo, nSt, _, nSlice := g.extractLocationPieces(n) 661 nWithoutAbbrev := strings.Join(nSlice, " ") 662 ranges := g.getSearchRange(nSlice) 663 664 matchingCities := []GeobedCity{} 665 666 // First, get everything that matches the city exactly (case insensitive). 667 for _, rng := range ranges { 668 // When adjusting the range, the keys become out of sync. Start from rng.f 669 currentKey := rng.f 670 for _, v := range g.c[rng.f:rng.t] { 671 currentKey++ 672 // The full string (ie. "New York" or "Las Vegas") 673 if strings.EqualFold(n, v.City) { 674 matchingCities = append(matchingCities, v) 675 } 676 // The pieces with abbreviations removed 677 if strings.EqualFold(nWithoutAbbrev, v.City) { 678 matchingCities = append(matchingCities, v) 679 } 680 // Each piece - doesn't make sense for now. May revisit this. 681 // ie. "New York" or "New" and "York" ... well, "York" is going 682 // to match a different city. 683 // While that might be weeded out next, who knows. It's starting 684 // to get more fuzzy than I'd like for this function. 685 // for _, np := range nSlice { 686 // if strings.EqualFold(np, v.City) { 687 // matchingCities = append(matchingCities, v) 688 // } 689 // } 690 } 691 } 692 693 // If only one was found, we can stop right here. 694 if len(matchingCities) == 1 { 695 return matchingCities[0] 696 // If more than one was found, we need to guess. 697 } else if len(matchingCities) > 1 { 698 c = filterMatchingCities(nCo, nSt, matchingCities) 699 } 700 701 return c 702 } 703 704 func scoreCountryMatch(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, nCo string) { 705 // A discovered country name converted into a country code 706 if nCo != "" { 707 if nCo == v.Country { 708 if val, ok := bestMatchingKeys[currentKey]; ok { 709 bestMatchingKeys[currentKey] = val + 4 710 } else { 711 bestMatchingKeys[currentKey] = 4 712 } 713 } 714 } 715 } 716 717 func scoreStateMatch(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, nSt string) { 718 // A discovered state name converted into a region code 719 if nSt != "" { 720 if nSt == v.Region { 721 if val, ok := bestMatchingKeys[currentKey]; ok { 722 bestMatchingKeys[currentKey] = val + 4 723 } else { 724 bestMatchingKeys[currentKey] = 4 725 } 726 } 727 } 728 } 729 730 func scoreAlternativeNames(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, query string) { 731 // If any alternate names can be discovered, take them into consideration. 732 if v.CityAlt != "" { 733 alts := strings.Fields(v.CityAlt) 734 for _, altV := range alts { 735 if strings.EqualFold(altV, query) { 736 if val, ok := bestMatchingKeys[currentKey]; ok { 737 bestMatchingKeys[currentKey] = val + 3 738 } else { 739 bestMatchingKeys[currentKey] = 3 740 } 741 } 742 // Exact, a case-sensitive match means a lot. 743 if altV == query { 744 if val, ok := bestMatchingKeys[currentKey]; ok { 745 bestMatchingKeys[currentKey] = val + 5 746 } else { 747 bestMatchingKeys[currentKey] = 5 748 } 749 } 750 } 751 } 752 } 753 754 func scoreFuzzyMatches(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, query, nCo, nSt string, abbrevSlice, nSlice []string) { 755 756 // Special case. Airport codes and other short 3 letter abbreviations, 757 // ie. NYC and SFO 758 // Country codes could present problems here. It seems to work for NYC, 759 // but not SFO (which there are multiple SFOs actually). 760 // Leaving it for now, but airport codes are tricky (though they are 761 // popular on Twitter). These must be exact (case sensitive) matches. 762 // if len(n) == 3 { 763 // alts := strings.Split(v.CityAlt, ",") 764 // for _, altV := range alts { 765 // if altV != "" { 766 // if altV == n { 767 // if val, ok := bestMatchingKeys[currentKey]; ok { 768 // bestMatchingKeys[currentKey] = val + 4 769 // } else { 770 // bestMatchingKeys[currentKey] = 4 771 // } 772 // } 773 // } 774 // } 775 // } 776 777 // Abbreviations for state/country 778 // Region (state/province) 779 for _, av := range abbrevSlice { 780 lowerAv := toLower(av) 781 if len(av) == 2 && strings.EqualFold(v.Region, lowerAv) { 782 if val, ok := bestMatchingKeys[currentKey]; ok { 783 bestMatchingKeys[currentKey] = val + 5 784 } else { 785 bestMatchingKeys[currentKey] = 5 786 } 787 } 788 789 // Country (worth 2 points if exact match) 790 if len(av) == 2 && strings.EqualFold(v.Country, lowerAv) { 791 if val, ok := bestMatchingKeys[currentKey]; ok { 792 bestMatchingKeys[currentKey] = val + 3 793 } else { 794 bestMatchingKeys[currentKey] = 3 795 } 796 } 797 } 798 799 scoreCountryMatch(v, currentKey, bestMatchingKeys, nCo) 800 scoreStateMatch(v, currentKey, bestMatchingKeys, nSt) 801 scoreAlternativeNames(v, currentKey, bestMatchingKeys, query) 802 803 // Exact city name matches mean a lot. 804 if strings.EqualFold(query, v.City) { 805 if val, ok := bestMatchingKeys[currentKey]; ok { 806 bestMatchingKeys[currentKey] = val + 7 807 } else { 808 bestMatchingKeys[currentKey] = 7 809 } 810 } 811 812 } 813 814 func (g *GeoBed) getBestFuzzyMatches(ranges []r, query, nCo, nSt string, abbrevSlice, nSlice []string) (map[int]int, int) { 815 816 var bestMatchingKeys = map[int]int{} 817 var bestMatchingKey = 0 818 819 for _, rng := range ranges { 820 // When adjusting the range, the keys become out of sync. Start from rng.f 821 currentKey := rng.f 822 823 for _, v := range g.c[rng.f:rng.t] { 824 currentKey++ 825 826 // Mainly useful for strings like: "Austin, TX" or "Austin TX" 827 // (locations with US state codes). Smile if your location string is this simple. 828 if nSt != "" { 829 if strings.EqualFold(query, v.City) && strings.EqualFold(nSt, v.Region) { 830 bestMatchingKeys[0] = currentKey 831 bestMatchingKey = currentKey 832 return bestMatchingKeys, bestMatchingKey 833 } 834 } 835 836 scoreFuzzyMatches(v, currentKey, bestMatchingKeys, query, nCo, nSt, abbrevSlice, nSlice) 837 838 for _, ns := range nSlice { 839 ns = strings.TrimSuffix(ns, ",") 840 841 // City (worth 2 points if contains part of string) 842 if strings.Contains(toLower(v.City), toLower(ns)) { 843 if val, ok := bestMatchingKeys[currentKey]; ok { 844 bestMatchingKeys[currentKey] = val + 2 845 } else { 846 bestMatchingKeys[currentKey] = 2 847 } 848 } 849 850 // If there's an exact match, maybe there was noise in the string 851 // so it could be the full city name, but unlikely. For 852 // example, "New" or "Los" is in many city names. 853 // Still, give it a point because it could be the bulkier part 854 // of a city name (or the city name could be one word). 855 // This has helped in some cases. 856 if strings.EqualFold(v.City, ns) { 857 if val, ok := bestMatchingKeys[currentKey]; ok { 858 bestMatchingKeys[currentKey] = val + 1 859 } else { 860 bestMatchingKeys[currentKey] = 1 861 } 862 } 863 864 } 865 } 866 } 867 868 return bestMatchingKeys, bestMatchingKey 869 } 870 871 // When geocoding, this provides a scored best match. 872 func (g *GeoBed) fuzzyMatchLocation(n string) GeobedCity { 873 nCo, nSt, abbrevSlice, nSlice := g.extractLocationPieces(n) 874 // Take the renaming unclassified pieces (those not likely to be 875 // abbreviations) and get our search range. 876 // These pieces are likely contain the city name. Narrowing down 877 // the search range will make the lookup faster. 878 ranges := g.getSearchRange(nSlice) 879 880 bestMatchingKeys, bestMatchingKey := g.getBestFuzzyMatches(ranges, n, nCo, nSt, abbrevSlice, nSlice) 881 if len(bestMatchingKeys) == 1 { 882 return g.c[bestMatchingKey] 883 } 884 885 // If no country was found, look at population as a factor. Is it obvious? 886 if nCo == "" { 887 hp := int32(0) 888 hpk := 0 889 for k, v := range bestMatchingKeys { 890 // Add bonus point for having a population 1,000+ 891 if g.c[k].Population >= 1000 { 892 bestMatchingKeys[k] = v + 1 893 } 894 // Now just add a bonus for having the highest population and points 895 if g.c[k].Population > hp { 896 hpk = k 897 hp = g.c[k].Population 898 } 899 } 900 // Add a point for having the highest population (if any of the results 901 // had population data available). 902 if g.c[hpk].Population > 0 { 903 bestMatchingKeys[hpk] = bestMatchingKeys[hpk] + 1 904 } 905 } 906 907 m := 0 908 for k, v := range bestMatchingKeys { 909 if v > m { 910 m = v 911 bestMatchingKey = k 912 } 913 914 // If there is a tie breaker, use the city with the higher population 915 // (if known) because it's more likely to be what is meant. 916 // For example, when people say "New York" they typically mean 917 // New York, NY...Though there are many New Yorks. 918 if v == m { 919 if g.c[k].Population > g.c[bestMatchingKey].Population { 920 bestMatchingKey = k 921 } 922 } 923 } 924 925 // debug 926 // log.Println("Possible results:") 927 // log.Println(len(bestMatchingKeys)) 928 // for _, kv := range bestMatchingKeys { 929 // log.Println(g.c[kv]) 930 // } 931 // log.Println("Best match:") 932 // log.Println(g.c[bestMatchingKey]) 933 // log.Println("Scored:") 934 // log.Println(m) 935 936 return g.c[bestMatchingKey] 937 } 938 939 // Splits a string up looking for potential abbreviations by matching against 940 // a shorter list of abbreviations. 941 // Returns country, state, a slice of strings with potential abbreviations 942 // (based on size; 2 or 3 characters), and then a slice of the remaning pieces. 943 // This does a good job at separating things that are clearly abbreviations 944 // from the city so that searching is faster and more accuarate. 945 func (g *GeoBed) extractLocationPieces(n string) (string, string, []string, []string) { 946 var re = regexp.MustCompile("") 947 948 // Extract all potential abbreviations. 949 re = regexp.MustCompile(`[\S]{2,3}`) 950 abbrevSlice := re.FindStringSubmatch(n) 951 952 // Convert country to country code and pull it out. We'll use it as a 953 // secondary form of validation. Remove the code from the original query. 954 nCo := "" 955 for _, co := range g.co { 956 re = regexp.MustCompile("(?i)^" + co.Country + ",?\\s|\\s" + co.Country + ",?\\s" + co.Country + "\\s$") 957 if re.MatchString(n) { 958 nCo = co.ISO 959 // And remove it so we have a cleaner query string for a city. 960 n = re.ReplaceAllString(n, "") 961 } 962 } 963 964 // Find US State codes and pull them out as well (do not convert 965 // state names, they can also easily be city names). 966 nSt := "" 967 for sc := range UsSateCodes { 968 re = regexp.MustCompile("(?i)^" + sc + ",?\\s|\\s" + sc + ",?\\s|\\s" + sc + "$") 969 if re.MatchString(n) { 970 nSt = sc 971 // And remove it too. 972 n = re.ReplaceAllString(n, "") 973 } 974 } 975 // Trim spaces and commas off the modified string. 976 n = strings.Trim(n, " ,") 977 978 // Now extract words (potential city names) into a slice. With this, 979 // the index will be referenced to pinpoint sections of 980 // the g.c []GeobedCity slice to scan. 981 // This results in a much faster lookup. This is over a simple 982 // binary search with strings.Search() etc. because the city name 983 // may not be the first word. 984 // This should not contain any known country code or US state codes. 985 nSlice := strings.Split(n, " ") 986 987 return nCo, nSt, abbrevSlice, nSlice 988 } 989 990 // There's potentially 2.7 million items to range though, let's see if we can 991 // reduce that by taking slices of the slice in alphabetical order. 992 func (g *GeoBed) getSearchRange(nSlice []string) []r { 993 // NOTE: A simple binary search was not helping here since 994 // we aren't looking for one specific thing. 995 // We have multiple elements, city, state, country. 996 // So we'd end up with multiple binary searches to piece together which 997 // could be quite a few exponentially given the possible 998 // combinations...And so it was slower. 999 1000 ranges := []r{} 1001 for _, ns := range nSlice { 1002 ns = strings.TrimSuffix(ns, ",") 1003 1004 if len(ns) > 0 { 1005 // Get the first character in the string, this tells us where to stop. 1006 fc := toLower(string(ns[0])) 1007 // Get the previous index key (by getting the previous 1008 // character in the alphabet) to figure out where to start. 1009 pik := string(prev(rune(fc[0]))) 1010 1011 // To/from key 1012 fk := 0 1013 tk := 0 1014 if val, ok := cityNameIdx[pik]; ok { 1015 fk = val 1016 } 1017 if val, ok := cityNameIdx[fc]; ok { 1018 tk = val 1019 } 1020 // Don't let the to key be out of range. 1021 if tk == 0 { 1022 tk = (len(g.c) - 1) 1023 } 1024 ranges = append(ranges, r{fk, tk}) 1025 } 1026 } 1027 1028 return ranges 1029 } 1030 1031 func prev(r rune) rune { 1032 return r - 1 1033 } 1034 1035 // ReverseGeocode finds place name by latitude and longitude. 1036 func (g *GeoBed) ReverseGeocode(lat float64, lng float64) GeobedCity { 1037 c := GeobedCity{} 1038 1039 gh := geohash.Encode(lat, lng) 1040 // This is produced with empty lat/lng values - don't look for anything. 1041 if gh == "7zzzzzzzzzzz" { 1042 return c 1043 } 1044 1045 // Note: All geohashes are going to be 12 characters long. Even if the 1046 // precision on the lat/lng isn't great. The geohash package will center things. 1047 // Obviously lat/lng like 37, -122 is a guess. That's no where near 1048 // the resolution of a city. Though we're going to allow guesses. 1049 mostMatched := 0 1050 matched := 0 1051 for k, v := range g.c { 1052 // check first two characters to reduce the number of loops 1053 if v.Geohash[0] == gh[0] && v.Geohash[1] == gh[1] { 1054 matched = 2 1055 for i := 2; i <= len(gh); i++ { 1056 //log.Println(gh[0:i]) 1057 if v.Geohash[0:i] == gh[0:i] { 1058 matched++ 1059 } 1060 } 1061 // tie breakers go to city with larger population (NOTE: There's 1062 // still a chance that the next pass will uncover a better match) 1063 if matched == mostMatched && g.c[k].Population > c.Population { 1064 c = g.c[k] 1065 //log.Println("MATCHES") 1066 //log.Println(matched) 1067 //log.Println("CITY") 1068 //log.Println(c.City) 1069 //log.Println("POPULATION") 1070 //log.Println(c.Population) 1071 } 1072 if matched > mostMatched { 1073 c = g.c[k] 1074 mostMatched = matched 1075 } 1076 } 1077 } 1078 1079 return c 1080 } 1081 1082 // A slightly faster lowercase function. 1083 func toLower(s string) string { 1084 b := make([]byte, len(s)) 1085 for i := range b { 1086 c := s[i] 1087 if c >= 'A' && c <= 'Z' { 1088 c += 'a' - 'A' 1089 } 1090 b[i] = c 1091 } 1092 return string(b) 1093 } 1094 1095 // A slightly faster uppercase function. 1096 func toUpper(s string) string { 1097 b := make([]byte, len(s)) 1098 for i := range b { 1099 c := s[i] 1100 if c >= 'a' && c <= 'z' { 1101 c -= 'a' - 'A' 1102 } 1103 b[i] = c 1104 } 1105 return string(b) 1106 } 1107 1108 // Dumps the Geobed data to disk. This speeds up startup time on subsequent 1109 // runs (or if calling NewGeobed() multiple times which should be avoided 1110 // if possible). 1111 // TODO: Refactor 1112 func (g GeoBed) store() error { 1113 b := new(bytes.Buffer) 1114 1115 // Store the city info 1116 enc := gob.NewEncoder(b) 1117 err := enc.Encode(g.c) 1118 if err != nil { 1119 b.Reset() 1120 return err 1121 } 1122 1123 fh, eopen := os.OpenFile("./geobed-data/g.c.dmp", os.O_CREATE|os.O_WRONLY, 0666) 1124 defer fh.Close() 1125 if eopen != nil { 1126 b.Reset() 1127 return eopen 1128 } 1129 n, e := fh.Write(b.Bytes()) 1130 if e != nil { 1131 b.Reset() 1132 return e 1133 } 1134 log.Printf("%d bytes successfully written to cache file\n", n) 1135 1136 // Store the country info as well (this is all now repetition - refactor) 1137 b.Reset() 1138 //enc = gob.NewEncoder(b) 1139 err = enc.Encode(g.co) 1140 if err != nil { 1141 b.Reset() 1142 return err 1143 } 1144 1145 fh, eopen = os.OpenFile("./geobed-data/g.co.dmp", os.O_CREATE|os.O_WRONLY, 0666) 1146 defer fh.Close() 1147 if eopen != nil { 1148 b.Reset() 1149 return eopen 1150 } 1151 n, e = fh.Write(b.Bytes()) 1152 if e != nil { 1153 b.Reset() 1154 return e 1155 } 1156 log.Printf("%d bytes successfully written to cache file\n", n) 1157 1158 // Store the index info (again there's some repetition here) 1159 b.Reset() 1160 //enc = gob.NewEncoder(b) 1161 err = enc.Encode(cityNameIdx) 1162 if err != nil { 1163 b.Reset() 1164 return err 1165 } 1166 1167 fh, eopen = os.OpenFile("./geobed-data/cityNameIdx.dmp", os.O_CREATE|os.O_WRONLY, 0666) 1168 defer fh.Close() 1169 if eopen != nil { 1170 b.Reset() 1171 return eopen 1172 } 1173 n, e = fh.Write(b.Bytes()) 1174 if e != nil { 1175 b.Reset() 1176 return e 1177 } 1178 log.Printf("%d bytes successfully written to cache file\n", n) 1179 1180 b.Reset() 1181 return nil 1182 } 1183 1184 // Loads a GeobedCity dump, which saves a bit of time. 1185 func loadGeobedCityData() ([]GeobedCity, error) { 1186 fh, err := os.Open("./geobed-data/g.c.dmp") 1187 if err != nil { 1188 return nil, err 1189 } 1190 gc := []GeobedCity{} 1191 dec := gob.NewDecoder(fh) 1192 err = dec.Decode(&gc) 1193 if err != nil { 1194 return nil, err 1195 } 1196 return gc, nil 1197 } 1198 1199 func loadGeobedCountryData() ([]CountryInfo, error) { 1200 fh, err := os.Open("./geobed-data/g.co.dmp") 1201 if err != nil { 1202 return nil, err 1203 } 1204 co := []CountryInfo{} 1205 dec := gob.NewDecoder(fh) 1206 err = dec.Decode(&co) 1207 if err != nil { 1208 return nil, err 1209 } 1210 return co, nil 1211 } 1212 1213 func loadGeobedCityNameIdx() error { 1214 fh, err := os.Open("./geobed-data/cityNameIdx.dmp") 1215 if err != nil { 1216 return err 1217 } 1218 dec := gob.NewDecoder(fh) 1219 cityNameIdx = make(map[string]int) 1220 err = dec.Decode(&cityNameIdx) 1221 if err != nil { 1222 return err 1223 } 1224 return nil 1225 }