git.dev.gkxim.com/golang-packages/geobed@v0.0.0-20190708072815-e989ac62624a/geobed.go (about) 1 package geobed 2 3 import ( 4 "archive/zip" 5 "bufio" 6 "bytes" 7 "compress/gzip" 8 "encoding/gob" 9 geohash "github.com/TomiHiltunen/geohash-golang" 10 "io" 11 "log" 12 "net/http" 13 "os" 14 "regexp" 15 "sort" 16 "strconv" 17 "strings" 18 ) 19 20 // There are over 2.4 million cities in the world. The Geonames data set only contains 143,270 and the MaxMind set contains 567,382 and 3,173,959 in the other MaxMind set. 21 // Obviously there's a lot of overlap and the worldcitiespop.txt from MaxMind contains a lot of dupes, though it by far the most comprehensive in terms of city - lat/lng. 22 // It may not be possible to have information for all cities, but many of the cities are also fairly remote and likely don't have internet access anyway. 23 // The Geonames data is preferred because it contains additional information such as elevation, population, and more. Population is good particuarly nice because a sense for 24 // the city size can be understood by applications. So showing all major cities is pretty easy. Though the primary goal of this package is to geocode, the additional information 25 // is bonus. So after checking the Geonames set, the geocoding functions will then look at MaxMind's. 26 // Maybe in the future this package will even use the Geonames premium data and have functions to look up nearest airports, etc. 27 // I would simply use just Geonames data, but there's so many more cities in the MaxMind set despite the lack of additional details. 28 // 29 // http://download.geonames.org/export/dump/cities1000.zip 30 // http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip 31 // http://download.maxmind.com/download/worldcities/worldcitiespop.txt.gz 32 33 // A list of data sources. 34 var dataSetFiles = []map[string]string{ 35 {"url": "http://download.geonames.org/export/dump/cities1000.zip", "path": "./geobed-data/cities1000.zip", "id": "geonamesCities1000"}, 36 {"url": "http://download.geonames.org/export/dump/countryInfo.txt", "path": "./geobed-data/countryInfo.txt", "id": "geonamesCountryInfo"}, 37 //{"url": "http://download.maxmind.com/download/worldcities/worldcitiespop.txt.gz", "path": "./geobed-data/worldcitiespop.txt.gz", "id": "maxmindWorldCities"}, 38 //{"url": "http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip", "path": "./geobed-data/GeoLiteCity-latest.zip", "id": "maxmindLiteCity"}, 39 } 40 41 // A handy map of US state codes to full names. 42 var UsSateCodes = map[string]string{ 43 "AL": "Alabama", 44 "AK": "Alaska", 45 "AZ": "Arizona", 46 "AR": "Arkansas", 47 "CA": "California", 48 "CO": "Colorado", 49 "CT": "Connecticut", 50 "DE": "Delaware", 51 "FL": "Florida", 52 "GA": "Georgia", 53 "HI": "Hawaii", 54 "ID": "Idaho", 55 "IL": "Illinois", 56 "IN": "Indiana", 57 "IA": "Iowa", 58 "KS": "Kansas", 59 "KY": "Kentucky", 60 "LA": "Louisiana", 61 "ME": "Maine", 62 "MD": "Maryland", 63 "MA": "Massachusetts", 64 "MI": "Michigan", 65 "MN": "Minnesota", 66 "MS": "Mississippi", 67 "MO": "Missouri", 68 "MT": "Montana", 69 "NE": "Nebraska", 70 "NV": "Nevada", 71 "NH": "New Hampshire", 72 "NJ": "New Jersey", 73 "NM": "New Mexico", 74 "NY": "New York", 75 "NC": "North Carolina", 76 "ND": "North Dakota", 77 "OH": "Ohio", 78 "OK": "Oklahoma", 79 "OR": "Oregon", 80 "PA": "Pennsylvania", 81 "RI": "Rhode Island", 82 "SC": "South Carolina", 83 "SD": "South Dakota", 84 "TN": "Tennessee", 85 "TX": "Texas", 86 "UT": "Utah", 87 "VT": "Vermont", 88 "VA": "Virginia", 89 "WA": "Washington", 90 "WV": "West Virginia", 91 "WI": "Wisconsin", 92 "WY": "Wyoming", 93 // Territories 94 "AS": "American Samoa", 95 "DC": "District of Columbia", 96 "FM": "Federated States of Micronesia", 97 "GU": "Guam", 98 "MH": "Marshall Islands", 99 "MP": "Northern Mariana Islands", 100 "PW": "Palau", 101 "PR": "Puerto Rico", 102 "VI": "Virgin Islands", 103 // Armed Forces (AE includes Europe, Africa, Canada, and the Middle East) 104 "AA": "Armed Forces Americas", 105 "AE": "Armed Forces Europe", 106 "AP": "Armed Forces Pacific", 107 } 108 109 // Contains all of the city and country data. Cities are split into buckets by country to increase lookup speed when the country is known. 110 type GeoBed struct { 111 c Cities 112 co []CountryInfo 113 } 114 115 type Cities []GeobedCity 116 117 func (c Cities) Len() int { 118 return len(c) 119 } 120 func (c Cities) Swap(i, j int) { 121 c[i], c[j] = c[j], c[i] 122 } 123 func (c Cities) Less(i, j int) bool { 124 return toLower(c[i].City) < toLower(c[j].City) 125 } 126 127 // A combined city struct (the various data sets have different fields, this combines what's available and keeps things smaller). 128 type GeobedCity struct { 129 City string 130 CityAlt string 131 // TODO: Think about converting this to a small int to save on memory allocation. Lookup requests can have the strings converted to the same int if there are any matches. 132 // This could make lookup more accurate, easier, and faster even. IF the int uses less bytes than the two letter code string. 133 Country string 134 Region string 135 Latitude float64 136 Longitude float64 137 Population int32 138 Geohash string 139 } 140 141 // TODO: String interning? (much like converting country code to int) 142 // https://gist.github.com/karlseguin/6570372 143 144 // TODO: Store the cities in mmap...??? 145 // https://github.com/boltdb/bolt/blob/master/bolt_unix.go#L42-L69 146 // Maybe even use bolt? 147 148 var maxMindCityDedupeIdx map[string][]string 149 150 // Holds information about the index ranges for city names (1st and 2nd characters) to help narrow down sets of the GeobedCity slice to scan when looking for a match. 151 var cityNameIdx map[string]int 152 var locationDedupeIdx map[string]bool 153 154 // Information about each country from Geonames including; ISO codes, FIPS, country capital, area (sq km), population, and more. 155 // Particularly useful for validating a location string contains a country name which can help the search process. 156 // Adding to this info, a slice of partial geohashes to help narrow down reverse geocoding lookups (maps to country buckets). 157 type CountryInfo struct { 158 Country string 159 Capital string 160 Area int32 161 Population int32 162 GeonameId int32 163 ISONumeric int16 164 ISO string 165 ISO3 string 166 Fips string 167 Continent string 168 Tld string 169 CurrencyCode string 170 CurrencyName string 171 Phone string 172 PostalCodeFormat string 173 PostalCodeRegex string 174 Languages string 175 Neighbours string 176 EquivalentFipsCode string 177 } 178 179 // Options when geocoding. For now just an exact match on city name, but there will be potentially other options that can be set to adjust how searching/matching works. 180 type GeocodeOptions struct { 181 ExactCity bool 182 } 183 184 // An index range struct that's used for narrowing down ranges over the large Cities struct. 185 type r struct { 186 f int 187 t int 188 } 189 190 // Creates a new Geobed instance. You do not need more than one. You do not want more than one. There's a fair bit of data to load into memory. 191 func NewGeobed() GeoBed { 192 g := GeoBed{} 193 194 var err error 195 g.c, err = loadGeobedCityData() 196 g.co, err = loadGeobedCountryData() 197 err = loadGeobedCityNameIdx() 198 if err != nil || len(g.c) == 0 { 199 g.downloadDataSets() 200 g.loadDataSets() 201 g.store() 202 } 203 204 return g 205 } 206 207 // Downloads the data sets if needed. 208 func (g *GeoBed) downloadDataSets() { 209 os.Mkdir("./geobed-data", 0777) 210 for _, f := range dataSetFiles { 211 _, err := os.Stat(f["path"]) 212 if err != nil { 213 if os.IsNotExist(err) { 214 // log.Println(f["path"] + " does not exist, downloading...") 215 out, oErr := os.Create(f["path"]) 216 defer out.Close() 217 if oErr == nil { 218 r, rErr := http.Get(f["url"]) 219 defer r.Body.Close() 220 if rErr == nil { 221 _, nErr := io.Copy(out, r.Body) 222 if nErr != nil { 223 // log.Println("Failed to copy data file, it will be tried again on next application start.") 224 // remove file so another attempt can be made, should something fail 225 err = os.Remove(f["path"]) 226 } 227 r.Body.Close() 228 } 229 out.Close() 230 } else { 231 log.Println(oErr) 232 } 233 } 234 } 235 } 236 } 237 238 // Unzips the data sets and loads the data. 239 func (g *GeoBed) loadDataSets() { 240 locationDedupeIdx = make(map[string]bool) 241 242 for _, f := range dataSetFiles { 243 // This one is zipped 244 if f["id"] == "geonamesCities1000" { 245 rz, err := zip.OpenReader(f["path"]) 246 if err != nil { 247 log.Fatal(err) 248 } 249 defer rz.Close() 250 251 for _, uF := range rz.File { 252 fi, err := uF.Open() 253 254 if err != nil { 255 log.Fatal(err) 256 } 257 defer fi.Close() 258 259 // Geonames uses a tab delineated format and it's not even consistent. No CSV reader that I've found for Go can understand this. 260 // I'm not expecting any reader to either because it's an invalid CSV to be frank. However, we can still split up each row by \t 261 scanner := bufio.NewScanner(fi) 262 scanner.Split(bufio.ScanLines) 263 264 i := 1 265 for scanner.Scan() { 266 i++ 267 268 // So regexp, sadly, must be used (well, unless I wanted parse each string byte by byte, pushing each into a buffer to append to a slice until a tab is reached, etc.). 269 // But I'd have to also then put in a condition if the next byte was a \t rune, then append an empty string, etc. This just, for now, seems nicer (easier). 270 // This is only an import/update, so it shouldn't be an issue for performance. If it is, then I'll look into other solutions. 271 fields := regexp.MustCompile("\t").Split(scanner.Text(), 19) 272 273 // NOTE: Now using a combined GeobedCity struct since not all data sets have the same fields. 274 // Plus, the entire point was to geocode forward and reverse. Bonus information like elevation and such is just superfluous. 275 // Leaving it here because it may be configurable... If options are passed to NewGeobed() then maybe Geobed can simply be a Geonames search. 276 // Don't even load in MaxMind data...And if that's the case, maybe that bonus information is desired. 277 if len(fields) == 19 { 278 //id, _ := strconv.Atoi(fields[0]) 279 lat, _ := strconv.ParseFloat(fields[4], 64) 280 lng, _ := strconv.ParseFloat(fields[5], 64) 281 pop, _ := strconv.Atoi(fields[14]) 282 //elv, _ := strconv.Atoi(fields[15]) 283 //dem, _ := strconv.Atoi(fields[16]) 284 285 gh := geohash.Encode(lat, lng) 286 // This is produced with empty lat/lng values - don't store it. 287 if gh == "7zzzzzzzzzzz" { 288 gh = "" 289 } 290 291 var c GeobedCity 292 c.City = strings.Trim(string(fields[1]), " ") 293 c.CityAlt = string(fields[3]) 294 c.Country = string(fields[8]) 295 c.Region = string(fields[10]) 296 c.Latitude = lat 297 c.Longitude = lng 298 c.Population = int32(pop) 299 c.Geohash = gh 300 301 // Don't include entries without a city name. If we want to geocode the centers of countries and states, then we can do that faster through other means. 302 if len(c.City) > 0 { 303 g.c = append(g.c, c) 304 } 305 } 306 } 307 } 308 } 309 310 // ...And this one is Gzipped (and this one may have worked with the CSV package, but parse it the same way as the others line by line) 311 if f["id"] == "maxmindWorldCities" { 312 // It also has a lot of dupes 313 maxMindCityDedupeIdx = make(map[string][]string) 314 315 fi, err := os.Open(f["path"]) 316 if err != nil { 317 log.Println(err) 318 } 319 defer fi.Close() 320 321 fz, err := gzip.NewReader(fi) 322 if err != nil { 323 log.Println(err) 324 } 325 defer fz.Close() 326 327 scanner := bufio.NewScanner(fz) 328 scanner.Split(bufio.ScanLines) 329 330 i := 1 331 for scanner.Scan() { 332 i++ 333 t := scanner.Text() 334 335 fields := strings.Split(t, ",") 336 if len(fields) == 7 { 337 var b bytes.Buffer 338 b.WriteString(fields[0]) // country 339 b.WriteString(fields[3]) // region 340 b.WriteString(fields[1]) // city 341 342 idx := b.String() 343 b.Reset() 344 maxMindCityDedupeIdx[idx] = fields 345 } 346 } 347 348 // Loop the map of fields after dupes have been removed (about 1/5th less... 2.6m vs 3.1m inreases lookup performance). 349 for _, fields := range maxMindCityDedupeIdx { 350 if fields[0] != "" && fields[0] != "0" { 351 if fields[2] != "AccentCity" { 352 pop, _ := strconv.Atoi(fields[4]) 353 lat, _ := strconv.ParseFloat(fields[5], 64) 354 lng, _ := strconv.ParseFloat(fields[6], 64) 355 // MaxMind's data set is a bit dirty. I've seen city names surrounded by parenthesis in a few places. 356 cn := strings.Trim(string(fields[2]), " ") 357 cn = strings.Trim(cn, "( )") 358 359 // Don't take any city names with erroneous punctuation either. 360 if strings.Contains(cn, "!") || strings.Contains(cn, "@") { 361 continue 362 } 363 364 gh := geohash.Encode(lat, lng) 365 // This is produced with empty lat/lng values - don't store it. 366 if gh == "7zzzzzzzzzzz" { 367 gh = "" 368 } 369 370 // If the geohash was seen before... 371 _, ok := locationDedupeIdx[gh] 372 if !ok { 373 locationDedupeIdx[gh] = true 374 375 var c GeobedCity 376 c.City = cn 377 c.Country = toUpper(string(fields[0])) 378 c.Region = string(fields[3]) 379 c.Latitude = lat 380 c.Longitude = lng 381 c.Population = int32(pop) 382 c.Geohash = gh 383 384 // Don't include entries without a city name. If we want to geocode the centers of countries and states, then we can do that faster through other means. 385 if len(c.City) > 0 && len(c.Country) > 0 { 386 g.c = append(g.c, c) 387 } 388 } 389 } 390 } 391 } 392 // Clear out the temrporary index (set to nil, it does get re-created) so that Go can garbage collect it at some point whenever it feels the need. 393 maxMindCityDedupeIdx = nil 394 locationDedupeIdx = nil 395 } 396 397 // ...And this one is just plain text 398 if f["id"] == "geonamesCountryInfo" { 399 fi, err := os.Open(f["path"]) 400 401 if err != nil { 402 log.Fatal(err) 403 } 404 defer fi.Close() 405 406 scanner := bufio.NewScanner(fi) 407 scanner.Split(bufio.ScanLines) 408 409 i := 1 410 for scanner.Scan() { 411 t := scanner.Text() 412 // There are a bunch of lines in this file that are comments, they start with # 413 if string(t[0]) != "#" { 414 i++ 415 fields := regexp.MustCompile("\t").Split(t, 19) 416 417 if len(fields) == 19 { 418 if fields[0] != "" && fields[0] != "0" { 419 isoNumeric, _ := strconv.Atoi(fields[2]) 420 area, _ := strconv.Atoi(fields[6]) 421 pop, _ := strconv.Atoi(fields[7]) 422 gid, _ := strconv.Atoi(fields[16]) 423 424 var ci CountryInfo 425 ci.ISO = string(fields[0]) 426 ci.ISO3 = string(fields[1]) 427 ci.ISONumeric = int16(isoNumeric) 428 ci.Fips = string(fields[3]) 429 ci.Country = string(fields[4]) 430 ci.Capital = string(fields[5]) 431 ci.Area = int32(area) 432 ci.Population = int32(pop) 433 ci.Continent = string(fields[8]) 434 ci.Tld = string(fields[9]) 435 ci.CurrencyCode = string(fields[10]) 436 ci.CurrencyName = string(fields[11]) 437 ci.Phone = string(fields[12]) 438 ci.PostalCodeFormat = string(fields[13]) 439 ci.PostalCodeRegex = string(fields[14]) 440 ci.Languages = string(fields[15]) 441 ci.GeonameId = int32(gid) 442 ci.Neighbours = string(fields[17]) 443 ci.EquivalentFipsCode = string(fields[18]) 444 445 g.co = append(g.co, ci) 446 } 447 } 448 } 449 } 450 } 451 } 452 453 // Sort []GeobedCity by city names to help with binary search (the City field is the most searched upon field and the matching names can be easily filtered down from there). 454 sort.Sort(g.c) 455 456 //debug 457 //log.Println("TOTAL RECORDS:") 458 //log.Println(len(g.c)) 459 460 // Index the locations of city names in the g.c []GeoCity slice. This way when searching the range can be limited so it will be faster. 461 cityNameIdx = make(map[string]int) 462 for k, v := range g.c { 463 // Get the index key for the first character of the city name. 464 ik := toLower(string(v.City[0])) 465 if val, ok := cityNameIdx[ik]; ok { 466 // If this key number is greater than what was previously recorded, then set it as the new indexed key. 467 if val < k { 468 cityNameIdx[ik] = k 469 } 470 } else { 471 // If the index key has not yet been set for this value, then set it. 472 cityNameIdx[ik] = k 473 } 474 475 // Get the index key for the first two characters of the city name. 476 // if len(v.CityLower) >= 2 { 477 // ik2 := v.CityLower[0:2] 478 // if val, ok := cityNameIdx[ik2]; ok { 479 // // If this key number is greater than what was previously recorded, then set it as the new indexed key. 480 // if val < k { 481 // cityNameIdx[ik2] = k 482 // } 483 // } else { 484 // // If the index key has not yet been set for this value, then set it. 485 // cityNameIdx[ik2] = k 486 // } 487 // } 488 } 489 } 490 491 // Forward geocode, location string to lat/lng (returns a struct though) 492 func (g *GeoBed) Geocode(n string, opts ...GeocodeOptions) GeobedCity { 493 var c GeobedCity 494 n = strings.TrimSpace(n) 495 if n == "" { 496 return c 497 } 498 // variadic optional argument trick 499 options := GeocodeOptions{} 500 if len(opts) > 0 { 501 options = opts[0] 502 } 503 504 if options.ExactCity { 505 c = g.exactMatchCity(n) 506 } else { 507 // NOTE: The downside of this (currently) is that something is basically always returned. It's a best guess. 508 // There's not much chance of it returning "not found" (or an empty GeobedCity struct). 509 // If you'd rather have nothing returned if not found, look at more exact matching options. 510 c = g.fuzzyMatchLocation(n) 511 } 512 513 return c 514 } 515 516 // Returns a GeobedCity only if there is an exact city name match. A stricter match, though if state or country are missing a guess will be made. 517 func (g *GeoBed) exactMatchCity(n string) GeobedCity { 518 var c GeobedCity 519 // Ignore the `abbrevSlice` value for now. Use `nCo` and `nSt` for more accuracy. 520 nCo, nSt, _, nSlice := g.extractLocationPieces(n) 521 nWithoutAbbrev := strings.Join(nSlice, " ") 522 ranges := g.getSearchRange(nSlice) 523 524 matchingCities := []GeobedCity{} 525 526 // First, get everything that matches the city exactly (case insensitive). 527 for _, rng := range ranges { 528 // When adjusting the range, the keys become out of sync. Start from rng.f 529 currentKey := rng.f 530 for _, v := range g.c[rng.f:rng.t] { 531 currentKey++ 532 // The full string (ie. "New York" or "Las Vegas") 533 if strings.EqualFold(n, v.City) { 534 matchingCities = append(matchingCities, v) 535 } 536 // The pieces with abbreviations removed 537 if strings.EqualFold(nWithoutAbbrev, v.City) { 538 matchingCities = append(matchingCities, v) 539 } 540 // Each piece - doesn't make sense for now. May revisit this. 541 // ie. "New York" or "New" and "York" ... well, "York" is going to match a different city. 542 // While that might be weeded out next, who knows. It's starting to get more fuzzy than I'd like for this function. 543 // for _, np := range nSlice { 544 // if strings.EqualFold(np, v.City) { 545 // matchingCities = append(matchingCities, v) 546 // } 547 // } 548 } 549 } 550 551 // If only one was found, we can stop right here. 552 if len(matchingCities) == 1 { 553 return matchingCities[0] 554 // If more than one was found, we need to guess. 555 } else if len(matchingCities) > 1 { 556 // Then range over those matching cities and try to figure out which one it is - city names are unfortunately not unique of course. 557 // There shouldn't be very many so I don't mind the multiple loops. 558 for _, city := range matchingCities { 559 // Was the state abbreviation present? That sounds promising. 560 if strings.EqualFold(nSt, city.Region) { 561 c = city 562 } 563 } 564 565 for _, city := range matchingCities { 566 // Matches the state and country? Likely the best scenario, I'd call it the best match. 567 if strings.EqualFold(nSt, city.Region) && strings.EqualFold(nCo, city.Country) { 568 c = city 569 } 570 } 571 572 // If we still don't have a city, maybe we have a country with the city name, ie. "New York, USA" 573 // This is tougher because there's a "New York" in Florida, Kentucky, and more. Let's use population to assist if we can. 574 if c.City == "" { 575 matchingCountryCities := []GeobedCity{} 576 for _, city := range matchingCities { 577 if strings.EqualFold(nCo, city.Country) { 578 matchingCountryCities = append(matchingCountryCities, city) 579 } 580 } 581 582 // If someone says, "New York, USA" they most likely mean New York, NY because it's the largest city. 583 // Specific locations are often implied based on size or popularity even though the names aren't unique. 584 biggestCity := GeobedCity{} 585 for _, city := range matchingCountryCities { 586 if city.Population > biggestCity.Population { 587 biggestCity = city 588 } 589 } 590 c = biggestCity 591 } 592 } 593 594 return c 595 } 596 597 // When geocoding, this provides a scored best match. 598 func (g *GeoBed) fuzzyMatchLocation(n string) GeobedCity { 599 nCo, nSt, abbrevSlice, nSlice := g.extractLocationPieces(n) 600 // Take the reamining unclassified pieces (those not likely to be abbreviations) and get our search range. 601 // These pieces are likely contain the city name. Narrowing down the search range will make the lookup faster. 602 ranges := g.getSearchRange(nSlice) 603 604 var bestMatchingKeys = map[int]int{} 605 var bestMatchingKey = 0 606 for _, rng := range ranges { 607 // When adjusting the range, the keys become out of sync. Start from rng.f 608 currentKey := rng.f 609 610 for _, v := range g.c[rng.f:rng.t] { 611 currentKey++ 612 613 // Mainly useful for strings like: "Austin, TX" or "Austin TX" (locations with US state codes). Smile if your location string is this simple. 614 if nSt != "" { 615 if strings.EqualFold(n, v.City) && strings.EqualFold(nSt, v.Region) { 616 return v 617 } 618 } 619 620 // Special case. Airport codes and other short 3 letter abbreviations, ie. NYC and SFO 621 // Country codes could present problems here. It seems to work for NYC, but not SFO (which there are multiple SFOs actually). 622 // Leaving it for now, but airport codes are tricky (though they are popular on Twitter). These must be exact (case sensitive) matches. 623 // if len(n) == 3 { 624 // alts := strings.Split(v.CityAlt, ",") 625 // for _, altV := range alts { 626 // if altV != "" { 627 // if altV == n { 628 // if val, ok := bestMatchingKeys[currentKey]; ok { 629 // bestMatchingKeys[currentKey] = val + 4 630 // } else { 631 // bestMatchingKeys[currentKey] = 4 632 // } 633 // } 634 // } 635 // } 636 // } 637 638 // Abbreviations for state/country 639 // Region (state/province) 640 for _, av := range abbrevSlice { 641 lowerAv := toLower(av) 642 if len(av) == 2 && strings.EqualFold(v.Region, lowerAv) { 643 if val, ok := bestMatchingKeys[currentKey]; ok { 644 bestMatchingKeys[currentKey] = val + 5 645 } else { 646 bestMatchingKeys[currentKey] = 5 647 } 648 } 649 650 // Country (worth 2 points if exact match) 651 if len(av) == 2 && strings.EqualFold(v.Country, lowerAv) { 652 if val, ok := bestMatchingKeys[currentKey]; ok { 653 bestMatchingKeys[currentKey] = val + 3 654 } else { 655 bestMatchingKeys[currentKey] = 3 656 } 657 } 658 } 659 660 // A discovered country name converted into a country code 661 if nCo != "" { 662 if nCo == v.Country { 663 if val, ok := bestMatchingKeys[currentKey]; ok { 664 bestMatchingKeys[currentKey] = val + 4 665 } else { 666 bestMatchingKeys[currentKey] = 4 667 } 668 } 669 } 670 671 // A discovered state name converted into a region code 672 if nSt != "" { 673 if nSt == v.Region { 674 if val, ok := bestMatchingKeys[currentKey]; ok { 675 bestMatchingKeys[currentKey] = val + 4 676 } else { 677 bestMatchingKeys[currentKey] = 4 678 } 679 } 680 } 681 682 // If any alternate names can be discovered, take them into consideration. 683 if v.CityAlt != "" { 684 alts := strings.Fields(v.CityAlt) 685 for _, altV := range alts { 686 if strings.EqualFold(altV, n) { 687 if val, ok := bestMatchingKeys[currentKey]; ok { 688 bestMatchingKeys[currentKey] = val + 3 689 } else { 690 bestMatchingKeys[currentKey] = 3 691 } 692 } 693 // Exact, a case-sensitive match means a lot. 694 if altV == n { 695 if val, ok := bestMatchingKeys[currentKey]; ok { 696 bestMatchingKeys[currentKey] = val + 5 697 } else { 698 bestMatchingKeys[currentKey] = 5 699 } 700 } 701 } 702 } 703 704 // Exact city name matches mean a lot. 705 if strings.EqualFold(n, v.City) { 706 if val, ok := bestMatchingKeys[currentKey]; ok { 707 bestMatchingKeys[currentKey] = val + 7 708 } else { 709 bestMatchingKeys[currentKey] = 7 710 } 711 } 712 713 for _, ns := range nSlice { 714 ns = strings.TrimSuffix(ns, ",") 715 716 // City (worth 2 points if contians part of string) 717 if strings.Contains(toLower(v.City), toLower(ns)) { 718 if val, ok := bestMatchingKeys[currentKey]; ok { 719 bestMatchingKeys[currentKey] = val + 2 720 } else { 721 bestMatchingKeys[currentKey] = 2 722 } 723 } 724 725 // If there's an exat match, maybe there was noise in the string so it could be the full city name, but unlikely. For example, "New" or "Los" is in many city names. 726 // Still, give it a point because it could be the bulkier part of a city name (or the city name could be one word). This has helped in some cases. 727 if strings.EqualFold(v.City, ns) { 728 if val, ok := bestMatchingKeys[currentKey]; ok { 729 bestMatchingKeys[currentKey] = val + 1 730 } else { 731 bestMatchingKeys[currentKey] = 1 732 } 733 } 734 735 } 736 } 737 } 738 739 // If no country was found, look at population as a factor. Is it obvious? 740 if nCo == "" { 741 hp := int32(0) 742 hpk := 0 743 for k, v := range bestMatchingKeys { 744 // Add bonus point for having a population 1,000+ 745 if g.c[k].Population >= 1000 { 746 bestMatchingKeys[k] = v + 1 747 } 748 // Now just add a bonus for having the highest population and points 749 if g.c[k].Population > hp { 750 hpk = k 751 hp = g.c[k].Population 752 } 753 } 754 // Add a point for having the highest population (if any of the results had population data available). 755 if g.c[hpk].Population > 0 { 756 bestMatchingKeys[hpk] = bestMatchingKeys[hpk] + 1 757 } 758 } 759 760 m := 0 761 for k, v := range bestMatchingKeys { 762 if v > m { 763 m = v 764 bestMatchingKey = k 765 } 766 767 // If there is a tie breaker, use the city with the higher population (if known) because it's more likely to be what is meant. 768 // For example, when people say "New York" they typically mean New York, NY...Though there are many New Yorks. 769 if v == m { 770 if g.c[k].Population > g.c[bestMatchingKey].Population { 771 bestMatchingKey = k 772 } 773 } 774 } 775 776 // debug 777 // log.Println("Possible results:") 778 // log.Println(len(bestMatchingKeys)) 779 // for _, kv := range bestMatchingKeys { 780 // log.Println(g.c[kv]) 781 // } 782 // log.Println("Best match:") 783 // log.Println(g.c[bestMatchingKey]) 784 // log.Println("Scored:") 785 // log.Println(m) 786 787 return g.c[bestMatchingKey] 788 } 789 790 // Splits a string up looking for potential abbreviations by matching against a shorter list of abbreviations. 791 // Returns country, state, a slice of strings with potential abbreviations (based on size; 2 or 3 characters), and then a slice of the remaning pieces. 792 // This does a good job at separating things that are clearly abbreviations from the city so that searching is faster and more accuarate. 793 func (g *GeoBed) extractLocationPieces(n string) (string, string, []string, []string) { 794 var re = regexp.MustCompile("") 795 796 // Extract all potential abbreviations. 797 re = regexp.MustCompile(`[\S]{2,3}`) 798 abbrevSlice := re.FindStringSubmatch(n) 799 800 // Convert country to country code and pull it out. We'll use it as a secondary form of validation. Remove the code from the original query. 801 nCo := "" 802 for _, co := range g.co { 803 re = regexp.MustCompile("(?i)^" + co.Country + ",?\\s|\\s" + co.Country + ",?\\s" + co.Country + "\\s$") 804 if re.MatchString(n) { 805 nCo = co.ISO 806 // And remove it so we have a cleaner query string for a city. 807 n = re.ReplaceAllString(n, "") 808 } 809 } 810 811 // Find US State codes and pull them out as well (do not convert state names, they can also easily be city names). 812 nSt := "" 813 for sc, _ := range UsSateCodes { 814 re = regexp.MustCompile("(?i)^" + sc + ",?\\s|\\s" + sc + ",?\\s|\\s" + sc + "$") 815 if re.MatchString(n) { 816 nSt = sc 817 // And remove it too. 818 n = re.ReplaceAllString(n, "") 819 } 820 } 821 // Trim spaces and commas off the modified string. 822 n = strings.Trim(n, " ,") 823 824 // Now extract words (potential city names) into a slice. With this, the index will be referenced to pinpoint sections of the g.c []GeobedCity slice to scan. 825 // This results in a much faster lookup. This is over a simple binary search with strings.Search() etc. because the city name may not be the first word. 826 // This should not contain any known country code or US state codes. 827 nSlice := strings.Split(n, " ") 828 829 return nCo, nSt, abbrevSlice, nSlice 830 } 831 832 // There's potentially 2.7 million items to range though, let's see if we can reduce that by taking slices of the slice in alphabetical order. 833 func (g *GeoBed) getSearchRange(nSlice []string) []r { 834 // NOTE: A simple binary search was not helping here since we aren't looking for one specific thing. We have multiple elements, city, state, country. 835 // So we'd end up with multiple binary searches to piece together which could be quite a few exponentially given the possible combinations...And so it was slower. 836 837 ranges := []r{} 838 for _, ns := range nSlice { 839 ns = strings.TrimSuffix(ns, ",") 840 841 if len(ns) > 0 { 842 // Get the first character in the string, this tells us where to stop. 843 fc := toLower(string(ns[0])) 844 // Get the previous index key (by getting the previous character in the alphabet) to figure out where to start. 845 pik := string(prev(rune(fc[0]))) 846 847 // To/from key 848 fk := 0 849 tk := 0 850 if val, ok := cityNameIdx[pik]; ok { 851 fk = val 852 } 853 if val, ok := cityNameIdx[fc]; ok { 854 tk = val 855 } 856 // Don't let the to key be out of range. 857 if tk == 0 { 858 tk = (len(g.c) - 1) 859 } 860 ranges = append(ranges, r{fk, tk}) 861 } 862 } 863 864 return ranges 865 } 866 867 func prev(r rune) rune { 868 return r - 1 869 } 870 871 // Reverse geocode 872 func (g *GeoBed) ReverseGeocode(lat float64, lng float64) GeobedCity { 873 c := GeobedCity{} 874 875 gh := geohash.Encode(lat, lng) 876 // This is produced with empty lat/lng values - don't look for anything. 877 if gh == "7zzzzzzzzzzz" { 878 return c 879 } 880 881 // Note: All geohashes are going to be 12 characters long. Even if the precision on the lat/lng isn't great. The geohash package will center things. 882 // Obviously lat/lng like 37, -122 is a guess. That's no where near the resolution of a city. Though we're going to allow guesses. 883 mostMatched := 0 884 matched := 0 885 for k, v := range g.c { 886 // check first two characters to reduce the number of loops 887 if v.Geohash[0] == gh[0] && v.Geohash[1] == gh[1] { 888 matched = 2 889 for i := 2; i <= len(gh); i++ { 890 //log.Println(gh[0:i]) 891 if v.Geohash[0:i] == gh[0:i] { 892 matched++ 893 } 894 } 895 // tie breakers go to city with larger population (NOTE: There's still a chance that the next pass will uncover a better match) 896 if matched == mostMatched && g.c[k].Population > c.Population { 897 c = g.c[k] 898 // log.Println("MATCHES") 899 // log.Println(matched) 900 // log.Println("CITY") 901 // log.Println(c.City) 902 // log.Println("POPULATION") 903 // log.Println(c.Population) 904 } 905 if matched > mostMatched { 906 c = g.c[k] 907 mostMatched = matched 908 } 909 } 910 } 911 912 return c 913 } 914 915 // A slightly faster lowercase function. 916 func toLower(s string) string { 917 b := make([]byte, len(s)) 918 for i := range b { 919 c := s[i] 920 if c >= 'A' && c <= 'Z' { 921 c += 'a' - 'A' 922 } 923 b[i] = c 924 } 925 return string(b) 926 } 927 928 // A slightly faster uppercase function. 929 func toUpper(s string) string { 930 b := make([]byte, len(s)) 931 for i := range b { 932 c := s[i] 933 if c >= 'a' && c <= 'z' { 934 c -= 'a' - 'A' 935 } 936 b[i] = c 937 } 938 return string(b) 939 } 940 941 // Dumps the Geobed data to disk. This speeds up startup time on subsequent runs (or if calling NewGeobed() multiple times which should be avoided if possible). 942 // TODO: Refactor 943 func (g GeoBed) store() error { 944 b := new(bytes.Buffer) 945 946 // Store the city info 947 enc := gob.NewEncoder(b) 948 err := enc.Encode(g.c) 949 if err != nil { 950 b.Reset() 951 return err 952 } 953 954 fh, eopen := os.OpenFile("./geobed-data/g.c.dmp", os.O_CREATE|os.O_WRONLY, 0666) 955 defer fh.Close() 956 if eopen != nil { 957 b.Reset() 958 return eopen 959 } 960 n, e := fh.Write(b.Bytes()) 961 if e != nil { 962 b.Reset() 963 return e 964 } 965 log.Printf("%d bytes successfully written to cache file\n", n) 966 967 // Store the country info as well (this is all now repetition - refactor) 968 b.Reset() 969 //enc = gob.NewEncoder(b) 970 err = enc.Encode(g.co) 971 if err != nil { 972 b.Reset() 973 return err 974 } 975 976 fh, eopen = os.OpenFile("./geobed-data/g.co.dmp", os.O_CREATE|os.O_WRONLY, 0666) 977 defer fh.Close() 978 if eopen != nil { 979 b.Reset() 980 return eopen 981 } 982 n, e = fh.Write(b.Bytes()) 983 if e != nil { 984 b.Reset() 985 return e 986 } 987 log.Printf("%d bytes successfully written to cache file\n", n) 988 989 // Store the index info (again there's some repetition here) 990 b.Reset() 991 //enc = gob.NewEncoder(b) 992 err = enc.Encode(cityNameIdx) 993 if err != nil { 994 b.Reset() 995 return err 996 } 997 998 fh, eopen = os.OpenFile("./geobed-data/cityNameIdx.dmp", os.O_CREATE|os.O_WRONLY, 0666) 999 defer fh.Close() 1000 if eopen != nil { 1001 b.Reset() 1002 return eopen 1003 } 1004 n, e = fh.Write(b.Bytes()) 1005 if e != nil { 1006 b.Reset() 1007 return e 1008 } 1009 log.Printf("%d bytes successfully written to cache file\n", n) 1010 1011 b.Reset() 1012 return nil 1013 } 1014 1015 // Loads a GeobedCity dump, which saves a bit of time. 1016 func loadGeobedCityData() ([]GeobedCity, error) { 1017 fh, err := os.Open("./geobed-data/g.c.dmp") 1018 if err != nil { 1019 return nil, err 1020 } 1021 gc := []GeobedCity{} 1022 dec := gob.NewDecoder(fh) 1023 err = dec.Decode(&gc) 1024 if err != nil { 1025 return nil, err 1026 } 1027 return gc, nil 1028 } 1029 1030 func loadGeobedCountryData() ([]CountryInfo, error) { 1031 fh, err := os.Open("./geobed-data/g.co.dmp") 1032 if err != nil { 1033 return nil, err 1034 } 1035 co := []CountryInfo{} 1036 dec := gob.NewDecoder(fh) 1037 err = dec.Decode(&co) 1038 if err != nil { 1039 return nil, err 1040 } 1041 return co, nil 1042 } 1043 1044 func loadGeobedCityNameIdx() error { 1045 fh, err := os.Open("./geobed-data/cityNameIdx.dmp") 1046 if err != nil { 1047 return err 1048 } 1049 dec := gob.NewDecoder(fh) 1050 cityNameIdx = make(map[string]int) 1051 err = dec.Decode(&cityNameIdx) 1052 if err != nil { 1053 return err 1054 } 1055 return nil 1056 }