git.dev.gkxim.com/golang-packages/geobed@v0.0.0-20190708072815-e989ac62624a/geobed.go (about)

     1  package geobed
     2  
     3  import (
     4  	"archive/zip"
     5  	"bufio"
     6  	"bytes"
     7  	"compress/gzip"
     8  	"encoding/gob"
     9  	geohash "github.com/TomiHiltunen/geohash-golang"
    10  	"io"
    11  	"log"
    12  	"net/http"
    13  	"os"
    14  	"regexp"
    15  	"sort"
    16  	"strconv"
    17  	"strings"
    18  )
    19  
    20  // There are over 2.4 million cities in the world. The Geonames data set only contains 143,270 and the MaxMind set contains 567,382 and 3,173,959 in the other MaxMind set.
    21  // Obviously there's a lot of overlap and the worldcitiespop.txt from MaxMind contains a lot of dupes, though it by far the most comprehensive in terms of city - lat/lng.
    22  // It may not be possible to have information for all cities, but many of the cities are also fairly remote and likely don't have internet access anyway.
    23  // The Geonames data is preferred because it contains additional information such as elevation, population, and more. Population is good particuarly nice because a sense for
    24  // the city size can be understood by applications. So showing all major cities is pretty easy. Though the primary goal of this package is to geocode, the additional information
    25  // is bonus. So after checking the Geonames set, the geocoding functions will then look at MaxMind's.
    26  // Maybe in the future this package will even use the Geonames premium data and have functions to look up nearest airports, etc.
    27  // I would simply use just Geonames data, but there's so many more cities in the MaxMind set despite the lack of additional details.
    28  //
    29  // http://download.geonames.org/export/dump/cities1000.zip
    30  // http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip
    31  // http://download.maxmind.com/download/worldcities/worldcitiespop.txt.gz
    32  
    33  // A list of data sources.
    34  var dataSetFiles = []map[string]string{
    35  	{"url": "http://download.geonames.org/export/dump/cities1000.zip", "path": "./geobed-data/cities1000.zip", "id": "geonamesCities1000"},
    36  	{"url": "http://download.geonames.org/export/dump/countryInfo.txt", "path": "./geobed-data/countryInfo.txt", "id": "geonamesCountryInfo"},
    37  	//{"url": "http://download.maxmind.com/download/worldcities/worldcitiespop.txt.gz", "path": "./geobed-data/worldcitiespop.txt.gz", "id": "maxmindWorldCities"},
    38  	//{"url": "http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip", "path": "./geobed-data/GeoLiteCity-latest.zip", "id": "maxmindLiteCity"},
    39  }
    40  
    41  // A handy map of US state codes to full names.
    42  var UsSateCodes = map[string]string{
    43  	"AL": "Alabama",
    44  	"AK": "Alaska",
    45  	"AZ": "Arizona",
    46  	"AR": "Arkansas",
    47  	"CA": "California",
    48  	"CO": "Colorado",
    49  	"CT": "Connecticut",
    50  	"DE": "Delaware",
    51  	"FL": "Florida",
    52  	"GA": "Georgia",
    53  	"HI": "Hawaii",
    54  	"ID": "Idaho",
    55  	"IL": "Illinois",
    56  	"IN": "Indiana",
    57  	"IA": "Iowa",
    58  	"KS": "Kansas",
    59  	"KY": "Kentucky",
    60  	"LA": "Louisiana",
    61  	"ME": "Maine",
    62  	"MD": "Maryland",
    63  	"MA": "Massachusetts",
    64  	"MI": "Michigan",
    65  	"MN": "Minnesota",
    66  	"MS": "Mississippi",
    67  	"MO": "Missouri",
    68  	"MT": "Montana",
    69  	"NE": "Nebraska",
    70  	"NV": "Nevada",
    71  	"NH": "New Hampshire",
    72  	"NJ": "New Jersey",
    73  	"NM": "New Mexico",
    74  	"NY": "New York",
    75  	"NC": "North Carolina",
    76  	"ND": "North Dakota",
    77  	"OH": "Ohio",
    78  	"OK": "Oklahoma",
    79  	"OR": "Oregon",
    80  	"PA": "Pennsylvania",
    81  	"RI": "Rhode Island",
    82  	"SC": "South Carolina",
    83  	"SD": "South Dakota",
    84  	"TN": "Tennessee",
    85  	"TX": "Texas",
    86  	"UT": "Utah",
    87  	"VT": "Vermont",
    88  	"VA": "Virginia",
    89  	"WA": "Washington",
    90  	"WV": "West Virginia",
    91  	"WI": "Wisconsin",
    92  	"WY": "Wyoming",
    93  	// Territories
    94  	"AS": "American Samoa",
    95  	"DC": "District of Columbia",
    96  	"FM": "Federated States of Micronesia",
    97  	"GU": "Guam",
    98  	"MH": "Marshall Islands",
    99  	"MP": "Northern Mariana Islands",
   100  	"PW": "Palau",
   101  	"PR": "Puerto Rico",
   102  	"VI": "Virgin Islands",
   103  	// Armed Forces (AE includes Europe, Africa, Canada, and the Middle East)
   104  	"AA": "Armed Forces Americas",
   105  	"AE": "Armed Forces Europe",
   106  	"AP": "Armed Forces Pacific",
   107  }
   108  
   109  // Contains all of the city and country data. Cities are split into buckets by country to increase lookup speed when the country is known.
   110  type GeoBed struct {
   111  	c  Cities
   112  	co []CountryInfo
   113  }
   114  
   115  type Cities []GeobedCity
   116  
   117  func (c Cities) Len() int {
   118  	return len(c)
   119  }
   120  func (c Cities) Swap(i, j int) {
   121  	c[i], c[j] = c[j], c[i]
   122  }
   123  func (c Cities) Less(i, j int) bool {
   124  	return toLower(c[i].City) < toLower(c[j].City)
   125  }
   126  
   127  // A combined city struct (the various data sets have different fields, this combines what's available and keeps things smaller).
   128  type GeobedCity struct {
   129  	City    string
   130  	CityAlt string
   131  	// TODO: Think about converting this to a small int to save on memory allocation. Lookup requests can have the strings converted to the same int if there are any matches.
   132  	// This could make lookup more accurate, easier, and faster even. IF the int uses less bytes than the two letter code string.
   133  	Country    string
   134  	Region     string
   135  	Latitude   float64
   136  	Longitude  float64
   137  	Population int32
   138  	Geohash    string
   139  }
   140  
   141  // TODO: String interning? (much like converting country code to int)
   142  // https://gist.github.com/karlseguin/6570372
   143  
   144  // TODO: Store the cities in mmap...???
   145  // https://github.com/boltdb/bolt/blob/master/bolt_unix.go#L42-L69
   146  // Maybe even use bolt?
   147  
   148  var maxMindCityDedupeIdx map[string][]string
   149  
   150  // Holds information about the index ranges for city names (1st and 2nd characters) to help narrow down sets of the GeobedCity slice to scan when looking for a match.
   151  var cityNameIdx map[string]int
   152  var locationDedupeIdx map[string]bool
   153  
   154  // Information about each country from Geonames including; ISO codes, FIPS, country capital, area (sq km), population, and more.
   155  // Particularly useful for validating a location string contains a country name which can help the search process.
   156  // Adding to this info, a slice of partial geohashes to help narrow down reverse geocoding lookups (maps to country buckets).
   157  type CountryInfo struct {
   158  	Country            string
   159  	Capital            string
   160  	Area               int32
   161  	Population         int32
   162  	GeonameId          int32
   163  	ISONumeric         int16
   164  	ISO                string
   165  	ISO3               string
   166  	Fips               string
   167  	Continent          string
   168  	Tld                string
   169  	CurrencyCode       string
   170  	CurrencyName       string
   171  	Phone              string
   172  	PostalCodeFormat   string
   173  	PostalCodeRegex    string
   174  	Languages          string
   175  	Neighbours         string
   176  	EquivalentFipsCode string
   177  }
   178  
   179  // Options when geocoding. For now just an exact match on city name, but there will be potentially other options that can be set to adjust how searching/matching works.
   180  type GeocodeOptions struct {
   181  	ExactCity bool
   182  }
   183  
   184  // An index range struct that's used for narrowing down ranges over the large Cities struct.
   185  type r struct {
   186  	f int
   187  	t int
   188  }
   189  
   190  // Creates a new Geobed instance. You do not need more than one. You do not want more than one. There's a fair bit of data to load into memory.
   191  func NewGeobed() GeoBed {
   192  	g := GeoBed{}
   193  
   194  	var err error
   195  	g.c, err = loadGeobedCityData()
   196  	g.co, err = loadGeobedCountryData()
   197  	err = loadGeobedCityNameIdx()
   198  	if err != nil || len(g.c) == 0 {
   199  		g.downloadDataSets()
   200  		g.loadDataSets()
   201  		g.store()
   202  	}
   203  
   204  	return g
   205  }
   206  
   207  // Downloads the data sets if needed.
   208  func (g *GeoBed) downloadDataSets() {
   209  	os.Mkdir("./geobed-data", 0777)
   210  	for _, f := range dataSetFiles {
   211  		_, err := os.Stat(f["path"])
   212  		if err != nil {
   213  			if os.IsNotExist(err) {
   214  				// log.Println(f["path"] + " does not exist, downloading...")
   215  				out, oErr := os.Create(f["path"])
   216  				defer out.Close()
   217  				if oErr == nil {
   218  					r, rErr := http.Get(f["url"])
   219  					defer r.Body.Close()
   220  					if rErr == nil {
   221  						_, nErr := io.Copy(out, r.Body)
   222  						if nErr != nil {
   223  							// log.Println("Failed to copy data file, it will be tried again on next application start.")
   224  							// remove file so another attempt can be made, should something fail
   225  							err = os.Remove(f["path"])
   226  						}
   227  						r.Body.Close()
   228  					}
   229  					out.Close()
   230  				} else {
   231  					log.Println(oErr)
   232  				}
   233  			}
   234  		}
   235  	}
   236  }
   237  
   238  // Unzips the data sets and loads the data.
   239  func (g *GeoBed) loadDataSets() {
   240  	locationDedupeIdx = make(map[string]bool)
   241  
   242  	for _, f := range dataSetFiles {
   243  		// This one is zipped
   244  		if f["id"] == "geonamesCities1000" {
   245  			rz, err := zip.OpenReader(f["path"])
   246  			if err != nil {
   247  				log.Fatal(err)
   248  			}
   249  			defer rz.Close()
   250  
   251  			for _, uF := range rz.File {
   252  				fi, err := uF.Open()
   253  
   254  				if err != nil {
   255  					log.Fatal(err)
   256  				}
   257  				defer fi.Close()
   258  
   259  				// Geonames uses a tab delineated format and it's not even consistent. No CSV reader that I've found for Go can understand this.
   260  				// I'm not expecting any reader to either because it's an invalid CSV to be frank. However, we can still split up each row by \t
   261  				scanner := bufio.NewScanner(fi)
   262  				scanner.Split(bufio.ScanLines)
   263  
   264  				i := 1
   265  				for scanner.Scan() {
   266  					i++
   267  
   268  					// So regexp, sadly, must be used (well, unless I wanted parse each string byte by byte, pushing each into a buffer to append to a slice until a tab is reached, etc.).
   269  					// But I'd have to also then put in a condition if the next byte was a \t rune, then append an empty string, etc. This just, for now, seems nicer (easier).
   270  					// This is only an import/update, so it shouldn't be an issue for performance. If it is, then I'll look into other solutions.
   271  					fields := regexp.MustCompile("\t").Split(scanner.Text(), 19)
   272  
   273  					// NOTE: Now using a combined GeobedCity struct since not all data sets have the same fields.
   274  					// Plus, the entire point was to geocode forward and reverse. Bonus information like elevation and such is just superfluous.
   275  					// Leaving it here because it may be configurable... If options are passed to NewGeobed() then maybe Geobed can simply be a Geonames search.
   276  					// Don't even load in MaxMind data...And if that's the case, maybe that bonus information is desired.
   277  					if len(fields) == 19 {
   278  						//id, _ := strconv.Atoi(fields[0])
   279  						lat, _ := strconv.ParseFloat(fields[4], 64)
   280  						lng, _ := strconv.ParseFloat(fields[5], 64)
   281  						pop, _ := strconv.Atoi(fields[14])
   282  						//elv, _ := strconv.Atoi(fields[15])
   283  						//dem, _ := strconv.Atoi(fields[16])
   284  
   285  						gh := geohash.Encode(lat, lng)
   286  						// This is produced with empty lat/lng values - don't store it.
   287  						if gh == "7zzzzzzzzzzz" {
   288  							gh = ""
   289  						}
   290  
   291  						var c GeobedCity
   292  						c.City = strings.Trim(string(fields[1]), " ")
   293  						c.CityAlt = string(fields[3])
   294  						c.Country = string(fields[8])
   295  						c.Region = string(fields[10])
   296  						c.Latitude = lat
   297  						c.Longitude = lng
   298  						c.Population = int32(pop)
   299  						c.Geohash = gh
   300  
   301  						// Don't include entries without a city name. If we want to geocode the centers of countries and states, then we can do that faster through other means.
   302  						if len(c.City) > 0 {
   303  							g.c = append(g.c, c)
   304  						}
   305  					}
   306  				}
   307  			}
   308  		}
   309  
   310  		// ...And this one is Gzipped (and this one may have worked with the CSV package, but parse it the same way as the others line by line)
   311  		if f["id"] == "maxmindWorldCities" {
   312  			// It also has a lot of dupes
   313  			maxMindCityDedupeIdx = make(map[string][]string)
   314  
   315  			fi, err := os.Open(f["path"])
   316  			if err != nil {
   317  				log.Println(err)
   318  			}
   319  			defer fi.Close()
   320  
   321  			fz, err := gzip.NewReader(fi)
   322  			if err != nil {
   323  				log.Println(err)
   324  			}
   325  			defer fz.Close()
   326  
   327  			scanner := bufio.NewScanner(fz)
   328  			scanner.Split(bufio.ScanLines)
   329  
   330  			i := 1
   331  			for scanner.Scan() {
   332  				i++
   333  				t := scanner.Text()
   334  
   335  				fields := strings.Split(t, ",")
   336  				if len(fields) == 7 {
   337  					var b bytes.Buffer
   338  					b.WriteString(fields[0]) // country
   339  					b.WriteString(fields[3]) // region
   340  					b.WriteString(fields[1]) // city
   341  
   342  					idx := b.String()
   343  					b.Reset()
   344  					maxMindCityDedupeIdx[idx] = fields
   345  				}
   346  			}
   347  
   348  			// Loop the map of fields after dupes have been removed (about 1/5th less... 2.6m vs 3.1m inreases lookup performance).
   349  			for _, fields := range maxMindCityDedupeIdx {
   350  				if fields[0] != "" && fields[0] != "0" {
   351  					if fields[2] != "AccentCity" {
   352  						pop, _ := strconv.Atoi(fields[4])
   353  						lat, _ := strconv.ParseFloat(fields[5], 64)
   354  						lng, _ := strconv.ParseFloat(fields[6], 64)
   355  						// MaxMind's data set is a bit dirty. I've seen city names surrounded by parenthesis in a few places.
   356  						cn := strings.Trim(string(fields[2]), " ")
   357  						cn = strings.Trim(cn, "( )")
   358  
   359  						// Don't take any city names with erroneous punctuation either.
   360  						if strings.Contains(cn, "!") || strings.Contains(cn, "@") {
   361  							continue
   362  						}
   363  
   364  						gh := geohash.Encode(lat, lng)
   365  						// This is produced with empty lat/lng values - don't store it.
   366  						if gh == "7zzzzzzzzzzz" {
   367  							gh = ""
   368  						}
   369  
   370  						// If the geohash was seen before...
   371  						_, ok := locationDedupeIdx[gh]
   372  						if !ok {
   373  							locationDedupeIdx[gh] = true
   374  
   375  							var c GeobedCity
   376  							c.City = cn
   377  							c.Country = toUpper(string(fields[0]))
   378  							c.Region = string(fields[3])
   379  							c.Latitude = lat
   380  							c.Longitude = lng
   381  							c.Population = int32(pop)
   382  							c.Geohash = gh
   383  
   384  							// Don't include entries without a city name. If we want to geocode the centers of countries and states, then we can do that faster through other means.
   385  							if len(c.City) > 0 && len(c.Country) > 0 {
   386  								g.c = append(g.c, c)
   387  							}
   388  						}
   389  					}
   390  				}
   391  			}
   392  			// Clear out the temrporary index (set to nil, it does get re-created) so that Go can garbage collect it at some point whenever it feels the need.
   393  			maxMindCityDedupeIdx = nil
   394  			locationDedupeIdx = nil
   395  		}
   396  
   397  		// ...And this one is just plain text
   398  		if f["id"] == "geonamesCountryInfo" {
   399  			fi, err := os.Open(f["path"])
   400  
   401  			if err != nil {
   402  				log.Fatal(err)
   403  			}
   404  			defer fi.Close()
   405  
   406  			scanner := bufio.NewScanner(fi)
   407  			scanner.Split(bufio.ScanLines)
   408  
   409  			i := 1
   410  			for scanner.Scan() {
   411  				t := scanner.Text()
   412  				// There are a bunch of lines in this file that are comments, they start with #
   413  				if string(t[0]) != "#" {
   414  					i++
   415  					fields := regexp.MustCompile("\t").Split(t, 19)
   416  
   417  					if len(fields) == 19 {
   418  						if fields[0] != "" && fields[0] != "0" {
   419  							isoNumeric, _ := strconv.Atoi(fields[2])
   420  							area, _ := strconv.Atoi(fields[6])
   421  							pop, _ := strconv.Atoi(fields[7])
   422  							gid, _ := strconv.Atoi(fields[16])
   423  
   424  							var ci CountryInfo
   425  							ci.ISO = string(fields[0])
   426  							ci.ISO3 = string(fields[1])
   427  							ci.ISONumeric = int16(isoNumeric)
   428  							ci.Fips = string(fields[3])
   429  							ci.Country = string(fields[4])
   430  							ci.Capital = string(fields[5])
   431  							ci.Area = int32(area)
   432  							ci.Population = int32(pop)
   433  							ci.Continent = string(fields[8])
   434  							ci.Tld = string(fields[9])
   435  							ci.CurrencyCode = string(fields[10])
   436  							ci.CurrencyName = string(fields[11])
   437  							ci.Phone = string(fields[12])
   438  							ci.PostalCodeFormat = string(fields[13])
   439  							ci.PostalCodeRegex = string(fields[14])
   440  							ci.Languages = string(fields[15])
   441  							ci.GeonameId = int32(gid)
   442  							ci.Neighbours = string(fields[17])
   443  							ci.EquivalentFipsCode = string(fields[18])
   444  
   445  							g.co = append(g.co, ci)
   446  						}
   447  					}
   448  				}
   449  			}
   450  		}
   451  	}
   452  
   453  	// Sort []GeobedCity by city names to help with binary search (the City field is the most searched upon field and the matching names can be easily filtered down from there).
   454  	sort.Sort(g.c)
   455  
   456  	//debug
   457  	//log.Println("TOTAL RECORDS:")
   458  	//log.Println(len(g.c))
   459  
   460  	// Index the locations of city names in the g.c []GeoCity slice. This way when searching the range can be limited so it will be faster.
   461  	cityNameIdx = make(map[string]int)
   462  	for k, v := range g.c {
   463  		// Get the index key for the first character of the city name.
   464  		ik := toLower(string(v.City[0]))
   465  		if val, ok := cityNameIdx[ik]; ok {
   466  			// If this key number is greater than what was previously recorded, then set it as the new indexed key.
   467  			if val < k {
   468  				cityNameIdx[ik] = k
   469  			}
   470  		} else {
   471  			// If the index key has not yet been set for this value, then set it.
   472  			cityNameIdx[ik] = k
   473  		}
   474  
   475  		// Get the index key for the first two characters of the city name.
   476  		// if len(v.CityLower) >= 2 {
   477  		// 	ik2 := v.CityLower[0:2]
   478  		// 	if val, ok := cityNameIdx[ik2]; ok {
   479  		// 		// If this key number is greater than what was previously recorded, then set it as the new indexed key.
   480  		// 		if val < k {
   481  		// 			cityNameIdx[ik2] = k
   482  		// 		}
   483  		// 	} else {
   484  		// 		// If the index key has not yet been set for this value, then set it.
   485  		// 		cityNameIdx[ik2] = k
   486  		// 	}
   487  		// }
   488  	}
   489  }
   490  
   491  // Forward geocode, location string to lat/lng (returns a struct though)
   492  func (g *GeoBed) Geocode(n string, opts ...GeocodeOptions) GeobedCity {
   493  	var c GeobedCity
   494  	n = strings.TrimSpace(n)
   495  	if n == "" {
   496  		return c
   497  	}
   498  	// variadic optional argument trick
   499  	options := GeocodeOptions{}
   500  	if len(opts) > 0 {
   501  		options = opts[0]
   502  	}
   503  
   504  	if options.ExactCity {
   505  		c = g.exactMatchCity(n)
   506  	} else {
   507  		// NOTE: The downside of this (currently) is that something is basically always returned. It's a best guess.
   508  		// There's not much chance of it returning "not found" (or an empty GeobedCity struct).
   509  		// If you'd rather have nothing returned if not found, look at more exact matching options.
   510  		c = g.fuzzyMatchLocation(n)
   511  	}
   512  
   513  	return c
   514  }
   515  
   516  // Returns a GeobedCity only if there is an exact city name match. A stricter match, though if state or country are missing a guess will be made.
   517  func (g *GeoBed) exactMatchCity(n string) GeobedCity {
   518  	var c GeobedCity
   519  	// Ignore the `abbrevSlice` value for now. Use `nCo` and `nSt` for more accuracy.
   520  	nCo, nSt, _, nSlice := g.extractLocationPieces(n)
   521  	nWithoutAbbrev := strings.Join(nSlice, " ")
   522  	ranges := g.getSearchRange(nSlice)
   523  
   524  	matchingCities := []GeobedCity{}
   525  
   526  	// First, get everything that matches the city exactly (case insensitive).
   527  	for _, rng := range ranges {
   528  		// When adjusting the range, the keys become out of sync. Start from rng.f
   529  		currentKey := rng.f
   530  		for _, v := range g.c[rng.f:rng.t] {
   531  			currentKey++
   532  			// The full string (ie. "New York" or "Las Vegas")
   533  			if strings.EqualFold(n, v.City) {
   534  				matchingCities = append(matchingCities, v)
   535  			}
   536  			// The pieces with abbreviations removed
   537  			if strings.EqualFold(nWithoutAbbrev, v.City) {
   538  				matchingCities = append(matchingCities, v)
   539  			}
   540  			// Each piece - doesn't make sense for now. May revisit this.
   541  			// ie. "New York" or "New" and "York" ... well, "York" is going to match a different city.
   542  			// While that might be weeded out next, who knows. It's starting to get more fuzzy than I'd like for this function.
   543  			// for _, np := range nSlice {
   544  			// 	if strings.EqualFold(np, v.City) {
   545  			// 		matchingCities = append(matchingCities, v)
   546  			// 	}
   547  			// }
   548  		}
   549  	}
   550  
   551  	// If only one was found, we can stop right here.
   552  	if len(matchingCities) == 1 {
   553  		return matchingCities[0]
   554  		// If more than one was found, we need to guess.
   555  	} else if len(matchingCities) > 1 {
   556  		// Then range over those matching cities and try to figure out which one it is - city names are unfortunately not unique of course.
   557  		// There shouldn't be very many so I don't mind the multiple loops.
   558  		for _, city := range matchingCities {
   559  			// Was the state abbreviation present? That sounds promising.
   560  			if strings.EqualFold(nSt, city.Region) {
   561  				c = city
   562  			}
   563  		}
   564  
   565  		for _, city := range matchingCities {
   566  			// Matches the state and country? Likely the best scenario, I'd call it the best match.
   567  			if strings.EqualFold(nSt, city.Region) && strings.EqualFold(nCo, city.Country) {
   568  				c = city
   569  			}
   570  		}
   571  
   572  		// If we still don't have a city, maybe we have a country with the city name, ie. "New York, USA"
   573  		// This is tougher because there's a "New York" in Florida, Kentucky, and more. Let's use population to assist if we can.
   574  		if c.City == "" {
   575  			matchingCountryCities := []GeobedCity{}
   576  			for _, city := range matchingCities {
   577  				if strings.EqualFold(nCo, city.Country) {
   578  					matchingCountryCities = append(matchingCountryCities, city)
   579  				}
   580  			}
   581  
   582  			// If someone says, "New York, USA" they most likely mean New York, NY because it's the largest city.
   583  			// Specific locations are often implied based on size or popularity even though the names aren't unique.
   584  			biggestCity := GeobedCity{}
   585  			for _, city := range matchingCountryCities {
   586  				if city.Population > biggestCity.Population {
   587  					biggestCity = city
   588  				}
   589  			}
   590  			c = biggestCity
   591  		}
   592  	}
   593  
   594  	return c
   595  }
   596  
   597  // When geocoding, this provides a scored best match.
   598  func (g *GeoBed) fuzzyMatchLocation(n string) GeobedCity {
   599  	nCo, nSt, abbrevSlice, nSlice := g.extractLocationPieces(n)
   600  	// Take the reamining unclassified pieces (those not likely to be abbreviations) and get our search range.
   601  	// These pieces are likely contain the city name. Narrowing down the search range will make the lookup faster.
   602  	ranges := g.getSearchRange(nSlice)
   603  
   604  	var bestMatchingKeys = map[int]int{}
   605  	var bestMatchingKey = 0
   606  	for _, rng := range ranges {
   607  		// When adjusting the range, the keys become out of sync. Start from rng.f
   608  		currentKey := rng.f
   609  
   610  		for _, v := range g.c[rng.f:rng.t] {
   611  			currentKey++
   612  
   613  			// Mainly useful for strings like: "Austin, TX" or "Austin TX" (locations with US state codes). Smile if your location string is this simple.
   614  			if nSt != "" {
   615  				if strings.EqualFold(n, v.City) && strings.EqualFold(nSt, v.Region) {
   616  					return v
   617  				}
   618  			}
   619  
   620  			// Special case. Airport codes and other short 3 letter abbreviations, ie. NYC and SFO
   621  			// Country codes could present problems here. It seems to work for NYC, but not SFO (which there are multiple SFOs actually).
   622  			// Leaving it for now, but airport codes are tricky (though they are popular on Twitter). These must be exact (case sensitive) matches.
   623  			// if len(n) == 3 {
   624  			// 	alts := strings.Split(v.CityAlt, ",")
   625  			// 	for _, altV := range alts {
   626  			// 		if altV != "" {
   627  			// 			if altV == n {
   628  			// 				if val, ok := bestMatchingKeys[currentKey]; ok {
   629  			// 					bestMatchingKeys[currentKey] = val + 4
   630  			// 				} else {
   631  			// 					bestMatchingKeys[currentKey] = 4
   632  			// 				}
   633  			// 			}
   634  			// 		}
   635  			// 	}
   636  			// }
   637  
   638  			// Abbreviations for state/country
   639  			// Region (state/province)
   640  			for _, av := range abbrevSlice {
   641  				lowerAv := toLower(av)
   642  				if len(av) == 2 && strings.EqualFold(v.Region, lowerAv) {
   643  					if val, ok := bestMatchingKeys[currentKey]; ok {
   644  						bestMatchingKeys[currentKey] = val + 5
   645  					} else {
   646  						bestMatchingKeys[currentKey] = 5
   647  					}
   648  				}
   649  
   650  				// Country (worth 2 points if exact match)
   651  				if len(av) == 2 && strings.EqualFold(v.Country, lowerAv) {
   652  					if val, ok := bestMatchingKeys[currentKey]; ok {
   653  						bestMatchingKeys[currentKey] = val + 3
   654  					} else {
   655  						bestMatchingKeys[currentKey] = 3
   656  					}
   657  				}
   658  			}
   659  
   660  			// A discovered country name converted into a country code
   661  			if nCo != "" {
   662  				if nCo == v.Country {
   663  					if val, ok := bestMatchingKeys[currentKey]; ok {
   664  						bestMatchingKeys[currentKey] = val + 4
   665  					} else {
   666  						bestMatchingKeys[currentKey] = 4
   667  					}
   668  				}
   669  			}
   670  
   671  			// A discovered state name converted into a region code
   672  			if nSt != "" {
   673  				if nSt == v.Region {
   674  					if val, ok := bestMatchingKeys[currentKey]; ok {
   675  						bestMatchingKeys[currentKey] = val + 4
   676  					} else {
   677  						bestMatchingKeys[currentKey] = 4
   678  					}
   679  				}
   680  			}
   681  
   682  			// If any alternate names can be discovered, take them into consideration.
   683  			if v.CityAlt != "" {
   684  				alts := strings.Fields(v.CityAlt)
   685  				for _, altV := range alts {
   686  					if strings.EqualFold(altV, n) {
   687  						if val, ok := bestMatchingKeys[currentKey]; ok {
   688  							bestMatchingKeys[currentKey] = val + 3
   689  						} else {
   690  							bestMatchingKeys[currentKey] = 3
   691  						}
   692  					}
   693  					// Exact, a case-sensitive match means a lot.
   694  					if altV == n {
   695  						if val, ok := bestMatchingKeys[currentKey]; ok {
   696  							bestMatchingKeys[currentKey] = val + 5
   697  						} else {
   698  							bestMatchingKeys[currentKey] = 5
   699  						}
   700  					}
   701  				}
   702  			}
   703  
   704  			// Exact city name matches mean a lot.
   705  			if strings.EqualFold(n, v.City) {
   706  				if val, ok := bestMatchingKeys[currentKey]; ok {
   707  					bestMatchingKeys[currentKey] = val + 7
   708  				} else {
   709  					bestMatchingKeys[currentKey] = 7
   710  				}
   711  			}
   712  
   713  			for _, ns := range nSlice {
   714  				ns = strings.TrimSuffix(ns, ",")
   715  
   716  				// City (worth 2 points if contians part of string)
   717  				if strings.Contains(toLower(v.City), toLower(ns)) {
   718  					if val, ok := bestMatchingKeys[currentKey]; ok {
   719  						bestMatchingKeys[currentKey] = val + 2
   720  					} else {
   721  						bestMatchingKeys[currentKey] = 2
   722  					}
   723  				}
   724  
   725  				// If there's an exat match, maybe there was noise in the string so it could be the full city name, but unlikely. For example, "New" or "Los" is in many city names.
   726  				// Still, give it a point because it could be the bulkier part of a city name (or the city name could be one word). This has helped in some cases.
   727  				if strings.EqualFold(v.City, ns) {
   728  					if val, ok := bestMatchingKeys[currentKey]; ok {
   729  						bestMatchingKeys[currentKey] = val + 1
   730  					} else {
   731  						bestMatchingKeys[currentKey] = 1
   732  					}
   733  				}
   734  
   735  			}
   736  		}
   737  	}
   738  
   739  	// If no country was found, look at population as a factor. Is it obvious?
   740  	if nCo == "" {
   741  		hp := int32(0)
   742  		hpk := 0
   743  		for k, v := range bestMatchingKeys {
   744  			// Add bonus point for having a population 1,000+
   745  			if g.c[k].Population >= 1000 {
   746  				bestMatchingKeys[k] = v + 1
   747  			}
   748  			// Now just add a bonus for having the highest population and points
   749  			if g.c[k].Population > hp {
   750  				hpk = k
   751  				hp = g.c[k].Population
   752  			}
   753  		}
   754  		// Add a point for having the highest population (if any of the results had population data available).
   755  		if g.c[hpk].Population > 0 {
   756  			bestMatchingKeys[hpk] = bestMatchingKeys[hpk] + 1
   757  		}
   758  	}
   759  
   760  	m := 0
   761  	for k, v := range bestMatchingKeys {
   762  		if v > m {
   763  			m = v
   764  			bestMatchingKey = k
   765  		}
   766  
   767  		// If there is a tie breaker, use the city with the higher population (if known) because it's more likely to be what is meant.
   768  		// For example, when people say "New York" they typically mean New York, NY...Though there are many New Yorks.
   769  		if v == m {
   770  			if g.c[k].Population > g.c[bestMatchingKey].Population {
   771  				bestMatchingKey = k
   772  			}
   773  		}
   774  	}
   775  
   776  	// debug
   777  	// log.Println("Possible results:")
   778  	// log.Println(len(bestMatchingKeys))
   779  	// for _, kv := range bestMatchingKeys {
   780  	// 	log.Println(g.c[kv])
   781  	// }
   782  	// log.Println("Best match:")
   783  	// log.Println(g.c[bestMatchingKey])
   784  	// log.Println("Scored:")
   785  	// log.Println(m)
   786  
   787  	return g.c[bestMatchingKey]
   788  }
   789  
   790  // Splits a string up looking for potential abbreviations by matching against a shorter list of abbreviations.
   791  // Returns country, state, a slice of strings with potential abbreviations (based on size; 2 or 3 characters), and then a slice of the remaning pieces.
   792  // This does a good job at separating things that are clearly abbreviations from the city so that searching is faster and more accuarate.
   793  func (g *GeoBed) extractLocationPieces(n string) (string, string, []string, []string) {
   794  	var re = regexp.MustCompile("")
   795  
   796  	// Extract all potential abbreviations.
   797  	re = regexp.MustCompile(`[\S]{2,3}`)
   798  	abbrevSlice := re.FindStringSubmatch(n)
   799  
   800  	// Convert country to country code and pull it out. We'll use it as a secondary form of validation. Remove the code from the original query.
   801  	nCo := ""
   802  	for _, co := range g.co {
   803  		re = regexp.MustCompile("(?i)^" + co.Country + ",?\\s|\\s" + co.Country + ",?\\s" + co.Country + "\\s$")
   804  		if re.MatchString(n) {
   805  			nCo = co.ISO
   806  			// And remove it so we have a cleaner query string for a city.
   807  			n = re.ReplaceAllString(n, "")
   808  		}
   809  	}
   810  
   811  	// Find US State codes and pull them out as well (do not convert state names, they can also easily be city names).
   812  	nSt := ""
   813  	for sc, _ := range UsSateCodes {
   814  		re = regexp.MustCompile("(?i)^" + sc + ",?\\s|\\s" + sc + ",?\\s|\\s" + sc + "$")
   815  		if re.MatchString(n) {
   816  			nSt = sc
   817  			// And remove it too.
   818  			n = re.ReplaceAllString(n, "")
   819  		}
   820  	}
   821  	// Trim spaces and commas off the modified string.
   822  	n = strings.Trim(n, " ,")
   823  
   824  	// Now extract words (potential city names) into a slice. With this, the index will be referenced to pinpoint sections of the g.c []GeobedCity slice to scan.
   825  	// This results in a much faster lookup. This is over a simple binary search with strings.Search() etc. because the city name may not be the first word.
   826  	// This should not contain any known country code or US state codes.
   827  	nSlice := strings.Split(n, " ")
   828  
   829  	return nCo, nSt, abbrevSlice, nSlice
   830  }
   831  
   832  // There's potentially 2.7 million items to range though, let's see if we can reduce that by taking slices of the slice in alphabetical order.
   833  func (g *GeoBed) getSearchRange(nSlice []string) []r {
   834  	// NOTE: A simple binary search was not helping here since we aren't looking for one specific thing. We have multiple elements, city, state, country.
   835  	// So we'd end up with multiple binary searches to piece together which could be quite a few exponentially given the possible combinations...And so it was slower.
   836  
   837  	ranges := []r{}
   838  	for _, ns := range nSlice {
   839  		ns = strings.TrimSuffix(ns, ",")
   840  
   841  		if len(ns) > 0 {
   842  			// Get the first character in the string, this tells us where to stop.
   843  			fc := toLower(string(ns[0]))
   844  			// Get the previous index key (by getting the previous character in the alphabet) to figure out where to start.
   845  			pik := string(prev(rune(fc[0])))
   846  
   847  			// To/from key
   848  			fk := 0
   849  			tk := 0
   850  			if val, ok := cityNameIdx[pik]; ok {
   851  				fk = val
   852  			}
   853  			if val, ok := cityNameIdx[fc]; ok {
   854  				tk = val
   855  			}
   856  			// Don't let the to key be out of range.
   857  			if tk == 0 {
   858  				tk = (len(g.c) - 1)
   859  			}
   860  			ranges = append(ranges, r{fk, tk})
   861  		}
   862  	}
   863  
   864  	return ranges
   865  }
   866  
   867  func prev(r rune) rune {
   868  	return r - 1
   869  }
   870  
   871  // Reverse geocode
   872  func (g *GeoBed) ReverseGeocode(lat float64, lng float64) GeobedCity {
   873  	c := GeobedCity{}
   874  
   875  	gh := geohash.Encode(lat, lng)
   876  	// This is produced with empty lat/lng values - don't look for anything.
   877  	if gh == "7zzzzzzzzzzz" {
   878  		return c
   879  	}
   880  
   881  	// Note: All geohashes are going to be 12 characters long. Even if the precision on the lat/lng isn't great. The geohash package will center things.
   882  	// Obviously lat/lng like 37, -122 is a guess. That's no where near the resolution of a city. Though we're going to allow guesses.
   883  	mostMatched := 0
   884  	matched := 0
   885  	for k, v := range g.c {
   886  		// check first two characters to reduce the number of loops
   887  		if v.Geohash[0] == gh[0] && v.Geohash[1] == gh[1] {
   888  			matched = 2
   889  			for i := 2; i <= len(gh); i++ {
   890  				//log.Println(gh[0:i])
   891  				if v.Geohash[0:i] == gh[0:i] {
   892  					matched++
   893  				}
   894  			}
   895  			// tie breakers go to city with larger population (NOTE: There's still a chance that the next pass will uncover a better match)
   896  			if matched == mostMatched && g.c[k].Population > c.Population {
   897  				c = g.c[k]
   898  				// log.Println("MATCHES")
   899  				// log.Println(matched)
   900  				// log.Println("CITY")
   901  				// log.Println(c.City)
   902  				// log.Println("POPULATION")
   903  				// log.Println(c.Population)
   904  			}
   905  			if matched > mostMatched {
   906  				c = g.c[k]
   907  				mostMatched = matched
   908  			}
   909  		}
   910  	}
   911  
   912  	return c
   913  }
   914  
   915  // A slightly faster lowercase function.
   916  func toLower(s string) string {
   917  	b := make([]byte, len(s))
   918  	for i := range b {
   919  		c := s[i]
   920  		if c >= 'A' && c <= 'Z' {
   921  			c += 'a' - 'A'
   922  		}
   923  		b[i] = c
   924  	}
   925  	return string(b)
   926  }
   927  
   928  // A slightly faster uppercase function.
   929  func toUpper(s string) string {
   930  	b := make([]byte, len(s))
   931  	for i := range b {
   932  		c := s[i]
   933  		if c >= 'a' && c <= 'z' {
   934  			c -= 'a' - 'A'
   935  		}
   936  		b[i] = c
   937  	}
   938  	return string(b)
   939  }
   940  
   941  // Dumps the Geobed data to disk. This speeds up startup time on subsequent runs (or if calling NewGeobed() multiple times which should be avoided if possible).
   942  // TODO: Refactor
   943  func (g GeoBed) store() error {
   944  	b := new(bytes.Buffer)
   945  
   946  	// Store the city info
   947  	enc := gob.NewEncoder(b)
   948  	err := enc.Encode(g.c)
   949  	if err != nil {
   950  		b.Reset()
   951  		return err
   952  	}
   953  
   954  	fh, eopen := os.OpenFile("./geobed-data/g.c.dmp", os.O_CREATE|os.O_WRONLY, 0666)
   955  	defer fh.Close()
   956  	if eopen != nil {
   957  		b.Reset()
   958  		return eopen
   959  	}
   960  	n, e := fh.Write(b.Bytes())
   961  	if e != nil {
   962  		b.Reset()
   963  		return e
   964  	}
   965  	log.Printf("%d bytes successfully written to cache file\n", n)
   966  
   967  	// Store the country info as well (this is all now repetition - refactor)
   968  	b.Reset()
   969  	//enc = gob.NewEncoder(b)
   970  	err = enc.Encode(g.co)
   971  	if err != nil {
   972  		b.Reset()
   973  		return err
   974  	}
   975  
   976  	fh, eopen = os.OpenFile("./geobed-data/g.co.dmp", os.O_CREATE|os.O_WRONLY, 0666)
   977  	defer fh.Close()
   978  	if eopen != nil {
   979  		b.Reset()
   980  		return eopen
   981  	}
   982  	n, e = fh.Write(b.Bytes())
   983  	if e != nil {
   984  		b.Reset()
   985  		return e
   986  	}
   987  	log.Printf("%d bytes successfully written to cache file\n", n)
   988  
   989  	// Store the index info (again there's some repetition here)
   990  	b.Reset()
   991  	//enc = gob.NewEncoder(b)
   992  	err = enc.Encode(cityNameIdx)
   993  	if err != nil {
   994  		b.Reset()
   995  		return err
   996  	}
   997  
   998  	fh, eopen = os.OpenFile("./geobed-data/cityNameIdx.dmp", os.O_CREATE|os.O_WRONLY, 0666)
   999  	defer fh.Close()
  1000  	if eopen != nil {
  1001  		b.Reset()
  1002  		return eopen
  1003  	}
  1004  	n, e = fh.Write(b.Bytes())
  1005  	if e != nil {
  1006  		b.Reset()
  1007  		return e
  1008  	}
  1009  	log.Printf("%d bytes successfully written to cache file\n", n)
  1010  
  1011  	b.Reset()
  1012  	return nil
  1013  }
  1014  
  1015  // Loads a GeobedCity dump, which saves a bit of time.
  1016  func loadGeobedCityData() ([]GeobedCity, error) {
  1017  	fh, err := os.Open("./geobed-data/g.c.dmp")
  1018  	if err != nil {
  1019  		return nil, err
  1020  	}
  1021  	gc := []GeobedCity{}
  1022  	dec := gob.NewDecoder(fh)
  1023  	err = dec.Decode(&gc)
  1024  	if err != nil {
  1025  		return nil, err
  1026  	}
  1027  	return gc, nil
  1028  }
  1029  
  1030  func loadGeobedCountryData() ([]CountryInfo, error) {
  1031  	fh, err := os.Open("./geobed-data/g.co.dmp")
  1032  	if err != nil {
  1033  		return nil, err
  1034  	}
  1035  	co := []CountryInfo{}
  1036  	dec := gob.NewDecoder(fh)
  1037  	err = dec.Decode(&co)
  1038  	if err != nil {
  1039  		return nil, err
  1040  	}
  1041  	return co, nil
  1042  }
  1043  
  1044  func loadGeobedCityNameIdx() error {
  1045  	fh, err := os.Open("./geobed-data/cityNameIdx.dmp")
  1046  	if err != nil {
  1047  		return err
  1048  	}
  1049  	dec := gob.NewDecoder(fh)
  1050  	cityNameIdx = make(map[string]int)
  1051  	err = dec.Decode(&cityNameIdx)
  1052  	if err != nil {
  1053  		return err
  1054  	}
  1055  	return nil
  1056  }